From 3b17d56af0413f9495df0af71120df6fa442a3d4 Mon Sep 17 00:00:00 2001
From: Felipe Tome <tomefelipe0@usp.br>
Date: Wed, 13 Aug 2025 14:52:52 -0300
Subject: [PATCH 1/6] Name changes and inclusion of bigger test matrices

---
 src/NextLA.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/NextLA.jl b/src/NextLA.jl
index 36ca7d9..8465a72 100644
--- a/src/NextLA.jl
+++ b/src/NextLA.jl
@@ -77,7 +77,6 @@ function lamch(::Type{T}, cmach) where {T <: Number}
 	end
 end
 
-# Write your package code here.
 include("NextLAMatrix.jl")
 include("lu.jl")
 include("trmm.jl")

From 31940266eb2724a2d16c6d86c0be209e85a78cc9 Mon Sep 17 00:00:00 2001
From: Felipe Tome <tomefelipe0@usp.br>
Date: Thu, 28 Aug 2025 18:25:52 -0300
Subject: [PATCH 2/6] CAQR: new interface and test adjustemnts

---
 src/axpy.jl      |   4 +-
 src/geqr2.jl     |  77 ++++++----
 src/geqrt.jl     |  97 ++++++++-----
 src/gerc.jl      |  67 +++++++--
 src/larf.jl      | 191 ++++++++++++++++++++-----
 src/larfb.jl     | 183 ++++++++++++++++--------
 src/larfg.jl     | 131 +++++++++++++----
 src/larft.jl     | 221 +++++++++++++++++++---------
 src/lauu2.jl     | 149 ++++++++++---------
 src/lauum.jl     | 266 ++++++++++++++++++----------------
 src/pamm.jl      | 218 +++++++++++++++++++---------
 src/parfb.jl     | 183 ++++++++++++++++--------
 src/pemv.jl      | 316 +++++++++++++++++++++++++---------------
 src/rectrxm.jl   | 154 +++++++++++++-------
 src/trmm.jl      | 206 ++++++++++++++++++++++-----
 src/trsm.jl      |  55 +++++++
 src/tsmqr.jl     | 226 +++++++++++++++++++++--------
 src/tsqrt.jl     | 178 +++++++++++++++++------
 src/ttmqr.jl     | 110 +++++++-------
 src/ttqrt.jl     | 161 +++++++++++----------
 src/unmqr.jl     | 364 +++++++++++++++++++++++++++++------------------
 test/geqr2.jl    |  68 ++++-----
 test/geqrt.jl    | 189 ++++++------------------
 test/larf.jl     |  33 +++--
 test/larfb.jl    |  41 +++---
 test/larfg.jl    |  39 ++---
 test/larft.jl    |  16 +--
 test/lauum.jl    |  14 +-
 test/pamm.jl     |  72 +++-------
 test/parfb.jl    |  62 ++++----
 test/pemv.jl     |  70 ++++-----
 test/runtests.jl |   8 +-
 test/tsmqr.jl    |  63 ++++----
 test/tsqrt.jl    |  44 +++---
 test/ttmqr.jl    |  58 ++++----
 test/ttqrt.jl    |  27 ++--
 test/unmqr.jl    | 131 ++++++++---------
 37 files changed, 2802 insertions(+), 1690 deletions(-)

diff --git a/src/axpy.jl b/src/axpy.jl
index e2cce4b..e56e516 100644
--- a/src/axpy.jl
+++ b/src/axpy.jl
@@ -1,4 +1,4 @@
-function axpy!(a, x, y)
+function axpy!(a::T, x::AbstractVector{T}, y::AbstractVector{T}) where {T}
     n = length(x)
 
     if n <= 0
@@ -12,6 +12,4 @@ function axpy!(a, x, y)
     for i in 1:n
         y[i] = y[i] + a*x[i]
     end
-
-    return
 end
\ No newline at end of file
diff --git a/src/geqr2.jl b/src/geqr2.jl
index ce9e94a..3e1ed13 100644
--- a/src/geqr2.jl
+++ b/src/geqr2.jl
@@ -1,43 +1,72 @@
-function geqr2(m,n, A, lda, tau, work)
+"""
+    geqr2!(m, n, A, lda, tau, work)
+
+Compute unblocked QR factorization of an m-by-n matrix A using Householder reflectors.
+The matrix A is overwritten with the Q and R factors.
+
+# Arguments
+- `m`: Number of rows in matrix A
+- `n`: Number of columns in matrix A  
+- `A`: Input matrix (m × n), modified in place to contain Q and R factors
+- `tau`: Output vector of scalar factors (length min(m,n))
+- `work`: Workspace vector (length n)
+
+# Algorithm
+Uses Householder reflectors H(i) to zero out elements below the diagonal.
+For each column i, generates H(i) and applies it to remaining columns.
+"""
+function geqr2!(m::Integer, n::Integer, A::AbstractMatrix{T}, tau::AbstractVector{T}, work::AbstractVector{T}) where {T}
+    # Input validation
     if m < 0
-        throw(ArgumentError("illegal value of m"))
-        return -1
+        throw(ArgumentError("illegal value of m: $m"))
     end
     
     if n < 0
-        throw(ArgumentError("illegal value of n"))
-        return -2
+        throw(ArgumentError("illegal value of n: $n"))
     end
 
-    if lda < max(1,m)
-        throw(ArgumentError("illegal value of lda"))
-        return -4
+    # Quick return for empty matrices
+    if m == 0 || n == 0
+        return
     end
 
-    k = min(m,n)
+    k = min(m, n)  # Number of reflectors to generate
     one = oneunit(eltype(A))
 
-    #av = parent(A)
-    #a1, a2 = parentindices(A)
-    #a1 = a1.start-1
-    #a2 = a2.start-1
-
+    # Main QR factorization loop
     for i in 1:k
-        # generate elementary reflector H(i) to anniliate A(i+1:m, i)
-        A[i,i], tau[i] = larfg(m-i+1, A[i, i], (@view A[min(i+1,m):m, i]), 1, tau[i])
+        # Generate elementary reflector H(i) to annihilate A(i+1:m, i)
+        A[i, i], tau[i] = larfg!(m-i+1, A[i, i], (@view A[min(i+1,m):m, i]), 1, tau[i])
         
         if i < n
-            # apply H(i)^H to A(i:m, i+1:n) from left
-            alpha = A[i,i]
-            A[i,i] = one
+            # Apply H(i)^H to A(i:m, i+1:n) from the left
+            alpha = A[i, i]
+            A[i, i] = one  # Set diagonal element to 1 for reflector application
 
-            #LinearAlgebra.LAPACK.larf!('L', (@view A[i:m, i]), conj(tau[i]), (@view A[i:m, i+1:n]), work)
-            larf('L', m-i+1, n-i, (@view A[i:m, i]), 1, conj(tau[i]), (@view A[i:m, i+1:n]), work)
-            #zlarf('L', m-i+1, n-i, (@view av[i+a1:m+a1, i+a2]), 1, conj(tau[i]), (@view av[i+a1:m+a1, i+1+a2:n+a2]), lda, work)
+            # Apply the reflector to remaining columns
+            larf!('L', m-i+1, n-i, (@view A[i:m, i]), 1, conj(tau[i]), (@view A[i:m, i+1:n]), work)
 
-            A[i,i] = alpha
+            A[i, i] = alpha  # Restore original diagonal element
         end
     end
+end
 
-    return   
+"""
+    geqr2!(A) -> (A, tau)
+    
+Helper function for unblocked QR factorization using Householder reflectors.
+
+# Arguments  
+- `A`: Input matrix (m × n), modified in place
+- `tau`: Output vector of scalar factors (length min(m,n))
+
+# Returns
+- Modified `A` containing Q and R factors
+- `tau`: Vector of scalar factors (length min(m,n))
+"""
+function geqr2!(A::AbstractMatrix{T}, tau::AbstractVector{T}) where {T}
+    m, n = size(A)
+    work = zeros(T, n)
+    
+    geqr2!(m, n, A, tau, work)
 end
diff --git a/src/geqrt.jl b/src/geqrt.jl
index 963390e..8b1e86f 100644
--- a/src/geqrt.jl
+++ b/src/geqrt.jl
@@ -1,58 +1,89 @@
-function geqrt(m,n,ib, A, lda, T, ldt, tau, work)
+"""
+    geqrt!(m, n, ib, A, T_matrix, tau, work)
+
+Compute blocked QR factorization of an m-by-n matrix A using block size ib.
+The matrix A is overwritten with the Q and R factors, and T contains the 
+triangular factor of the block reflector.
+
+# Arguments
+- `m`: Number of rows in matrix A
+- `n`: Number of columns in matrix A
+- `ib`: Block size for the factorization (must be > 0 if m,n > 0)
+- `A`: Input matrix (m × n), modified in place to contain Q and R factors
+- `T`: Output triangular block reflector matrix (ib × n)
+- `tau`: Output vector of scalar factors (length n)
+- `work`: Workspace vector (length ib × n)
+
+# Algorithm
+Uses a block algorithm that processes ib columns at a time.
+For each block, performs unblocked QR and then applies the 
+block reflector to the remaining columns.
+"""
+function geqrt!(m::Integer, n::Integer, ib::Integer, A::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, tau::AbstractVector{T}, work::AbstractVector{T}) where {T}
+    # Input validation
     if m < 0
-        throw(ArgumentError("illegal value of m"))
-        return -1
+        throw(ArgumentError("illegal value of m: $m"))
     end
 
     if n < 0
-        throw(ArgumentError("illegal value of n"))
-        return -2
+        throw(ArgumentError("illegal value of n: $n"))
     end
 
     if (ib < 0) || ((ib == 0) && (m > 0) && (n > 0))
-        throw(ArgumentError("illegal value of ib"))
-        return -3
-    end
-
-    if lda < max(1,m) && m > 0
-        throw(ArgumentError("illegal value of lda"))
-        return -5
-    end
-
-    if ldt < max(1,ib) && ib > 0
-        throw(ArgumentError("illegal value of ldt"))
-        return -7
+        throw(ArgumentError("illegal value of ib: $ib"))
     end
 
+    # Quick return for empty matrices or zero block size
     if m == 0 || n == 0 || ib == 0
         return 
     end
 
-    k = min(m,n)
+    k = min(m, n)  # Number of reflectors to generate
 
+    # Process matrix in blocks of size ib
     for i in 1:ib:k
-        sb = min(ib, k-i+1)
+        sb = min(ib, k-i+1)  # Current block size
 
-        av = @view A[i:m, i:i+sb-1]
-        tv = @view T[1:sb,i:i+sb-1]
-        tauv = @view tau[i:i+sb-1]
+        # Extract current block and corresponding parts of T and tau
+    av = @view A[i:m, i:i+sb-1]           # Current block columns
+    tv = @view T_matrix[1:sb, i:i+sb-1]          # Corresponding T block
+        tauv = @view tau[i:i+sb-1]            # Corresponding tau values
 
-        # compute qr for A[i:m, i:i+sb-1]
+    # Perform unblocked QR factorization on current block
+    geqr2!(m-i+1, sb, av, tauv, work)
         
-        geqr2(m-i+1, sb, av, lda, tauv, work)
-        larft('F', 'C', m-i+1, sb, av, lda, tauv, tv, ldt)
+    # Form the triangular factor T for the block reflector
+    larft!('F', 'C', m-i+1, sb, av, tauv, tv)
 
+        # Apply block reflector to remaining columns if any exist
         if n >= i + sb
-            # update by apply H^H to A[i:m, i+sb:n] from left
-
-            #wwork = @view work[1: (n-i-sb+1)*sb]
-            #ww = reshape(wwork, n-i-sb+1, sb)
-            ww = reshape((@view work[1: (n-i-sb+1)*sb]), n-i-sb+1, sb)
+            # Reshape work array for block reflector application
+            ww = reshape((@view work[1:(n-i-sb+1)*sb]), n-i-sb+1, sb)
 
-            larfb('L', 'C', 'F', 'C', m-i+1, n-i-sb+1, sb, av, 
-                m-i+1, tv, sb, (@view A[i:m, i+sb:n]), lda, ww, n-i-sb+1)
+            # Apply H^H to A[i:m, i+sb:n] from the left
+            larfb!('L', 'C', 'F', 'C', m-i+1, n-i-sb+1, sb, av, 
+                m-i+1, tv, (@view A[i:m, i+sb:n]), ww)
         end
     end
+end
+
+"""
+    geqrt!(A, ib) -> (A, T, tau)
+    
+Helper function for blocked QR factorization. Computes A = Q*R where Q is orthogonal and R is upper triangular.
+
+# Arguments
+- `A`: Input matrix (m × n), modified in place to contain R in upper triangle and Q factors below
+- `ib`: Block size for the factorization
+
+# Returns
+- Modified `A` matrix containing Q and R factors  
+- `T`: Upper triangular block reflector matrix (ib × n)
+- `tau`: Vector of scalar factors for elementary reflectors (length n)
+"""
+function geqrt!(ib::Integer, A::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, tau::AbstractVector{T}) where {T}
+    m, n = size(A)
+    work = zeros(T, ib * n)
 
-    return
+    geqrt!(m, n, ib, A, T_matrix, tau, work)
 end
diff --git a/src/gerc.jl b/src/gerc.jl
index e5e767b..5df5cba 100644
--- a/src/gerc.jl
+++ b/src/gerc.jl
@@ -1,30 +1,71 @@
+"""
+    gerc!(alpha, x, y, A)
+
+Perform the rank-1 update: A := A + alpha * x * y^H
+
+This function computes a rank-1 update to the matrix A using the outer product
+of vectors x and y, scaled by the scalar alpha. The operation performed is:
+A[i,j] := A[i,j] + alpha * x[i] * conj(y[j])
+
+This is the complex version of the rank-1 update (GER Complex), where the 
+conjugate of y is used in the outer product.
+
+# Arguments
+- `alpha`: Scalar multiplier for the rank-1 update
+- `x`: Vector of length m (first dimension)
+- `y`: Vector of length n (second dimension)  
+- `A`: m×n matrix to be updated in-place
+
+# Algorithm
+The algorithm efficiently computes the outer product by:
+1. For each column j, compute temp = alpha * conj(y[j])
+2. If temp ≠ 0, update column j: A[:,j] += temp * x
+3. Skip columns where y[j] = 0 to avoid unnecessary computation
+
+# Input Validation
+- Matrix A must have non-negative dimensions
+- Vectors x and y must have lengths matching A dimensions
+- All inputs must have compatible numeric types
+
+# Performance Notes
+- Optimized for cache efficiency by operating column-wise
+- Skips zero elements in y to minimize operations
+- In-place operation minimizes memory allocation
+
+# Example
+```julia
+m, n = 4, 3
+A = zeros(ComplexF64, m, n)
+x = complex.([1.0, 2.0, 3.0, 4.0], [0.1, 0.2, 0.3, 0.4])
+y = complex.([1.0, 0.0, 2.0], [0.5, 0.0, 1.0])
+alpha = 2.0 + 1.0im
+gerc!(alpha, x, y, A)  # A updated with rank-1 modification
+```
+"""
 function gerc!(alpha::T, x::AbstractVector{T}, y::AbstractVector{T}, A::AbstractMatrix{T}) where {T}
     m, n = size(A)
 
-    if m < 0
-        return 1
+    # Input validation with descriptive error messages
+    if length(x) != m
+        throw(ArgumentError("Vector x length ($(length(x))) must match matrix row dimension ($m)"))
     end
-
-    if n < 0
-        return 2
+    
+    if length(y) != n
+        throw(ArgumentError("Vector y length ($(length(y))) must match matrix column dimension ($n)"))
     end
 
+    # Early return for degenerate cases
     if m == 0 || n == 0 || alpha == zero(T)
         return
     end
 
-    jy = 1
-
+    # Perform rank-1 update: A := A + alpha * x * y^H
     for j in 1:n
-        if y[jy] != zero(T)
-            temp = alpha * conj(y[jy])
+        if y[j] != zero(T)
+            temp = alpha * conj(y[j])
             for i in 1:m
                 A[i, j] += x[i] * temp
             end
         end
-
-        jy += 1
     end
-
-    return
 end
diff --git a/src/larf.jl b/src/larf.jl
index bb592e9..d26ae3e 100644
--- a/src/larf.jl
+++ b/src/larf.jl
@@ -1,84 +1,127 @@
-function larf(side, m, n, v, incv, tau, c, work)
+"""
+    larf!(side, m, n, v, incv, tau, c, work)
+
+Apply an elementary reflector H to a m-by-n matrix C from either 
+the left or the right.
+
+H = I - tau * v * v^H
+
+where tau is a scalar and v is a vector.
+
+# Arguments
+- `side`: Character specifying the side of application
+  - 'L': apply H from the left (H * C)
+  - 'R': apply H from the right (C * H)
+- `m`: Number of rows in matrix C
+- `n`: Number of columns in matrix C  
+- `v`: Array containing the elementary reflector vector
+- `incv`: Increment for the elements of v (typically 1)
+- `tau`: Scalar factor for the elementary reflector
+- `c`: m-by-n matrix to be modified in-place
+- `work`: Workspace array
+
+# Algorithm
+The elementary reflector H is applied optimally by exploiting the structure
+of the reflector. The algorithm scans for the effective length of the reflector
+vector and the effective dimensions of the matrix to minimize operations.
+
+For side = 'L': Computes C := H * C = (I - tau * v * v^H) * C
+For side = 'R': Computes C := C * H = C * (I - tau * v * v^H)
+
+# Notes
+This is a low-level computational routine used internally by higher-level
+QR factorization algorithms. The workspace array must be properly allocated.
+"""
+function larf!(side::Char, m::Integer, n::Integer, v::AbstractVector{T}, incv::Integer, tau::T, C::AbstractMatrix{T}, work::AbstractVector{T}) where {T}
     lastv = 0
     lastc = 0
-    one = oneunit(eltype(c))
-    zero0 = zero(eltype(c))
+    one0 = oneunit(eltype(C))
+    zero0 = zero(eltype(C))
     
-    if tau != 0
-        # set up variables for scanning v, lastv beigns pointing to end of V
-
+    if tau != zero0
+        # Determine the effective length of the reflector vector v
         if side == 'L' 
             lastv = m
         else
             lastv = n
         end
         
+        # Find the index of the last element to check
         if incv > 0
             i = 1 + (lastv-1)*incv
         else
             i = 1
         end
 
-        while lastv > 0 && v[i] == 0
+        # Scan backwards to find the last non-zero element in v
+        while lastv > 0 && v[i] == zero0
             lastv -= 1
             i -= incv
         end
 
+        # Determine the effective dimensions of C to operate on
         if side == 'L'
-            # scan for last non-zero column in C[1:lastv, :]
-            lastc = ilazlc(lastv, n, c)
+            # Find last non-zero column in C[1:lastv, :]
+            lastc = ilazlc(lastv, n, C)
         else
-            # scan for last non-zero row in C[:, 1:lastv]
-            lastc = ilazlr(m, lastv, c)
+            # Find last non-zero row in C[:, 1:lastv]
+            lastc = ilazlr(m, lastv, C)
         end
     end
 
     if side == 'L'
-        #form H*C
-
+        # Form H*C = (I - tau * v * v^H) * C
         if lastv > 0
             vv = @view v[1:lastv, 1]
-            cv = @view c[1:lastv, 1:lastc]
+            cv = @view C[1:lastv, 1:lastc]
             wv = @view work[1:lastc]
-            # w[1:lastc,1] = c[1:lastv, 1:lastc]^H * v[1:lastv, 1]
-
-            #LinearAlgebra.BLAS.gemv!('C', one, cv, vv, zero0, wv)
-            LinearAlgebra.generic_matvecmul!(wv, 'C', cv, vv, LinearAlgebra.MulAddMul(one, zero0))
-            #LinearAlgebra.generic_matvecmul!((@view work[1:lastc, 1]), 'C', (@view c[1:lastv, 1:lastc]), 
-            #(@view v[1:lastv, 1]), LinearAlgebra.MulAddMul(one, zero0))
-
-            #c[1:lastv,1:lastc] -= tau*v[1:lastv, 1]*w[1:lastc,1]^H
-            #LinearAlgebra.BLAS.gemm!('N', 'C', -tau, vv, wv, one, cv)
-           gerc!(-tau, vv, wv, cv)
+            
+            # Step 1: w = C^H * v (compute v^H * C as w^T)
+            LinearAlgebra.generic_matvecmul!(wv, 'C', cv, vv, LinearAlgebra.MulAddMul(one0, zero0))
 
-            #LinearAlgebra.generic_matmul!((@view c[1:lastv, 1:lastc]), 'N', (@view v[1:lastv, 1]), 
-            #(@view work[1:lastc, 1]), LinearAlgebra.MulAddMul(-tau, one))
+            # Step 2: C := C - tau * v * w^H (rank-1 update)
+            gerc!(-tau, vv, wv, cv)
         end
     else
-        #form C*H
-
+        # Form C*H = C * (I - tau * v * v^H)
         if lastv > 0
-            # w[1:lastc,1] = c[1:lastc, 1:lastv] * v[1:lastv, 1]
-            LinearAlgebra.generic_matvecmul!((@view work[1:lastc, 1]), 'N', (@view c[1:lastc, 1:lastv]),
-            (@view v[1:lastv, 1]), LinearAlgebra.MulAddMul(one, zero0))
+            # Step 1: w = C * v
+            LinearAlgebra.generic_matvecmul!((@view work[1:lastc, 1]), 'N', (@view C[1:lastc, 1:lastv]),
+                (@view v[1:lastv, 1]), LinearAlgebra.MulAddMul(one0, zero0))
 
-            #c[1:lastc,1:lastv] -= tau(?)*w[1:lastc,1]*v[1:lastv, 1]^H
-            
-            #LinearAlgebra.BLAS.ger!(-tau, wv, vv, cv)
-            gerc!(-tau, (@view work[1:lastc, 1]), (@view v[1:lastv, 1]), (@view c[1:lastc, 1:lastv]))
+            # Step 2: C := C - tau * w * v^H (rank-1 update)
+            gerc!(-tau, (@view work[1:lastc, 1]), (@view v[1:lastv, 1]), (@view C[1:lastc, 1:lastv]))
         end
     end
 end
 
-function ilazlc(m,n,a)
+"""
+    ilazlc(m, n, a) -> Int
+
+Find the index of the last non-zero column in an m-by-n matrix.
+Scans from column n backwards to column 1, checking all rows
+in each column for non-zero elements.
+
+# Arguments
+- `m`: Number of rows in matrix a
+- `n`: Number of columns in matrix a
+- `a`: Matrix to scan
+
+# Returns
+- Index of last column containing at least one non-zero element,
+  or 0 if all elements are zero
+"""
+function ilazlc(m, n, a)
     if n == 0
         return n
     end
 
+    # Quick check of the last column boundaries
     if a[1,n] != 0 || a[m,n] != 0 
         return n
     end
 
+    # Scan columns from right to left
     for j in n:-1:1
         for i in 1:m
             if a[i, j] != 0
@@ -86,28 +129,98 @@ function ilazlc(m,n,a)
             end
         end
     end
+    
+    return 0  # All elements are zero
 end
 
-function ilazlr(m,n,a)
-    
+"""
+    ilazlr(m, n, a) -> Int
+
+Find the index of the last non-zero row in an m-by-n matrix.
+Scans all columns to determine the effective row dimension.
+
+# Arguments
+- `m`: Number of rows in matrix a
+- `n`: Number of columns in matrix a  
+- `a`: Matrix to scan
+
+# Returns
+- Index of last row containing at least one non-zero element,
+  or 0 if all elements are zero
+"""
+function ilazlr(m, n, a)
     if m == 0
         return m
     end
 
+    # Quick check of the last row boundaries
     if a[m,1] != 0 || a[m,n] != 0 
         return m
     end
 
     ila = 0
 
+    # For each column, find the last non-zero row
     for j in 1:n
         i = m
         while (a[max(i,1), j] == 0) && (i > 1)
             i -= 1
         end
-
         ila = max(ila, i)
     end
 
     return ila
 end
+
+"""
+    larf!(side, A, tau, C) -> C
+
+Apply an elementary reflector H to a matrix C, where H = I - tau * A * A^H.
+
+This is a high-level interface to the elementary reflector application routine.
+The reflector can be applied from either the left (H*C) or right (C*H) side.
+
+# Arguments
+- `side`: Character specifying application side ('L' for left, 'R' for right)
+- `A`: Vector defining the elementary reflector
+- `tau`: Scalar factor for the reflector  
+- `C`: Matrix to be transformed in-place
+
+# Returns
+- The modified matrix `C`
+
+# Input Validation
+- For side='L': length(A) must equal number of rows in C
+- For side='R': length(A) must equal number of columns in C
+
+# Example
+```julia
+# Apply reflector from left: C := H * C
+larf!('L', v, tau, C)
+
+# Apply reflector from right: C := C * H  
+larf!('R', v, tau, C)
+```
+"""
+function larf!(side::Char,  v::AbstractVector{T}, incv::Integer, tau::T, C::AbstractMatrix{T}) where {T}
+    m, n = size(C)
+    
+    # Input validation with descriptive error messages
+    if side == 'L'
+        if length(v) != m
+            throw(ArgumentError("For side='L', reflector length ($(length(v))) must equal matrix row dimension ($m)"))
+        end
+        work = zeros(T, n)
+    elseif side == 'R'
+        if length(v) != n
+            throw(ArgumentError("For side='R', reflector length ($(length(v))) must equal matrix column dimension ($n)"))
+        end
+        work = zeros(T, m)
+    else
+        throw(ArgumentError("Invalid side parameter: '$side'. Must be 'L' or 'R'"))
+    end
+    
+    
+    # Call the core computational routine
+    larf!(side, m, n, v, incv, tau, C, work)
+end
diff --git a/src/larfb.jl b/src/larfb.jl
index 33f5774..a9940ca 100644
--- a/src/larfb.jl
+++ b/src/larfb.jl
@@ -1,5 +1,5 @@
 """
-    larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, work, ldwork)
+    larfb!(side, trans, direct, storev, m, n, k, v, ldv, t, c, work)
 
 Applies complex block reflector H or its transpose H^H to m-by-n matrix C from either the left or the right
 Implemented with Julia internal functions for matrix multiplication
@@ -29,23 +29,18 @@ Implemented with Julia internal functions for matrix multiplication
     - if storev = 'C' and side = 'R', ldv >= max(1,n)
     - if storev = 'R', ldv >= k
 - 't': dimension (ldv, k), the triangular k-by-k matrix t in representation of the block reflector
-- 'ldt': the leading dimension of array t, ldt >= k
 - 'c': 
     - on entry m-by-n matrix
     - on exit, overwritten by H*C or H^H*C or C*H or C*H^H
-- 'ldc': the leading dimension of c. ldc >= max(1,m)
 - 'work': dimension (ldwork, k)
-- 'ldwork': 
-    - if side = 'L', ldwork >= max(1,n)
-    - if side = 'R', ldwork >= max(1,m)
 """
-function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, work, ldwork)
-    
+function larfb!(side::Char, trans::Char, direct::Char, storev::Char, m::Integer, n::Integer, k::Integer, V::AbstractMatrix{T}, ldv::Integer, T_mat::AbstractMatrix{T}, C::AbstractMatrix{T}, work::AbstractMatrix{T}) where {T}
+
     if m <= 0 || n <= 0
         return
     end
 
-    one = oneunit(eltype(c))
+    one = oneunit(eltype(C))
     plus = LinearAlgebra.MulAddMul(one, one)
     minus = LinearAlgebra.MulAddMul(one*(-1),one)
 
@@ -62,10 +57,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor
                                               (C2)
                 """
 
-                c1 = @view c[1:k,:] 
-                c2 = @view c[k+1:m,:]
-                v1 = @view v[1:k,:]
-                v2 = @view v[k+1:m,:]
+                c1 = @view C[1:k,:] 
+                c2 = @view C[k+1:m,:]
+                v1 = @view V[1:k,:]
+                v2 = @view V[k+1:m,:]
             
                 work .= c1'
 
@@ -80,9 +75,9 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor
                 # W = W * T^H or W*T
 
                 if trans == 'N' # W = W*T^H
-                    LinearAlgebra.generic_mattrimul!(work, 'U', 'N', adjoint, work, t)
+                    LinearAlgebra.generic_mattrimul!(work, 'U', 'N', adjoint, work, T_mat)
                 else
-                    LinearAlgebra.generic_mattrimul!(work, 'U', 'N', identity, work, t)
+                    LinearAlgebra.generic_mattrimul!(work, 'U', 'N', identity, work, T_mat)
                 end
 
                 if m > k 
@@ -100,10 +95,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor
                     """
                     Form C*H or C*H^H where C = (c1 c2)
                     """
-                    c1 = @view c[:, 1:k]
-                    c2 = @view c[:, k+1:n]
-                    v1 = @view v[1:k,:]
-                    v2 = @view v[k+1:n,:]
+                    c1 = @view C[:, 1:k]
+                    c2 = @view C[:, k+1:n]
+                    v1 = @view V[1:k,:]
+                    v2 = @view V[k+1:n,:]
 
                     work .= c1
 
@@ -119,9 +114,9 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor
                     #w = w*t or w*t^H
 
                     if trans == 'C' # W = W*T^H
-                        LinearAlgebra.generic_mattrimul!(work, 'U', 'N', adjoint, work, t)
+                        LinearAlgebra.generic_mattrimul!(work, 'U', 'N', adjoint, work, T_mat)
                     else
-                        LinearAlgebra.generic_mattrimul!(work, 'U', 'N', identity, work, t)
+                        LinearAlgebra.generic_mattrimul!(work, 'U', 'N', identity, work, T_mat)
                     end
 
                     if n > k
@@ -146,10 +141,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor
                 Form H*C or H^H*C where C = (c1)
                                             (c2)
                 """
-                c1 = @view c[1:m-k,:]
-                c2 = @view c[m-k+1:m,:]
-                v1 = @view v[1:ldv-k,:]
-                v2 = @view v[ldv-k+1:ldv,:]
+                c1 = @view C[1:m-k,:]
+                c2 = @view C[m-k+1:m,:]
+                v1 = @view V[1:ldv-k,:]
+                v2 = @view V[ldv-k+1:ldv,:]
                 
                 work .= c2'
 
@@ -163,10 +158,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor
 
                 if trans == 'N'
                     #work = work*(t')
-                    LinearAlgebra.generic_mattrimul!(work, 'L', 'N', adjoint, work, t)
+                    LinearAlgebra.generic_mattrimul!(work, 'L', 'N', adjoint, work, T_mat)
                 else
                     #work = work*t
-                    LinearAlgebra.generic_mattrimul!(work, 'L', 'N', identity, work, t)
+                    LinearAlgebra.generic_mattrimul!(work, 'L', 'N', identity, work, T_mat)
                 end
 
                 #c1 = c1 - v1*w^H
@@ -180,7 +175,7 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor
                 #c2 = c2 - w^H
                 for j in 1:k
                     for i in 1:n
-                        c[m-k+j,i] = c[m-k+j,i] - conj(work[i,j])
+                        C[m-k+j,i] = C[m-k+j,i] - conj(work[i,j])
                     end
                 end
             else 
@@ -188,10 +183,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor
                     """
                     Form C*H or C*H^H where C = (c1 c2)
                     """
-                    c1 = @view c[:,1:n-k]
-                    c2 = @view c[:,n-k+1:n]
-                    v1 = @view v[1:ldv-k,:]
-                    v2 = @view v[ldv-k+1:ldv,:]
+                    c1 = @view C[:,1:n-k]
+                    c2 = @view C[:,n-k+1:n]
+                    v1 = @view V[1:ldv-k,:]
+                    v2 = @view V[ldv-k+1:ldv,:]
 
                     work .= c2
 
@@ -205,10 +200,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor
 
                     if trans == 'C'
                         #work = work*(t')
-                        LinearAlgebra.generic_mattrimul!(work, 'L', 'N', adjoint, work, t)
+                        LinearAlgebra.generic_mattrimul!(work, 'L', 'N', adjoint, work, T_mat)
                     else
                         #work = work*t
-                        LinearAlgebra.generic_mattrimul!(work, 'L', 'N', identity, work, t)
+                        LinearAlgebra.generic_mattrimul!(work, 'L', 'N', identity, work, T_mat)
                     end
                     
                     #c1 = c1 - w*v1^H
@@ -237,10 +232,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor
                                                 (c2)
                     """
 
-                    v1 = @view v[:, 1:k]
-                    v2 = @view v[:, k+1:m]
-                    c1 = @view c[1:k, :]
-                    c2 = @view c[k+1:m, :]
+                    v1 = @view V[:, 1:k]
+                    v2 = @view V[:, k+1:m]
+                    c1 = @view C[1:k, :]
+                    c2 = @view C[k+1:m, :]
 
                     work .= c1'
 
@@ -254,10 +249,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor
 
                     if trans == 'N'
                         #work = work*(t')
-                        LinearAlgebra.generic_mattrimul!(work, 'U', 'N', adjoint, work, t)
+                        LinearAlgebra.generic_mattrimul!(work, 'U', 'N', adjoint, work, T_mat)
                     else
                         #work = work*t
-                        LinearAlgebra.generic_mattrimul!(work, 'U', 'N', identity, work, t)
+                        LinearAlgebra.generic_mattrimul!(work, 'U', 'N', identity, work, T_mat)
                     end
 
                     #c2 = c2 - v2^h*w^h
@@ -276,10 +271,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor
                         Form C*H or C*H^H where C = (c1 c2)
                         """
                         
-                        v1 = @view v[:, 1:k]
-                        v2 = @view v[:, k+1:n]
-                        c1 = @view c[:, 1:k]
-                        c2 = @view c[:, k+1:n]
+                        v1 = @view V[:, 1:k]
+                        v2 = @view V[:, k+1:n]
+                        c1 = @view C[:, 1:k]
+                        c2 = @view C[:, k+1:n]
 
                         work .= c1
 
@@ -293,10 +288,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor
 
                         if trans == 'C'
                             #work = work*(t')
-                            LinearAlgebra.generic_mattrimul!(work, 'U', 'N', adjoint, work, t)
+                            LinearAlgebra.generic_mattrimul!(work, 'U', 'N', adjoint, work, T_mat)
                         else
                             #work = work*t
-                            LinearAlgebra.generic_mattrimul!(work, 'U', 'N', identity, work, t)
+                            LinearAlgebra.generic_mattrimul!(work, 'U', 'N', identity, work, T_mat)
                         end
 
                         #c2 = c2 - w*v2
@@ -320,10 +315,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor
                     Form H*C or H^H*C where C = (c1)
                                                 (c2)
                     """
-                    v1 = @view v[:, 1:m-k]
-                    v2 = @view v[:, m-k+1:m]
-                    c1 = @view c[1:m-k,:]
-                    c2 = @view c[m-k+1:m,:]
+                    v1 = @view V[:, 1:m-k]
+                    v2 = @view V[:, m-k+1:m]
+                    c1 = @view C[1:m-k,:]
+                    c2 = @view C[m-k+1:m,:]
 
                     work .= c2'
 
@@ -337,10 +332,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor
 
                     if trans == 'N'
                         #work = work*(t')
-                        LinearAlgebra.generic_mattrimul!(work, 'L', 'N', adjoint, work, t)
+                        LinearAlgebra.generic_mattrimul!(work, 'L', 'N', adjoint, work, T_mat)
                     else
                         #work = work*t
-                        LinearAlgebra.generic_mattrimul!(work, 'L', 'N', identity, work, t)
+                        LinearAlgebra.generic_mattrimul!(work, 'L', 'N', identity, work, T_mat)
                     end
 
                     #c1 = c1 - v1^h * w^h
@@ -358,10 +353,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor
                         """
                         Form C*H or C*H^H where C = (c1 c2)
                         """
-                        v1 = @view v[:, 1:n-k]
-                        v2 = @view v[:, n-k+1:n]
-                        c1 = @view c[:, 1:n-k]
-                        c2 = @view c[:,n-k+1:n]
+                        v1 = @view V[:, 1:n-k]
+                        v2 = @view V[:, n-k+1:n]
+                        c1 = @view C[:, 1:n-k]
+                        c2 = @view C[:,n-k+1:n]
 
                         work .= c2
                         
@@ -375,10 +370,10 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor
 
                         if trans == 'C'
                             #work = work*(t')
-                            LinearAlgebra.generic_mattrimul!(work, 'L', 'N', adjoint, work, t)
+                            LinearAlgebra.generic_mattrimul!(work, 'L', 'N', adjoint, work, T_mat)
                         else
                             #work = work*t
-                            LinearAlgebra.generic_mattrimul!(work, 'L', 'N', identity, work, t)
+                            LinearAlgebra.generic_mattrimul!(work, 'L', 'N', identity, work, T_mat)
                         end
 
                         #c1 = c1 - w*v1
@@ -395,6 +390,74 @@ function larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, wor
             end
         end
     end
+end 
+
+"""
+    larfb!(side, trans, direct, storev, V, T, C)
+
+Apply a complex block reflector H or its conjugate transpose H^H to a matrix C.
+
+This is a high-level interface that automatically computes required dimensions
+and allocates workspace for the block reflector application.
+
+The block reflector H has the form:
+H = I - V * T * V^H
 
-    return
-end 
\ No newline at end of file
+where V contains k elementary reflector vectors and T is an upper triangular
+block reflector coefficient matrix.
+
+# Arguments
+- `side`: Character specifying which side to apply the reflector
+  - 'L': Apply H from the left (H*C or H^H*C)  
+  - 'R': Apply H from the right (C*H or C*H^H)
+- `trans`: Character specifying which form to apply
+  - 'N': Apply H (no conjugate transpose)
+  - 'C': Apply H^H (conjugate transpose)
+- `direct`: Character indicating how H is formed from elementary reflectors
+  - 'F': H = H(1) H(2) ... H(k) (Forward - first k reflectors)
+  - 'B': H = H(k) ... H(2) H(1) (Backward - last k reflectors)  
+- `storev`: Character indicating how reflector vectors are stored in V
+  - 'C': Reflector vectors stored columnwise in V
+  - 'R': Reflector vectors stored rowwise in V
+- `V`: Matrix containing the elementary reflector vectors
+- `T`: Upper triangular k×k matrix with block reflector coefficients
+- `C`: m×n matrix to be transformed in-place
+
+# Algorithm
+Applies the block reflector efficiently by:
+1. Computing W = C^H * V (or W = C * V for right multiplication)  
+2. Multiplying by the triangular matrix T: W := W * T (or W * T^H)
+3. Applying rank-k update: C := C - V * W^H (or C - W * V^H)
+
+The algorithm exploits the triangular structure of the reflector matrix
+to minimize computational cost.
+
+# Example
+```julia
+m, n, k = 8, 6, 4
+C = complex.(randn(m, n), randn(m, n))
+V = complex.(randn(m, k), randn(m, k))  # k reflector vectors  
+T = triu(complex.(randn(k, k), randn(k, k)))  # Upper triangular
+larfb!('L', 'N', 'F', 'C', V, T, C)  # Apply H*C
+```
+"""
+function larfb!(side::Char, trans::Char, direct::Char, storev::Char, V::AbstractMatrix{T}, T_mat::AbstractMatrix{T}, C::AbstractMatrix{T}) where {T}
+    # Determine dimensions
+    m, n = size(C)
+    k = size(T, 1)
+    
+    # Set leading dimensions
+    ldv = size(V, 1) 
+    
+    # Allocate workspace
+    if side == 'L'
+        ldwork = n
+        work = similar(C, k, n)
+    else
+        ldwork = m
+        work = similar(C, m, k)
+    end
+    
+    # Call the underlying kernel
+    larfb!(side, trans, direct, storev, m, n, k, V, ldv, T_mat, C, work)
+end
\ No newline at end of file
diff --git a/src/larfg.jl b/src/larfg.jl
index 71ec937..332b958 100644
--- a/src/larfg.jl
+++ b/src/larfg.jl
@@ -1,59 +1,100 @@
-function larfg(n, alpha, x, incx, tau)
+"""
+    larfg!(n, alpha, x, incx, tau)
+
+Generate an elementary reflector H such that:
+H * [alpha; x] = [beta; 0]
+
+where H = I - tau * v * v^H, v = [1; x/scale], and beta = -sign(alpha) * ||[alpha; x]||
+
+This routine generates a complex elementary Householder reflector H of order n,
+such that when applied to the vector [alpha; x], it zeros out the x portion
+and produces [beta; 0] where beta has the same magnitude as the original vector.
+
+# Arguments  
+- `n`: Order of the reflector (length of full vector [alpha; x])
+- `alpha`: Scalar element, the first component of the vector
+- `x`: Vector of length n-1, remaining components of the vector  
+- `incx`: Increment for elements of x (typically 1)
+- `tau`: Output scalar factor for the reflector
+
+# Returns
+- `alpha`: Modified to contain beta (the new first component)
+- `tau`: Scalar factor such that H = I - tau * v * v^H
+
+# Algorithm
+The algorithm handles potential under/overflow carefully by scaling when
+necessary. The reflector is chosen so that the reflection introduces no
+unnecessary amplification of round-off errors.
+
+Special cases:
+- If x = 0 and imag(alpha) = 0, then tau = 0 (no reflection needed)
+- If n ≤ 1, then tau = 0 (trivial case)
+
+# Mathematical Details
+For the elementary reflector H = I - tau * v * v^H where v = [1; u]:
+- tau = (beta - alpha) / beta for real case
+- tau = (beta - Re(alpha))/beta - i*Im(alpha)/beta for complex case  
+- The vector u replaces x on output
+
+# Note
+This is a low-level LAPACK-style computational routine. Input validation
+should be performed by higher-level interfaces.
+"""
+function larfg!(n::Integer, alpha::T, x::AbstractVector{T}, incx::Integer, tau::T) where {T}
     one = oneunit(eltype(alpha))
     zero0 = zero(eltype(alpha)) 
-    type = eltype(alpha)
-    if n <= 0
+    
+    if n <= 1
         tau = zero0
         return alpha, tau
     end
-    
-    if n == 1
-        xnorm = 0
-    else
-        xnorm = norm(x,2)
-    end
+
+    xnorm = norm(x, 2)
 
     alphr = real(alpha)
     alphi = imag(alpha)
 
-    if xnorm == 0 && alphi == 0
+
+    if xnorm == zero0 && alphi == zero0
         tau = zero0
     else
+        # Compute beta = -sign(alphr) * ||[alpha, x]||
         beta = -copysign(sqrt(alphr^2 + alphi^2 + xnorm^2), alphr)
+        # Machine parameters for safe scaling
         safmin = lamch(eltype(alphr), 'S') / lamch(eltype(alphr), 'E')
         rsafmn = one / safmin
         knt = 0
 
         if abs(beta) < safmin
-            #  xnorm, beta may be inaccurate, scale x and recompute
-            
-            while true
+            # xnorm, beta may be inaccurate due to underflow; scale and recompute
+            while abs(beta) < safmin
                 knt += 1
                 x .*= rsafmn
                 beta *= rsafmn
-                alphr *= rsafmn
+                alphr *= rsafmn  
                 alphi *= rsafmn
                 alpha *= rsafmn
-
-                if abs(beta) < safmin
-                    break
-                end
             end                
 
-            #recompute 
-            xnorm = norm(x)
-            if type <: Complex
+            # Recompute with scaled values
+            xnorm = norm(x, 2)
+            if T <: Complex
                 alpha = alphr + im * alphi
             end
             beta = -copysign(sqrt(alphr^2 + alphi^2 + xnorm^2), alphr)
         end
-        if type <: Complex
-            tau = ( beta-alphr ) / beta - im * alphi / beta 
+        
+        # Compute tau based on number type
+        if T <: Complex
+            tau = (beta - alphr) / beta - im * alphi / beta 
         else
-            tau = ( beta - alphr ) / beta
+            tau = (beta - alphr) / beta
         end
-        x .*= (one / (alpha-beta))
         
+        # Scale x to form the reflector vector
+        x .*= (one / (alpha - beta))
+        
+        # Scale beta back if we scaled up
         for j in 1:knt
             beta *= safmin
         end
@@ -63,3 +104,43 @@ function larfg(n, alpha, x, incx, tau)
 
     return alpha, tau
 end
+
+"""
+    larfg!(x) -> (alpha, tau, x_updated)
+
+Generate an elementary reflector H such that H * x produces a vector
+with all but the first element equal to zero.
+
+This is a high-level interface to the elementary reflector generation routine.
+Given a vector x, it computes a Householder reflector H = I - tau * v * v^H
+that zeros out all but the first component.
+
+# Arguments
+- `x`: Vector to be transformed (will be modified in-place)
+
+# Returns  
+- `alpha`: The resulting first component (beta)
+- `tau`: Scalar factor of the elementary reflector
+- `x_updated`: The updated vector with first component as alpha, rest as reflector vector
+
+# Input Validation
+- Vector must have at least one element
+
+# Example
+```julia
+x = complex.([3.0, 4.0, 0.0], [0.0, 0.0, 0.0])
+alpha, tau, x_new = larfg!(x)
+# x_new[1] will be the magnitude -||x||, x_new[2:end] will be the reflector vector
+```
+
+# Mathematical Background
+Creates H such that H * x = [||x||; 0; 0; ...] where the sign is chosen
+to avoid cancellation. The reflector vector is stored in x_new[2:end].
+"""
+function larfg!(alpha::T, x::AbstractVector{T}, incx::Integer, tau::T) where {T}
+    n = length(x)
+    
+    alpha_out, tau_out = larfg!(n, alpha, x, incx, tau)
+
+    return alpha_out, tau_out
+end
\ No newline at end of file
diff --git a/src/larft.jl b/src/larft.jl
index bab0e63..b76f39f 100644
--- a/src/larft.jl
+++ b/src/larft.jl
@@ -1,160 +1,249 @@
-function larft(direct, storev, n, k, v, ldv, tau, t, ldt)
+"""
+    larft!(direct, storev, n, k, v, tau, T_mat)
+
+Form the triangular factor T of a complex block reflector H of order n,
+where H is defined as a product of k elementary reflectors.
+
+The block reflector H has the form:
+H = I - V * T * V^H
+
+where V is n-by-k and contains the elementary reflector vectors, and T is
+the k-by-k upper triangular factor computed by this routine.
+
+# Arguments
+- `direct`: Character indicating the order of the elementary reflectors
+  - 'F': H = H(1) H(2) ... H(k) (Forward)  
+  - 'B': H = H(k) ... H(2) H(1) (Backward)
+- `storev`: Character indicating how the reflector vectors are stored in V
+  - 'C': Columnwise storage (V is n-by-k)
+  - 'R': Rowwise storage (V is k-by-n)  
+- `n`: Order of the reflector H
+- `k`: Number of elementary reflectors (order of T)
+- `v`: Matrix containing the elementary reflector vectors
+- `tau`: Array containing the scalar factors of the elementary reflectors
+- `T_mat`: k-by-k matrix where the triangular factor T will be stored
+
+# Algorithm
+The algorithm computes T such that H = I - V * T * V^H where each column
+(or row) of V represents an elementary reflector. The triangular structure
+ensures efficient application of the block reflector.
+
+For forward direction (direct='F'):
+- T[i,i] = tau[i] (diagonal elements)
+- T[j,i] = -tau[i] * V[i,j] * T[j,j:i-1] for j < i (upper triangular part)
+
+For backward direction (direct='B'):
+- T[i,i] = tau[i] (diagonal elements)  
+- T[j,i] = -tau[i] * V[j,i] * T[i+1:j,i] for j > i (lower triangular part)
+
+# Notes
+This is the core computational routine for forming block reflector coefficients.
+The matrix T enables efficient application of multiple reflectors simultaneously.
+"""
+function larft!(direct::Char, storev::Char, n::Integer, k::Integer, V::AbstractMatrix{T}, tau::AbstractVector{T}, T_mat::AbstractMatrix{T}) where {T}
     if n == 0
         return
     end
 
-    zero0 = zero(eltype(v))
-    one = oneunit(eltype(v))
+    zero0 = zero(eltype(V))
+    one0 = oneunit(eltype(V))
 
     if direct == 'F'
         prevlastv = n
 
         for i in 1:k
-
             prevlastv = max(prevlastv, i)
 
-            if tau[i] == 0
-                # H(i) = i
-
+            if tau[i] == zero0
+                # H(i) = I (no reflection)
                 for j in 1:i
-                    t[j,i] = zero0
+                    T_mat[j,i] = zero0
                 end
-
             else
-                # general case
-
+                # General case: compute T column
                 if storev == 'C'
+                    # Find the last non-zero element in v[:,i]
                     lastv = n
-                    
-                    #for lastv in n:-1:i+1
                     while lastv >= i+1
-
-                        if v[lastv, i] != 0
+                        if V[lastv, i] != zero0
                             break
                         end
-
                         lastv -= 1
                     end
 
+                    # Initialize T[1:i-1,i] with diagonal contribution
                     for j in 1:i-1
-                        t[j,i] = -tau[i] * conj(v[i,j])
+                        T_mat[j,i] = -tau[i] * conj(V[i,j])
                     end
 
+                    # Add contribution from off-diagonal part
                     j = min(lastv, prevlastv)
+                    LinearAlgebra.generic_matvecmul!((@view T_mat[1:i-1, i]), 'C', (@view V[i+1:j, 1:i-1]), 
+                        (@view V[i+1:j,i]), LinearAlgebra.MulAddMul(-tau[i], one0))
 
-                    # t[1:i-1, i] = -tau[i] * v[i:j, 1:i-1]^H * v[i:j, i] 
-                    LinearAlgebra.generic_matvecmul!((@view t[1:i-1, i]), 'C', (@view v[i+1:j, 1:i-1]), 
-                    (@view v[i+1:j,i]), LinearAlgebra.MulAddMul(-tau[i], one))
-
-                else
+                else  # storev == 'R'
+                    # Find the last non-zero element in v[i,:]
                     lastv = n
-                    #for lastv in n:-1:i+1
                     while lastv >= i+1
-                        if v[i, lastv] != 0
+                        if V[i, lastv] != zero0
                             break
                         end
-
                         lastv -= 1
                     end
 
+                    # Initialize T[1:i-1,i] with diagonal contribution
                     for j in 1:i-1
-                        t[j,i] = -tau[i] * v[j,i]
+                        T_mat[j,i] = -tau[i] * V[j,i]
                     end
 
+                    # Add contribution from off-diagonal part
                     j = min(lastv, prevlastv)
-
-                    # t[1:i-1, i] = -tau[i] * v[1:i-1, i:j] * v[i,i:j]^H
                     if i-1 > 0
-                        LinearAlgebra.generic_matmatmul!((@view t[1:i-1, i]), 'N', 'C', (@view v[1:i-1, i:j]), 
-                        (@view v[i:i, i:j]), LinearAlgebra.MulAddMul(-tau[i], one))
+                        LinearAlgebra.generic_matmatmul!((@view T_mat[1:i-1, i]), 'N', 'C', (@view V[1:i-1, i:j]), 
+                            (@view V[i:i, i:j]), LinearAlgebra.MulAddMul(-tau[i], one0))
                     end
                 end
 
-                #t[1:i-1,i] = t[1:i-1, 1:i-1] * t[1:i-1,i]
-                LinearAlgebra.generic_trimatmul!((@view t[1:i-1,i]), 'U', 'N', identity, 
-                (@view t[1:i-1, 1:i-1]), (@view t[1:i-1, i]))
+                # Apply triangular solve: T[1:i-1,i] = T[1:i-1,1:i-1] * T[1:i-1,i]
+                LinearAlgebra.generic_trimatmul!((@view T_mat[1:i-1,i]), 'U', 'N', identity, 
+                    (@view T_mat[1:i-1, 1:i-1]), (@view T_mat[1:i-1, i]))
 
-                t[i,i] = tau[i]
+                # Set diagonal element
+                T_mat[i,i] = tau[i]
 
+                # Update tracking variable
                 if i > 1
                     prevlastv = max(prevlastv, lastv)
                 else
                     prevlastv = lastv
                 end
-
             end
         end
-    else
+    else  # direct == 'B'
         prevlastv = 1
         for i in k:-1:1
-            if tau[i] == 0
-
-                #H(i) = I
-
+            if tau[i] == zero0
+                # H(i) = I (no reflection)
                 for j in i:k
-                    t[j,i] = zero0
+                    T_mat[j,i] = zero0
                 end
-
             else
                 if i < k
                     if storev == 'C'
+                        # Find the first non-zero element in v[:,i]
                         lastv = 1
-
-                        #for lastv in 1:i-1
                         while lastv <= i-1
-                            if v[lastv,i] != 0
+                            if V[lastv,i] != zero0
                                 break
                             end
                             lastv += 1
                         end
 
+                        # Initialize T[i+1:k,i] with diagonal contribution
                         for j in i+1:k
-                            t[j,i] = -tau[i] * conj(v[n-k+i, j])
+                            T_mat[j,i] = -tau[i] * conj(V[n-k+i, j])
                         end
                         
+                        # Add contribution from off-diagonal part
                         j = max(lastv, prevlastv)
-
-
-                        #t[i+1:k, i] = -tau[i] * v[j:n-k+i, i+1:k]^H * v[j:n-k+i, i]
-
-                        LinearAlgebra.generic_matvecmul!((@view t[i+1:k, i]), 'C', (@view v[j:n-k+i, i+1:k]), 
-                        (@view v[j:n-k+i, k]), LinearAlgebra.MulAddMul(-tau[i], one))
-                    else
+                        LinearAlgebra.generic_matvecmul!((@view T_mat[i+1:k, i]), 'C', (@view V[j:n-k+i, i+1:k]), 
+                            (@view V[j:n-k+i, k]), LinearAlgebra.MulAddMul(-tau[i], one0))
+                            
+                    else  # storev == 'R'
+                        # Find the first non-zero element in v[i,:]
                         lastv = 1
-                        #for lastv in 1:i-1
                         while lastv <= i-1
-                            if v[lastv,i] != 0
+                            if V[lastv,i] != zero0
                                 break
                             end
                             lastv += 1
                         end
 
+                        # Initialize T[i+1:k,i] with diagonal contribution
                         for j in i+1:k
-                            t[j,i] = -tau[i] * v[j, n-k+i]
+                            T_mat[j,i] = -tau[i] * V[j, n-k+i]
                         end
                         
+                        # Add contribution from off-diagonal part
                         j = max(lastv, prevlastv)
-
-                        #t[i+1:k, i] = -tau[i] * v[i+1:k , j:n-k+i] * v[i, j:n-k+i]^H
-                        LinearAlgebra.generic_matmatmul!((@view t[i+1:k, i]), 'N', 'C', (@view v[i+1:k, j:n-k+i-1]), 
-                        (@view v[i:i, j:n-k+i-1]), LinearAlgebra.MulAddMul(-tau[i], one))
+                        LinearAlgebra.generic_matmatmul!((@view T_mat[i+1:k, i]), 'N', 'C', (@view V[i+1:k, j:n-k+i-1]), 
+                            (@view V[i:i, j:n-k+i-1]), LinearAlgebra.MulAddMul(-tau[i], one0))
                     end
 
-                    # t[i+1:k, i] = t[i+1:k, i+1:k] * t[i+1:k, i]
-
-                    LinearAlgebra.generic_trimatmul!((@view t[i+1:k, i]), 'L', 'N', identity, 
-                    (@view t[i+1:k, i+1:k]), (@view t[i+1:k, i]))
+                    # Apply triangular solve: T[i+1:k,i] = T[i+1:k,i+1:k] * T[i+1:k,i]
+                    LinearAlgebra.generic_trimatmul!((@view T_mat[i+1:k, i]), 'L', 'N', identity, 
+                        (@view T_mat[i+1:k, i+1:k]), (@view T_mat[i+1:k, i]))
 
+                    # Update tracking variable
                     if i > 1
                         prevlastv = min(prevlastv, lastv)
                     else
                         prevlastv = lastv
                     end
-
                 end
 
-                t[i,i] = tau[i]
+                # Set diagonal element
+                T_mat[i,i] = tau[i]
             end
         end
     end
+end
+
+"""
+    larft(direct, storev, V, tau) -> T
+
+Form the triangular factor T of a complex block reflector H from elementary 
+reflectors and their scalar factors.
+
+This is a high-level interface that automatically determines dimensions and
+allocates the output matrix. The block reflector H has the form:
+H = I - V * T * V^H
+
+# Arguments
+- `direct`: Character indicating the order of elementary reflector products
+  - 'F': H = H(1) H(2) ... H(k) (Forward)
+  - 'B': H = H(k) ... H(2) H(1) (Backward)
+- `storev`: Character indicating how reflector vectors are stored in V
+  - 'C': Columnwise storage (V is n-by-k)
+  - 'R': Rowwise storage (V is k-by-n)  
+- `V`: Matrix containing the elementary reflector vectors
+- `tau`: Vector containing scalar factors of the elementary reflectors
+
+# Returns
+- `T`: k-by-k upper triangular matrix (triangular factor of block reflector)
+
+# Input Validation  
+- Matrix V and vector tau must have compatible dimensions
+- For 'C' storage: size(V,2) must equal length(tau)
+- For 'R' storage: size(V,1) must equal length(tau)
+
+# Example
+```julia
+m, k = 8, 4
+V = complex.(randn(m, k), randn(m, k))  # Elementary reflector vectors
+tau = complex.(randn(k), randn(k))      # Reflector scaling factors
+T = larft('F', 'C', V, tau)             # Compute triangular factor
+```
+
+# Mathematical Background
+The triangular factor T enables efficient block operations. Instead of applying
+k individual reflectors H(1), H(2), ..., H(k), the block reflector 
+H = I - V*T*V^H can be applied in O(n²k) operations rather than O(nk²).
+"""
+function larft!(direct::Char, storev::Char, V::AbstractMatrix{T}, tau::AbstractVector{T}, T_mat::AbstractMatrix{T}) where {T}
+    # Determine dimensions based on storage format
+    if storev == 'C'
+        n, k = size(V)
+        if length(tau) != k
+            throw(ArgumentError("For columnwise storage, length(tau) must equal size(V,2)"))
+        end
+    else # storev == 'R'
+        k, n = size(V)
+        if length(tau) != k
+            throw(ArgumentError("For rowwise storage, length(tau) must equal size(V,1)"))
+        end
+    end
+
+    # Call the core computational routine
+    larft!(direct, storev, n, k, V, tau, T_mat)
 end
\ No newline at end of file
diff --git a/src/lauu2.jl b/src/lauu2.jl
index ca203ab..4c0d517 100644
--- a/src/lauu2.jl
+++ b/src/lauu2.jl
@@ -1,108 +1,107 @@
-export lauu2
+export lauu2!
 
 """
-Purpose:
-=======
-LAUU2 computes the product U * U' or L' * L, where the triangular
-factor U or L is stored in the upper or lower triangular part of
-the array A.
-
-If UPLO = 'U' or 'u', the upper triangle of the result is stored,
-overwriting the factor U in A.
-If UPLO = 'L' or 'l', the lower triangle of the result is stored,
-overwriting the factor L in A.
-
-Arguments:
-==========
-UPLO    (input) CHARACTER*1
-        Specifies whether the triangular factor stored in the array A
-        is upper or lower triangular:
-        = 'U':  Upper triangular
-        = 'L':  Lower triangular
-
-N       (input) INTEGER
-        The order of the triangular factor U or L.  N >= 0.
-
-A       (input/output) COMPLEX{T} array, dimension (LDA,N)
-        On entry, the triangular factor U or L.
-        On exit, if UPLO = 'U', the upper triangle of A is
-        overwritten with the upper triangle of the product U * U';
-        if UPLO = 'L', the lower triangle of A is overwritten with
-        the lower triangle of the product L' * L.
-
-LDA     (input) INTEGER
-        The leading dimension of the array A.  LDA >= max(1,N).
-
-INFO    (output) INTEGER
-        = 0: successful exit
-        < 0: if INFO = -k, the k-th argument had an illegal value
+    lauu2!(uplo, n, A)
+
+Compute the product U * U^H or L^H * L, where the triangular factor U or L
+is stored in the upper or lower triangular part of the array A.
+
+This is an unblocked algorithm for computing the product of a triangular
+matrix with its conjugate transpose. The result overwrites the original
+triangular matrix.
+
+# Arguments
+- `uplo`: Character specifying which triangle is stored
+  - 'U' or 'u': Upper triangular, computes U * U^H
+  - 'L' or 'l': Lower triangular, computes L^H * L
+- `n`: Order of the triangular matrix (≥ 0)
+- `A`: Triangular matrix to be transformed (modified in-place)
+
+# Algorithm
+For upper triangular (uplo='U'):
+- Computes A := U * U^H where U is upper triangular
+- Result is Hermitian, only upper triangle is computed and stored
+
+For lower triangular (uplo='L'):  
+- Computes A := L^H * L where L is lower triangular
+- Result is Hermitian, only lower triangle is computed and stored
+
+The algorithm processes one column (or row) at a time using dot products
+and matrix-vector operations. This is the unblocked version, suitable
+for small matrices or as a building block for blocked algorithms.
+
+# Input Validation
+- uplo must be 'U', 'u', 'L', or 'l'
+- n must be non-negative
+
+# Notes
+This routine is typically used in Cholesky factorization algorithms
+and for computing covariance matrices from triangular factors.
+
+# Example
+```julia
+n = 4
+A = triu(randn(ComplexF64, n, n))  # Upper triangular matrix
+lauu2!('U', n, A, n)  # A := U * U^H
+```
 """
+function lauu2!(uplo::Char, n::Int, A::AbstractMatrix{T}) where T
 
-function lauu2(uplo::Char, n::Int, A::AbstractMatrix{T}, lda::Int) where T
-
-    # Initialize the INFO variable
-    info = 0
-
-    # Validate the input for 'uplo'
-    if !(uplo == 'U' || uplo == 'u' || uplo == 'L' || uplo == 'l')
-        info = -1
-        return info
+    # Input validation with descriptive error messages
+    if !(uplo in ['U', 'u', 'L', 'l'])
+        throw(ArgumentError("uplo must be 'U', 'u', 'L', or 'l', got '$uplo'"))
     end
 
-    # Check for valid matrix order
     if n < 0
-        info = -2
-        return info
-    end
-
-    # Validate the leading dimension of A
-    if lda < max(1, n)
-        info = -4
-        return info
+        throw(ArgumentError("n must be non-negative, got $n"))
     end
 
-    # Quick return if possible (nothing to do if n is zero)
+    # Quick return for degenerate case
     if n == 0
-        return info
+        return
     end
 
-    if uplo == 'U' || uplo == 'u'
-        # Upper triangular case: Compute U * U'
+    if uplo in ['U', 'u']
+        # Upper triangular case: Compute U * U^H
         for i in 1:n
             aii = A[i, i]  # Diagonal element of U
 
             if i < n
-                # Update the diagonal element
-                A[i, i] = aii^2 + dot(A[i, i+1:n], A[i, i+1:n])
+                # Update diagonal: A[i,i] = |U[i,i]|² + sum(|U[i,j]|² for j > i)
+                A[i, i] = real(aii * conj(aii)) + real(dot(A[i, i+1:n], A[i, i+1:n]))
 
-                # Update the remaining upper triangle elements
-                if i > 1
-                    A[1:i-1, i] .= A[1:i-1, i+1:n] * A[i, i+1:n] + A[1:i-1, i] * aii
+                # Update off-diagonal elements in column i
+                for k in 1:i-1
+                    A[k, i] = A[k, i] * aii + dot(A[k, i+1:n], conj(A[i, i+1:n]))
                 end
             else
-                # Scale diagonal entries when i == n
-                A[1:i, i] .= aii * A[1:i, i]
+                # Final column: scale by diagonal element
+                for k in 1:i
+                    A[k, i] = A[k, i] * aii
+                end
             end
         end
     else
-        # Lower triangular case: Compute L' * L
+        # Lower triangular case: Compute L^H * L
         for i in 1:n
             aii = A[i, i]  # Diagonal element of L
 
             if i < n
-                # Update the diagonal element
-                A[i, i] = aii^2 + dot(A[i+1:n, i], A[i+1:n, i])
+                # Update diagonal: A[i,i] = |L[i,i]|² + sum(|L[j,i]|² for j > i)
+                A[i, i] = real(conj(aii) * aii) + real(dot(A[i+1:n, i], A[i+1:n, i]))
 
-                # Update the remaining lower triangle elements
-                if i > 1
-                    A[i, 1:i-1] .= adjoint(A[i+1:n, 1:i-1]) * A[i+1:n, i] + A[i, 1:i-1] * aii
+                # Update off-diagonal elements in row i
+                for k in 1:i-1
+                    A[i, k] = conj(aii) * A[i, k] + dot(A[i+1:n, k], conj(A[i+1:n, i]))
                 end
             else
-                # Scale diagonal entries when i == n
-                A[i, 1:i] .= aii * A[i, 1:i]
+                # Final row: scale by conjugate of diagonal element
+                for k in 1:i
+                    A[i, k] = conj(aii) * A[i, k]
+                end
             end
         end
     end
-
-    return info
 end
+
+lauu2!(uplo::Char, A::AbstractMatrix{T}) where {T} = lauu2!(uplo, size(A, 1), A)
diff --git a/src/lauum.jl b/src/lauum.jl
index 2650cb6..7a9596c 100644
--- a/src/lauum.jl
+++ b/src/lauum.jl
@@ -1,173 +1,191 @@
-export lauum
+export lauum!
 
- # Import the unblocked version of the matrix multiplication function (lauu2) to use later in this computation.
+# Import the unblocked version (lauu2!) for use in blocked algorithm
 
 """
-    lauum(uplo::Char, n::Int, a::AbstractMatrix{T}, lda::Int, block_size::Int)
-
-This function computes the product of a triangular matrix with its conjugate transpose. Specifically, it computes:
-
-- `U * U'` if the triangular matrix `U` is stored in the upper part of matrix `a`.
-- `L' * L` if the triangular matrix `L` is stored in the lower part of matrix `a`.
-
-Where:
-- `U'` represents the conjugate transpose of the upper triangular matrix `U`.
-- `L'` represents the conjugate transpose of the lower triangular matrix `L`.
-
-### Parameters:
-- `uplo`: A character (`'U'` or `'L'`) that specifies whether the triangular matrix is stored in the upper or lower part of `a`.
-  - `'U'`: Indicates that the upper triangle contains the triangular matrix `U`. The result of `U * U'` will overwrite the corresponding entries in the upper triangle of matrix `a`.
-  - `'L'`: Indicates that the lower triangle contains the triangular matrix `L`. The result of `L' * L` will overwrite the corresponding entries in the lower triangle of matrix `a`.
-  
-- `n`: The order (size) of the triangular matrix. This must be a non-negative integer, representing the dimensions of the square matrix `a`, which is `n x n`.
-
-- `a`: The matrix where the triangular factor `U` or `L` is stored, and where the result will be stored after computation. This matrix is modified in place, meaning its contents will change as a result of the computation.
-
-- `lda`: The leading dimension of the array `a`. This should be at least `max(1, n)`. This parameter is important for accessing the elements of the matrix in memory correctly, particularly in scenarios where matrices may be stored in a non-contiguous fashion for performance reasons.
-
-- `block_size`: This specifies the block size for the blocked algorithm. A blocked algorithm processes the matrix in submatrices (or blocks), improving performance on large matrices by making better use of CPU cache and reducing memory bandwidth demands.
-
-### Returns:
-- `info`: An integer indicating the success or failure of the function execution:
-  - `0`: Indicates successful execution.
-  - A negative integer indicates that an invalid argument was provided:
-    - `-1`: Invalid value for `uplo`.
-    - `-2`: Invalid value for `n`.
-    - `-4`: Invalid value for `lda`.
+    lauum!(uplo, n, A, ib)
+
+Compute the product U * U^H or L^H * L using a blocked algorithm, where
+the triangular factor U or L is stored in the upper or lower triangular
+part of the matrix a.
+
+This is a blocked version of the triangular matrix multiplication that
+achieves better performance on large matrices by exploiting cache locality
+and enabling vectorization.
+
+# Arguments
+- `uplo`: Character specifying which triangle contains the factor
+  - 'U': Upper triangular, computes U * U^H  
+  - 'L': Lower triangular, computes L^H * L
+- `n`: Order of the triangular matrix (≥ 0)
+- `A`: Matrix containing triangular factor (modified in-place)
+- `ib`: Block size for blocked algorithm (typically 32-64)
+
+# Algorithm
+The blocked algorithm partitions the matrix into blocks of size ib
+and processes them using high-performance BLAS operations:
+- Level-3 BLAS (matrix-matrix operations) for most computations
+- Level-2 BLAS (matrix-vector operations) for smaller blocks
+- Automatic fallback to unblocked algorithm for small matrices
+
+For upper triangular (uplo='U'): A := U * U^H
+For lower triangular (uplo='L'): A := L^H * L
+
+# Performance Notes
+- Block size should be chosen based on cache size (typically 32-64)
+- Uses parallel processing for independent block operations
+- Optimal performance achieved when n >> block_size
+
+# Input Validation
+- uplo must be 'U' or 'L'
+- n must be non-negative  
+- block_size is automatically clamped to valid range
+
+# Example
+```julia
+n = 100
+block_size = 32
+A = triu(randn(ComplexF64, n, n))
+lauum!('U', n, A, n, block_size)  # A := U * U^H
+```
 """
-function lauum(uplo::Char, n::Int, a::AbstractMatrix{T}, lda::Int, block_size::Int) where T
-    # Validate the 'uplo' parameter to ensure it is either 'U' or 'L'
+function lauum!(uplo::Char, n::Integer, A::AbstractMatrix{T}, ib::Integer) where {T}
+    # Input validation with descriptive error messages
     if !(uplo in ['U', 'L'])
-        return -1  # Return an error code for invalid 'uplo'
+        throw(ArgumentError("uplo must be 'U' or 'L', got '$uplo'"))
     end
 
-    # Check if 'n' is non-negative
     if n < 0
-        return -2  # Return an error code for invalid 'n'
-    end
-
-    # Validate 'lda' to ensure it meets the minimum requirement
-    if lda < max(1, n)
-        return -4  # Return an error code for invalid 'lda'
+        throw(ArgumentError("n must be non-negative, got $n"))
     end
 
-    # If 'n' is zero, no computation is needed, so return success
+    # Quick return for degenerate case
     if n == 0
-        return 0  # Early exit with success code
+        return
     end
 
-    # Adjust block_size to ensure it does not exceed the size of the matrix
-    block_size = min(block_size, n)
+    # Adjust block_size to reasonable bounds
+    ib = max(1, min(ib, n))
 
-    # If block_size is less than or equal to 1, or greater than or equal to n, use the unblocked version
-    if block_size <= 1 || block_size >= n
-        lauu2(uplo, n, a, lda)  # Call the unblocked computation
-        return 0  # Return success code
+    # Use unblocked algorithm for small matrices or invalid block size
+    if ib <= 1 || ib >= n
+        lauu2!(uplo, n, A)
+        return
     end
 
-    # Call the appropriate computation based on whether the upper or lower triangular matrix is specified
+    # Call appropriate blocked computation
     if uplo == 'U'
-        compute_upper(n, block_size, a, lda)  # Compute for upper triangular matrix
+        compute_upper!(n, ib, A)
     else
-        compute_lower(n, block_size, a, lda)  # Compute for lower triangular matrix
+        compute_lower!(n, ib, A)
     end
-
-    return 0  # Return success code after completing the computation
 end
 
 """
-    compute_upper(n, block_size, a, lda)
+    compute_upper!(n, block_size, a)
+
+Blocked computation of U * U^H for upper triangular matrix U.
 
-This function performs the blocked computation of U * U' for an upper triangular matrix `U`.
-The computation is carried out in parallel to improve performance on large matrices.
+Processes the matrix in blocks to achieve better cache performance and
+enable vectorized operations. Uses Level-3 BLAS operations where possible.
 
-### Parameters:
-- `n`: The size of the matrix, which is also the order of the triangular matrix `U`.
-- `block_size`: The size of the blocks to be processed in each iteration. This allows for better cache usage and performance.
-- `a`: The matrix that contains the upper triangular part `U`, and where the results will be stored.
-- `lda`: The leading dimension of the matrix `a`.
+# Arguments
+- `n`: Order of the matrix
+- `block_size`: Size of blocks for processing  
+- `a`: Matrix containing U (modified in-place)
 
-This function modifies the matrix `a` in place.
+# Algorithm
+For each diagonal block:
+1. Update off-diagonal blocks using TRMM operations
+2. Compute diagonal block product U_block * U_block^H  
+3. Add contribution from remaining blocks using SYRK operations
 """
-function compute_upper(n::Int, block_size::Int, a::AbstractMatrix{T}, lda::Int) where T
-    Threads.@threads for i in 1:block_size:n  # Parallelize the outer loop over blocks
-        ib = min(block_size, n - i + 1)  # Determine the actual block size for this iteration
-
-        # Perform a triangular matrix multiplication (equivalent to DTRMM)
-        # Update the upper triangle of the matrix using the current block
-        view(a, 1:i-1, i:i+ib-1) .= view(a, 1:i-1, i:i+ib-1) * view(a, i:i+ib-1, i:i+ib-1)'
-
-        # Compute the product U * U' for the current block using the lauu2 function
-        # lauu2('U', ib, view(a, i:i+ib-1, i:i+ib-1), lda)
-        U = view(a, i:i+ib-1, i:i+ib-1)  # Extract the block U
-        U_Ut = U * adjoint(U)  # Use adjoint for complex matrices
-        # Only update the upper triangular part of the matrix
-        for j in 1:ib
-            for k in j:ib
-                a[i + j - 1, i + k - 1] = U_Ut[j, k]
-            end
+function compute_upper!(n::Integer, ib::Integer, A::AbstractMatrix{T}) where T
+    Threads.@threads for i in 1:ib:n
+        ib = min(ib, n - i + 1)  # Actual block size
+
+        # Update off-diagonal blocks: A[1:i-1, i:i+ib-1] = A[1:i-1, i:i+ib-1] * A[i:i+ib-1, i:i+ib-1]^H
+        if i > 1
+            view(A, 1:i-1, i:i+ib-1) .= view(A, 1:i-1, i:i+ib-1) * view(A, i:i+ib-1, i:i+ib-1)'
         end
+
+        # Compute diagonal block: U_block * U_block^H
+        U_block = view(A, i:i+ib-1, i:i+ib-1)
+        U_Ut = U_block * U_block'
         
+        # Store only upper triangular part
+        for j in 1:ib, k in j:ib
+            A[i + j - 1, i + k - 1] = U_Ut[j, k]
+        end
         
-        # Check if there are additional blocks to process
+        # Add contribution from trailing blocks if they exist
         if i + ib <= n
-            # Perform matrix-matrix multiplication (equivalent to DGEMM)
-            view(a, 1:i-1, i:i+ib-1) .+= view(a, 1:i-1, i+ib:n) * view(a, i:i+ib-1, i+ib:n)'
-
-            # Perform symmetric rank-k update (equivalent to DSYRK)
-            product_matrix = view(a, i:i+ib-1, i+ib:n) * view(a, i:i+ib-1, i+ib:n)'
-            for j in 1:ib  # Iterate over the rows of the current block
-                for k in j:ib  # Iterate over the columns of the current block
-                    @inbounds a[i + j - 1, i + k - 1] += product_matrix[j, k]  # Update the result matrix
-                end
+            # Update off-diagonal: add A[1:i-1, i+ib:n] * A[i:i+ib-1, i+ib:n]^H
+            if i > 1
+                view(A, 1:i-1, i:i+ib-1) .+= view(A, 1:i-1, i+ib:n) * view(A, i:i+ib-1, i+ib:n)'
+            end
+
+            # Rank-k update: add A[i:i+ib-1, i+ib:n] * A[i:i+ib-1, i+ib:n]^H to diagonal block
+            trailing_block = view(A, i:i+ib-1, i+ib:n)
+            syrk_result = trailing_block * trailing_block'
+            
+            for j in 1:ib, k in j:ib
+                A[i + j - 1, i + k - 1] += syrk_result[j, k]
             end
         end
     end
 end
 
 """
-    compute_lower(n, block_size, a, lda)
+    compute_lower!(n, ib, A) 
 
-This function performs the blocked computation of L' * L for a lower triangular matrix `L`.
-The computation is carried out in parallel to improve performance on large matrices.
+Blocked computation of L^H * L for lower triangular matrix L.
 
-### Parameters:
-- `n`: The size of the matrix, which is also the order of the triangular matrix `L`.
-- `block_size`: The size of the blocks to be processed in each iteration. This allows for better cache usage and performance.
-- `a`: The matrix that contains the lower triangular part `L`, and where the results will be stored.
-- `lda`: The leading dimension of the matrix `a`.
+Processes the matrix in blocks to achieve better cache performance and
+enable vectorized operations. Uses Level-3 BLAS operations where possible.
 
-This function modifies the matrix `a` in place.
+# Arguments  
+- `n`: Order of the matrix
+- `block_size`: Size of blocks for processing
+- `A`: Matrix containing L (modified in-place)  
+
+# Algorithm
+For each diagonal block:
+1. Update off-diagonal blocks using TRMM operations
+2. Compute diagonal block product L_block^H * L_block
+3. Add contribution from remaining blocks using SYRK operations
 """
-function compute_lower(n::Int, block_size::Int, a::AbstractMatrix{T}, lda::Int) where T
-    Threads.@threads for i in 1:block_size:n  # Parallelize the outer loop over blocks
-        ib = min(block_size, n - i + 1)  # Determine the actual block size for this iteration
+function compute_lower!(n::Integer, ib::Integer, A::AbstractMatrix{T}) where T
+    Threads.@threads for i in 1:ib:n
+        ib = min(ib, n - i + 1)  # Actual block size
 
-        # Perform a triangular matrix multiplication for lower triangular matrix
-        view(a, i:i+ib-1, 1:i-1) .= adjoint(view(a, i:i+ib-1, i:i+ib-1)) * view(a, i:i+ib-1, 1:i-1)
+        # Update off-diagonal blocks: A[i:i+ib-1, 1:i-1] = A[i:i+ib-1, i:i+ib-1]^H * A[i:i+ib-1, 1:i-1]
+        if i > 1
+            view(A, i:i+ib-1, 1:i-1) .= view(A, i:i+ib-1, i:i+ib-1)' * view(A, i:i+ib-1, 1:i-1)
+        end
 
-        # Compute the product L' * L for the current block using adjoint for complex matrices
-        L = view(a, i:i+ib-1, i:i+ib-1)  # Extract the block L
-        Lt_L = adjoint(L) * L
+        # Compute diagonal block: L_block^H * L_block  
+        L_block = view(A, i:i+ib-1, i:i+ib-1)
+        Lt_L = L_block' * L_block
 
-        # Store the result back in the lower triangular part only
-        for j in 1:ib
-            for k in 1:j
-                @inbounds a[i + j - 1, i + k - 1] = Lt_L[j, k]
-            end
+        # Store only lower triangular part
+        for j in 1:ib, k in 1:j
+            A[i + j - 1, i + k - 1] = Lt_L[j, k]
         end
 
-        # Check if there are additional blocks to process
+        # Add contribution from trailing blocks if they exist
         if i + ib <= n
-            # Perform matrix-matrix multiplication (equivalent to DGEMM) with proper adjoint
-            view(a, i:i+ib-1, 1:i-1) .+= adjoint(view(a, i+ib:n, i:i+ib-1)) * view(a, i+ib:n, 1:i-1)
-
-            # Perform symmetric rank-k update
-            product_matrix = adjoint(view(a, i+ib:n, i:i+ib-1)) * view(a, i+ib:n, i:i+ib-1)
-            for j in 1:ib
-                for k in 1:j
-                    a[i + j - 1, i + k - 1] += product_matrix[j, k]
-                end
+            # Update off-diagonal: add A[i+ib:n, i:i+ib-1]^H * A[i+ib:n, 1:i-1]
+            if i > 1
+                view(A, i:i+ib-1, 1:i-1) .+= view(A, i+ib:n, i:i+ib-1)' * view(A, i+ib:n, 1:i-1)
+            end
+
+            # Rank-k update: add A[i+ib:n, i:i+ib-1]^H * A[i+ib:n, i:i+ib-1] to diagonal block
+            trailing_block = view(A, i+ib:n, i:i+ib-1)
+            syrk_result = trailing_block' * trailing_block
+            
+            for j in 1:ib, k in 1:j
+                A[i + j - 1, i + k - 1] += syrk_result[j, k]
             end
         end
     end
diff --git a/src/pamm.jl b/src/pamm.jl
index 1993cc4..51cbf84 100644
--- a/src/pamm.jl
+++ b/src/pamm.jl
@@ -1,59 +1,105 @@
-function pamm(op, side, storev, direct, m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw)
-    # Input validation
-    op ∉ ('W', 'A') && throw(ArgumentError("illegal value of op"))
-    side ∉ ('L', 'R') && throw(ArgumentError("illegal value of side"))
-    storev ∉ ('C', 'R') && throw(ArgumentError("illegal value of storev"))
-    direct ∉ ('F', 'B') && throw(ArgumentError("illegal value of direct"))
+"""
+    pamm!(op, side, storev, direct, m, n, k, l, A1, A2, V, W)
+
+Parallel matrix multiplication kernel for block reflector applications.
+
+This routine performs specialized matrix operations needed in blocked orthogonal
+factorizations. It computes either:
+- W = A1 + op(V) * A2 (when op='W')
+- A2 = A2 + op(V) * W (when op='A')
+
+where op(V) is V, V^H, V^T depending on the storage and direction parameters.
+
+# Arguments
+- `op`: Operation type
+  - 'W': Compute W = A1 + op(V) * A2 or W = A1 + A2 * op(V)
+  - 'A': Update A2 = A2 + op(V) * W or A2 = A2 + W * op(V)
+- `side`: Which side V is applied
+  - 'L': Left multiplication (op(V) * A2)
+  - 'R': Right multiplication (A2 * op(V))
+- `storev`: How reflector vectors are stored in V
+  - 'C': Columnwise storage
+  - 'R': Rowwise storage
+- `direct`: Direction of reflector product
+  - 'F': Forward (H = H₁H₂...Hₖ)
+  - 'B': Backward (H = HₖHₖ₋₁...H₁)
+- `m`, `n`: Dimensions of matrices A1, A2, W
+- `k`: Number of elementary reflectors
+- `l`: Number of columns/rows in triangular part of V
+- `A1`: First input matrix
+- `A2`: Second input/output matrix
+- `V`: Matrix containing reflector vectors
+- `W`: Workspace/output matrix
+
+# Algorithm
+The routine handles all combinations of storage formats and application sides
+efficiently by dispatching to specialized kernels. Each kernel exploits the
+structure of the reflector matrix V (triangular + rectangular parts) to
+minimize computational cost.
+
+# Input Validation
+All parameters are validated for correctness. Dimensions must be non-negative
+and leading dimensions must meet minimum requirements.
+
+# Notes
+This is a low-level computational kernel used internally by blocked QR
+and LQ factorization routines. It is optimized for performance with
+specific memory access patterns.
+"""
+function pamm!(op::Char, side::Char, storev::Char, direct::Char, m::Integer, n::Integer, k::Integer, l::Integer, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, V::AbstractMatrix{T}, W::AbstractMatrix{T}) where {T}
+    # Input validation with descriptive error messages
+    if op ∉ ('W', 'A')
+        throw(ArgumentError("op must be 'W' or 'A', got '$op'"))
+    end
+    if side ∉ ('L', 'R')
+        throw(ArgumentError("side must be 'L' or 'R', got '$side'"))
+    end
+    if storev ∉ ('C', 'R')
+        throw(ArgumentError("storev must be 'C' or 'R', got '$storev'"))
+    end
+    if direct ∉ ('F', 'B')
+        throw(ArgumentError("direct must be 'F' or 'B', got '$direct'"))
+    end
     
     # Dimension validation
-    m < 0 && throw(ArgumentError("illegal value of m"))
-    n < 0 && throw(ArgumentError("illegal value of n"))
-    k < 0 && throw(ArgumentError("illegal value of k"))
-    l < 0 && throw(ArgumentError("illegal value of l"))
+    if m < 0
+        throw(ArgumentError("m must be non-negative, got $m"))
+    end
+    if n < 0
+        throw(ArgumentError("n must be non-negative, got $n"))
+    end
+    if k < 0
+        throw(ArgumentError("k must be non-negative, got $k"))
+    end
+    if l < 0
+        throw(ArgumentError("l must be non-negative, got $l"))
+    end
     
-    # Leading dimension validation
-    lda1 < 0 && throw(ArgumentError("illegal value of lda1"))
-    lda2 < 0 && throw(ArgumentError("illegal value of lda2"))
-    ldv < 0 && throw(ArgumentError("illegal value of ldv"))
-    ldw < 0 && throw(ArgumentError("illegal value of ldw"))
     
     # Quick return for degenerate cases
-    (m == 0 || n == 0 || k == 0) && return nothing
-
-    if direct == 'F'
-        forward = true
-    else
-        forward = false
-    end
-
-    if storev == 'C'
-        colmajor = true
-    else
-        colmajor = false
+    if m == 0 || n == 0 || k == 0
+        return
     end
 
-    if side == 'L'
-        left = true
-    else
-        left = false
-    end
+    # Convert parameters to boolean flags for efficiency
+    forward = (direct == 'F')
+    colmajor = (storev == 'C')
+    left = (side == 'L')
     
-
+    # Dispatch to appropriate kernel
     if op == 'W'
-        pamm_w(left, colmajor, forward, m,n,k,l, A1, A2, V, W)
+        pamm_w!(left, colmajor, forward, m, n, k, l, A1, A2, V, W)
     else
-        pamm_a(left, colmajor, forward, m,n,k,l, A2, V, W)
+        pamm_a!(left, colmajor, forward, m, n, k, l, A2, V, W)
     end
-    
-    return 
 end
 
-function pamm_w(left, colmajor, forward, m, n, k, l, A1, A2, V, W)
+function pamm_w!(left::Bool, colmajor::Bool, forward::Bool, m::Integer, n::Integer, k::Integer, l::Integer, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, V::AbstractMatrix{T}, W::AbstractMatrix{T}) where {T}
     # W = A1 + op(V) * A2 or W = A1 + A2 * op(V)
-    one0 = oneunit(eltype(A1))
-    zero0 = zero(eltype(A1))
-    plus = LinearAlgebra.MulAddMul(one0, one0)
-    eqa = LinearAlgebra.MulAddMul(one0, zero0)
+    one = oneunit(eltype(A1))
+    Tzero = zero(eltype(A1))
+    plus = LinearAlgebra.MulAddMul(one, one)
+    eqa = LinearAlgebra.MulAddMul(one, Tzero)
 
     if colmajor && forward && left # colmajor, forward, left
         mp = min(m-l+1, m)
@@ -70,7 +116,7 @@ function pamm_w(left, colmajor, forward, m, n, k, l, A1, A2, V, W)
         LinearAlgebra.generic_matmatmul!((@view W[kp:kp+k-l-1, 1:n]), 'C', 'N', (@view V[1:m, kp:kp+k-l-1]), (@view A2[1:m, 1:n]), eqa)
 
         for i in 1:k
-            LinearAlgebra.axpy!(one0, (@view A1[i, 1:n]), (@view W[i, 1:n]))
+            LinearAlgebra.axpy!(one, (@view A1[i, 1:n]), (@view W[i, 1:n]))
         end
 
     end
@@ -89,7 +135,7 @@ function pamm_w(left, colmajor, forward, m, n, k, l, A1, A2, V, W)
         LinearAlgebra.generic_matmatmul!((@view W[kp:kp+k-l-1, 1:n]), 'N', 'N', (@view V[kp:kp+k-l-1, 1:m]), (@view A2[1:m, 1:n]), eqa)
 
         for i in 1:k
-            LinearAlgebra.axpy!(one0, (@view A1[i, 1:n]), (@view W[i, 1:n]))
+            LinearAlgebra.axpy!(one, (@view A1[i, 1:n]), (@view W[i, 1:n]))
         end
     end
     if colmajor && forward && !left # colmajor, forward, right
@@ -107,7 +153,7 @@ function pamm_w(left, colmajor, forward, m, n, k, l, A1, A2, V, W)
         LinearAlgebra.generic_matmatmul!((@view W[1:m, kp:kp+k-l-1]), 'N', 'N', (@view A2[1:m, 1:n]), (@view V[1:n, kp:kp+k-l-1]), eqa)
 
         for j in 1:k
-            LinearAlgebra.axpy!(one0, (@view A1[1:m, j]), (@view W[1:m, j]))
+            LinearAlgebra.axpy!(one, (@view A1[1:m, j]), (@view W[1:m, j]))
         end
     end
     if !colmajor && forward && !left # rowmajor, forward, right
@@ -125,7 +171,7 @@ function pamm_w(left, colmajor, forward, m, n, k, l, A1, A2, V, W)
         LinearAlgebra.generic_matmatmul!((@view W[1:m, kp:kp+k-l-1]), 'N', 'C', (@view A2[1:m, 1:n]), (@view V[kp:kp+k-l-1, 1:n]), eqa)
 
         for j in 1:k
-            LinearAlgebra.axpy!(one0, (@view A1[1:m, j]), (@view W[1:m, j]))
+            LinearAlgebra.axpy!(one, (@view A1[1:m, j]), (@view W[1:m, j]))
         end
     end
     if colmajor && !forward && left # colmajor, backward, left
@@ -143,7 +189,7 @@ function pamm_w(left, colmajor, forward, m, n, k, l, A1, A2, V, W)
         LinearAlgebra.generic_matmatmul!((@view W[1:k-l, 1:n]), 'C', 'N', (@view V[1:m, 1:k-l]), (@view A2[1:m, 1:n]), eqa)
 
         for i in 1:k
-            LinearAlgebra.axpy!(one0, (@view A1[i, 1:n]), (@view W[i, 1:n]))
+            LinearAlgebra.axpy!(one, (@view A1[i, 1:n]), (@view W[i, 1:n]))
         end
     end
 
@@ -162,7 +208,7 @@ function pamm_w(left, colmajor, forward, m, n, k, l, A1, A2, V, W)
         LinearAlgebra.generic_matmatmul!((@view W[1:k-l, 1:n]), 'N', 'N', (@view V[1:k-l, 1:m]), (@view A2[1:m, 1:n]), eqa)
 
         for i in 1:k
-            LinearAlgebra.axpy!(one0, (@view A1[i, 1:n]), (@view W[i, 1:n]))
+            LinearAlgebra.axpy!(one, (@view A1[i, 1:n]), (@view W[i, 1:n]))
         end      
     end
     if !colmajor && !forward && !left # rowmajor, backward, right
@@ -180,7 +226,7 @@ function pamm_w(left, colmajor, forward, m, n, k, l, A1, A2, V, W)
         LinearAlgebra.generic_matmatmul!((@view W[1:m, 1:k-l]), 'N', 'C', (@view A2[1:m, 1:n]), (@view V[1:k-l, 1:n]), eqa)
 
         for j in 1:k
-            LinearAlgebra.axpy!(one0, (@view A1[1:m, j]), (@view W[1:m, j]))
+            LinearAlgebra.axpy!(one, (@view A1[1:m, j]), (@view W[1:m, j]))
         end
     end
     if colmajor && !forward && !left # colmajor, backward, right
@@ -198,18 +244,16 @@ function pamm_w(left, colmajor, forward, m, n, k, l, A1, A2, V, W)
         LinearAlgebra.generic_matmatmul!((@view W[1:m, 1:k-l]), 'N', 'N', (@view A2[1:m, 1:n]), (@view V[1:n, 1:k-l]), eqa)
 
         for j in 1:k
-            LinearAlgebra.axpy!(one0, (@view A1[1:m, j]), (@view W[1:m, j]))
+            LinearAlgebra.axpy!(one, (@view A1[1:m, j]), (@view W[1:m, j]))
         end
     end
-   
-    return
 end
 
 
-function pamm_a(left, colmajor, forward, m, n, k, l, A2, V, W)
+function pamm_a!(left::Bool, colmajor::Bool, forward::Bool, m::Integer, n::Integer, k::Integer, l::Integer, A2::AbstractMatrix{T}, V::AbstractMatrix{T}, W::AbstractMatrix{T}) where {T}
         # A2 = A2 + op(V) * W or A2 = A2 + W * op(V)
-        one0 = oneunit(eltype(A2))
-        minus = LinearAlgebra.MulAddMul(one0*(-1),one0)
+        one = oneunit(eltype(A2))
+        minus = LinearAlgebra.MulAddMul(one*(-1), one)
 
         if colmajor && forward && left # colmajor, forward, left
             mp = min( m-l+1, m )
@@ -222,7 +266,7 @@ function pamm_a(left, colmajor, forward, m, n, k, l, A2, V, W)
             LinearAlgebra.generic_trimatmul!((@view W[1:l, 1:n]), 'U', 'N', identity, (@view V[mp:mp+l-1, 1:l]), (@view W[1:l, 1:n]))
 
             for i in 1:l 
-                LinearAlgebra.axpy!(-one0, (@view W[i, 1:n]), (@view A2[m-l+i, 1:n]))
+                LinearAlgebra.axpy!(-one, (@view W[i, 1:n]), (@view A2[m-l+i, 1:n]))
             end
         end
 
@@ -237,7 +281,7 @@ function pamm_a(left, colmajor, forward, m, n, k, l, A2, V, W)
             LinearAlgebra.generic_trimatmul!((@view W[1:l, 1:n]), 'L', 'N', adjoint, (@view V[1:l, mp:mp+l-1]), (@view W[1:l, 1:n]))
 
             for i in 1:l 
-                LinearAlgebra.axpy!((-one0), (@view W[i, 1:n]), (@view A2[m-l+i, 1:n])) 
+                LinearAlgebra.axpy!((-one), (@view W[i, 1:n]), (@view A2[m-l+i, 1:n])) 
             end
         end
 
@@ -253,7 +297,7 @@ function pamm_a(left, colmajor, forward, m, n, k, l, A2, V, W)
             LinearAlgebra.generic_mattrimul!((@view W[1:m, 1:l]), 'U', 'N', adjoint, (@view W[1:m, 1:l]), (@view V[np:np+l-1, 1:l]))
 
             for j in 1:l 
-                LinearAlgebra.axpy!(-one0, (@view W[1:m, j]), (@view A2[1:m, n-l+j]))
+                LinearAlgebra.axpy!(-one, (@view W[1:m, j]), (@view A2[1:m, n-l+j]))
             end
         end
 
@@ -268,7 +312,7 @@ function pamm_a(left, colmajor, forward, m, n, k, l, A2, V, W)
             LinearAlgebra.generic_mattrimul!((@view W[1:m, 1:l]), 'L', 'N', identity, (@view W[1:m, 1:l]), (@view V[1:l, np:np+l-1]))
 
             for j in 1:l 
-                LinearAlgebra.axpy!(-one0, (@view W[1:m, j]), (@view A2[1:m, n-l+j]))
+                LinearAlgebra.axpy!(-one, (@view W[1:m, j]), (@view A2[1:m, n-l+j]))
             end
         end
 
@@ -283,7 +327,7 @@ function pamm_a(left, colmajor, forward, m, n, k, l, A2, V, W)
             LinearAlgebra.generic_trimatmul!((@view W[kp:kp+l-1, 1:n]), 'L', 'N', identity, (@view V[1:l, kp:kp+l-1]), (@view W[kp:kp+l-1, 1:n]))
 
             for i in 1:l 
-                LinearAlgebra.axpy!(-one0, (@view W[k-l+i, 1:n]), (@view A2[i, 1:n]))
+                LinearAlgebra.axpy!(-one, (@view W[k-l+i, 1:n]), (@view A2[i, 1:n]))
             end
         end
         
@@ -298,7 +342,7 @@ function pamm_a(left, colmajor, forward, m, n, k, l, A2, V, W)
             LinearAlgebra.generic_trimatmul!((@view W[kp:kp+l-1, 1:n]), 'U', 'N', adjoint, (@view V[kp:kp+l-1, 1:l]), (@view W[kp:kp+l-1, 1:n]))
 
             for i in 1:l 
-                LinearAlgebra.axpy!(-one0, (@view W[k-l+i, 1:n]), (@view A2[i, 1:n]))
+                LinearAlgebra.axpy!(-one, (@view W[k-l+i, 1:n]), (@view A2[i, 1:n]))
             end
         end
         
@@ -312,7 +356,7 @@ function pamm_a(left, colmajor, forward, m, n, k, l, A2, V, W)
             LinearAlgebra.generic_mattrimul!((@view W[1:m, kp:kp+l-1]), 'U', 'N', identity, (@view W[1:m, kp:kp+l-1]), (@view V[kp:kp+l-1, 1:l]))
 
             for j in 1:l 
-                LinearAlgebra.axpy!(-one0, (@view W[1:m, k-l+j]), (@view A2[1:m, j]))
+                LinearAlgebra.axpy!(-one, (@view W[1:m, k-l+j]), (@view A2[1:m, j]))
             end
         end
         
@@ -327,11 +371,55 @@ function pamm_a(left, colmajor, forward, m, n, k, l, A2, V, W)
             LinearAlgebra.generic_mattrimul!((@view W[1:m, kp:kp+l-1]), 'L', 'N', adjoint, (@view W[1:m, kp:kp+l-1]), (@view V[1:l, kp:kp+l-1]))
 
             for j in 1:l 
-                LinearAlgebra.axpy!(-one0, (@view W[1:m, k-l+j]), (@view A2[1:m, j]))
+                LinearAlgebra.axpy!(-one, (@view W[1:m, k-l+j]), (@view A2[1:m, j]))
             end
         end
+end
 
-        return
+"""
+    pamm(op, side, storev, direct, A1, A2, V) -> (A1, A2)
+
+Performs panel matrix multiplication with automatic workspace allocation.
+This is a simplified interface that automatically computes required parameters.
+
+# Arguments
+- 'op': operation type
+    - 'W': compute workspace
+    - 'A': apply operation
+- 'side': 
+    - 'L' : apply from the left
+    - 'R' : apply from the right
+- 'storev': indicates how the vectors are stored
+    - 'C' : columnwise
+    - 'R' : rowwise
+- 'direct': indicates direction
+    - 'F' : forward
+    - 'B' : backward
+- 'A1': first matrix to be updated
+- 'A2': second matrix to be updated
+- 'V': matrix containing the vectors
+
+# Returns
+- Updated A1 and A2 matrices
+
+# Example
+```julia
+m, n, k, l = 6, 4, 3, 2
+A1 = complex.(randn(m, k), randn(m, k))
+A2 = complex.(randn(m, l), randn(m, l))
+V = complex.(randn(m, k), randn(m, k))
+A1_new, A2_new = pamm('A', 'L', 'C', 'F', A1, A2, V)
+```
+"""
+function pamm(op::Char, side::Char, storev::Char, direct::Char, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, V::AbstractMatrix{T}) where {T}
+    # Determine dimensions
+    m, k = size(A1)
+    n = size(A2, 2)
+    l = n
+
+    W = similar(A1, m, k)
+    
+    # Call the underlying kernel
+    pamm(op, side, storev, direct, m, n, k, l, A1, A2, V, W)
 end
-  
 
diff --git a/src/parfb.jl b/src/parfb.jl
index 3bb8b91..1e34413 100644
--- a/src/parfb.jl
+++ b/src/parfb.jl
@@ -1,92 +1,98 @@
-export parfb
-
-function parfb(side, trans, direct, storev, m1, n1, m2, n2, k, l, 
-                A1, lda1, A2, lda2, V,  ldv, T, ldt, work, ldwork)
+"""
+    parfb!(side, trans, direct, storev, m1, n1, m2, n2, k, l, A1, A2, V, T, work)
+
+Apply a block reflector or its transpose/adjoint to a general matrix using parallel algorithms.
+
+This function applies the block reflector H or its transpose/adjoint to two matrix blocks simultaneously,
+making it efficient for parallel QR factorization algorithms. It performs operations of the form:
+- C₁ := H^op · C₁ (left multiplication)
+- C₁ := C₁ · H^op (right multiplication)
+
+where H is represented in compact form by matrices V and T, and op can be N (no operation), T (transpose), 
+or C (conjugate transpose).
+
+# Arguments
+- `side::Char`: Determines the side of multiplication ('L' for left, 'R' for right)
+- `trans::Char`: Operation to apply ('N' for none, 'T' for transpose, 'C' for conjugate transpose)
+- `direct::Char`: Direction of reflector storage ('F' for forward, 'B' for backward)
+- `storev::Char`: Storage format of reflectors ('C' for columnwise, 'R' for rowwise)
+- `m1::Int`: Number of rows in first matrix block A1
+- `n1::Int`: Number of columns in first matrix block A1
+- `m2::Int`: Number of rows in second matrix block A2
+- `n2::Int`: Number of columns in second matrix block A2
+- `k::Int`: Number of elementary reflectors
+- `l::Int`: Order of the triangular factor in T
+- `A1::Matrix`: First m1×n1 matrix block to be transformed (modified in-place)
+- `A2::Matrix`: Second m2×n2 matrix block to be transformed (modified in-place)
+- `V::Matrix`: Matrix containing elementary reflectors in compact form
+- `T::Matrix`: Upper triangular factor matrix
+- `work::Vector`: Workspace array
+
+# Returns
+- `Int`: Status code (0 for success, negative for invalid arguments)
+
+# Algorithm
+The function uses the compact WY representation where H = I - V·T·V^H, performing efficient
+block operations to apply the transformation to both matrix blocks simultaneously.
+
+# Implementation Notes
+- Modifies A1 and A2 in-place for efficiency
+- Uses optimized BLAS-3 operations for performance
+- Handles different storage formats and operation types
+- Validates all input parameters with descriptive error messages
+"""
+function parfb!(side::Char, trans::Char, direct::Char, storev::Char, m1::Integer, n1::Integer, m2::Integer, n2::Integer, k::Integer, l::Integer, 
+                A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, V::AbstractMatrix{T}, T_mat::AbstractMatrix{T}, work::AbstractMatrix{T}) where {T}
 
     if side != 'L' && side != 'R'
         throw(ArgumentError("illegal value of side"))
-        return -1
     end
 
     if trans != 'N' && trans != 'C' && trans != 'T'
         throw(ArgumentError("illegal value of trans"))
-        return -2
     end
 
     if direct != 'F' && direct != 'B'
         throw(ArgumentError("illegal value of direct"))
-        return -3
     end
 
     if storev != 'C' && storev != 'R'
         throw(ArgumentError("illegal value of storev"))
-        return -4
     end
 
     if m1 < 0
         throw(ArgumentError("illegal value of m1"))
-        return -5
     end
 
     if n1 < 0
         throw(ArgumentError("illegal value of n1"))
-        return -6
     end
 
     if m2 < 0 || (side == 'R' && m1 != m2)
         throw(ArgumentError("illegal value of m2"))
-        return -7
     end
 
     if n2 < 0 || (side == 'L' && n1 != n2)
         throw(ArgumentError("illegal value of n2"))
-        return -8
     end
 
     if k < 0
         throw(ArgumentError("illegal value of k"))
-        return -9
     end
 
     if l < 0 || l > k
         throw(ArgumentError("illegal value of l"))
-        return -10
-    end
-
-    if lda1 < 0
-        throw(ArgumentError("illegal value of lda1"))
-        return -12
-    end
-
-    if lda2 < 0
-        throw(ArgumentError("illegal value of lda2"))
-        return -14
     end
 
-    if ldv < 0 
-        throw(ArgumentError("illegal value of ldv"))
-        return -16
-    end
-
-    if ldt < 0
-        throw(ArgumentError("illegal value of ldt"))
-        return -18
-    end
-
-    if ldwork < 0
-        throw(ArgumentError("illegal value of ldwork"))
-        return -20
-    end
-
-    # quick return 
-
+    # Quick return if any dimension is zero
     if m1 == 0 || n1 == 0 || n2 == 0 || k == 0
         return 
     end
 
-    one0 = oneunit(eltype(A1))
-    zero0 = zero(eltype(A1))
+    # Define scalar constants
+    one = oneunit(eltype(A1))
 
+    # Determine operation transformations based on flags
     if trans == 'N'
         tfun = identity
     else
@@ -110,75 +116,126 @@ function parfb(side, trans, direct, storev, m1, n1, m2, n2, k, l,
     else
         colmajor = false
     end
-    pamm('W', side, storev, direct, m2, n2, k, l, A1, lda1, A2, lda2, V, ldv, work, ldwork)
 
+    # Apply workspace computation using pamm kernel
+    pamm!('W', side, storev, direct, m2, n2, k, l, A1, A2, V, work)
+
+    # Apply block reflector transformation based on storage format and direction
     if colmajor && forward && left # colmajor, forward, left
-        LinearAlgebra.generic_trimatmul!((@view work[1:k, 1:n2]), 'U', 'N', tfun, (@view T[1:k, 1:k]), (@view work[1:k, 1:n2]))
+        LinearAlgebra.generic_trimatmul!((@view work[1:k, 1:n2]), 'U', 'N', tfun, (@view T_mat[1:k, 1:k]), (@view work[1:k, 1:n2]))
 
         for i in 1:k
-            LinearAlgebra.axpy!(-one0, (@view work[i, 1:n2]), (@view A1[i, 1:n2]))
+            LinearAlgebra.axpy!(-one, (@view work[i, 1:n2]), (@view A1[i, 1:n2]))
         end
     end
 
     if colmajor && forward && !left # colmajor, forward, right
-        LinearAlgebra.generic_mattrimul!((@view work[1:m2, 1:k]), 'U', 'N', tfun, (@view work[1:m2, 1:k]), (@view T[1:k, 1:k]))
+        LinearAlgebra.generic_mattrimul!((@view work[1:m2, 1:k]), 'U', 'N', tfun, (@view work[1:m2, 1:k]), (@view T_mat[1:k, 1:k]))
 
         for j in 1:k
-            LinearAlgebra.axpy!(-one0, (@view work[1:m2, j]), (@view A1[1:m2, j]))
+            LinearAlgebra.axpy!(-one, (@view work[1:m2, j]), (@view A1[1:m2, j]))
         end
     end
 
     if colmajor && !forward && left # colmajor, backward, left
-        LinearAlgebra.generic_trimatmul!((@view work[1:k, 1:n2]), 'L', 'N', tfun, (@view T[1:k, 1:k]), (@view work[1:k, 1:n2]))
+        LinearAlgebra.generic_trimatmul!((@view work[1:k, 1:n2]), 'L', 'N', tfun, (@view T_mat[1:k, 1:k]), (@view work[1:k, 1:n2]))
 
         for i in 1:k
-            LinearAlgebra.axpy!(-one0, (@view work[i, 1:n2]), (@view A1[i, 1:n2]))
+            LinearAlgebra.axpy!(-one, (@view work[i, 1:n2]), (@view A1[i, 1:n2]))
         end
     end
 
     if colmajor && !forward && !left # colmajor, backward, right
-        LinearAlgebra.generic_mattrimul!((@view work[1:m2, 1:k]), 'L', 'N', tfun, (@view work[1:m2, 1:k]), (@view T[1:k, 1:k]))
+        LinearAlgebra.generic_mattrimul!((@view work[1:m2, 1:k]), 'L', 'N', tfun, (@view work[1:m2, 1:k]), (@view T_mat[1:k, 1:k]))
 
         for j in 1:k
-            LinearAlgebra.axpy!(-one0, (@view work[1:m2, j]), (@view A1[1:m2, j]))
+            LinearAlgebra.axpy!(-one, (@view work[1:m2, j]), (@view A1[1:m2, j]))
         end
     end
 
     if !colmajor && forward && left # rowmajor, forward, left
-
-        LinearAlgebra.generic_trimatmul!((@view work[1:k, 1:n2]), 'U', 'N', tfun, (@view T[1:k, 1:k]), (@view work[1:k, 1:n2]))
+        LinearAlgebra.generic_trimatmul!((@view work[1:k, 1:n2]), 'U', 'N', tfun, (@view T_mat[1:k, 1:k]), (@view work[1:k, 1:n2]))
 
         for i in 1:k
-            LinearAlgebra.axpy!((-one0), (@view work[i, 1:n2]), (@view A1[i, 1:n2]))
+            LinearAlgebra.axpy!(-one, (@view work[i, 1:n2]), (@view A1[i, 1:n2]))
         end
     end
 
     if !colmajor && forward && !left # rowmajor, forward, right
-        LinearAlgebra.generic_mattrimul!((@view work[1:m2, 1:k]), 'U', 'N', tfun, (@view work[1:m2, 1:k]), (@view T[1:k, 1:k]))
+        LinearAlgebra.generic_mattrimul!((@view work[1:m2, 1:k]), 'U', 'N', tfun, (@view work[1:m2, 1:k]), (@view T_mat[1:k, 1:k]))
 
         for j in 1:k
-            LinearAlgebra.axpy!(-one0, (@view work[1:m2, j]), (@view A1[1:m2, j]))
+            LinearAlgebra.axpy!(-one, (@view work[1:m2, j]), (@view A1[1:m2, j]))
         end
     end
 
     if !colmajor && !forward && left # rowmajor, backward, left
-        LinearAlgebra.generic_trimatmul!((@view work[1:k, 1:n2]), 'L', 'N', tfun, (@view T[1:k, 1:k]), (@view work[1:k, 1:n2]))
+        LinearAlgebra.generic_trimatmul!((@view work[1:k, 1:n2]), 'L', 'N', tfun, (@view T_mat[1:k, 1:k]), (@view work[1:k, 1:n2]))
 
         for i in 1:k
-            LinearAlgebra.axpy!(-one0, (@view work[i, 1:n2]), (@view A1[i, 1:n2]))
+            LinearAlgebra.axpy!(-one, (@view work[i, 1:n2]), (@view A1[i, 1:n2]))
         end
     end
 
     if !colmajor && !forward && !left # rowmajor, backward, right
-        LinearAlgebra.generic_mattrimul!((@view work[1:m2, 1:k]), 'L', 'N', tfun, (@view work[1:m2, 1:k]), (@view T[1:k, 1:k]))
+        LinearAlgebra.generic_mattrimul!((@view work[1:m2, 1:k]), 'L', 'N', tfun, (@view work[1:m2, 1:k]), (@view T_mat[1:k, 1:k]))
 
         for j in 1:k
-            LinearAlgebra.axpy!((-one0), (@view work[1:m2, j]), (@view A1[1:m2, j]))
+            LinearAlgebra.axpy!(-one, (@view work[1:m2, j]), (@view A1[1:m2, j]))
         end
     end
 
+    # Apply final transformation using pamm kernel
+    pamm!('A', side, storev, direct, m2, n2, k, l, A1, A2, V, work)
 
-    pamm('A', side, storev, direct, m2, n2, k, l, A1, lda1, A2, lda2, V, ldv, work, ldwork)
+end
 
-    return
+"""
+    parfb(side, trans, direct, storev, A1, A2, V, T) -> (A1, A2)
+
+Applies a block reflector or its transpose to a pair of matrices A1 and A2.
+This is a simplified interface that automatically computes required parameters.
+
+# Arguments
+- `side::Char`: Determines the side of multiplication ('L' for left, 'R' for right)
+- `trans::Char`: Operation to apply ('N' for none, 'T' for transpose, 'C' for conjugate transpose)
+- `direct::Char`: Direction of reflector storage ('F' for forward, 'B' for backward)
+- `storev::Char`: Storage format of reflectors ('C' for columnwise, 'R' for rowwise)
+- `A1::Matrix`: First matrix to be updated (modified in-place)
+- `A2::Matrix`: Second matrix to be updated (modified in-place)
+- `V::Matrix`: Matrix containing the elementary reflectors
+- `T::Matrix`: Upper triangular matrix of the block reflector
+
+# Returns
+- Updated A1 and A2 matrices
+
+# Example
+```julia
+m1, n1, m2, n2, k = 4, 6, 4, 6, 3
+A1 = complex.(randn(m1, n1), randn(m1, n1))
+A2 = complex.(randn(m2, n2), randn(m2, n2))
+V = complex.(randn(m1+m2, k), randn(m1+m2, k))
+T = complex.(randn(k, k), randn(k, k))
+A1_new, A2_new = parfb('L', 'N', 'F', 'C', A1, A2, V, T)
+```
+"""
+function parfb!(side::Char, trans::Char, direct::Char, storev::Char, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, V::AbstractMatrix{T}, T_mat::AbstractMatrix{T}) where {T}
+    # Determine dimensions
+    m1, n1 = size(A1)
+    m2, n2 = size(A2)
+    k = size(T_mat, 1)
+    l = size(V, 2)
+
+    # Allocate workspace
+    if side == 'L'
+        work = similar(A1, max(m1, m2), max(n1, n2))
+    else
+        work = similar(A1, max(m1, m2), max(n1, n2))
+    end
+    
+    # Call the underlying kernel
+    parfb!(side, trans, direct, storev, m1, n1, m2, n2, k, l,
+          A1, A2, V, T_mat, work)
 end
+
+export parfb!
diff --git a/src/pemv.jl b/src/pemv.jl
index b62dded..8c4acf4 100644
--- a/src/pemv.jl
+++ b/src/pemv.jl
@@ -1,141 +1,225 @@
-function pemv(trans, storev, m, n, l, alpha, A, lda, X, beta, Y, work)
-    begin
-        if trans != 'N' && trans != 'T' && trans != 'C'
-            throw(ArgumentError("illegal value of trans"))
-            return -1
-        end
-
-        if storev != 'C' && storev != 'R'
-            throw(ArgumentError("illegal value of storev"))
-            return -2
-        end
-
-        if !((storev == 'C' && trans != 'N') || (storev == 'R' && trans == 'N'))
-            throw(ArgumentError("illegal values of trans/storev"))
-            return -2
-        end
+"""
+    pemv!(trans, storev, m, n, l, alpha, A, X, beta, Y, work)
+
+Perform panel matrix-vector multiplication with optimized block algorithms.
+
+This function implements efficient matrix-vector multiplication for structured panels,
+commonly used in block QR factorization algorithms. It performs operations of the form:
+Y := alpha * op(A) * X + beta * Y
+
+where op(A) can be A, A^T, or A^H depending on the trans parameter.
+
+# Arguments
+- `trans::Char`: Transpose operation ('N' for none, 'T' for transpose, 'C' for conjugate transpose)
+- `storev::Char`: Storage format for vectors ('C' for columnwise, 'R' for rowwise)
+- `m::Int`: Number of rows in matrix A
+- `n::Int`: Number of columns in matrix A
+- `l::Int`: Panel size (must be ≤ min(m,n))
+- `alpha`: Scalar multiplier for the matrix-vector product
+- `A::Matrix`: Input matrix of size m×n
+- `X::Vector`: Input vector (modified in-place)
+- `beta`: Scalar multiplier for the output vector Y
+- `Y::Vector`: Output vector (modified in-place)
+- `work::Vector`: Workspace array for intermediate computations
+
+# Returns
+- `Int`: Status code (0 for success, negative for invalid arguments)
+
+# Algorithm
+The function uses block-structured algorithms that partition the matrix and vectors
+to take advantage of cache locality and vectorization, particularly effective for
+panel-based factorizations.
+
+# Implementation Notes
+- Optimized for different storage formats (columnwise vs rowwise)
+- Uses BLAS-3 operations where possible for performance
+- Handles edge cases with l=1 efficiently
+- Validates input parameters with descriptive error messages
+"""
+function pemv!(trans::Char, storev::Char, m::Integer, n::Integer, l::Integer, alpha::T, A::AbstractMatrix{T}, x::AbstractVector{T}, beta::T, y::AbstractVector{T}, work::AbstractVector{T}) where {T}
+    # Input validation
+    if trans != 'N' && trans != 'T' && trans != 'C'
+        throw(ArgumentError("illegal value of trans"))
+    end
 
-        if m < 0
-            throw(ArgumentError("illegal value of m"))
-            return -3
-        end
+    if storev != 'C' && storev != 'R'
+        throw(ArgumentError("illegal value of storev"))
+    end
 
-        if n < 0
-            throw(ArgumentError("illegal value of n"))
-            return -4
-        end
+    if !((storev == 'C' && trans != 'N') || (storev == 'R' && trans == 'N'))
+        throw(ArgumentError("illegal values of trans/storev"))
+    end
 
-        if l > min(m, n)
-            throw(ArgumentError("illegal value of l"))
-            return -5
-        end
+    if m < 0
+        throw(ArgumentError("illegal value of m"))
+    end
 
-        if lda < max(1, m)
-            throw(ArgumentError("illegal value of lda"))
-            return -8
-        end
+    if n < 0
+        throw(ArgumentError("illegal value of n"))
+    end
 
-        # quick return 
-        if m == 0 || n == 0
-            return
-        end
+    if l > min(m, n)
+        throw(ArgumentError("illegal value of l"))
+    end
 
-        if alpha == 0 && beta == 0
-            return
-        end
+    # Quick return for trivial cases
+    if m == 0 || n == 0
+        return
+    end
 
-        if l == 1
-            l = 0
-        end
+    if alpha == 0 && beta == 0
+        return
+    end
 
-        if storev == 'C'
-            x1 = (@view X[1:m-l])
-            x2 = (@view X[m-l+1:m])
-            xf = (@view X[1:m])
-        else # assume incx = ldaX
-            x1 = (@view X[1:n-l])
-            x2 = (@view X[n-l+1:n])
-            xf = (@view X[1:n])
-            # columnwise 
-        end
+    # Handle special case where l=1 (convert to l=0 for efficiency)
+    if l == 1
+        l = 0
+    end
 
-        if storev != 'C'
-            y1 = (@view Y[1:l])
-            y2 = (@view Y[l+1:m])
-        else # assume incy = ldaY
-            y1 = (@view Y[1:l])
-            y2 = (@view Y[l+1:n])
-            # columnwise 
-        end
+    # Set up vector views based on storage format
+    if storev == 'C'
+        # Column-wise storage: partition X and Y based on m and l
+        x1 = (@view x[1:m-l])
+        x2 = (@view x[m-l+1:m])
+        xf = (@view x[1:m])
+    else 
+        # Row-wise storage: partition X and Y based on n and l
+        x1 = (@view x[1:n-l])
+        x2 = (@view x[n-l+1:n])
+        xf = (@view x[1:n])
+    end
 
+    # Determine Y partitioning based on storage format
+    if storev != 'C'
+        y1 = (@view y[1:l])
+        y2 = (@view y[l+1:m])
+    else 
+        y1 = (@view y[1:l])
+        y2 = (@view y[l+1:n])
+    end
 
-        if storev == 'C'
-            if trans == 'N'
-                throw(ErrorException("not implemented"))
-                return -1
-            else
-                if l > 0
-                    (@view work[1:l]) .= (@view X[m-l+1:m])
 
-                    if trans == 'C'
-                        LinearAlgebra.generic_trimatmul!((@view work[1:l]), 'U', 'N', adjoint,
-                            (@view A[m-l+1:m, 1:l]), (@view work[1:l]))
-                    else
-                        LinearAlgebra.generic_trimatmul!((@view work[1:l]), 'U', 'N', transpose,
-                            (@view A[m-l+1:m, 1:l]), (@view work[1:l]))
-                    end
+    # Apply the matrix-vector multiplication based on storage format and transpose
+    if storev == 'C'
+        if trans == 'N'
+            throw(ErrorException("not implemented"))
+        else
+            # Column-wise storage with transpose/adjoint operation
+            if l > 0
+                # Copy relevant portion to workspace for triangular operations
+                (@view work[1:l]) .= (@view x[m-l+1:m])
+
+                # Apply triangular matrix multiplication
+                if trans == 'C'
+                    LinearAlgebra.generic_trimatmul!((@view work[1:l]), 'U', 'N', adjoint,
+                        (@view A[m-l+1:m, 1:l]), (@view work[1:l]))
+                else
+                    LinearAlgebra.generic_trimatmul!((@view work[1:l]), 'U', 'N', transpose,
+                        (@view A[m-l+1:m, 1:l]), (@view work[1:l]))
+                end
 
-                    if m > l
-                        LinearAlgebra.generic_matvecmul!((@view Y[1:l]), trans, (@view A[1:m-l, 1:l]),
-                            (@view X[1:m-l]), LinearAlgebra.MulAddMul(alpha, beta))
-                        LinearAlgebra.axpy!(alpha, (@view work[1:l]), (@view Y[1:l]))
+                # Handle remaining matrix-vector operations
+                if m > l
+                    LinearAlgebra.generic_matvecmul!((@view y[1:l]), trans, (@view A[1:m-l, 1:l]),
+                        (@view x[1:m-l]), LinearAlgebra.MulAddMul(alpha, beta))
+                    LinearAlgebra.axpy!(alpha, (@view work[1:l]), (@view y[1:l]))
+                else
+                    # Handle case where m <= l
+                    if beta == 0
+                        (@view work[1:l]) .*= alpha
+                        (@view y[1:l]) .= (@view work[1:l])
                     else
-                        if beta == 0
-                            (@view work[1:l]) .*= alpha
-                            (@view Y[1:l]) .= (@view work[1:l])
-                        else
-                            (@view Y[1:l]) .*= beta
-                            LinearAlgebra.axpy!(alpha, (@view work[1:l]), (@view Y[1:l]))
-                        end
-
+                        (@view y[1:l]) .*= beta
+                        LinearAlgebra.axpy!(alpha, (@view work[1:l]), (@view y[1:l]))
                     end
                 end
+            end
 
-                if n > l
-                    k = n - l
-                    LinearAlgebra.generic_matvecmul!((@view Y[l+1:n]), trans, (@view A[1:m, l+1:n]),
-                        (@view X[1:m]), LinearAlgebra.MulAddMul(alpha, beta))
-                end
+            # Handle remaining columns if n > l
+            if n > l
+                k = n - l
+                LinearAlgebra.generic_matvecmul!((@view y[l+1:n]), trans, (@view A[1:m, l+1:n]),
+                    (@view x[1:m]), LinearAlgebra.MulAddMul(alpha, beta))
             end
-        else
-            if trans == 'N'
-                if l > 0
-                    work[1:l] .= x2
-                    LinearAlgebra.generic_trimatmul!((@view work[1:l]), 'L', 'N', identity,
-                        (@view A[1:l, n-l+1:n]), (@view work[1:l]))
-
-                    if n > l
-                        LinearAlgebra.generic_matvecmul!(y1, 'N', (@view A[1:l, 1:n-l]),
-                            x1, LinearAlgebra.MulAddMul(alpha, beta))
-                        LinearAlgebra.axpy!(alpha, (@view work[1:l]), y1)
+        end
+    else
+        # Row-wise storage
+        if trans == 'N'
+            # Row-wise storage with no transpose
+            if l > 0
+                # Copy and apply triangular operations
+                work[1:l] .= x2
+                LinearAlgebra.generic_trimatmul!((@view work[1:l]), 'L', 'N', identity,
+                    (@view A[1:l, n-l+1:n]), (@view work[1:l]))
+
+                # Handle rectangular part if n > l
+                if n > l
+                    LinearAlgebra.generic_matvecmul!(y1, 'N', (@view A[1:l, 1:n-l]),
+                        x1, LinearAlgebra.MulAddMul(alpha, beta))
+                    LinearAlgebra.axpy!(alpha, (@view work[1:l]), y1)
+                else
+                    # Handle case where n <= l
+                    if beta == 0
+                        y1 .= alpha * (@view work[1:l])
                     else
-                        if beta == 0
-                            y1 .= alpha * (@view work[1:l])
-                        else
-                            y1 .*= beta
-                            LinearAlgebra.axpy!(alpha, (@view work[1:l]), y1)
-                        end
+                        y1 .*= beta
+                        LinearAlgebra.axpy!(alpha, (@view work[1:l]), y1)
                     end
                 end
+            end
 
-                if m > l
-                    LinearAlgebra.generic_matvecmul!(y2, 'N', (@view A[l+1:m, 1:n]),
-                        xf, LinearAlgebra.MulAddMul(alpha, beta))
-                end
-            else
-                throw(ErrorException("not implemented"))
-                return -1
+            # Handle remaining rows if m > l
+            if m > l
+                LinearAlgebra.generic_matvecmul!(y2, 'N', (@view A[l+1:m, 1:n]),
+                    xf, LinearAlgebra.MulAddMul(alpha, beta))
             end
+        else
+            # Row-wise storage with transpose - not implemented
+            throw(ErrorException("not implemented"))
         end
     end
-end
\ No newline at end of file
+end
+
+"""
+    pemv(trans, storev, A, X, Y, alpha=1.0, beta=0.0) -> Y
+
+Performs panel matrix-vector multiplication with automatic workspace allocation.
+This is a simplified interface that automatically computes required parameters.
+
+# Arguments
+- `trans::Char`: Transpose operation ('N' for none, 'T' for transpose, 'C' for conjugate transpose)
+- `storev::Char`: Storage format for vectors ('C' for columnwise, 'R' for rowwise)
+- `A::Matrix`: Matrix for multiplication
+- `X::Vector`: Input vector
+- `Y::Vector`: Output vector (modified in-place)
+- `alpha`: Scalar multiplier for A*X (default: 1.0)
+- `beta`: Scalar multiplier for Y (default: 0.0)
+
+# Returns
+- Updated vector Y
+
+# Example
+```julia
+m, n, l = 6, 4, 3
+A = complex.(randn(m, n), randn(m, n))
+X = complex.(randn(n), randn(n))
+Y = complex.(randn(m), randn(m))
+Y_new = pemv('N', 'C', A, X, Y, 2.0, 1.0)
+```
+"""
+
+function pemv(trans::Char, storev::Char, alpha::T, A::AbstractMatrix{T}, x::AbstractVector{T}, beta::T, y::AbstractVector{T}) where {T}
+    # Determine dimensions
+    m, n = size(A)
+    l = min(m, n)  # Default panel size
+    
+    # Leading dimension
+    
+    # Allocate workspace
+    work = similar(x, max(m, n))
+    
+    # Call the underlying kernel
+    pemv!(trans, storev, m, n, l, alpha, A, x, beta, y, work)
+end
+
+export pemv!
\ No newline at end of file
diff --git a/src/rectrxm.jl b/src/rectrxm.jl
index fe6dfb2..cd0bec1 100644
--- a/src/rectrxm.jl
+++ b/src/rectrxm.jl
@@ -1,34 +1,45 @@
-export unified_rectrxm!
 """
+    unified_rectrxm!(side, uplo, transpose, alpha, func, A, B)
+
 Unified recursive function for triangular matrix solve (TRSM) and multiply (TRMM) operations.
 
-This function supports both solving triangular systems of equations and performing triangular matrix multiplications.
-
-Arguments:
-- side::Char: Specifies the side of the operation:
-    - 'L': Left multiplication (A * B or inv(A) * B).
-    - 'R': Right multiplication (B * A or B * inv(A)).
-- uplo::Char: Specifies the triangular part of the matrix to reference:
-    - 'U': Use the upper triangle.
-    - 'L': Use the lower triangle.
-- transpose::Char: Specifies the transposition operation:
-    - 'N': No transpose.
-    - 'T': Transpose.
-    - 'C': Conjugate transpose.
-- alpha::Number: Scalar multiplier applied to the operation.
-- func::Char: Specifies the function type:
-    - 'S': Solve (TRSM, A * X = alpha * B).
-    - 'M': Multiply (TRMM, Update B = alpha * A * B or alpha * B * A).
-- A::AbstractMatrix: The triangular matrix.
-- B::AbstractMatrix: The matrix to multiply or solve for.
-
-Returns:
-- Updated matrix `B` after performing the specified operation.
-
-Notes:
-- The function modifies `B` in place.
-"""
+This function supports both solving triangular systems of equations and performing triangular matrix multiplications
+using recursive algorithms that are cache-friendly and numerically stable.
+
+# Arguments
+- `side::Char`: Specifies the side of the operation ('L' for left, 'R' for right)
+    - 'L': Left multiplication (A * B or inv(A) * B)
+    - 'R': Right multiplication (B * A or B * inv(A))
+- `uplo::Char`: Specifies the triangular part of the matrix to reference
+    - 'U': Use the upper triangle
+    - 'L': Use the lower triangle
+- `transpose::Char`: Specifies the transposition operation
+    - 'N': No transpose
+    - 'T': Transpose
+    - 'C': Conjugate transpose
+- `alpha::Number`: Scalar multiplier applied to the operation
+- `func::Char`: Specifies the function type
+    - 'S': Solve (TRSM, A * X = alpha * B)
+    - 'M': Multiply (TRMM, Update B = alpha * A * B or alpha * B * A)
+- `A::AbstractMatrix`: The triangular matrix
+- `B::AbstractMatrix`: The matrix to multiply or solve for (modified in-place)
+
+# Returns
+- Updated matrix `B` after performing the specified operation
+
+# Algorithm
+Uses recursive divide-and-conquer approach that:
+1. Partitions matrices into 2x2 block structure
+2. Applies operations recursively on subblocks
+3. Handles base cases with optimized kernel functions
+4. Maintains numerical stability through careful ordering
 
+# Implementation Notes
+- The function modifies `B` in place for efficiency
+- Uses different thresholds for TRSM (256) vs TRMM (16) operations
+- Automatically handles transpose operations by adjusting matrix views
+- Recursive partitioning adapts to matrix size for optimal performance
+"""
 function unified_rectrxm!(
         side::Char, 
         uplo::Char, 
@@ -38,28 +49,59 @@ function unified_rectrxm!(
         A::AbstractMatrix, 
         B::AbstractMatrix
     )
-    threshold = 16
+    threshold = 16  # Default threshold for TRMM operations
     n = size(A, 1)
 
+    # Handle transpose operations by adjusting matrix view and uplo flag
     if transpose == 'T' || transpose == 'C'
         A = (transpose == 'T') ? Transpose(A) : Adjoint(A)
         uplo = (uplo == 'L') ? 'U' : 'L'
     end    
     
+    # TRSM operations require different handling and larger threshold
     if func == 'S'
-        threshold = 256
-        B .= alpha .* B
+        threshold = 256  # Larger threshold for solve operations
+        B .= alpha .* B  # Apply scaling before solve
     end
+    
+    # Call recursive kernel
     unified_rec(func, side, uplo, A, n, B, threshold)
+    
+    # TRMM operations apply scaling after multiplication
     if func == 'M'
         B .= alpha .* B
     end
+    
     return B
 end
 
+"""
+    unified_rec(func, side, uplo, A, n, B, threshold)
+
+Recursive kernel for unified triangular matrix operations.
+
+This function implements the divide-and-conquer recursive algorithm that partitions
+matrices into 2x2 block structure and applies the appropriate sequence of operations.
+
+# Arguments
+- `func::Char`: Operation type ('S' for solve, 'M' for multiply)
+- `side::Char`: Operation side ('L' for left, 'R' for right)  
+- `uplo::Char`: Triangular part ('U' for upper, 'L' for lower)
+- `A::AbstractMatrix{T}`: Triangular coefficient matrix
+- `n::Int`: Matrix dimension to process
+- `B::AbstractMatrix{T}`: Target matrix (modified in-place)
+- `threshold::Int`: Recursion base case threshold (default: 256)
+
+# Algorithm
+The recursion follows different orderings based on the operation type:
+1. For forward substitution: A11 → GEMM → A22
+2. For backward substitution: A22 → GEMM → A11
+This ensures numerical stability and correctness of the triangular solve.
+"""
 function unified_rec(func::Char, side::Char, uplo::Char, A::AbstractMatrix{T}, n, B::AbstractMatrix{T}, threshold::Int=256) where T <: AbstractFloat
+    # Base case: use optimized kernel functions for small matrices
     if n <= threshold
-        if func == 'S'
+        if func == 'S'  # Solve operations (TRSM)
             if side == 'L' && uplo == 'L'
                 LeftLowerTRSM!(A, B)
             elseif side == 'L' && uplo == 'U'
@@ -69,7 +111,7 @@ function unified_rec(func::Char, side::Char, uplo::Char, A::AbstractMatrix{T}, n
             else
                 RightUpperTRSM!(A, B)
             end
-        else
+        else  # Multiply operations (TRMM)
             if side == 'L' && uplo == 'L'
                 LeftLowerTRMM!(A, B)
             elseif side == 'L' && uplo == 'U'
@@ -83,6 +125,7 @@ function unified_rec(func::Char, side::Char, uplo::Char, A::AbstractMatrix{T}, n
         return B
     end
 
+    # Determine partition size for optimal cache performance
     if isinteger(log2(n))
         mid = div(n, 2)
     else
@@ -90,55 +133,68 @@ function unified_rec(func::Char, side::Char, uplo::Char, A::AbstractMatrix{T}, n
     end
     mid_remainder = n - mid
 
-    A11 = view(A, 1:mid, 1:mid)
-    A22 = view(A, mid+1:n, mid+1:n)
-    A21 = view(A, mid+1:n, 1:mid)
-    A12 = view(A, 1:mid, mid+1:n)
+    # Create 2x2 block partition of matrix A
+    A11 = view(A, 1:mid, 1:mid)                    # Upper-left block
+    A22 = view(A, mid+1:n, mid+1:n)                # Lower-right block  
+    A21 = view(A, mid+1:n, 1:mid)                  # Lower-left block
+    A12 = view(A, 1:mid, mid+1:n)                  # Upper-right block
 
+    # Partition matrix B based on operation side
     if side == 'L'
-        B1 = view(B, 1:mid, :)
-        B2 = view(B, mid+1:n, :)
+        B1 = view(B, 1:mid, :)        # Upper block rows
+        B2 = view(B, mid+1:n, :)      # Lower block rows
     else
-        B1 = view(B, :, 1:mid)
-        B2 = view(B, :, mid+1:n)
+        B1 = view(B, :, 1:mid)        # Left block columns
+        B2 = view(B, :, mid+1:n)      # Right block columns
     end
 
+    # Apply recursive algorithm with correct ordering for numerical stability
+    # Different operation types require different orderings to maintain correctness
     if (side == 'L' && uplo == 'L' && func == 'S') || 
         (side == 'R' && uplo == 'U' && func == 'S') || 
         (side == 'L' && uplo == 'U' && func == 'M') || 
         (side == 'R' && uplo == 'L' && func == 'M')
+        
+        # Forward substitution ordering: A11 → GEMM → A22
         unified_rec(func, side, uplo, A11, mid, B1, threshold)
+        
+        # Apply rank-k update between recursive calls
         if side == 'L'
             if func == 'S'
-                GEMM_SUB!(B2, A21, B1)
+                GEMM_SUB!(B2, A21, B1)  # B2 := B2 - A21 * B1
             else
-                GEMM_ADD!(A12, B2, B1)
+                GEMM_ADD!(A12, B2, B1)  # B1 := B1 + A12 * B2
             end
         else
             if func == 'S'
-                GEMM_SUB!(B2, B1, A12)
+                GEMM_SUB!(B2, B1, A12)  # B2 := B2 - B1 * A12
             else
-                GEMM_ADD!(B2, A21, B1)
+                GEMM_ADD!(B2, A21, B1)  # B2 := B2 + A21 * B1
             end
         end
+        
         unified_rec(func, side, uplo, A22, mid_remainder, B2, threshold)
     else
+        # Backward substitution ordering: A22 → GEMM → A11
         unified_rec(func, side, uplo, A22, mid_remainder, B2, threshold)
+        
+        # Apply rank-k update between recursive calls
         if side == 'L'
             if func == 'S'
-                GEMM_SUB!(B1, A12, B2)
+                GEMM_SUB!(B1, A12, B2)  # B1 := B1 - A12 * B2
             else
-                GEMM_ADD!(A21, B1, B2)
+                GEMM_ADD!(A21, B1, B2)  # B2 := B2 + A21 * B1
             end
         else
             if func == 'S'
-                GEMM_SUB!(B1, B2, A21)
+                GEMM_SUB!(B1, B2, A21)  # B1 := B1 - B2 * A21
             else
-                GEMM_ADD!(B1, A12, B2)
+                GEMM_ADD!(B1, A12, B2)  # B1 := B1 + A12 * B2
             end
         end
+        
         unified_rec(func, side, uplo, A11, mid, B1, threshold)
     end
-    return B
 end
 
+export unified_rectrxm!
\ No newline at end of file
diff --git a/src/trmm.jl b/src/trmm.jl
index 7936921..3654bfe 100644
--- a/src/trmm.jl
+++ b/src/trmm.jl
@@ -1,67 +1,106 @@
+"""
+GPU-accelerated Triangular Matrix Multiplication (TRMM) Operations
+
+This module provides GPU kernel implementations for triangular matrix multiplication
+operations, supporting both left and right sided operations with upper and lower
+triangular matrices.
+
+The kernels are optimized for GPU architectures with:
+- Shared memory tiling for improved memory access patterns  
+- Bank conflict avoidance through memory padding
+- Vectorized inner loops for computational efficiency
+- Bounds checking for non-square matrix operations
+
+All kernels perform in-place operations: B := A * B or B := B * A
+where A is triangular and B is a general matrix.
+"""
+
 export LeftLowerTRMM!, LeftUpperTRMM!, RightLowerTRMM!, RightUpperTRMM!
 
-# Performs in place TRMM B = A * B 
-# where A is an NxN lower triangular matrix and B is an NxM matrix
-# A is limited to matrix size 16x16 due to shared memory constraints
+# Performs in-place TRMM: B := A * B 
+# where A is an N×N lower triangular matrix and B is an N×M matrix
+# A is limited to matrix size 16×16 due to shared memory constraints
+
+"""
+    LeftLowerTRMM_kernel!(A, B, ::Val{BANK}=Val(1))
+
+GPU kernel for left-sided lower triangular matrix multiplication.
 
+Performs the operation B := A * B where A is lower triangular.
+Uses shared memory tiling with configurable bank offset to avoid conflicts.
+
+# Arguments
+- `A::AbstractMatrix`: N×N lower triangular coefficient matrix
+- `B::AbstractMatrix`: N×M target matrix (modified in-place) 
+- `BANK::Int`: Memory bank offset to avoid conflicts (default: 1)
+
+# Implementation Notes
+- Tile size limited to 16×16 due to shared memory constraints
+- Uses private variables for accumulation to enable vectorization
+- Includes bounds checking for non-square input matrices
+- Synchronization points ensure correct shared memory access patterns
+"""
 @kernel function LeftLowerTRMM_kernel!(A,B,
                             ::Val{BANK} = Val(1)) where BANK
     
+    # Get thread and block indices
     gi,gj = @index(Group, NTuple)
     i,j = @index(Local, NTuple)
 
-    # kept at 16x16 due to shmem constraints
+    # Tile dimension kept at 16×16 due to shared memory constraints
     TILE_DIM = @uniform @groupsize()[1]
 
-    # allocating shared memory for the sub matrix product calculation
-    # BANK = 1, added to avoid banck coonflicts as a result of irregular thread access
-    tile1 = @localmem eltype(B) (TILE_DIM+BANK, TILE_DIM)
-    tile2 = @localmem eltype(B) (TILE_DIM+BANK, TILE_DIM)
+    # Allocate shared memory for sub-matrix product calculation
+    # BANK padding added to avoid bank conflicts from irregular thread access
+    tile1 = @localmem eltype(B) (TILE_DIM+BANK, TILE_DIM)  # For matrix A
+    tile2 = @localmem eltype(B) (TILE_DIM+BANK, TILE_DIM)  # For matrix B
 
-    #declaring a private variable to accumulate the result of submatrix multiplication
+    # Private variable to accumulate the result of sub-matrix multiplication
     B_sub = @private eltype(B) 1
-    @inbounds B_sub[1] = -zero(eltype(B))
+    @inbounds B_sub[1] = zero(eltype(B))
 
-    @uniform N = size(A, 1)
-    @uniform R = size(A, 2)
-    @uniform M = size(B, 2)
+    # Get matrix dimensions
+    @uniform N = size(A, 1)  # Matrix A dimensions
+    @uniform R = size(A, 2)  # Matrix A dimensions  
+    @uniform M = size(B, 2)  # Matrix B column count
 
-    # Cannot use @index(Global), because we use a smaller ndrange(gridsize would reduce)
+    # Calculate global thread indices (cannot use @index(Global) with custom ndrange)
     I = (gi-1) * TILE_DIM + i
     J = (gj-1) * TILE_DIM + j
 
-    # load input A into tile, with bounds checking for non-square matrices
+    # Load input matrix A into shared memory tile with bounds checking
     if i <= N && j <= N
         @inbounds tile1[i, j] = A[i, j]
     else
-        @inbounds tile1[i, j] = 0.0
-        
+        @inbounds tile1[i, j] = zero(eltype(A))
     end
 
-    # load input/output B into tiles, with bounds checking for non-square matrices
+    # Load input/output matrix B into shared memory tile with bounds checking
     if I <= R && J <= M
         @inbounds tile2[i, j] = B[I, J]
     else
-        @inbounds tile2[i, j] = 0.0
+        @inbounds tile2[i, j] = zero(eltype(B))
     end
 
-    # wait for all tiles to be loaded
+    # Synchronize to ensure all tiles are loaded before computation
     @synchronize
 
-    # calculate value of spot in output, use temporary value to allow for vectorization
+    # Calculate triangular matrix-vector product for lower triangular A
+    # For lower triangular: only use elements A[i,k] where k <= i
     out = zero(eltype(B))
     @simd for k in 1:i
         @inbounds out += tile1[i, k] * tile2[k, j]
     end
     B_sub[1] += out
 
+    # Synchronize before writing results
     @synchronize
     
-    # get global indices again
+    # Recalculate global indices after synchronization
     I = (gi-1) * TILE_DIM + i
     J = (gj-1) * TILE_DIM + j
 
-    # save if inbounds
+    # Write result back to global memory if within bounds
     if I <= N && J <= M
         @inbounds B[I, J] = B_sub[1]
     end
@@ -275,38 +314,135 @@ end
 
 
 
-# wrapper function for the LLTRMM kernel
+"""
+    LeftLowerTRMM!(A, B; n_threads=(16,16))
+
+Perform left-sided lower triangular matrix multiplication: B := A * B
+
+# Arguments
+- `A::AbstractMatrix`: N×N lower triangular coefficient matrix
+- `B::AbstractMatrix`: N×M target matrix (modified in-place)
+- `n_threads::Tuple`: Thread block dimensions (default: (16,16))
+
+# Implementation Notes
+- Uses GPU acceleration with optimized kernel
+- Thread block size should not exceed hardware limits
+- NDRange is padded to handle boundary conditions
+"""
 function LeftLowerTRMM!(A, B; n_threads = (16,16))
     backend = get_backend(A)
+    # Calculate NDRange with padding to handle boundary threads
     Ndrange = max(size(A), size(B))
-    Ndrange = (Ndrange[1]+ 16, Ndrange[2]+16)
+    Ndrange = (Ndrange[1] + 16, Ndrange[2] + 16)
     LeftLowerTRMM_kernel!(backend, n_threads)(A, B, ndrange = Ndrange)
-    # need to specify ndrange as the larger of the 2 ARGUMENTS
-    # LeftLowerTRMM_kernel!(backend, n_threads)(A, B, ndrange = max(size(A), size(B)))
 end
 
-# wrapper function for the LUTRMM kernel
+"""
+    LeftUpperTRMM!(A, B; n_threads=(16,16))
+
+Perform left-sided upper triangular matrix multiplication: B := A * B
+
+# Arguments
+- `A::AbstractMatrix`: N×N upper triangular coefficient matrix  
+- `B::AbstractMatrix`: N×M target matrix (modified in-place)
+- `n_threads::Tuple`: Thread block dimensions (default: (16,16))
+"""
 function LeftUpperTRMM!(A, B; n_threads = (16,16))
     backend = get_backend(A)
     Ndrange = max(size(A), size(B))
-    Ndrange = (Ndrange[1]+ 16, Ndrange[2]+16)
-    # could not use overloading with only 2 args
+    Ndrange = (Ndrange[1] + 16, Ndrange[2] + 16)
     LeftUpperTRMM_kernel!(backend, n_threads)(A, B, ndrange = Ndrange)
 end
 
-# wrapper function for the RLTRMM kernel
+"""
+    RightLowerTRMM!(A, B; n_threads=(16,16))
+
+Perform right-sided lower triangular matrix multiplication: B := B * A
+
+# Arguments
+- `A::AbstractMatrix`: N×N lower triangular coefficient matrix
+- `B::AbstractMatrix`: M×N target matrix (modified in-place)  
+- `n_threads::Tuple`: Thread block dimensions (default: (16,16))
+"""
 function RightLowerTRMM!(A, B; n_threads = (16,16))
     backend = get_backend(A)
     Ndrange = max(size(A), size(B))
-    Ndrange = (Ndrange[1]+ 16, Ndrange[2]+16)
-    # could not use overloading with only 2 args
+    Ndrange = (Ndrange[1] + 16, Ndrange[2] + 16)
     RightLowerTRMM_kernel!(backend, n_threads)(A, B, ndrange = Ndrange)
 end
 
+"""
+    RightUpperTRMM!(A, B; n_threads=(16,16))
+
+Perform right-sided upper triangular matrix multiplication: B := B * A
+
+# Arguments  
+- `A::AbstractMatrix`: N×N upper triangular coefficient matrix
+- `B::AbstractMatrix`: M×N target matrix (modified in-place)
+- `n_threads::Tuple`: Thread block dimensions (default: (16,16))
+"""
 function RightUpperTRMM!(A, B; n_threads = (16,16))
     backend = get_backend(A)
     Ndrange = max(size(A), size(B))
-    Ndrange = (Ndrange[1]+ 16, Ndrange[2]+16)
-    # could not use overloading with only 2 args
+    Ndrange = (Ndrange[1] + 16, Ndrange[2] + 16)
     RightUpperTRMM_kernel!(backend, n_threads)(A, B, ndrange = Ndrange)
+end
+
+"""
+    trmm(side, uplo, transa, diag, A, B, alpha=1.0) -> B
+
+Performs triangular matrix multiplication with automatic parameter detection.
+This is a simplified interface for triangular matrix multiplication operations.
+
+# Arguments
+- `side::Char`: Operation side
+    - 'L': B := alpha*op(A)*B (left multiplication)
+    - 'R': B := alpha*B*op(A) (right multiplication)
+- `uplo::Char`: Triangular part specification
+    - 'U': A is upper triangular
+    - 'L': A is lower triangular  
+- `transa::Char`: Operation on matrix A
+    - 'N': op(A) = A (no transpose)
+    - 'T': op(A) = A^T (transpose)
+    - 'C': op(A) = A^H (conjugate transpose)
+- `diag::Char`: Diagonal type (currently unused in GPU implementation)
+    - 'N': non-unit diagonal
+    - 'U': unit diagonal
+- `A::AbstractMatrix`: Triangular coefficient matrix
+- `B::AbstractMatrix`: Target matrix (modified in-place)
+- `alpha`: Scalar multiplier (default: 1.0)
+
+# Returns
+- Updated matrix B (same as input B, modified in-place)
+
+# Example
+```julia
+A = complex.(triu(randn(4, 4)), triu(randn(4, 4)))
+B = complex.(randn(4, 3), randn(4, 3))
+C = trmm('L', 'U', 'N', 'N', A, copy(B))
+```
+
+# Implementation Notes
+- Currently supports 'N' (no transpose) operations only
+- Uses GPU-accelerated kernels for computation
+- The transa and diag parameters are provided for interface compatibility
+"""
+function trmm(side, uplo, transa, diag, A, B, alpha=one(eltype(A)))
+    # Apply scaling if alpha != 1
+    if alpha != one(eltype(A))
+        B .*= alpha
+    end
+    
+    # Dispatch to appropriate GPU kernel based on operation parameters
+    if side == 'L' && uplo == 'L'
+        LeftLowerTRMM!(A, B)
+    elseif side == 'L' && uplo == 'U' 
+        LeftUpperTRMM!(A, B)
+    elseif side == 'R' && uplo == 'L'
+        RightLowerTRMM!(A, B)
+    elseif side == 'R' && uplo == 'U'
+        RightUpperTRMM!(A, B)
+    else
+        error("Unsupported combination of side='$side', uplo='$uplo'")
+    end
 end
\ No newline at end of file
diff --git a/src/trsm.jl b/src/trsm.jl
index ea37a94..1300c5b 100644
--- a/src/trsm.jl
+++ b/src/trsm.jl
@@ -147,4 +147,59 @@ function RightUpperTRSM!(A, B)
     n, m = size(B)
     backend = get_backend(A)
     right_upper_kernel(backend, (m,))(Transpose(A), B, m, ndrange=(m, n))
+end
+
+"""
+    trsm(side, uplo, transa, diag, A, B, alpha=1.0) -> B
+
+Solves triangular matrix systems with automatic parameter detection.
+This is a simplified interface for triangular system solving.
+
+# Arguments
+- 'side': 
+    - 'L': solve op(A)*X = alpha*B
+    - 'R': solve X*op(A) = alpha*B
+- 'uplo':
+    - 'U': A is upper triangular
+    - 'L': A is lower triangular  
+- 'transa': operation on A
+    - 'N': op(A) = A
+    - 'T': op(A) = A^T
+    - 'C': op(A) = A^H
+- 'diag': diagonal type
+    - 'N': non-unit diagonal
+    - 'U': unit diagonal
+- 'A': triangular matrix
+- 'B': right-hand side matrix (will be overwritten with solution)
+- 'alpha': scalar multiplier (default: 1.0)
+
+# Returns
+- Updated matrix B containing the solution
+
+# Example
+```julia
+A = complex.(triu(randn(4, 4)), triu(randn(4, 4)))
+B = complex.(randn(4, 3), randn(4, 3))
+X = trsm('L', 'U', 'N', 'N', A, copy(B))
+```
+"""
+function trsm(side, uplo, transa, diag, A, B, alpha=one(eltype(A)))
+    # Scale B if alpha != 1
+    if alpha != one(eltype(A))
+        B .*= alpha
+    end
+    
+    # Apply the appropriate kernel based on parameters
+    if side == 'L' && uplo == 'L'
+        LeftLowerTRSM!(A, B)
+    elseif side == 'L' && uplo == 'U' 
+        LeftUpperTRSM!(A, B)
+    elseif side == 'R' && uplo == 'L'
+        RightLowerTRSM!(A, B)
+    elseif side == 'R' && uplo == 'U'
+        RightUpperTRSM!(A, B)
+    else
+        error("Unsupported combination of side='$side', uplo='$uplo'")
+    end
+    
 end
\ No newline at end of file
diff --git a/src/tsmqr.jl b/src/tsmqr.jl
index 221d0af..b78af1a 100644
--- a/src/tsmqr.jl
+++ b/src/tsmqr.jl
@@ -1,110 +1,220 @@
-function tsmqr(side, trans, m1, n1, m2, n2, k, ib, 
-    A1, lda1, A2, lda2, V, ldv, T, ldt, work, ldwork)
-
-    #check input arguments
+"""
+    tsmqr!(side, trans, m1, n1, m2, n2, k, ib, A1, A2, V, T, work)
+
+Apply orthogonal matrix Q (or Q^H) stored as compact WY representation to 
+a triangular-pentagonal matrix [A1; A2].
+
+This routine applies a block orthogonal transformation represented in compact
+WY form (stored in V and T) to the combined matrix [A1; A2] where A1 is 
+triangular and A2 is pentagonal.
+
+# Arguments
+- `side`: Character indicating side of multiplication
+  - 'L': Apply Q from the left (Q*[A1; A2] or Q^H*[A1; A2])
+  - 'R': Apply Q from the right ([A1 A2]*Q or [A1 A2]*Q^H)
+- `trans`: Character indicating whether to transpose Q  
+  - 'N': Apply Q (no transpose)
+  - 'C': Apply Q^H (conjugate transpose)
+  - 'T': Apply Q^T (transpose, same as 'C' for complex)
+- `m1`, `n1`: Dimensions of triangular matrix A1
+- `m2`, `n2`: Dimensions of pentagonal matrix A2  
+- `k`: Number of elementary reflectors (columns of V)
+- `ib`: Block size for compact WY representation
+- `A1`: Triangular part of the matrix (modified in-place)
+- `A2`: Pentagonal part of the matrix (modified in-place)
+- `V`: Matrix containing reflector vectors
+- `T`: Upper triangular block reflector coefficient matrix  
+- `work`: Workspace array
+
+
+# Algorithm
+The transformation Q is applied using the compact WY representation:
+Q = I - V * T * V^H
+
+The algorithm processes the reflectors in blocks of size ib, applying
+each block using efficient matrix operations (parfb! routine).
+
+# Input Validation  
+Validates all dimension parameters and leading dimension requirements
+for proper matrix storage and computation.
+
+# Notes
+This is a core computational routine for applying orthogonal transformations
+in blocked QR algorithms. The compact WY form enables efficient block updates.
+"""
+function tsmqr!(side::Char, trans::Char, m1::Integer, n1::Integer, m2::Integer, n2::Integer, k::Integer, ib::Integer,
+    A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, V::AbstractMatrix{T}, T_mat::AbstractMatrix{T}, work::AbstractVector{T}) where {T}
+
+    # Input validation with descriptive error messages
     if side != 'L' && side != 'R'
-        throw(ArgumentError("illegal value of side"))
-        return -1
+        throw(ArgumentError("side must be 'L' or 'R', got '$side'"))
     end
 
     if trans != 'N' && trans != 'C' && trans != 'T'
-        throw(ArgumentError("illegal value of trans"))
-        return -2
+        throw(ArgumentError("trans must be 'N', 'C', or 'T', got '$trans'"))
     end
 
     if m1 < 0
-        throw(ArgumentError("illegal value of m1"))
-        return -3
+        throw(ArgumentError("m1 must be non-negative, got $m1"))
     end
 
     if n1 < 0
-        throw(ArgumentError("illegal value of n1"))
-        return -4
+        throw(ArgumentError("n1 must be non-negative, got $n1"))
     end
 
-    if m1 < 0 || (m2 != m1 && side == 'R')
-        throw(ArgumentError("illegal value of m2"))
-        return -5
+    if m2 < 0 || (m2 != m1 && side == 'R')
+        if side == 'R'
+            throw(ArgumentError("For side='R', m2 must equal m1. Got m1=$m1, m2=$m2"))
+        else
+            throw(ArgumentError("m2 must be non-negative, got $m2"))
+        end
     end
 
     if n2 < 0 || (n2 != n1 && side == 'L')
-        throw(ArgumentError("illegal value of n2"))
-        return -6
+        if side == 'L'
+            throw(ArgumentError("For side='L', n2 must equal n1. Got n1=$n1, n2=$n2"))
+        else
+            throw(ArgumentError("n2 must be non-negative, got $n2"))
+        end
     end
 
     if k < 0 || (side == 'L' && k > m1) || (side == 'R' && k > n1)
-        throw(ArgumentError("illegal value of k"))
-        return -7
+        max_k = side == 'L' ? m1 : n1
+        throw(ArgumentError("k must be between 0 and $max_k for side='$side', got $k"))
     end
 
     if ib < 0
-        throw(ArgumentError("illegal value of ib"))
-        return -8
-    end
-
-    if lda1 < max(1,m1)
-        throw(ArgumentError("illegal value of lda1"))
-        return -10
-    end
-
-    if lda2 < max(1,m2)
-        throw(ArgumentError("illegal value of lda2"))
-        return -12
-    end
-
-    if (side == 'L' && ldv < max(1,m2)) || (side == 'R' && ldv < max(1,n2))
-        throw(ArgumentError("illegal value of ldv"))
-        return -14
-    end
-
-    if ldt < max(1,ib)
-        throw(ArgumentError("illegal value of ldt"))
-        return -16
+        throw(ArgumentError("ib must be non-negative, got $ib"))
     end
 
-    if (side == 'L' && ldwork < max(1,ib)) || (side == 'R' && ldwork < max(1,m1))
-        throw(ArgumentError("illegal value of ldwork"))
-        return -18
-    end
-
-    # quick return
-    if m1 == 0 || n1 == 0 || m2 == 0 || n2 == 0  || k == 0 || ib == 0
+    # Quick return for degenerate cases
+    if m1 == 0 || n1 == 0 || m2 == 0 || n2 == 0 || k == 0 || ib == 0
         return 
     end
 
+    # Determine the order of applying blocks based on side and trans
     if (side == 'L' && trans != 'N') || (side == 'R' && trans == 'N')
+        # Apply blocks forward: 1, ib+1, 2*ib+1, ...
         i1 = 1
         i3 = ib
         istop = k
     else
+        # Apply blocks backward: ..., 2*ib+1, ib+1, 1
         i1 = (div(k-1,ib))*ib + 1
         i3 = -ib
         istop = 1
     end
     
+    # Initialize indices for submatrices
     ic = 1
     jc = 1
     mi = m1
     ni = n1
 
+    # Apply blocks of reflectors
     for i in i1:i3:istop
-        kb = min(ib, k-i+1)
+        kb = min(ib, k - i + 1)  # Size of current block
 
         if side == 'L'
-            # H  or H^H is applied to C[i:m, 1:n]
+            # Q is applied from the left: Q * [A1; A2]
             mi = m1 - i + 1
             ic = i
-            ldvv = m2
+            # Workspace for this block: kb x ni
+            W = reshape(@view(work[1:kb*ni]), kb, ni)
+            parfb!('L', trans, 'F', 'C', mi, ni, m2, n2, kb, 0,
+                   (@view A1[ic:ic+mi-1, jc:jc+ni-1]), (@view A2[1:m2, 1:n2]),
+                   (@view V[1:m2, i:i+kb-1]), (@view T_mat[1:kb, i:i+kb-1]), W)
         else
-            # H or H^H is applied to C[1:m, i:n]
-            ni = n1- i + 1
+            # Q is applied from the right: [A1 A2] * Q  
+            ni = n1 - i + 1
             jc = i
-            ldvv = n2
+            # Workspace for this block: mi x kb
+            W = reshape(@view(work[1:mi*kb]), mi, kb)
+            parfb!('R', trans, 'F', 'C', mi, ni, m2, n2, kb, 0,
+                   (@view A1[ic:ic+mi-1, jc:jc+ni-1]), (@view A2[1:m2, 1:n2]),
+                   (@view V[1:n2, i:i+kb-1]), (@view T_mat[1:kb, i:i+kb-1]), W)
         end
+    end
+end
 
-        # apply H or H^H 
-        parfb(side, trans, 'F', 'C', mi, ni, m2, n2, kb, 0,
-        (@view A1[ic:ic+mi-1, jc:jc+ni-1]), lda1, (@view A2[1:m2, 1:n2]), lda2, 
-        (@view V[1:ldvv, i:i+kb-1]), ldvv, (@view T[1:kb, i:i+kb-1]), kb, work, ldwork)
+"""
+    tsmqr!(side, trans, A1, A2, V, T, ib) -> (A1, A2)
+    
+Apply orthogonal matrix Q (stored in compact WY form) to triangular-pentagonal matrices.
+
+This is a high-level interface that automatically determines dimensions and
+allocates workspace for applying block orthogonal transformations to the
+combined matrix [A1; A2].
+
+# Arguments
+- `side`: Character indicating multiplication side
+  - 'L': Apply Q from left (Q*[A1; A2] or Q^H*[A1; A2])
+  - 'R': Apply Q from right ([A1 A2]*Q or [A1 A2]*Q^H)
+- `trans`: Character indicating transpose operation
+  - 'N': Apply Q (no transpose)
+  - 'C': Apply Q^H (conjugate transpose)
+- `A1`: Triangular part of matrix (modified in-place)
+- `A2`: Pentagonal part of matrix (modified in-place)
+- `V`: Matrix containing elementary reflector vectors
+- `T_matrix`: Upper triangular block reflector coefficient matrix
+- `ib`: Block size for the compact WY representation
+
+# Returns
+- Modified `A1`: Triangular part after transformation
+- Modified `A2`: Pentagonal part after transformation
+
+# Input Validation
+- For side='L': n2 must equal n1 (same number of columns)
+- For side='R': m2 must equal m1 (same number of rows)
+- Block size ib should be positive and ≤ min(size(V,2), ib)
+
+# Example
+```julia
+# Apply Q from left to triangular-pentagonal matrix
+m1, n1, m2, n2 = 6, 8, 10, 8  
+k, ib = 4, 2
+A1 = triu(randn(ComplexF64, m1, n1))
+A2 = randn(ComplexF64, m2, n2)
+V = randn(ComplexF64, m2, k)
+T = triu(randn(ComplexF64, ib, k))
+tsmqr!('L', 'N', A1, A2, V, T, ib)
+```
+
+# Algorithm
+Uses blocked approach to apply the orthogonal transformation Q = I - V*T*V^H
+efficiently. The compact WY representation enables high-performance 
+matrix-matrix operations instead of multiple vector operations.
+"""
+function tsmqr!(side::Char, trans::Char, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, 
+               V::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, ib::Integer) where {T}
+    m1, n1 = size(A1)
+    m2, n2 = size(A2)
+    k = size(V, 2)
+    
+    # Validate input dimensions
+    if side == 'L' && n2 != n1
+        throw(ArgumentError("For side='L', A1 and A2 must have same number of columns. Got n1=$n1, n2=$n2"))
+    elseif side == 'R' && m2 != m1
+        throw(ArgumentError("For side='R', A1 and A2 must have same number of rows. Got m1=$m1, m2=$m2"))
+    end
+    
+    if ib <= 0
+        throw(ArgumentError("Block size ib must be positive, got $ib"))
+    end
+    
+    if k > size(T_matrix, 2)
+        throw(ArgumentError("Number of reflectors k ($k) exceeds T matrix columns ($(size(T_matrix, 2)))"))
     end
+    
+    # Determine workspace requirements and allocate
+    if side == 'L'
+        work_size = ib * max(n1, n2)
+    else
+        work_size = m1 * ib
+    end
+    work = zeros(T, work_size)
+    
+    # Call the core computational routine
+    tsmqr!(side, trans, m1, n1, m2, n2, k, ib, A1, A2, 
+          V, T_matrix, work)
 end
diff --git a/src/tsqrt.jl b/src/tsqrt.jl
index 4d61153..fa1636b 100644
--- a/src/tsqrt.jl
+++ b/src/tsqrt.jl
@@ -1,79 +1,173 @@
-function tsqrt(m, n, ib, A1, lda1, A2, lda2, T, ldt, tau, work)
-    # check input Arguments
+"""
+    tsqrt!(m, n, ib, A1, A2, T, tau, work)
 
-    if m < 0
-        throw(ArgumentError("illegal value of m"))
-        return -1
-    end
+Compute the QR factorization of an (m+n)-by-n triangular-pentagonal matrix
+using the compact WY representation.
 
-    if n < 0
-        throw(ArgumentError("illegal value of n"))
-        return -2
-    end
+This routine computes the QR factorization of a triangular-pentagonal matrix:
+    [ A1 ]
+    [ A2 ]
+where A1 is n-by-n upper triangular and A2 is m-by-n general.
 
-    if ib < 0
-        throw(ArgumentError("illegal value of ib"))
-        return -3
-    end
+The factorization has the form:
+    [ A1 ] = Q * [ R ]
+    [ A2 ]       [ 0 ]
+where Q is orthogonal and R is upper triangular.
+
+# Arguments
+- `m`: Number of rows of the pentagonal part A2
+- `n`: Number of columns of the triangular-pentagonal matrix  
+- `ib`: Block size for the compact WY representation
+- `A1`: n×n upper triangular matrix (modified in-place)
+- `A2`: m×n general matrix (modified in-place) 
+- `T`: ib×n matrix to store block reflector coefficients
+- `tau`: Vector of length n to store reflector scalar factors
+- `work`: Workspace array of length ib×n
 
-    if lda1 < max(1,n) && n > 0
-        throw(ArgumentError("illegal value of lda1"))
-        return -5
+# Algorithm
+The algorithm proceeds in blocks of size ib:
+1. For each block, generate elementary reflectors to zero the pentagonal part
+2. Apply reflectors to remaining columns using efficient block updates
+3. Store reflector coefficients in compact WY form in matrix T
+
+The compact WY representation allows for efficient application of the 
+orthogonal factor Q using block operations.
+
+# Input Validation
+All dimension parameters must be non-negative and leading dimensions
+must satisfy minimum requirements for valid matrix storage.
+
+# Notes
+This is a low-level computational routine typically called by higher-level
+QR factorization interfaces. The matrices A1, A2 are modified in-place
+to store the R factor and reflector vectors respectively.
+"""
+function tsqrt!(m::Integer, n::Integer, ib::Integer, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, tau::AbstractVector{T}, work::AbstractVector{T}) where {T}
+    # Input validation with descriptive error messages
+    if m < 0
+        throw(ArgumentError("m must be non-negative, got $m"))
     end
 
-    if lda2 < max(1,m) && m > 0
-        throw(ArgumentError("illegal value of lda2"))
-        return -7
+    if n < 0
+        throw(ArgumentError("n must be non-negative, got $n"))
     end
 
-    if ldt < max(1,ib) && ib > 0
-        throw(ArgumentError("illegal value of ldt"))
-        return -9
+    if ib < 0
+        throw(ArgumentError("ib must be non-negative, got $ib"))
     end
 
-    # quick return 
+    # Quick return for degenerate cases
     if m == 0 || n == 0 || ib == 0
         return
     end
 
-    one0 = oneunit(eltype(A1))
-    zero0 = zero(eltype(A1))
-    plus = LinearAlgebra.MulAddMul(one0, one0)
+    Tone = oneunit(eltype(A1))
+    Tzero = zero(eltype(A1))
+    plus = LinearAlgebra.MulAddMul(Tone, Tone)
 
+    # Process matrix in blocks of size ib
     for ii in 1:ib:n
         sb = min(n-ii+1, ib)
 
+        # Generate elementary reflectors for current block
         for i in 1:sb
-            # generate elementary reflector H[ii*ib + i] to annilate A[ii*ib, + i:m, ii*ib + i]
-            A1[ii+i-1, ii+i-1], tau[ii+i-1] = larfg(m+1, A1[ii+i-1, ii+i-1], (@view A2[1:m, ii+i-1]), 1, tau[ii+i-1])
+            # Generate elementary reflector H[ii+i-1] to annihilate A2[1:m, ii+i-1]
+            A1[ii+i-1, ii+i-1], tau[ii+i-1] = larfg!(m+1, A1[ii+i-1, ii+i-1], 
+                (@view A2[1:m, ii+i-1]), 1, tau[ii+i-1])
 
             if ii+i <= n
-                # apply H[ii*ib + i] to A[ii*ib + i:m, ii*ib + i + 1 : ii*ib + ib] from left
+                # Apply H[ii+i-1] to A[ii+i-1:m, ii+i:ii+sb-1] from the left
                 alpha = -conj(tau[ii+i-1])
                 (@view work[1:sb-i]) .= (@view A1[ii+i-1, ii+i:ii+sb-1])
                 
+                # Compute work = A1[ii+i-1, ii+i:ii+sb-1]^H + A2[1:m, ii+i:ii+sb-1]^H * A2[1:m, ii+i-1]
                 conj!((@view work[1:sb-i]))
-                LinearAlgebra.generic_matvecmul!((@view work[1:sb-i]), 'C', (@view A2[1:m, ii+i:ii+sb-1]), (@view A2[1:m, ii+i-1]), plus)
+                LinearAlgebra.generic_matvecmul!((@view work[1:sb-i]), 'C', (@view A2[1:m, ii+i:ii+sb-1]), 
+                    (@view A2[1:m, ii+i-1]), plus)
                 conj!((@view work[1:sb-i]))
+                
+                # Apply the reflector: A1 -= alpha * work, A2 -= alpha * v * work^H
                 LinearAlgebra.axpy!(alpha, (@view work[1:sb-i]), (@view A1[ii+i-1, ii+i:ii+sb-1]))
                 conj!((@view work[1:sb-i]))
                 gerc!(alpha, (@view A2[1:m, ii+i-1]), (@view work[1:sb-i]), (@view A2[1:m, ii+i:ii+sb-1]))
             end
 
-            # Calculate T
-            alpha = -tau[ii+i-1]
-            LinearAlgebra.generic_matvecmul!((@view T[1:i-1, ii+i-1]), 'C', (@view A2[1:m, ii:ii+i-2]), (@view A2[1:m, ii+i-1]),LinearAlgebra.MulAddMul(alpha, zero0))
-            #LinearAlgebra.BLAS.trmv!('U', 'N', 'N', (@view T[1:i-1, ii:ii+i-2]), (@view T[1:i-1, ii+i-1]))
-            LinearAlgebra.generic_trimatmul!((@view T[1:i-1, ii+i-1]), 'U', 'N', identity, (@view T[1:i-1, ii:ii+i-2]), (@view T[1:i-1, ii+i-1]))
-            T[i, ii+i-1] = tau[ii+i-1]
+            # Build triangular factor T for block reflectors
+            if i > 1
+                alpha = -tau[ii+i-1]
+                LinearAlgebra.generic_matvecmul!((@view T_matrix[1:i-1, ii+i-1]), 'C', (@view A2[1:m, ii:ii+i-2]), 
+                    (@view A2[1:m, ii+i-1]), LinearAlgebra.MulAddMul(alpha, Tzero))
+                LinearAlgebra.generic_trimatmul!((@view T_matrix[1:i-1, ii+i-1]), 'U', 'N', identity, 
+                    (@view T_matrix[1:i-1, ii:ii+i-2]), (@view T_matrix[1:i-1, ii+i-1]))
+            end
+            T_matrix[i, ii+i-1] = tau[ii+i-1]
         end
 
+        # Apply block reflector to remaining columns
         if n >= ii+sb
-            ww = reshape(@view(work[1: ib*(n-(ii+sb)+1)]), ib, n-(ii+sb)+1)
-
-            tsmqr('L', 'C', sb, n-(ii+sb) + 1, m, n-(ii+sb) + 1, ib, ib, 
-            (@view A1[ii:ii+sb-1, ii+sb: n]), sb, (@view A2[1:m, ii+sb:n]), m, 
-            (@view A2[1:m, ii:ii+sb-1]), m, (@view T[1:ib, ii:ii+ib-1]), ib, ww, sb)
+            # Use provided vector workspace; tsmqr! will reshape internally as needed
+            tsmqr!('L', 'C', sb, n - (ii + sb) + 1, m, n - (ii + sb) + 1, sb, ib,
+                   (@view A1[ii:ii+sb-1, ii+sb:n]), (@view A2[1:m, ii+sb:n]),
+                   (@view A2[1:m, ii:ii+sb-1]), (@view T_matrix[1:ib, ii:ii+sb-1]), work)
         end
     end
 end
+
+"""
+    tsqrt!(A1, A2, ib) -> (A1, A2, T, tau)
+    
+Compute QR factorization of a triangular-pentagonal matrix using block algorithm.
+
+This is a high-level interface that automatically allocates workspace and
+computes the QR factorization of the combined matrix [A1; A2] where A1 is
+upper triangular and A2 is general.
+
+# Arguments
+- `A1`: n×n upper triangular matrix (modified in-place to store R factor)
+- `A2`: m×n general matrix (modified in-place to store reflector vectors)
+- `ib`: Block size for the algorithm (typically 32-64 for good performance)
+
+# Returns
+- Modified `A1`: Contains the R factor of the QR factorization  
+- Modified `A2`: Contains the elementary reflector vectors
+- `T`: ib×n matrix containing block reflector coefficients
+- `tau`: Length-n vector containing reflector scaling factors
+
+# Input Validation
+- A1 must be square (n×n)
+- A2 must have same number of columns as A1 (m×n)
+- Block size ib should be positive and ≤ n for efficiency
+
+# Example
+```julia
+n, m = 6, 8
+ib = 4
+A1 = triu(randn(ComplexF64, n, n))  # Upper triangular
+A2 = randn(ComplexF64, m, n)        # General matrix
+A1_qr, A2_qr, T, tau = tsqrt!(copy(A1), copy(A2), ib)
+```
+
+# Algorithm Notes  
+Uses blocked algorithm for efficiency with large matrices. The compact WY
+representation (stored in T) enables efficient application of the Q factor.
+"""
+function tsqrt!(A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, tau::AbstractVector{T}, ib::Integer) where {T}
+    n, n2 = size(A1)
+    if n != n2
+        throw(ArgumentError("A1 must be square, got size $(size(A1))"))
+    end
+    
+    m, n3 = size(A2) 
+    if n != n3
+        throw(ArgumentError("A1 and A2 must have same number of columns, got $n and $n3"))
+    end
+    
+    if ib <= 0
+        throw(ArgumentError("Block size ib must be positive, got $ib"))
+    end
+    
+    work = zeros(T, ib * n)
+    
+    # Call the core computational routine
+    tsqrt!(m, n, ib, A1, A2, T_matrix, tau, work)
+end
diff --git a/src/ttmqr.jl b/src/ttmqr.jl
index 7b084dd..ac4b263 100644
--- a/src/ttmqr.jl
+++ b/src/ttmqr.jl
@@ -1,73 +1,40 @@
-function ttmqr(side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, work, ldwork)
+function ttmqr!(side::Char, trans::Char, m1::Integer, n1::Integer, m2::Integer, n2::Integer, k::Integer, ib::Integer, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, V::AbstractMatrix{T}, T_mat::AbstractMatrix{T}, work::AbstractVector{T}) where {T}
     # check input arguments
     if side != 'L' && side != 'R'
         throw(ArgumentError("illegal value of side"))
-        return -1
     end
 
     if trans != 'N' && trans != 'C' 
         throw(ArgumentError("illegal value of trans"))
-        return -2
     end
 
     if m1 < 0 
         throw(ArgumentError("illegal value of m1"))
-        return -3
     end
 
     if n1 < 0
         throw(ArgumentError("illegal value of n1"))
-        return -4
     end
 
     if (m2 < 0) || (m2 != m1 && side == 'R')
         throw(ArgumentError("illegal value of m2"))
-        return -5
     end
 
     if (n2 < 0) || (n2 != n1 && side == 'L')
         throw(ArgumentError("illegal value of n2"))
-        return -6
     end
 
     if (k < 0) || (side == 'L' && k > m1) || (side == 'R' && k > n1)
         throw(ArgumentError("illegal value of k"))
-        return -7
     end
 
     if ib < 0
         throw(ArgumentError("illegal value of ib"))
-        return -8
-    end
-
-    if lda1 < max(1,m1)
-        throw(ArgumentError("illegal value of lda1"))
-        return -10
-    end
-
-    if lda2 < max(1,m2)
-        throw(ArgumentError("illegal value of lda2"))
-        return -12
-    end
-
-    if ldv < max(1, side == 'L' ? m2 : n2)
-        throw(ArgumentError("illegal value of ldv"))
-        return -14
-    end
-
-    if ldt < max(1,ib)
-        throw(ArgumentError("illegal of ldt"))
-        return -16
-    end
-
-    if ldwork < max(1, side == 'L' ? ib : m1)
-        throw(ArgumentError("illegal value of ldwork"))
-        return -18
     end
 
     # quick return
     if m1 == 0 || n1 == 0 || m2 == 0 || n2 == 0 || k == 0 || ib == 0
-        return 0
+        return
     end
 
     if (side == 'L' && trans != 'N') || (side == 'R' && trans == 'N')
@@ -91,28 +58,69 @@ function ttmqr(side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T
         l = 0
 
         if side == 'L'
-            # H or H^H applied to C[i:m, 1:n]
+            # Apply from left on the current block rows
             mi = kb
-            mi2 = min(i+kb-1, m2)
+            mi2 = min(i + kb - 1, m2)
             ic = i
-            l = min(kb, max(0, m2-i))  # Julia 1-based: m2-i+1 (PLASMA has m2-i for 0-based)
-            ldvv = m2
-        else 
+            l = min(kb, max(0, m2 - i))
+            # Workspace as kb x ni
+            W = reshape(@view(work[1:kb*ni]), kb, ni)
+            parfb!('L', trans, 'F', 'C', mi, ni, mi2, ni2, kb, l,
+            (@view A1[ic:ic+mi-1, jc:jc+ni-1]),
+            (@view A2[1:mi2, 1:ni2]),
+            (@view V[1:m2, i:i+kb-1]),
+            (@view T_mat[1:kb, i:i+kb-1]),
+            W)
+        else
+            # Apply from right on the current block columns
             ni = kb
-            ni2 = min(i+kb-1, n2)
+            ni2 = min(i + kb - 1, n2)
             jc = i
-            l = min(kb, max(0, n2-i))  # Julia 1-based: n2-i+1 (PLASMA has n2-i for 0-based)
-            ldvv = n2
+            l = min(kb, max(0, n2 - i))
+            # Workspace as mi x kb
+            W = reshape(@view(work[1:mi*kb]), mi, kb)
+            parfb!('R', trans, 'F', 'C', mi, ni, mi2, ni2, kb, l,
+                   (@view A1[ic:ic+mi-1, jc:jc+ni-1]),
+                   (@view A2[1:mi2, 1:ni2]),
+                   (@view V[1:n2, i:i+kb-1]),
+                   (@view T_mat[1:kb, i:i+kb-1]),
+                   W)
         end
-
-        # apply H or H^H 
-        parfb(side, trans, 'F', 'C', mi, ni, mi2, ni2, kb, l,
-            (@view A1[ic:ic+mi-1, jc:jc+ni-1]), lda1, 
-            (@view A2[1:mi2, 1:ni2]), lda2, 
-            (@view V[1:ldvv, i:i+kb-1]), ldvv, 
-            (@view T[1:ldt, i:i+kb-1]), ldt, 
-            work, ldwork)
         
         i += i3
     end
 end
+
+"""
+    ttmqr!(side, trans, A1, A2, V, T, ib) -> (A1, A2)
+    
+Helper function for triangular-trapezoidal matrix transformation.
+
+# Arguments
+- `side`: 'L' (left) or 'R' (right)
+- `trans`: 'N' (no transpose) or 'C' (conjugate transpose)  
+- `A1`: Upper triangular matrix to be updated
+- `A2`: Trapezoidal matrix to be updated
+- `V`: Reflector vectors matrix
+- `T`: Block reflector matrix
+- `ib`: Block size
+
+# Returns  
+- Modified `A1` and `A2`
+"""
+function ttmqr!(side::Char, trans::Char, A1::AbstractMatrix{T}, A2::AbstractMatrix{T},
+         V::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, ib::Integer) where T
+    m1, n1 = size(A1)
+    m2, n2 = size(A2)
+    # Use the common number of reflectors available in V and T
+    k = size(T_matrix, 2)
+
+    # Workspace size follows parfb!/TPMQRT requirements
+    # - Left: W is (ib x n1) at most
+    # - Right: W is (m1 x ib) at most
+    work_size = side == 'L' ? ib * n1 : m1 * ib
+    work = zeros(T, work_size)
+
+    ttmqr!(side, trans, m1, n1, m2, n2, k, ib, A1, A2,
+        V, T_matrix, work)
+end
diff --git a/src/ttqrt.jl b/src/ttqrt.jl
index 11ef15b..8c6f341 100644
--- a/src/ttqrt.jl
+++ b/src/ttqrt.jl
@@ -1,97 +1,104 @@
-function ttqrt(m, n, ib, A1, lda1, A2, lda2, T, ldt, tau, work)
-    begin
-        if m < 0
-            throw(ArgumentError("illegal value of m"))
-            return -1
-        end
-
-        if n < 0
-            throw(ArgumentError("illegal value of n"))
-            return -2
-        end
-
-        if ib < 0
-            throw(ArgumentError("illegal value of ib"))
-            return -3
-        end
-
-        if lda1 < max(1, n) && n > 0
-            throw(ArgumentError("illegal value of lda1"))
-            return -5
-        end
-
-        if lda2 < max(1, m) && m > 0
-            throw(ArgumentError("illegal value of lda2"))
-            return -7
-        end
+function ttqrt!(m::Integer, n::Integer, ib::Integer, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, T_mat::AbstractMatrix{T}, tau::AbstractVector{T}, work::AbstractVector{T}) where {T}
+    if m < 0
+        throw(ArgumentError("illegal value of m"))
+    end
 
-        if ldt < max(1, ib) && ib > 0
-            throw(ArgumentError("illegal value of ldt"))
-            return -9
-        end
+    if n < 0
+        throw(ArgumentError("illegal value of n"))
+    end
 
-        # quick return
-        if m == 0 || n == 0 || ib == 0
-            return
-        end
+    if ib < 0
+        throw(ArgumentError("illegal value of ib"))
+    end
 
-        #   original function had this todo:
-        #   todo: Need to check why some cases require this to avoid
-        #   uninitialized values
-        #   core_zlaset(CoreBlasGeneral, ib, n, 0.0, 0.0, T, ldt);
+    # quick return
+    if m == 0 || n == 0 || ib == 0
+        return
+    end
 
-        one = oneunit(eltype(A1))
+    #   original function had this todo:
+    #   todo: Need to check why some cases require this to avoid
+    #   uninitialized values
+    #   core_zlaset(CoreBlasGeneral, ib, n, 0.0, 0.0, T, ldt);
 
-        for ii in 1:ib:n
-            sb = min(n - ii + 1, ib)
+    one = oneunit(eltype(A1))
+    Tzero = zero(eltype(A1))
 
-            for i in 1:sb
-                j = ii + i - 1 # index
-                mi = min(j, m) # length
-                ni = sb - i  # length
+    for ii in 1:ib:n
+        sb = min(n - ii + 1, ib)
 
-                A1[j, j], tau[j] = larfg(mi + 1, A1[j, j], (@view A2[1:mi, j]), 1, tau[j])
+        for i in 1:sb
+            j = ii + i - 1 # index
+            mi = min(j, m) # length
+            ni = sb - i  # length
 
-                if ni > 0
-                    work[1:ni] .= (@view A1[j, j+1:j+ni])
-                    conj!((@view work[1:ni]))
+            A1[j, j], tau[j] = larfg!(mi + 1, A1[j, j], (@view A2[1:mi, j]), 1, tau[j])
 
-                    LinearAlgebra.generic_matvecmul!((@view work[1:ni]), 'C', (@view A2[1:mi, j+1:j+ni]),
-                        (@view A2[1:mi, j]), LinearAlgebra.MulAddMul(one, one))
-                    conj!((@view work[1:ni]))
+            if ni > 0
+                work[1:ni] .= (@view A1[j, j+1:j+ni])
+                conj!((@view work[1:ni]))
 
-                    alpha = -conj(tau[j])
-                    axpy!(alpha, (@view work[1:ni]), (@view A1[j, j+1:j+ni]))
-                    conj!((@view work[1:ni]))
-                    gerc!(alpha, (@view A2[1:mi, j]), (@view work[1:ni]), (@view A2[1:mi, j+1:j+ni]))
-                end
+                LinearAlgebra.generic_matvecmul!((@view work[1:ni]), 'C', (@view A2[1:mi, j+1:j+ni]),
+                    (@view A2[1:mi, j]), LinearAlgebra.MulAddMul(one, one))
+                conj!((@view work[1:ni]))
 
-                # calculate T
-                if i > 1
-                    l = min(i - 1, max(0, m - ii + 1)) # length
-                    alpha = -tau[j]
+                alpha = -conj(tau[j])
+                axpy!(alpha, (@view work[1:ni]), (@view A1[j, j+1:j+ni]))
+                conj!((@view work[1:ni]))
+                gerc!(alpha, (@view A2[1:mi, j]), (@view work[1:ni]), (@view A2[1:mi, j+1:j+ni]))
+            end
 
-                    pemv('C', 'C', min(j - 1, m), i - 1, l, alpha, (@view A2[1:m, ii:ii+i-2]), lda2,
-                        (@view A2[1:m, j]), 0, (@view T[1:i-1, j]), work)
-                    LinearAlgebra.generic_trimatmul!((@view T[1:i-1, j]), 'U', 'N', identity, (@view T[1:i-1, ii:ii+i-2]), (@view T[1:i-1, j]))
-                end
+            # calculate T
+            if i > 1
+                l = min(i - 1, max(0, m - ii + 1)) # length
+                alpha = -tau[j]
 
-                T[i, j] = tau[j]
+                pemv!('C', 'C', min(j - 1, m), i - 1, l, alpha, (@view A2[1:m, ii:ii+i-2]),
+                    (@view A2[1:m, j]), Tzero, (@view T_mat[1:i-1, j]), work)
+                LinearAlgebra.generic_trimatmul!((@view T_mat[1:i-1, j]), 'U', 'N', identity, (@view T_mat[1:i-1, ii:ii+i-2]), (@view T_mat[1:i-1, j]))
             end
 
-            if (n >= ii + sb)
-                mi = min(ii + sb - 1, m)
-                ni = n - (ii + sb - 1)
-                l = min(sb, max(0, mi - ii + 1))
-                ww = reshape(@view(work[1:sb*ni]), sb, ni) # k by n1 -- sb by ni
+            T_mat[i, j] = tau[j]
+        end
 
-                parfb('L', 'C', 'F', 'C', ib, ni, mi, ni, sb, l, (@view A1[ii:ii+ib-1, ii+sb:ii+sb+ni-1]),
-                    lda1, (@view A2[1:mi, ii+sb:ii+sb+ni-1]), lda2, (@view A2[1:mi, ii:ii+sb-1]), lda2,
-                    (@view T[1:sb, ii:ii+sb-1]), ldt, ww, sb)
+        if (n >= ii + sb)
+            mi = min(ii + sb - 1, m)
+            ni = n - (ii + sb - 1)
+            l = min(sb, max(0, mi - ii + 1))
+            # Workspace reshape for this call: sb x ni (left side)
+            W = reshape(@view(work[1:sb*ni]), sb, ni)
+            parfb!('L', 'C', 'F', 'C', ib, ni, mi, ni, sb, l,
+                (@view A1[ii:ii+ib-1, ii+sb:ii+sb+ni-1]),
+                (@view A2[1:mi, ii+sb:ii+sb+ni-1]),
+                (@view A2[1:mi, ii:ii+sb-1]),
+                (@view T_mat[1:sb, ii:ii+sb-1]),
+                W)
 
-            end
         end
-
-        return
     end
 end
+
+"""
+    ttqrt!(A, B, ib) -> (A, B, T, tau)
+    
+Helper for triangular-triangular QR factorization.
+
+# Arguments
+- `A`: Upper triangular matrix (n × n)
+- `B`: Upper triangular matrix (n × n)
+- `ib`: Block size
+
+# Returns
+- Modified `A` and `B` matrices
+- `T`: Block reflector matrix  
+- `tau`: Scalar factors
+"""
+function ttqrt!(ib::Integer, A::AbstractMatrix{T}, B::AbstractMatrix{T}, T_mat::AbstractMatrix{T}, tau::AbstractVector{T}) where {T}
+    m, n = size(A)
+    m2, n2 = size(B)
+    @assert m2 == m && n2 == n "A and B must have same dimensions"
+
+    work = zeros(T, ib * n)
+    
+    ttqrt!(m, n, ib, A, B, T_mat, tau, work)
+end
diff --git a/src/unmqr.jl b/src/unmqr.jl
index 9df884c..ab94981 100644
--- a/src/unmqr.jl
+++ b/src/unmqr.jl
@@ -1,151 +1,231 @@
 """
-	unmqr(side, trans, m, n, k, ib, A, lda, T, ldt, C, ldc, work, ldwork)
+    unmqr!(side, trans, m, n, k, ib, A, lda, T_matrix, C, work)
 
-Overwrites the general m-by-n tile C with
-					side = 'L'		side = 'R'
-	trans = 'N'		   Q*C				C*Q
-	trans = 'C'		   Q^H*C			C*Q^H
+Apply orthogonal matrix Q (or Q^H) from a QR factorization to a general matrix C.
 
-where Q is a unitary matrix defined as the product of k elementary reflectors
-	Q = H(1) H(2) ... H(k)
-as returned by zgeqrt. Q is of order m if side = 'L" and of order n if side = 'R'
+Overwrites the general m-by-n matrix C with:
+                    side = 'L'        side = 'R'
+    trans = 'N'       Q * C           C * Q
+    trans = 'C'       Q^H * C         C * Q^H
+
+where Q is a unitary matrix defined as the product of k elementary reflectors:
+Q = H(1) H(2) ... H(k)
+
+as returned by geqrt!. Q is of order m if side = 'L' and of order n if side = 'R'.
 
 # Arguments
-- 'side': 
-	- = 'L': apply Q or Q^H from the left
-	- = 'R': apply Q or Q^H from the right
-- 'trans':
-	- = 'N': no transpose, apply Q
-	- = 'C': conjugate transpose, apply Q^H
-- 'm': the number of rows of the tile C. m >= 0
-- 'n': the number of columns of the tile C. n >= 0
-- 'k': the number of elementary refelctors whose product defines the matrix Q
-	- if side = 'L', m >= k >= 0
-	- if side = 'R', n >= k >= 0
-- 'ib': the inner blocking size. ib >= 0
-- 'A': dimension (lda, k)
-	the i-th column must contain the vector which defines the 
-	elementary reflector H(i) for i = 1,2,...,k,
-	as returned by zgeqrt in the first k columns of its array argument A
-- 'lda': the leading dimension of array A
-	if side = 'L', lda >= max(1,m)
-	if side = 'R', lda >= max(1,n)
-- 'T': the ib-by-k triangular factor T of the block reflector
-	T is upper triangular by block (economic storage)
-	The rest of the array is not referenced
-- 'ldt': the elding dimension of the array T. ldt >= ib
-- 'C': 
-	On entry the m-by-n tile C
-	On exit, C is overwritten by Q*C or Q^H*C or C*Q^H or C*Q.
--'work': auxillary workspace of array work
-	ldwork-by-n if side = 'L'
-	ldwork by ib if side = 'R'
-- 'ldwork': the leading dimension of array work
-	ldwork >= max(1,ib) if side = 'L'
-	ldwork >= max(1,m) if side = 'R'
+- `side`: Character specifying which side to apply Q
+  - 'L': Apply Q or Q^H from the left
+  - 'R': Apply Q or Q^H from the right
+- `trans`: Character specifying transpose operation
+  - 'N': No transpose, apply Q
+  - 'C': Conjugate transpose, apply Q^H
+- `m`: Number of rows of matrix C (≥ 0)
+- `n`: Number of columns of matrix C (≥ 0)
+- `k`: Number of elementary reflectors defining Q
+  - If side = 'L': m ≥ k ≥ 0
+  - If side = 'R': n ≥ k ≥ 0
+- `ib`: Inner block size (≥ 0)
+- `A`: Matrix of dimension (lda, k) containing reflector vectors
+  The i-th column contains the vector defining elementary reflector H(i),
+  as returned by geqrt! in the first k columns
+- `lda`: Leading dimension of array A
+  - If side = 'L': lda ≥ max(1,m)
+  - If side = 'R': lda ≥ max(1,n)
+- `T`: ib×k triangular factor of the block reflector
+  T is upper triangular by blocks (economic storage)
+- `C`: m×n matrix to be transformed (modified in-place)
+- `work`: Workspace array
+
+# Algorithm
+The routine applies Q using the compact WY representation stored in A and T.
+It processes the elementary reflectors in blocks of size ib, using efficient
+block operations (larfb!) for high performance.
+
+The order of applying blocks depends on side and trans parameters to ensure
+numerical stability and efficiency.
+
+# Notes
+This is a core computational routine for applying orthogonal transformations
+from QR factorizations. It is typically called by higher-level interfaces.
 """
-function unmqr(side, trans, m, n, k, ib, A, lda, T, ldt, C, ldc, work, ldwork)
-	if side != 'L' && side != 'R'
-        throw(ArgumentError("illegal value of side"))
-		return -1
-	end
-
-	if side == 'L'
-		nq = m
-		nw = n
-	else
-		nq = n
-		nw = m
-	end
-
-	if trans != 'N' && trans != 'C' && trans != 'T'
-        throw(ArgumentError("illegal value of trans"))
-		return -2
-	end
-
-	if m < 0
-        throw(ArgumentError("illegal value of m"))
-		return -3
-	end
-
-	if n < 0
-        throw(ArgumentError("illegal value of n"))
-		return -4
-	end
-
-	if k < 0 || k > nq
-        throw(ArgumentError("illegal value of k"))
-		return -5
-	end
-
-	if ib < 0 
-        throw(ArgumentError("illegal value of ib"))
-		return -6
-	end
-
-	if lda < max(1, nq) && nq > 0
-        throw(ArgumentError("illegal value of lda"))
-		return -8
-	end
-
-	if ldt < max(1,ib)
-        throw(ArgumentError("illegal value of ldt"))
-		return -10
-	end
-
-	if ldc < max(1,m) && m > 0
-        throw(ArgumentError("illegal value of ldc"))
-		return -12
-	end
-
-	if ldwork < max(1,nw) && nw > 0
-        throw(ArgumentError("illegal value of ldwork"))
-		return -14
-	end
-
-	# quick return 
-	if m == 0 || n == 0 || k == 0
-		return
-	end
-
-	if ((side == 'L' && trans != 'N') || (side == 'R' && trans == 'N'))
-		i1 = 1
-		i3 = ib
-		ibstop = k
-	else
-		i1 = div((k-1),ib)*ib + 1
-		i3 = -ib
-		ibstop = 1
-	end
-	
-	ic = 1
-	jc = 1
-	ni = n
-	mi = m
-
-	if side == 'L'
-		wwork = ones(eltype(A), n, ib)
-		ldw = n
-	else
-		wwork = ones(eltype(A), m, ib)
-		ldw = m
-	end
-
-	for i in i1 : i3 : ibstop
-		kb = min(ib, k-i+1)
-
-		if side == 'L'
-			# apply to C[i:m, 1:n]
-			mi = m - i + 1
-			ic = i
-		else
-			# apply to C[1:m, i:n]
-			ni = n-i + 1
-			jc = i
-		end
-
+function unmqr!(side::Char, trans::Char, m::Integer, n::Integer, k::Integer, ib::Integer, A::AbstractMatrix{T}, lda::Integer, T_matrix::AbstractMatrix{T}, C::AbstractMatrix{T}, work::AbstractMatrix{T}) where {T}
+    # Input validation with descriptive error messages
+    if side != 'L' && side != 'R'
+        throw(ArgumentError("side must be 'L' or 'R', got '$side'"))
+    end
+
+    if side == 'L'
+        nq = m  # Order of Q when applied from left
+        nw = n  # Width for workspace
+    else
+        nq = n  # Order of Q when applied from right  
+        nw = m  # Width for workspace
+    end
+
+    if trans != 'N' && trans != 'C' && trans != 'T'
+        throw(ArgumentError("trans must be 'N', 'C', or 'T', got '$trans'"))
+    end
+
+    if m < 0
+        throw(ArgumentError("m must be non-negative, got $m"))
+    end
+
+    if n < 0
+        throw(ArgumentError("n must be non-negative, got $n"))
+    end
+
+    if k < 0 || k > nq
+        throw(ArgumentError("k must satisfy 0 ≤ k ≤ $nq, got $k"))
+    end
+
+    if ib < 0 
+        throw(ArgumentError("ib must be non-negative, got $ib"))
+    end
+
+    if lda < max(1, nq) && nq > 0
+        throw(ArgumentError("lda must be ≥ max(1,$nq), got $lda"))
+    end
+
+    # Quick return for degenerate cases
+    if m == 0 || n == 0 || k == 0
+        return
+    end
+
+    # Determine order of applying reflector blocks
+    if ((side == 'L' && trans != 'N') || (side == 'R' && trans == 'N'))
+        # Apply blocks forward: 1, ib+1, 2*ib+1, ...
+        i1 = 1
+        i3 = ib
+        ibstop = k
+    else
+        # Apply blocks backward: ..., 2*ib+1, ib+1, 1
+        i1 = div((k-1),ib)*ib + 1
+        i3 = -ib
+        ibstop = 1
+    end
+    
+    # Initialize submatrix indices
+    ic = 1
+    jc = 1
+    ni = n
+    mi = m
+
+    # Allocate workspace for block operations
+    if side == 'L'
+        wwork = ones(eltype(A), n, ib)
+        ldw = n
+    else
+        wwork = ones(eltype(A), m, ib)
+        ldw = m
+    end
+
+    # Apply blocks of elementary reflectors
+    for i in i1 : i3 : ibstop
+        kb = min(ib, k-i+1)  # Size of current block
+
+        if side == 'L'
+            # Apply to C[i:m, 1:n]
+            mi = m - i + 1
+            ic = i
+        else
+            # Apply to C[1:m, i:n]
+            ni = n - i + 1
+            jc = i
+        end
+
+        # Get view of submatrix to transform
         cv = @view C[ic:m, jc:n]
 
-        larfb(side, trans, 'F', 'C', mi, ni, kb, (@view A[i:lda, i:i+kb-1]), lda-i+1, (@view T[1:kb, i:i+kb-1]), kb, cv, ldc, (@view wwork[:, 1:kb]), ldw)
-	end
+        # Apply current block of reflectors
+        larfb!(side, trans, 'F', 'C', mi, ni, kb,
+            (@view A[i:lda, i:i+kb-1]), lda-i+1,
+            (@view T_matrix[1:kb, i:i+kb-1]),
+            cv, (@view wwork[:, 1:kb]))
+    end
+end
+
+"""
+    unmqr!(side, trans, A_qr, T, C, ib) -> C
+    
+Apply orthogonal matrix Q from QR factorization to matrix C.
+
+This is a high-level interface that automatically determines dimensions and
+allocates workspace to apply the orthogonal factor Q from a QR factorization
+to a general matrix C.
+
+# Arguments
+- `side`: Character specifying application side
+  - 'L': Apply Q from left (Q*C or Q^H*C)
+  - 'R': Apply Q from right (C*Q or C*Q^H)
+- `trans`: Character specifying transpose operation  
+  - 'N': Apply Q (no transpose)
+  - 'C': Apply Q^H (conjugate transpose)
+- `A_qr`: QR factorization result from geqrt! (contains reflector vectors)
+- `T_matrix`: Block reflector coefficient matrix from geqrt!
+- `C`: Matrix to transform (modified in-place)
+- `ib`: Block size used in QR factorization
+
+# Returns
+- Modified matrix `C` after applying the orthogonal transformation
+
+# Input Validation
+- Matrix dimensions must be compatible with the QR factorization
+- Block size ib must be positive and consistent with T matrix dimensions
+- For side='L': number of rows of C must match Q dimension
+- For side='R': number of columns of C must match Q dimension
+
+# Example
+```julia
+# Apply Q from QR factorization to matrix C
+m, n, k = 10, 8, 6
+ib = 4
+A = randn(ComplexF64, m, k)
+A_qr, T, tau = geqrt!(copy(A), ib)
+C = randn(ComplexF64, m, n)
+unmqr!('L', 'N', A_qr, T, C, ib)  # C := Q * C
+```
+
+# Algorithm
+Uses the blocked compact WY representation to apply Q efficiently through
+matrix-matrix operations rather than individual elementary reflectors.
+"""
+function unmqr!(side::Char, trans::Char, A::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, 
+               C::AbstractMatrix{T}, ib::Integer) where {T}
+    m, n = size(C)
+    k = size(T_matrix, 2)
+    
+    # Validate input dimensions
+    if ib <= 0
+        throw(ArgumentError("Block size ib must be positive, got $ib"))
+    end
+    
+    if side == 'L'
+        if size(A, 1) != m
+            throw(ArgumentError("For side='L', A_qr rows ($(size(A, 1))) must match C rows ($m)"))
+        end
+        if size(A, 2) < k
+            throw(ArgumentError("A_qr columns ($(size(A, 2))) must be ≥ k ($k)"))
+        end
+    else  # side == 'R'
+        if size(A, 1) != n
+            throw(ArgumentError("For side='R', A_qr rows ($(size(A, 1))) must match C columns ($n)"))
+        end
+        if size(A, 2) < k
+            throw(ArgumentError("A_qr columns ($(size(A, 2))) must be ≥ k ($k)"))
+        end
+    end
+    
+    # Set leading dimensions
+    lda = max(1, size(A, 1))
+    
+    # Allocate workspace based on side (matrix workspace expected by low-level)
+    if side == 'L'
+        work = zeros(T, n, ib)
+    else
+        work = zeros(T, m, ib)
+    end
+    
+    # Call the core computational routine
+    unmqr!(side, trans, m, n, k, ib, A, lda, T_matrix, C, work)
 end
diff --git a/test/geqr2.jl b/test/geqr2.jl
index a9c85d6..0230ad4 100644
--- a/test/geqr2.jl
+++ b/test/geqr2.jl
@@ -52,51 +52,46 @@ end
                         for imat in 1:4
                             @testset "Matrix type $imat" begin
                                 A_orig = generate_qr_test_matrix(T, m, n, imat)
-                                
-                                # --- Reference Calculation ---
-                                A_ref = copy(A_orig)
-                                tau_ref = zeros(T, k)
-                                A_ref = qr(A_ref).factors
 
                                 # --- NextLA Calculation ---
                                 A_test = copy(A_orig)
-                                lda = max(1, m)
                                 tau_test = zeros(T, k)
-                                work_test = zeros(T, n)  # Work array size n for geqr2
-                                NextLA.geqr2(m, n, A_test, lda, tau_test, work_test)
+                                work_test = zeros(T, n)  # Work array size n for geqr2!
+                                NextLA.geqr2!(m, n, A_test, tau_test, work_test)
+
+                                # --- Test Helper Function ---
+                                A_helper = copy(A_orig)
+                                tau_helper = zeros(T, k)
+                                NextLA.geqr2!(A_helper, tau_helper)
+
+                                # Verify helper gives same results as kernel
+                                @test A_helper ≈ A_test rtol=rtol atol=atol
+                                if k > 0
+                                    @test tau_helper ≈ tau_test rtol=rtol atol=atol
+                                end
 
                                 # --- Comparisons ---
                                 if m == 0 || n == 0
                                     @test size(A_test) == size(A_orig)
                                 else
-                                    # 1. Compare the factored matrix A (contains V and R)
-                                    scaled_rtol = rtol * max(1, m, n)
-                                    @test A_test ≈ A_ref rtol=scaled_rtol
-
-                                    # 3. Mathematical property checks
+                                    #Mathematical property checks
                                     if k > 0
                                         # Extract R from the factored matrix
                                         R_test = triu(A_test[1:k, 1:n])
                                         
-                                        # Form Q using LAPACK's unmqr
+                                        # Form Q using LAPACK's unmqr!
                                         Q_test = Matrix{T}(I, m, m)
-                                        try
-                                            LAPACK.ormqr!('L', 'N', A_test, tau_test, Q_test)
-                                            
-                                            # Test 3a: Reconstruction. A_orig should be Q * R.
-                                            A_recon = Q_test[:, 1:k] * R_test
-                                            reconstruction_tol = rtol * max(1, m, n) * norm(A_orig)
-                                            @test A_orig ≈ A_recon rtol=reconstruction_tol
+                                        LAPACK.ormqr!('L', 'N', A_test, tau_test, Q_test)
+                                        
+                                        # Test 3a: Reconstruction. A_orig should be Q * R.
+                                        A_recon = Q_test[:, 1:k] * R_test
+                                        reconstruction_tol = rtol * max(1, m, n) * norm(A_orig)
+                                        @test A_orig ≈ A_recon
 
-                                            # Test 3b: Orthogonality of Q. Q' * Q should be Identity.
-                                            orthog_error = norm(Q_test' * Q_test - I)
-                                            orthog_tol = rtol * m
-                                            @test orthog_error < orthog_tol
-                                        catch e
-                                            # If LAPACK fails, just check basic properties
-                                            @test all(isfinite.(A_test))
-                                            @test all(isfinite.(tau_test))
-                                        end
+                                        # Test 3b: Orthogonality of Q. Q' * Q should be Identity.
+                                        orthog_error = norm(Q_test' * Q_test - I)
+                                        orthog_tol = rtol * m
+                                        @test orthog_error < orthog_tol
                                         
                                         # Additional checks
                                         @test all(isfinite.(A_test))
@@ -119,17 +114,16 @@ end
                 # Test edge cases and error conditions
                 m, n = 500, 300
                 A = rand(T, m, n)
-                lda = m
                 tau = zeros(T, min(m, n))
                 work = zeros(T, n)
                 
                 # Valid call should not error
-                @test_nowarn NextLA.geqr2(m, n, copy(A), lda, copy(tau), copy(work))
+                @test_nowarn NextLA.geqr2!(m, n, copy(A), copy(tau), copy(work))
                 
                 # Zero dimensions should not error
-                @test_nowarn NextLA.geqr2(0, 0, zeros(T, 0, 0), 1, T[], T[])
-                @test_nowarn NextLA.geqr2(0, 300, zeros(T, 0, 300), 1, T[], zeros(T, 300))
-                @test_nowarn NextLA.geqr2(500, 0, zeros(T, 500, 0), 500, T[], T[])
+                @test_nowarn NextLA.geqr2!(0, 0, zeros(T, 0, 0), T[], T[])
+                @test_nowarn NextLA.geqr2!(0, 300, zeros(T, 0, 300), T[], zeros(T, 300))
+                @test_nowarn NextLA.geqr2!(500, 0, zeros(T, 500, 0), T[], T[])
             end
         end
     end
@@ -153,10 +147,10 @@ end
                     
                     # CPU reference
                     A_cpu_result = copy(A_cpu)
-                    NextLA.geqr2(m, n, A_cpu_result, m, tau_cpu, work_cpu)
+                    NextLA.geqr2!(m, n, A_cpu_result, tau_cpu, work_cpu)
                     
                     # GPU test
-                    NextLA.geqr2(m, n, A_gpu, m, tau_gpu, work_gpu)
+                    NextLA.geqr2!(m, n, A_gpu, tau_gpu, work_gpu)
                     
                     # Compare results
                     @test Array(A_gpu) ≈ A_cpu_result rtol=rtol
diff --git a/test/geqrt.jl b/test/geqrt.jl
index 71ed18e..cf5f602 100644
--- a/test/geqrt.jl
+++ b/test/geqrt.jl
@@ -3,40 +3,11 @@ using NextLA
 using LinearAlgebra, LinearAlgebra.LAPACK
 using Random
 
-# Function signature: geqrt(m, n, ib, A, lda, T, ldt, tau, work)
+# Function signature: geqrt!(m, n, ib, A, lda, T, ldt, tau, work)
 const GEQRT_TYPES = [ComplexF32, ComplexF64, Float32, Float64]
 const GEQRT_SIZES = [(0,0), (100,100), (200,100), (100,200), (400,300), (800,600), (150,100), (200,150)]
 const GEQRT_BLOCKSIZES = [100, 200, 400, 800]
 
-function generate_qr_test_matrix(::Type{T}, m, n, imat=1) where T
-    if m == 0 || n == 0
-        return zeros(T, m, n)
-    end
-    
-    # Use the matrix generation from runtests.jl
-    if imat == 1
-        # Well-conditioned random matrix
-        return matrix_generation(T, m, n, mode=:decay, cndnum=2.0)
-    elseif imat == 2
-        # Moderately ill-conditioned
-        return matrix_generation(T, m, n, mode=:decay, cndnum=1e2)
-    elseif imat == 3
-        # Severely ill-conditioned
-        return matrix_generation(T, m, n, mode=:one_large, cndnum=1e6)
-    elseif imat == 4
-        # Random matrix
-        return rand(T, m, n)
-    else
-        # Identity-like matrix
-        A = zeros(T, m, n)
-        k = min(m, n)
-        for i in 1:k
-            A[i, i] = one(T)
-        end
-        return A
-    end
-end
-
 @testset "GEQRT Tests" begin
     @testset "Blocked QR Factorization Tests" begin
         for (itype, T) in enumerate(GEQRT_TYPES)
@@ -74,13 +45,24 @@ end
 
                                         # --- NextLA Blocked QR ---
                                         A_test = copy(A_orig)
-                                        lda = max(1, m)
                                         T_test = zeros(T, max(1,ib), k)  # Block reflector matrix
-                                        ldt = max(1, ib)
                                         tau_test = zeros(T, k)
                                         work_test = zeros(T, ib * n)  # Work array
                                         
-                                        NextLA.geqrt(m, n, ib, A_test, lda, T_test, ldt, tau_test, work_test)
+                                        NextLA.geqrt!(m, n, ib, A_test, T_test, tau_test, work_test)
+
+                                        # --- Test Helper Function ---
+                                        A_helper = copy(A_orig)
+                                        T_helper = zeros(T, max(1, ib), k)
+                                        tau_helper = zeros(T, k)
+                                        NextLA.geqrt!(ib, A_helper, T_helper, tau_helper)
+                                        
+                                        # Verify helper gives same results as kernel (in-place)
+                                        if k > 0
+                                            @test A_helper ≈ A_test rtol=rtol atol=atol
+                                            @test T_helper[1:ib, 1:k] ≈ T_test[1:ib, 1:k] rtol=rtol atol=atol
+                                            @test tau_helper ≈ tau_test rtol=rtol atol=atol
+                                        end
 
                                         # --- Comparisons ---
                                         if m == 0 || n == 0
@@ -96,31 +78,27 @@ end
                                                 
                                                 
                                                 # For small matrices, verify reconstruction
-                                                if m <= 200 && n <= 200
-                                                    # Extract R from the factored matrix
-                                                    R_test = triu(A_test[1:k, 1:n])
-                                                    
-                                                    # Form Q using LAPACK's unmqr
-                                                    Q_test = Matrix{T}(I, m, m)
-                                                    LAPACK.ormqr!('L', 'N', A_test, tau_test, Q_test)
-                                                    
-                                                    # Test 3a: Reconstruction. A_orig should be Q * R.
-                                                    A_recon = Q_test[:, 1:k] * R_test
-                                                    _, R = qr(A_orig)
-                                                    reconstruction_tol = rtol * max(1, m, n) * norm(A_orig)
-                                                    @test A_orig ≈ A_recon rtol=reconstruction_tol
-                                                    @test norm(R - R_test) < reconstruction_tol
-        
-                                                    # Test 3b: Orthogonality of Q. Q' * Q should be Identity.
-                                                    orthog_error = norm(adjoint(Q_test) * Q_test - I)
-                                                    orthog_tol = rtol * m
-                                                    @test orthog_error < orthog_tol
+                                                # Extract R from the factored matrix
+                                                R_test = triu(A_test[1:k, 1:n])
                                                 
-                                                    # Additional checks
-                                                    @test all(isfinite.(A_test))
-                                                    @test all(isfinite.(tau_test))
-                                                    @test size(A_test) == size(A_orig)
-                                                end
+                                                # Form Q using LAPACK's unmqr!
+                                                Q_test = Matrix{T}(I, m, m)
+                                                LAPACK.ormqr!('L', 'N', A_test, tau_test, Q_test)
+                                                
+                                                # Test 3a: Reconstruction. A_orig should be Q * R.
+                                                A_recon = Q_test[:, 1:k] * R_test
+                                                reconstruction_tol = rtol * max(1, m, n) * norm(A_orig)
+                                                @test A_orig ≈ A_recon rtol=reconstruction_tol
+    
+                                                # Test 3b: Orthogonality of Q. Q' * Q should be Identity.
+                                                orthog_error = norm(adjoint(Q_test) * Q_test - I)
+                                                orthog_tol = rtol * m
+                                                @test orthog_error < orthog_tol
+                                            
+                                                # Additional checks
+                                                @test all(isfinite.(A_test))
+                                                @test all(isfinite.(tau_test))
+                                                @test size(A_test) == size(A_orig)
 
                                             end
                                         end
@@ -134,100 +112,33 @@ end
         end
     end
     
-    @testset "Square Matrix Tests" begin
-        n = 16
-        ib = 4
-        A = rand(ComplexF64, n, n)
-        A_original = copy(A)
-        lda = n
-        T = zeros(ComplexF64, ib, n)
-        ldt = ib
-        tau = zeros(ComplexF64, n)
-        work = zeros(ComplexF64, ib * n)
-        
-        NextLA.geqrt(n, n, ib, A, lda, T, ldt, tau, work)
-        
-        # For square matrices, check complete factorization
-        R_our = triu(A)
-        
-        # Compare with Julia's QR
-        Q_ref, R_ref = qr(A_original)
-        R_ref_mat = Matrix(R_ref)
-
-        @test norm(R_our - R_ref_mat) < 1e-10
-    end
-    
-    @testset "Tall Matrix Tests" begin
-        m, n, ib = 30, 15, 5
-        A = rand(ComplexF64, m, n)
-        A_original = copy(A)
-        lda = m
-        T = zeros(ComplexF64, ib, n)
-        ldt = ib
-        tau = zeros(ComplexF64, n)
-        work = zeros(ComplexF64, ib * n)
-        
-        NextLA.geqrt(m, n, ib, A, lda, T, ldt, tau, work)
-        k = min(m, n)
-        R_our = triu(A[1:k, 1:k])
-
-        Q_ref, R_ref = qr(A_original)
-        R_ref_mat = Matrix(R_ref)
-        
-        # Check upper triangular structure
-        @test norm(R_our - R_ref_mat) < 1e-10
-    end
-    
-    @testset "Wide Matrix Tests" begin
-        m, n, ib = 15, 25, 5
-        A = rand(ComplexF64, m, n)
-        A_original = copy(A)
-        lda = m
-        T = zeros(ComplexF64, ib, m)
-        ldt = ib
-        tau = zeros(ComplexF64, m)
-        work = zeros(ComplexF64, ib * n)
-        
-        NextLA.geqrt(m, n, ib, A, lda, T, ldt, tau, work)
-
-        R_our = triu(A)
-        
-        Q_ref, R_ref = qr(A_original)
-        R_ref_mat = Matrix(R_ref)
-        @test norm(R_our - R_ref_mat) < 1e-10
-    end
-    
     @testset "Edge Cases" begin
         # Test with ib = 1 (should behave like unblocked QR)
         m, n, ib = 10, 8, 1
         A = rand(ComplexF64, m, n)
         A_original = copy(A)
-        lda = m
         T = zeros(ComplexF64, ib, min(m, n))
-        ldt = ib
         tau = zeros(ComplexF64, min(m, n))
         work = zeros(ComplexF64, ib * n)
         
-        NextLA.geqrt(m, n, ib, A, lda, T, ldt, tau, work)
+        NextLA.geqrt!(m, n, ib, A, T, tau, work)
         
         # Compare with unblocked version
         A_unblocked = copy(A_original)
         tau_unblocked = zeros(ComplexF64, min(m, n))
         work_unblocked = zeros(ComplexF64, n)
-        NextLA.geqr2(m, n, A_unblocked, lda, tau_unblocked, work_unblocked)
+        NextLA.geqr2!(m, n, A_unblocked, tau_unblocked, work_unblocked)
         
         @test A ≈ A_unblocked rtol=1e-10
         
         # Test with very small matrices
         m, n, ib = 3, 2, 1
         A = rand(ComplexF64, m, n)
-        lda = m
         T = zeros(ComplexF64, ib, min(m, n))
-        ldt = ib
         tau = zeros(ComplexF64, min(m, n))
         work = zeros(ComplexF64, ib * n)
         
-        NextLA.geqrt(m, n, ib, A, lda, T, ldt, tau, work)
+        NextLA.geqrt!(m, n, ib, A, T, tau, work)
         
         # Should not crash
         @test all(isfinite.(A))
@@ -237,16 +148,12 @@ end
     
     @testset "Error Handling" begin
         # Test negative dimensions
-        @test_throws ArgumentError NextLA.geqrt(-1, 5, 2, zeros(ComplexF64, 5, 5), 5, zeros(ComplexF64, 2, 5), 2, zeros(ComplexF64, 5), zeros(ComplexF64, 10))
-        @test_throws ArgumentError NextLA.geqrt(5, -1, 2, zeros(ComplexF64, 5, 5), 5, zeros(ComplexF64, 2, 5), 2, zeros(ComplexF64, 5), zeros(ComplexF64, 10))
+        @test_throws ArgumentError NextLA.geqrt!(-1, 5, 2, zeros(ComplexF64, 5, 5), zeros(ComplexF64, 2, 5), zeros(ComplexF64, 5), zeros(ComplexF64, 10))
+        @test_throws ArgumentError NextLA.geqrt!(5, -1, 2, zeros(ComplexF64, 5, 5), zeros(ComplexF64, 2, 5), zeros(ComplexF64, 5), zeros(ComplexF64, 10))
         
         # Test invalid block size
-        @test_throws ArgumentError NextLA.geqrt(5, 5, -1, zeros(ComplexF64, 5, 5), 5, zeros(ComplexF64, 2, 5), 2, zeros(ComplexF64, 5), zeros(ComplexF64, 10))
-        @test_throws ArgumentError NextLA.geqrt(5, 5, 0, zeros(ComplexF64, 5, 5), 5, zeros(ComplexF64, 2, 5), 2, zeros(ComplexF64, 5), zeros(ComplexF64, 10))
-        
-        # Test invalid leading dimensions
-        @test_throws ArgumentError NextLA.geqrt(5, 5, 2, zeros(ComplexF64, 5, 5), 3, zeros(ComplexF64, 2, 5), 2, zeros(ComplexF64, 5), zeros(ComplexF64, 10))
-        @test_throws ArgumentError NextLA.geqrt(5, 5, 2, zeros(ComplexF64, 5, 5), 5, zeros(ComplexF64, 2, 5), 1, zeros(ComplexF64, 5), zeros(ComplexF64, 10))
+        @test_throws ArgumentError NextLA.geqrt!(5, 5, -1, zeros(ComplexF64, 5, 5), zeros(ComplexF64, 2, 5), zeros(ComplexF64, 5), zeros(ComplexF64, 10))
+        @test_throws ArgumentError NextLA.geqrt!(5, 5, 0, zeros(ComplexF64, 5, 5), zeros(ComplexF64, 2, 5), zeros(ComplexF64, 5), zeros(ComplexF64, 10))
     end
     
     @testset "Consistency Tests" begin
@@ -256,19 +163,17 @@ end
         
         # First application
         A1 = copy(A)
-        lda = m
         T1 = zeros(ComplexF64, ib, min(m, n))
-        ldt = ib
         tau1 = zeros(ComplexF64, min(m, n))
         work1 = zeros(ComplexF64, ib * n)
-        NextLA.geqrt(m, n, ib, A1, lda, T1, ldt, tau1, work1)
+        NextLA.geqrt!(m, n, ib, A1, T1, tau1, work1)
         
         # Second application
         A2 = copy(A)
         T2 = zeros(ComplexF64, ib, min(m, n))
         tau2 = zeros(ComplexF64, min(m, n))
         work2 = zeros(ComplexF64, ib * n)
-        NextLA.geqrt(m, n, ib, A2, lda, T2, ldt, tau2, work2)
+        NextLA.geqrt!(m, n, ib, A2, T2, tau2, work2)
         
         @test A1 ≈ A2 rtol=1e-12
         @test T1 ≈ T2 rtol=1e-12
@@ -281,9 +186,7 @@ end
             
             # Create CPU data
             A_cpu = rand(ComplexF32, m, n)
-            lda = m
             T_cpu = zeros(ComplexF32, ib, min(m, n))
-            ldt = ib
             tau_cpu = zeros(ComplexF32, min(m, n))
             work_cpu = zeros(ComplexF32, ib * n)
             
@@ -297,10 +200,10 @@ end
             A_cpu_result = copy(A_cpu)
             T_cpu_result = copy(T_cpu)
             tau_cpu_result = copy(tau_cpu)
-            NextLA.geqrt(m, n, ib, A_cpu_result, lda, T_cpu_result, ldt, tau_cpu_result, work_cpu)
+            NextLA.geqrt!(m, n, ib, A_cpu_result, T_cpu_result, tau_cpu_result, work_cpu)
             
             # Apply on GPU
-            NextLA.geqrt(m, n, ib, A_gpu, lda, T_gpu, ldt, tau_gpu, work_gpu)
+            NextLA.geqrt!(m, n, ib, A_gpu, T_gpu, tau_gpu, work_gpu)
             
             @test Array(A_gpu) ≈ A_cpu_result rtol=1e-6
             @test Array(T_gpu) ≈ T_cpu_result rtol=1e-6
diff --git a/test/larf.jl b/test/larf.jl
index a91a8c4..ed0f837 100644
--- a/test/larf.jl
+++ b/test/larf.jl
@@ -121,10 +121,17 @@ end
                                             
                                             # Determine work array size
                                             work_size = side == 'L' ? n : m
-                                            work = zeros(T, work_size, 1)
+                                            work = zeros(T, work_size)
                                             
-                                            # NextLA call: larf(side, m, n, v, incv, tau, c, ldc, work)
-                                            NextLA.larf(side, m, n, v, 1, tau, C_test, work)
+                                            # NextLA call: larf!(side, m, n, v, incv, tau, c, ldc, work)
+                                            NextLA.larf!(side, m, n, v, 1, tau, C_test, work)
+                                            
+                                            # --- Test Helper Function ---
+                                            C_helper = copy(C_orig)
+                                            NextLA.larf!(side, v, 1, tau, C_helper)
+                                            
+                                            # Verify helper gives same results as kernel (in-place)
+                                            @test C_helper ≈ C_test rtol=rtol
                                             
                                             # Basic checks
                                             @test all(isfinite.(C_test))
@@ -156,14 +163,14 @@ end
                 C = randn(T, m, n)
                 v = randn(T, max(m, n))
                 tau = T(0.5)
-                work = zeros(T, max(m, n), 1)
+                work = zeros(T, max(m, n))
                 
-                @test_nowarn NextLA.larf('L', m, n, v, 1, tau, C, work)
-                @test_nowarn NextLA.larf('R', m, n, v, 1, tau, C, work)
+                @test_nowarn NextLA.larf!('L', m, n, v, 1, tau, C, work)
+                @test_nowarn NextLA.larf!('R', m, n, v, 1, tau, C, work)
                 
                 # Test edge cases
-                @test_nowarn NextLA.larf('L', 1, 1, T[T(1)], 1, T(0), T[T(1);;], T[T(0);;])
-                @test_nowarn NextLA.larf('R', 1, 1, T[T(1)], 1, T(0), T[T(1);;], T[T(0);;])
+                @test_nowarn NextLA.larf!('L', 1, 1, T[T(1)], 1, T(0), T[T(1);;], T[0])
+                @test_nowarn NextLA.larf!('R', 1, 1, T[T(1)], 1, T(0), T[T(1);;], T[0])
             end
         end
     end
@@ -183,12 +190,12 @@ end
                         C = T.(scale .* randn(T, m, n))
                         v = T.(scale .* randn(T, side == 'L' ? m : n))
                         tau = T(scale * randn(T))
-                        work = zeros(T, max(m, n), 1)
+                        work = zeros(T, max(m, n))
                         
                         C_orig = copy(C)
                         
                         # Test calculation
-                        NextLA.larf(side, m, n, v, 1, tau, C, work)
+                        NextLA.larf!(side, m, n, v, 1, tau, C, work)
                         
                         # Check that results are finite
                         @test all(isfinite.(C))
@@ -213,7 +220,7 @@ end
                             C_cpu = T.(randn(T, m, n))
                             v_cpu = T.(randn(T, max(m, n)))
                             tau_cpu = T(randn(T))
-                            work_cpu = zeros(T, max(m, n), 1)
+                            work_cpu = zeros(T, max(m, n))
                             
                             # Move data to GPU
                             C_gpu = CuArray(C_cpu)
@@ -223,10 +230,10 @@ end
                             # Reference CPU calculation
                             C_ref = copy(C_cpu)
                             work_ref = copy(work_cpu)
-                            NextLA.larf(side, m, n, v_cpu, 1, tau_cpu, C_ref, m, work_ref)
+                            NextLA.larf!(side, m, n, v_cpu, 1, tau_cpu, C_ref, work_ref)
                             
                             # Our implementation on GPU
-                            NextLA.larf(side, m, n, v_gpu, 1, tau_cpu, C_gpu, m, work_gpu)
+                            NextLA.larf!(side, m, n, v_gpu, 1, tau_cpu, C_gpu, work_gpu)
                             
                             # Compare results
                             @test norm(Array(C_gpu) - C_ref) < rtol * max(1, norm(C_ref))
diff --git a/test/larfb.jl b/test/larfb.jl
index bad4db6..cdd5502 100644
--- a/test/larfb.jl
+++ b/test/larfb.jl
@@ -88,22 +88,23 @@ end
                                                         C_test = copy(C_orig)
                                                         C_ref = copy(C_orig)
                                                         
-                                                        # Set leading dimensions
-                                                        ldv = size(V, 1)
-                                                        ldt = k
-                                                        ldc = m
-                                                        ldwork = size(work, 1)
+                                                        # NextLA call: larfb!(side, trans, direct, storev, m, n, k, V, ldv, T, C, work)
+                                                        NextLA.larfb!(side, trans, direct, storev, m, n, k, V, size(V,1), T_mat, C_test, work)
                                                         
-                                                        # NextLA call: larfb(side, trans, direct, storev, m, n, k, v, ldv, t, ldt, c, ldc, work, ldwork)
-                                                        NextLA.larfb(side, trans, direct, storev, m, n, k, V, ldv, T_mat, ldt, C_test, ldc, work, ldwork)
+                                                        # --- Test Helper Function ---
+                                                        C_helper = copy(C_orig)
+                                                        NextLA.larfb!(side, trans, direct, storev, V, T_mat, C_helper)
+                                                        
+                                                        # Verify helper gives same results as kernel
+                                                        @test C_helper ≈ C_test rtol=rtol
                                                         
                                                         # Basic checks
                                                         @test all(isfinite.(C_test))
                                                         @test size(C_test) == (m, n)
                                                         @test all(isfinite.(work))
                                                         
-                                                        NextLA.larfb('L', 'N', direct, storev, m, n, k, V, ldv, T_mat, ldt, C_ref, ldc, work, ldwork)
-                                                        NextLA.larfb('L', 'C', direct, storev, m, n, k, V, ldv, T_mat, ldt, C_ref, ldc, work, ldwork)
+                                                        NextLA.larfb!('L', 'N', direct, storev, m, n, k, V, size(V,1), T_mat, C_ref, work)
+                                                        NextLA.larfb!('L', 'C', direct, storev, m, n, k, V, size(V,1), T_mat, C_ref, work)
 
                                                         # Mathematical validation
                                                         @test norm(C_ref - C_orig) / norm(C_orig) < rtol
@@ -138,15 +139,15 @@ end
                 C = randn(T, m, n)
                 work = zeros(T, n, k)
                 
-                @test_nowarn NextLA.larfb('L', 'N', 'F', 'C', m, n, k, V, m, T_mat, k, C, m, work, n)
+                @test_nowarn NextLA.larfb!('L', 'N', 'F', 'C', m, n, k, V, m, T_mat, C, work)
                 
                 # Test edge cases
-                @test_nowarn NextLA.larfb('L', 'N', 'F', 'C', 0, 0, 0, zeros(T, 0, 0), 1, zeros(T, 0, 0), 1, zeros(T, 0, 0), 1, zeros(T, 0, 0), 1)  # m = n = k = 0
-                @test_nowarn NextLA.larfb('L', 'N', 'F', 'C', 1, 1, 0, zeros(T, 1, 0), 1, zeros(T, 0, 0), 1, randn(T, 1, 1), 1, zeros(T, 1, 0), 1)  # k = 0
+                @test_nowarn NextLA.larfb!('L', 'N', 'F', 'C', 0, 0, 0, zeros(T, 0, 0), 1, zeros(T, 0, 0), zeros(T, 0, 0))  # m = n = k = 0
+                @test_nowarn NextLA.larfb!('L', 'N', 'F', 'C', 1, 1, 0, zeros(T, 1, 0), 1, zeros(T, 0, 0), randn(T, 1, 1), zeros(T, 1, 0))  # k = 0
                 
                 # Test different side/storev combinations
-                @test_nowarn NextLA.larfb('R', 'N', 'F', 'C', m, n, k, randn(T, n, k), n, T_mat, k, copy(C), m, zeros(T, m, k), m)  # Right side
-                @test_nowarn NextLA.larfb('L', 'C', 'B', 'R', m, n, k, randn(T, k, m), k, T_mat, k, copy(C), m, zeros(T, n, k), n)  # Row-wise storage
+                @test_nowarn NextLA.larfb!('R', 'N', 'F', 'C', m, n, k, randn(T, n, k), n, T_mat, copy(C), zeros(T, m, k))  # Right side
+                @test_nowarn NextLA.larfb!('L', 'C', 'B', 'R', m, n, k, randn(T, k, m), k, T_mat, copy(C), zeros(T, n, k))  # Row-wise storage
             end
         end
     end
@@ -167,7 +168,7 @@ end
                     work = zeros(T, n, k)
                     
                     # Test calculation
-                    NextLA.larfb('L', 'N', 'F', 'C', m, n, k, V, m, T_mat, k, C, m, work, n)
+                    NextLA.larfb!('L', 'N', 'F', 'C', m, n, k, V, m, T_mat, C, work)
                     
                     # Check that results are finite
                     @test all(isfinite.(C))
@@ -180,7 +181,7 @@ end
                             work_test = zeros(T, side == 'L' ? n : m, k)
                             V_test = side == 'L' ? V : T.(scale .* randn(ComplexF64, n, k))
                             
-                            NextLA.larfb(side, trans, 'F', 'C', m, n, k, V_test, size(V_test, 1), T_mat, k, C_test, m, work_test, size(work_test, 1))
+                            NextLA.larfb!(side, trans, 'F', 'C', m, n, k, V_test, size(V_test, 1), T_mat, C_test, work_test)
                             
                             @test all(isfinite.(C_test))
                             @test all(isfinite.(work_test))
@@ -219,10 +220,10 @@ end
                                             # Reference CPU calculation
                                             C_ref = copy(C_cpu)
                                             work_ref = copy(work_cpu)
-                                            NextLA.larfb(side, trans, 'F', 'C', m, n, k, V_cpu, size(V_cpu, 1), T_cpu, k, C_ref, m, work_ref, size(work_ref, 1))
+                                            NextLA.larfb!(side, trans, 'F', 'C', m, n, k, V_cpu, size(V_cpu, 1), T_cpu, C_ref, work_ref)
                                             
                                             # GPU calculation
-                                            NextLA.larfb(side, trans, 'F', 'C', m, n, k, V_gpu, size(V_gpu, 1), T_gpu, k, C_gpu, m, work_gpu, size(work_gpu, 1))
+                                            NextLA.larfb!(side, trans, 'F', 'C', m, n, k, V_gpu, size(V_gpu, 1), T_gpu, C_gpu, work_gpu)
                                             
                                             # Compare results
                                             @test norm(Array(C_gpu) - C_ref) < rtol * max(1, norm(C_ref))
@@ -266,10 +267,10 @@ end
                                         # Reference CPU calculation
                                         C_ref = copy(C_cpu)
                                         work_ref = copy(work_cpu)
-                                        NextLA.larfb(side, 'N', 'F', 'C', m, n, k, V_cpu, size(V_cpu, 1), T_cpu, k, C_ref, m, work_ref, size(work_ref, 1))
+                                        NextLA.larfb!(side, 'N', 'F', 'C', m, n, k, V_cpu, size(V_cpu, 1), T_cpu, C_ref, work_ref)
                                         
                                         # ROCm calculation
-                                        NextLA.larfb(side, 'N', 'F', 'C', m, n, k, V_rocm, size(V_rocm, 1), T_rocm, k, C_rocm, m, work_rocm, size(work_rocm, 1))
+                                        NextLA.larfb!(side, 'N', 'F', 'C', m, n, k, V_rocm, size(V_rocm, 1), T_rocm, C_rocm, work_rocm)
                                         
                                         # Compare results
                                         @test norm(Array(C_rocm) - C_ref) < rtol * max(1, norm(C_ref))
diff --git a/test/larfg.jl b/test/larfg.jl
index d057196..1eba28d 100644
--- a/test/larfg.jl
+++ b/test/larfg.jl
@@ -49,7 +49,7 @@ for (larfg, elty) in
                         
                         # Test NextLA implementation
                         x_nextla = copy(x_orig)
-                        alpha_nextla, tau_nextla = NextLA.larfg(n, alpha_orig, x_nextla, 1, zero(T))
+                        alpha_nextla, tau_nextla = NextLA.larfg!(n, alpha_orig, x_nextla, 1, zero(T))
                         
                         # Test LAPACK reference
                         if n > 0
@@ -57,17 +57,24 @@ for (larfg, elty) in
                             tau_lapack, alpha_lapack = larfg_our!(lapack_vec)
                             x_lapack = lapack_vec[2:end]
                             
-                            # Compare results (allowing for sign differences)
-                            if abs(abs(tau_nextla) - abs(tau_lapack)) > rtol * max(1, abs(tau_lapack))
-                                @show tau_nextla, tau_lapack
-                            end
-                            @test abs(abs(tau_nextla) - abs(tau_lapack)) < rtol * max(1, abs(tau_lapack))
-                            if abs(abs(alpha_nextla) - abs(alpha_lapack)) > rtol * max(1, abs(alpha_lapack))
-                                @show alpha_nextla, alpha_lapack
-                            end
-                            @test abs(abs(alpha_nextla) - abs(alpha_lapack)) < rtol * max(1, abs(alpha_lapack))
-                            if length(x_orig) > 0
-                                @test norm(abs.(x_nextla) - abs.(x_lapack)) < rtol * max(1, norm(x_lapack))
+                            # For n==1, NextLA defines tau≈0; LAPACK may return nonzero. Accept tau≈0.
+                            if n == 1
+                                @test abs(tau_nextla) ≤ (T <: ComplexF32 ? 1e-6 : 1e-12)
+                                # alpha magnitude should match LAPACK
+                                @test abs(abs(alpha_nextla) - abs(alpha_lapack)) < rtol * max(1, abs(alpha_lapack))
+                            else
+                                # Compare results (allowing for sign differences)
+                                if abs(abs(tau_nextla) - abs(tau_lapack)) > rtol * max(1, abs(tau_lapack))
+                                    @show tau_nextla, tau_lapack
+                                end
+                                @test abs(abs(tau_nextla) - abs(tau_lapack)) < rtol * max(1, abs(tau_lapack))
+                                if abs(abs(alpha_nextla) - abs(alpha_lapack)) > rtol * max(1, abs(alpha_lapack))
+                                    @show alpha_nextla, alpha_lapack
+                                end
+                                @test abs(abs(alpha_nextla) - abs(alpha_lapack)) < rtol * max(1, abs(alpha_lapack))
+                                if length(x_orig) > 0
+                                    @test norm(abs.(x_nextla) - abs.(x_lapack)) < rtol * max(1, norm(x_lapack))
+                                end
                             end
                         end
                         
@@ -85,17 +92,17 @@ for (larfg, elty) in
         for T in [ComplexF32, ComplexF64]
             @testset "Type $T edge cases" begin
                 # Test n=0 case
-                alpha_nextla, tau_nextla = NextLA.larfg(0, T(1), T[], 1, zero(T))
+                alpha_nextla, tau_nextla = NextLA.larfg!(0, T(1), T[], 1, zero(T))
                 @test tau_nextla == 0
                 @test alpha_nextla == T(1)
                 
                 # Test n=1 case
-                alpha_nextla, tau_nextla = NextLA.larfg(1, T(2), T[], 1, zero(T))
+                alpha_nextla, tau_nextla = NextLA.larfg!(1, T(2), T[], 1, zero(T))
                 @test abs(tau_nextla) < 1e-10
                 @test abs(alpha_nextla - T(2)) < 1e-10
                 
-                # Test zero vector
-                alpha_nextla, tau_nextla = NextLA.larfg(3, T(0), T[0, 0], 1, zero(T))
+                # Test zero vector (n=3, x has length 2)
+                alpha_nextla, tau_nextla = NextLA.larfg!(3, T(0), zeros(T, 2), 1, zero(T))
                 @test isfinite(alpha_nextla)
                 @test isfinite(tau_nextla)
             end
diff --git a/test/larft.jl b/test/larft.jl
index 7741b32..6ccd599 100644
--- a/test/larft.jl
+++ b/test/larft.jl
@@ -86,8 +86,8 @@ end
                                                     T_mat = zeros(T, k, k)
                                                     ldt = k
                                                     
-                                                    # NextLA call: larft(direct, storev, n, k, v, ldv, tau, t, ldt)
-                                                    NextLA.larft(direct, storev, n, k, V, ldv, tau, T_mat, ldt)
+                                                    # NextLA call: larft!(direct, storev, n, k, V, tau, T_mat)
+                                                    NextLA.larft!(direct, storev, n, k, V, tau, T_mat)
                                                     
                                                     # Basic checks
                                                     @test all(isfinite.(T_mat))
@@ -159,12 +159,12 @@ end
                 tau = randn(T, k)
                 T_mat = zeros(T, k, k)
                 
-                @test_nowarn NextLA.larft('F', 'C', n, k, V, n, tau, T_mat, k)
+                @test_nowarn NextLA.larft!('F', 'C', n, k, V, tau, T_mat)
                 
                 # Test edge cases
-                @test_nowarn NextLA.larft('F', 'C', 0, 0, zeros(T, 0, 0), 1, T[], zeros(T, 0, 0), 1)  # n = 0, k = 0
-                @test_nowarn NextLA.larft('F', 'C', 1, 0, zeros(T, 1, 0), 1, T[], zeros(T, 0, 0), 1)  # k = 0
-                @test_nowarn NextLA.larft('F', 'C', 0, 1, zeros(T, 0, 1), 1, T[T(0)], zeros(T, 1, 1), 1)  # n = 0
+                @test_nowarn NextLA.larft!('F', 'C', 0, 0, zeros(T, 0, 0), T[], zeros(T, 0, 0))  # n = 0, k = 0
+                @test_nowarn NextLA.larft!('F', 'C', 1, 0, zeros(T, 1, 0), T[], zeros(T, 0, 0))  # k = 0
+                @test_nowarn NextLA.larft!('F', 'C', 0, 1, zeros(T, 0, 1), T[T(0)], zeros(T, 1, 1))  # n = 0
             end
         end
     end
@@ -217,10 +217,10 @@ end
                         
                         # Reference CPU calculation
                         T_ref = zeros(T, k, k)
-                        NextLA.larft('F', 'C', n, k, V_cpu, n, tau_cpu, T_ref, k)
+                        NextLA.larft!('F', 'C', n, k, V_cpu, tau_cpu, T_ref)
                         
                         # Our implementation on GPU
-                        NextLA.larft('F', 'C', n, k, V_gpu, n, tau_gpu, T_gpu, k)
+                        NextLA.larft!('F', 'C', n, k, V_gpu, tau_gpu, T_gpu)
                         
                         # Compare results
                         @test norm(Array(T_gpu) - T_ref) < rtol * max(1, norm(T_ref))
diff --git a/test/lauum.jl b/test/lauum.jl
index c96d0af..30a8b19 100644
--- a/test/lauum.jl
+++ b/test/lauum.jl
@@ -1,4 +1,8 @@
-@testset "lauum test" begin
+using Test
+using NextLA
+using LinearAlgebra
+
+@testset "NextLA.lauum! test" begin
     for T in [Float32, Float64, ComplexF32, ComplexF64]
         for uplo in ['U', 'L']
             # Test different matrix sizes including edge cases
@@ -13,8 +17,8 @@
                         A = Matrix(LowerTriangular(-0.5 .+ rand(T, n, n)))
                     end
                     Ac = copy(A)             
-                    info = lauum(uplo, n, A, n, block_size)                  
-                    @test info == 0  # Ensure no error from lauum
+                    NextLA.lauum!(uplo, n, A, block_size)                  
+                    # @test info == 0  # Function now returns nothing instead of error code
                     # Set tolerance based on type
                     tolerance = T <: Union{Float64, ComplexF64} ? 1e-12 : 1e-6
                     if uplo == 'U'
@@ -22,7 +26,7 @@
                         result_diff = norm(Matrix(A) - expected_result) / n
                         @test result_diff < tolerance  # Use adjusted tolerance
                         if result_diff >= tolerance
-                            println("Failure in lauum test for T: $T, uplo: $uplo, n: $n, block_size: $block_size")
+                            println("Failure in NextLA.lauum! test for T: $T, uplo: $uplo, n: $n, block_size: $block_size")
                             println("Difference norm: $result_diff")
                         end
                     else
@@ -30,7 +34,7 @@
                         result_diff = norm(Matrix(A) - expected_result) / n
                         @test result_diff < tolerance  # Use adjusted tolerance
                         if result_diff >= tolerance
-                            println("Failure in lauum test for T: $T, uplo: $uplo, n: $n, block_size: $block_size")
+                            println("Failure in NextLA.lauum! test for T: $T, uplo: $uplo, n: $n, block_size: $block_size")
                             println("Difference norm: $result_diff")
                         end
                     end
diff --git a/test/pamm.jl b/test/pamm.jl
index 7cb7e39..aea06a2 100644
--- a/test/pamm.jl
+++ b/test/pamm.jl
@@ -19,13 +19,8 @@ using CUDA
         V_original = copy(V)
         W_original = copy(W)
         
-        lda1 = k
-        lda2 = m
-        ldv = m
-        ldw = n
-        
-        # Apply our PAMM
-        NextLA.pamm('W', 'L', 'C', 'F', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw)
+    # Apply our PAMM
+    NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1, A2, V, W)
         
         # Basic checks
         @test size(W) == (n, l)
@@ -43,12 +38,7 @@ using CUDA
         V = rand(ComplexF64, n, l)
         W = rand(ComplexF64, m, l)
         
-        lda1 = k
-        lda2 = n
-        ldv = n
-        ldw = m
-        
-        NextLA.pamm('W', 'R', 'C', 'F', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw)
+    NextLA.pamm!('W', 'R', 'C', 'F', m, n, k, l, A1, A2, V, W)
         
         @test size(W) == (m, l)
         @test all(isfinite.(W))
@@ -64,13 +54,8 @@ using CUDA
         
         A2_original = copy(A2)
         
-        lda1 = k
-        lda2 = m
-        ldv = m
-        ldw = n
-        
         # Apply A operation
-        NextLA.pamm('A', 'L', 'C', 'F', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw)
+    NextLA.pamm!('A', 'L', 'C', 'F', m, n, k, l, A1, A2, V, W)
         
         @test size(A2) == (m, k)
         @test all(isfinite.(A2))
@@ -87,12 +72,7 @@ using CUDA
         V = rand(ComplexF64, m, l)
         W = rand(ComplexF64, n, l)
         
-        lda1 = k
-        lda2 = m
-        ldv = m
-        ldw = n
-        
-        NextLA.pamm('W', 'L', 'C', 'B', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw)
+    NextLA.pamm!('W', 'L', 'C', 'B', m, n, k, l, A1, A2, V, W)
         
         @test all(isfinite.(W))
     end
@@ -105,12 +85,7 @@ using CUDA
         V = rand(ComplexF64, l, m)  # Row-wise storage
         W = rand(ComplexF64, l, n)  # Row-wise storage
         
-        lda1 = k
-        lda2 = m
-        ldv = l
-        ldw = l
-        
-        NextLA.pamm('W', 'L', 'R', 'F', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw)
+    NextLA.pamm!('W', 'L', 'R', 'F', m, n, k, l, A1, A2, V, W)
         
         @test all(isfinite.(W))
     end
@@ -125,12 +100,7 @@ using CUDA
         
         W_original = copy(W)
         
-        lda1 = k
-        lda2 = m
-        ldv = m
-        ldw = n
-        
-        NextLA.pamm('W', 'L', 'C', 'F', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw)
+    NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1, A2, V, W)
         
         @test all(isfinite.(W))
         @test !isapprox(W, W_original, rtol=1e-6)
@@ -157,7 +127,7 @@ using CUDA
             ldv = m
             ldw = n
             
-            NextLA.pamm('W', 'L', 'C', 'F', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw)
+            NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw)
             
             @test all(isfinite.(W))
             @test size(W) == (n, l)
@@ -178,7 +148,7 @@ using CUDA
         ldv = m
         ldw = n
         
-        NextLA.pamm('W', 'L', 'C', 'F', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw)
+        NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw)
         
         @test all(isfinite.(W))
         
@@ -189,7 +159,7 @@ using CUDA
         V = rand(ComplexF64, m, l)
         W = rand(ComplexF64, n, l)
         
-        NextLA.pamm('W', 'L', 'C', 'F', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw)
+        NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1, lda1, A2, lda2, V, ldv, W, ldw)
         
         @test all(isfinite.(W))
     end
@@ -229,16 +199,11 @@ using CUDA
         V = rand(ComplexF64, m, l)
         W = rand(ComplexF64, n, l)
         
-        lda1 = k
-        lda2 = m
-        ldv = m
-        ldw = n
-        
         # Apply W operation
-        NextLA.pamm('W', 'L', 'C', 'F', m, n, k, l, A1, lda1, A2_w, lda2, V, ldv, W, ldw)
+            NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1, A2_w, V, W)
         
         # Apply A operation with same input
-        NextLA.pamm('A', 'L', 'C', 'F', m, n, k, l, A1, lda1, A2_a, lda2, V, ldv, W, ldw)
+            NextLA.pamm!('A', 'L', 'C', 'F', m, n, k, l, A1, A2_a, V, W)
         
         # Results should be finite and well-defined
         @test all(isfinite.(W))
@@ -256,22 +221,17 @@ using CUDA
             W_cpu = rand(ComplexF32, n, l)
             
             lda1 = k
-            lda2 = m
-            ldv = m
-            ldw = n
-            
-            # Create GPU data
             A1_gpu = CuArray(A1_cpu)
-            A2_gpu = CuArray(A2_cpu)
-            V_gpu = CuArray(V_cpu)
+            W_cpu_result = copy(W_cpu)
+            NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1_cpu, A2_cpu, V_cpu, W_cpu_result)
             W_gpu = CuArray(W_cpu)
             
             # Apply on CPU
             W_cpu_result = copy(W_cpu)
-            NextLA.pamm('W', 'L', 'C', 'F', m, n, k, l, A1_cpu, lda1, A2_cpu, lda2, V_cpu, ldv, W_cpu_result, ldw)
+            NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1_cpu, lda1, A2_cpu, lda2, V_cpu, ldv, W_cpu_result, ldw)
             
             # Apply on GPU
-            NextLA.pamm('W', 'L', 'C', 'F', m, n, k, l, A1_gpu, lda1, A2_gpu, lda2, V_gpu, ldv, W_gpu, ldw)
+            NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1_gpu, A2_gpu, V_gpu, W_gpu)
             
             @test Array(W_gpu) ≈ W_cpu_result rtol=1e-6
         end
diff --git a/test/parfb.jl b/test/parfb.jl
index 60c4f32..d192642 100644
--- a/test/parfb.jl
+++ b/test/parfb.jl
@@ -57,7 +57,7 @@ function lapack_tprfb!(::Type{T}, side::AbstractChar, trans::AbstractChar, direc
 end
 
 
-# LAPACK-style test parameters for NextLA.parfb
+# LAPACK-style test parameters for NextLA.parfb!
 const PARFB_TYPES = [ComplexF32, ComplexF64, Float32, Float64]
 # Format: (m1, n1, m2, n2, k, l) where:
 # - For side='L': n1 == n2 (same number of columns)
@@ -132,13 +132,11 @@ const PARFB_SIZES = [
                                             Tee = rand(T, k, k)
                                             ldt = k
                                             
-                                            # Work array dimensions based on SIDE
+                                            # Work array dimensions based on SIDE (2D workspace)
                                             if side == 'L'
-                                                work = rand(T, k, n1)  # WORK is K-by-N when SIDE='L'
-                                                ldw = k
+                                                work = rand(T, k, n2)  # WORK is K-by-n2 when SIDE='L'
                                             else
-                                                work = rand(T, m1, k)  # WORK is M-by-K when SIDE='R'
-                                                ldw = m1
+                                                work = rand(T, m2, k)  # WORK is m2-by-K when SIDE='R'
                                             end
                                             
                                             # Make copies for testing
@@ -153,9 +151,9 @@ const PARFB_SIZES = [
                                             
                                             work_l = lapack_tprfb!(T, side, trans, direct, storev, l, V, Tee, A1_l, A2_l)
                                             
-                                            # NextLA call: parfb(side, trans, direct, storev, m1, n1, m2, n2, k, l, A1, lda1, A2, lda2, V, ldv, T, ldt, work, ldwork)
-                                            NextLA.parfb(side, trans, direct, storev, m1, n1, m2, n2, k, l,
-                                                        A1_test, lda1, A2_test, lda2, V_test, ldv, T_test, ldt, work, ldw)
+                                            # NextLA call with simplified signature (no ld*), workspace as matrix
+                                            NextLA.parfb!(side, trans, direct, storev, m1, n1, m2, n2, k, l,
+                                                        A1_test, A2_test, V_test, T_test, work)
 
                                                 
 
@@ -208,10 +206,10 @@ const PARFB_SIZES = [
                 A2 = randn(T, m2, n2)
                 V = randn(T, m2, k)  # For side='L', V has m2 rows
                 T_mat = triu(randn(T, k, k))
-                work = zeros(T, k, n1)
+                work = zeros(T, k, n2)
                 
-                @test_nowarn NextLA.parfb('L', 'N', 'F', 'C', m1, n1, m2, n2, k, l,
-                                          A1, m1, A2, m2, V, m2, T_mat, k, work, k)
+                @test_nowarn NextLA.parfb!('L', 'N', 'F', 'C', m1, n1, m2, n2, k, l,
+                                          A1, A2, V, T_mat, work)
                 
                 # Test with valid parameters for side='R' case: m1 == m2
                 m1, n1, m2, n2, k, l = 600, 500, 600, 400, 300, 200
@@ -219,20 +217,18 @@ const PARFB_SIZES = [
                 A2 = randn(T, m2, n2)
                 V = randn(T, n2, k)  # For side='R', V has n2 rows
                 T_mat = triu(randn(T, k, k))
-                work = zeros(T, m1, k)
+                work = zeros(T, m2, k)
                 
-                @test_nowarn NextLA.parfb('R', 'N', 'F', 'C', m1, n1, m2, n2, k, l,
-                                          A1, m1, A2, m2, V, n2, T_mat, k, work, m1)
+                @test_nowarn NextLA.parfb!('R', 'N', 'F', 'C', m1, n1, m2, n2, k, l,
+                                          A1, A2, V, T_mat, work)
                 
                 # Test edge cases
-                @test_nowarn NextLA.parfb('L', 'N', 'F', 'C', 0, 0, 0, 0, 0, 0,
-                                          zeros(T, 0, 0), 1, zeros(T, 0, 0), 1, zeros(T, 0, 0), 1,
-                                          zeros(T, 0, 0), 1, T[], 1)
+                @test_nowarn NextLA.parfb!('L', 'N', 'F', 'C', 0, 0, 0, 0, 0, 0,
+                                          zeros(T, 0, 0), zeros(T, 0, 0), zeros(T, 0, 0), zeros(T, 0, 0), zeros(T, 0, 0))
                 
                 # Test with k=0 (valid for both sides)
-                @test_nowarn NextLA.parfb('L', 'N', 'F', 'C', 2, 2, 2, 2, 0, 0,
-                                          randn(T, 2, 2), 2, randn(T, 2, 2), 2, zeros(T, 2, 0), 2,
-                                          zeros(T, 0, 0), 1, T[], 1)
+                @test_nowarn NextLA.parfb!('L', 'N', 'F', 'C', 2, 2, 2, 2, 0, 0,
+                                          randn(T, 2, 2), randn(T, 2, 2), zeros(T, 2, 0), zeros(T, 0, 0), zeros(T, 0, 0))
             end
         end
     end
@@ -252,7 +248,7 @@ const PARFB_SIZES = [
                     A2 = T.(scale .* randn(ComplexF64, m2, n2))
                     V = T.(scale .* randn(ComplexF64, m2, k))  # For side='L', V has m2 rows
                     T_mat = triu(T.(scale .* randn(ComplexF64, k, k)))
-                    work = zeros(T, k, n1)
+                    work = zeros(T, k, n2)
                     
                     # Set up proper Householder structure
                     for i in 1:k
@@ -261,8 +257,8 @@ const PARFB_SIZES = [
                     end
                     
                     # Test calculation
-                    NextLA.parfb('L', 'N', 'F', 'C', m1, n1, m2, n2, k, l,
-                                  A1, m1, A2, m2, V, m2, T_mat, k, work, k)
+                    NextLA.parfb!('L', 'N', 'F', 'C', m1, n1, m2, n2, k, l,
+                                  A1, A2, V, T_mat, work)
                     
                     # Check that results are finite
                     @test all(isfinite.(A1))
@@ -294,14 +290,10 @@ const PARFB_SIZES = [
                         # Set V dimensions based on side
                         if side == 'L'
                             V_cpu = randn(T, m2, k)
-                            work_cpu = zeros(T, k * n1)
-                            ldv = m2
-                            ldwork = k
+                            work_cpu = zeros(T, k, n2)
                         else  # side == 'R'
                             V_cpu = randn(T, n2, k)
-                            work_cpu = zeros(T, m1 * k)
-                            ldv = n2
-                            ldwork = m1
+                            work_cpu = zeros(T, m2, k)
                         end
                         
                         T_cpu = triu(randn(T, k, k))
@@ -323,17 +315,17 @@ const PARFB_SIZES = [
                         A1_ref = copy(A1_cpu)
                         A2_ref = copy(A2_cpu)
                         work_ref = copy(work_cpu)
-                        NextLA.parfb(side, 'N', 'F', 'C', m1, n1, m2, n2, k, l,
-                                      A1_ref, m1, A2_ref, m2, V_cpu, ldv, T_cpu, k, work_ref, ldwork)
+                        NextLA.parfb!(side, 'N', 'F', 'C', m1, n1, m2, n2, k, l,
+                                      A1_ref, A2_ref, V_cpu, T_cpu, work_ref)
                         
                         # Our implementation on GPU
-                        NextLA.parfb(side, 'N', 'F', 'C', m1, n1, m2, n2, k, l,
-                                      A1_gpu, m1, A2_gpu, m2, V_gpu, ldv, T_gpu, k, work_gpu, ldwork)
+                        NextLA.parfb!(side, 'N', 'F', 'C', m1, n1, m2, n2, k, l,
+                                      A1_gpu, A2_gpu, V_gpu, T_gpu, work_gpu)
                         
                         # Compare results
                         @test norm(Array(A1_gpu) - A1_ref) < rtol * max(1, norm(A1_ref))
                         @test norm(Array(A2_gpu) - A2_ref) < rtol * max(1, norm(A2_ref))
-                        @test norm(Array(work_gpu) - work_ref) < rtol * max(1, norm(work_ref))
+                        @test norm(Array(work_gpu) - Array(work_ref)) < rtol * max(1, norm(Array(work_ref)))
                         
                         @test all(isfinite.(Array(A1_gpu)))
                         @test all(isfinite.(Array(A2_gpu)))
diff --git a/test/pemv.jl b/test/pemv.jl
index 5bd53d4..cdd64f3 100644
--- a/test/pemv.jl
+++ b/test/pemv.jl
@@ -9,15 +9,14 @@ using CUDA
         alpha = 2.5 + 1.5im
         beta = 1.2 - 0.8im
         
-        A = rand(ComplexF64, m, n)
-        lda = m
+    A = rand(ComplexF64, m, n)
         X = rand(ComplexF64, n)
         Y = rand(ComplexF64, m)
         Y_original = copy(Y)
         work = zeros(ComplexF64, m)
         
         # Apply our PEMV
-        NextLA.pemv('C', 'C', m, n, l, alpha, A, lda, X, beta, Y, work)
+    NextLA.pemv!('C', 'C', m, n, l, alpha, A, X, beta, Y, work)
         
         # Verify using manual computation
         # For column storage with conjugate transpose, this should compute:
@@ -32,15 +31,14 @@ using CUDA
         alpha = 1.8 + 2.2im
         beta = 0.5 + 1.0im
         
-        A = rand(ComplexF64, m, n)
-        lda = m
+    A = rand(ComplexF64, m, n)
         X = rand(ComplexF64, m)
         Y = rand(ComplexF64, n)
         Y_original = copy(Y)
         work = zeros(ComplexF64, n)
         
         # Apply our PEMV
-        NextLA.pemv('N', 'R', m, n, l, alpha, A, lda, X, beta, Y, work)
+    NextLA.pemv!('N', 'R', m, n, l, alpha, A, X, beta, Y, work)
         
         # For row storage with no transpose:
         # Y := alpha * A^T * X + beta * Y
@@ -54,14 +52,13 @@ using CUDA
         alpha = ComplexF32(2.0 + 1.0im)
         beta = ComplexF32(0.8 - 0.5im)
         
-        A = rand(ComplexF32, m, n)
-        lda = m
+    A = rand(ComplexF32, m, n)
         X = rand(ComplexF32, n)
         Y = rand(ComplexF32, m)
         Y_original = copy(Y)
         work = zeros(ComplexF32, m)
         
-        NextLA.pemv('C', 'C', m, n, l, alpha, A, lda, X, beta, Y, work)
+    NextLA.pemv!('C', 'C', m, n, l, alpha, A, X, beta, Y, work)
         
         Y_expected = alpha * A' * X + beta * Y_original
         
@@ -79,13 +76,12 @@ using CUDA
             
             # Test column storage
             A = rand(ComplexF64, m, n)
-            lda = m
             X = rand(ComplexF64, n)
             Y = rand(ComplexF64, m)
             Y_original = copy(Y)
             work = zeros(ComplexF64, m)
             
-            NextLA.pemv('C', 'C', m, n, l, alpha, A, lda, X, beta, Y, work)
+            NextLA.pemv!('C', 'C', m, n, l, alpha, A, X, beta, Y, work)
             
             Y_expected = alpha * A' * X + beta * Y_original
             @test Y ≈ Y_expected rtol=1e-12
@@ -97,14 +93,13 @@ using CUDA
         alpha = ComplexF64(0.0)
         beta = 2.0 + 1.5im
         
-        A = rand(ComplexF64, m, n)
-        lda = m
+    A = rand(ComplexF64, m, n)
         X = rand(ComplexF64, n)
         Y = rand(ComplexF64, m)
         Y_original = copy(Y)
         work = zeros(ComplexF64, m)
         
-        NextLA.pemv('C', 'C', m, n, l, alpha, A, lda, X, beta, Y, work)
+    NextLA.pemv!('C', 'C', m, n, l, alpha, A, X, beta, Y, work)
         
         # With alpha = 0, result should be beta * Y_original
         Y_expected = beta * Y_original
@@ -116,14 +111,13 @@ using CUDA
         alpha = 2.0 + 1.5im
         beta = ComplexF64(0.0)
         
-        A = rand(ComplexF64, m, n)
-        lda = m
+    A = rand(ComplexF64, m, n)
         X = rand(ComplexF64, n)
         Y = rand(ComplexF64, m)
         Y_original = copy(Y)
         work = zeros(ComplexF64, m)
         
-        NextLA.pemv('C', 'C', m, n, l, alpha, A, lda, X, beta, Y, work)
+    NextLA.pemv!('C', 'C', m, n, l, alpha, A, X, beta, Y, work)
         
         # With beta = 0, result should be alpha * A' * X
         Y_expected = alpha * A' * X
@@ -135,14 +129,13 @@ using CUDA
         alpha = ComplexF64(0.0)
         beta = ComplexF64(0.0)
         
-        A = rand(ComplexF64, m, n)
-        lda = m
+    A = rand(ComplexF64, m, n)
         X = rand(ComplexF64, n)
         Y = rand(ComplexF64, m)
         Y_original = copy(Y)
         work = zeros(ComplexF64, m)
         
-        NextLA.pemv('C', 'C', m, n, l, alpha, A, lda, X, beta, Y, work)
+    NextLA.pemv!('C', 'C', m, n, l, alpha, A, X, beta, Y, work)
         
         # Function should return early, Y might be unchanged or zeroed
         # Check that it doesn't crash and produces finite results
@@ -155,26 +148,24 @@ using CUDA
         alpha = 2.0 + 1.0im
         beta = 1.5 - 0.5im
         
-        A = zeros(ComplexF64, max(1, m), n)
-        lda = max(1, m)
+    A = zeros(ComplexF64, max(1, m), n)
         X = rand(ComplexF64, n)
         Y = ComplexF64[]
         work = ComplexF64[]
         
         # Should return early without error
-        NextLA.pemv('C', 'C', m, n, l, alpha, A, lda, X, beta, Y, work)
+    NextLA.pemv!('C', 'C', m, n, l, alpha, A, X, beta, Y, work)
         @test length(Y) == 0
         
         # n = 0 case
         m, n, l = 5, 0, 0
-        A = rand(ComplexF64, m, max(1, n))
-        lda = m
+    A = rand(ComplexF64, m, max(1, n))
         X = ComplexF64[]
         Y = rand(ComplexF64, m)
         Y_original = copy(Y)
         work = zeros(ComplexF64, m)
         
-        NextLA.pemv('C', 'C', m, n, l, alpha, A, lda, X, beta, Y, work)
+    NextLA.pemv!('C', 'C', m, n, l, alpha, A, X, beta, Y, work)
         # Should return early
         @test all(isfinite.(Y))
     end
@@ -189,24 +180,23 @@ using CUDA
         work = zeros(ComplexF64, m)
         
         # Invalid trans
-        @test_throws ArgumentError NextLA.pemv('X', 'C', m, n, l, alpha, A, m, X, beta, Y, work)
+    @test_throws ArgumentError NextLA.pemv!('X', 'C', m, n, l, alpha, A, X, beta, Y, work)
         
         # Invalid storev
-        @test_throws ArgumentError NextLA.pemv('C', 'X', m, n, l, alpha, A, m, X, beta, Y, work)
+    @test_throws ArgumentError NextLA.pemv!('C', 'X', m, n, l, alpha, A, X, beta, Y, work)
         
         # Invalid trans/storev combination
-        @test_throws ArgumentError NextLA.pemv('N', 'C', m, n, l, alpha, A, m, X, beta, Y, work)
-        @test_throws ArgumentError NextLA.pemv('C', 'R', m, n, l, alpha, A, m, X, beta, Y, work)
+    @test_throws ArgumentError NextLA.pemv!('N', 'C', m, n, l, alpha, A, X, beta, Y, work)
+    @test_throws ArgumentError NextLA.pemv!('C', 'R', m, n, l, alpha, A, X, beta, Y, work)
         
         # Negative dimensions
-        @test_throws ArgumentError NextLA.pemv('C', 'C', -1, n, l, alpha, A, m, X, beta, Y, work)
-        @test_throws ArgumentError NextLA.pemv('C', 'C', m, -1, l, alpha, A, m, X, beta, Y, work)
+    @test_throws ArgumentError NextLA.pemv!('C', 'C', -1, n, l, alpha, A, X, beta, Y, work)
+    @test_throws ArgumentError NextLA.pemv!('C', 'C', m, -1, l, alpha, A, X, beta, Y, work)
         
         # Invalid l (l > min(m,n))
-        @test_throws ArgumentError NextLA.pemv('C', 'C', m, n, min(m,n)+1, alpha, A, m, X, beta, Y, work)
+    @test_throws ArgumentError NextLA.pemv!('C', 'C', m, n, min(m,n)+1, alpha, A, X, beta, Y, work)
         
-        # Invalid lda
-        @test_throws ArgumentError NextLA.pemv('C', 'C', m, n, l, alpha, A, m-1, X, beta, Y, work)
+    # No lda parameter to validate anymore
     end
     
     @testset "Consistency with BLAS" begin
@@ -215,15 +205,14 @@ using CUDA
         alpha = 2.0 + 1.0im
         beta = 1.5 - 0.8im
         
-        A = rand(ComplexF64, m, n)
-        lda = m
+    A = rand(ComplexF64, m, n)
         X = rand(ComplexF64, n)
         Y1 = rand(ComplexF64, m)
         Y2 = copy(Y1)
         work = zeros(ComplexF64, m)
         
         # Our implementation
-        NextLA.pemv('C', 'C', m, n, l, alpha, A, lda, X, beta, Y1, work)
+    NextLA.pemv!('C', 'C', m, n, l, alpha, A, X, beta, Y1, work)
         
         # BLAS reference
         LinearAlgebra.BLAS.gemv!('C', alpha, A, X, beta, Y2)
@@ -239,7 +228,6 @@ using CUDA
             
             # Create CPU data
             A_cpu = rand(ComplexF32, m, n)
-            lda = m
             X_cpu = rand(ComplexF32, n)
             Y_cpu = rand(ComplexF32, m)
             work_cpu = zeros(ComplexF32, m)
@@ -252,10 +240,10 @@ using CUDA
             
             # Apply on CPU
             Y_cpu_result = copy(Y_cpu)
-            NextLA.pemv('C', 'C', m, n, l, alpha, A_cpu, lda, X_cpu, beta, Y_cpu_result, work_cpu)
+            NextLA.pemv!('C', 'C', m, n, l, alpha, A_cpu, X_cpu, beta, Y_cpu_result, work_cpu)
             
             # Apply on GPU
-            NextLA.pemv('C', 'C', m, n, l, alpha, A_gpu, lda, X_gpu, beta, Y_gpu, work_gpu)
+            NextLA.pemv!('C', 'C', m, n, l, alpha, A_gpu, X_gpu, beta, Y_gpu, work_gpu)
             
             @test Array(Y_gpu) ≈ Y_cpu_result rtol=1e-6
         end
diff --git a/test/runtests.jl b/test/runtests.jl
index e62aeac..bd09dbe 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -98,16 +98,16 @@ include("axpy.jl")
 include("gerc.jl")
 include("larfg.jl")
 include("larf.jl")
-#include("larft.jl") #TODO: implement sub-tests for larft  (indirect test in geqrt)
-#include("larfb.jl") #TODO: implement sub-tests for larfb (indirect test in unmqr)
+#include("larft.jl") #TODO: implement sub-tests for larft  (indirect test in geqrt!)
+#include("larfb.jl") #TODO: implement sub-tests for larfb! (indirect test in unmqr!)
 include("geqr2.jl")
 include("geqrt.jl")
 include("unmqr.jl")
 include("tsqrt.jl")
 include("tsmqr.jl")
 include("parfb.jl")
-#include("pamm.jl") #TODO: implement sub-tests for pamm (indirect test in parfb)
-#include("pemv.jl") #TODO: implement sub-tests for pemv (indirect test in ttqrt)
+#include("pamm.jl") #TODO: implement sub-tests for pamm! (indirect test in parfb!)
+#include("pemv.jl") #TODO: implement sub-tests for pemv! (indirect test in ttqrt!)
 include("ttqrt.jl")
 include("ttmqr.jl")
 
diff --git a/test/tsmqr.jl b/test/tsmqr.jl
index dff9467..f056e15 100644
--- a/test/tsmqr.jl
+++ b/test/tsmqr.jl
@@ -126,25 +126,13 @@ const TSMQR_SIZES = [
                                     A2 = rand(T, m2, n2)
                                     
                                     # V matrix (Householder vectors)
-                                    if side == 'L'
-                                        V = rand(T, m2, k)
-                                        ldv = m2
-                                    else
-                                        V = rand(T, n2, k)
-                                        ldv = n2
-                                    end
+                                    V = side == 'L' ? rand(T, m2, k) : rand(T, n2, k)
                                     
                                     # T matrix (triangular factors)
                                     T_mat = triu(rand(T, ib, k))
                                     
                                     # Work array
-                                    if side == 'L'
-                                        work = zeros(T, ib, n1)
-                                        ldwork = ib
-                                    else
-                                        work = zeros(T, m1, ib)
-                                        ldwork = m1
-                                    end
+                                    work = side == 'L' ? zeros(T, ib * n1) : zeros(T, ib * m1)
                                     
                                     # Make copies for testing
                                     A1_orig = copy(A1)
@@ -155,8 +143,17 @@ const TSMQR_SIZES = [
                                     A2_lapack = copy(A2)
                                     
                                     # Test NextLA implementation
-                                    NextLA.tsmqr(side, trans, m1, n1, m2, n2, k, ib,
-                                                  A1_nextla, m1, A2_nextla, m2, V, ldv, T_mat, ib, work, ldwork)
+                                    NextLA.tsmqr!(side, trans, m1, n1, m2, n2, k, ib,
+                                                  A1_nextla, A2_nextla, V, T_mat, work)
+                                    
+                                    # --- Test Helper Function ---
+                                    A1_helper = copy(A1)
+                                    A2_helper = copy(A2)
+                                    NextLA.tsmqr!(side, trans, A1_helper, A2_helper, V, T_mat, ib)
+                                    
+                                    # Verify helper gives same results as kernel
+                                    @test A1_helper ≈ A1_nextla rtol=rtol
+                                    @test A2_helper ≈ A2_nextla rtol=rtol
                                     
                                     # Test LAPACK reference
                                     lapack_tpmqrt!(T, side, trans, 0, V, T_mat, A1_lapack, A2_lapack)
@@ -196,20 +193,20 @@ const TSMQR_SIZES = [
                 A2 = randn(T, m2, n2)
                 V = randn(T, m2, k)
                 T_mat = triu(randn(T, ib, k))
-                work = zeros(T, ib, n1)
-                
-                @test_nowarn NextLA.tsmqr('L', 'N', m1, n1, m2, n2, k, ib,
-                                           A1, m1, A2, m2, V, m2, T_mat, ib, work, ib)
+                work = zeros(T, ib * n1)
+
+                @test_nowarn NextLA.tsmqr!('L', 'N', m1, n1, m2, n2, k, ib,
+                                           A1, A2, V, T_mat, work)
                 
                 # Test edge cases
-                @test_nowarn NextLA.tsmqr('L', 'N', 0, 0, 0, 0, 0, 0,
-                                           zeros(T, 0, 0), 1, zeros(T, 0, 0), 1, 
-                                           zeros(T, 0, 0), 1, zeros(T, 0, 0), 1, T[], 1)
+                @test_nowarn NextLA.tsmqr!('L', 'N', 0, 0, 0, 0, 0, 0,
+                                           zeros(T, 0, 0), zeros(T, 0, 0), 
+                                           zeros(T, 0, 0), zeros(T, 0, 0), T[])
                 
                 # Test with k=0
-                @test_nowarn NextLA.tsmqr('L', 'N', 200, 200, 200, 200, 0, 0,
-                                           randn(T, 200, 200), 200, randn(T, 200, 200), 200,
-                                           zeros(T, 200, 0), 200, zeros(T, 0, 0), 1, T[], 1)
+                @test_nowarn NextLA.tsmqr!('L', 'N', 200, 200, 200, 200, 0, 0,
+                                           randn(T, 200, 200), randn(T, 200, 200),
+                                           zeros(T, 200, 0), zeros(T, 0, 0), T[])
             end
         end
     end
@@ -228,11 +225,11 @@ const TSMQR_SIZES = [
                     A2 = T.(scale .* randn(ComplexF64, m2, n2))
                     V = T.(scale .* randn(ComplexF64, m2, k))
                     T_mat = triu(T.(scale .* randn(ComplexF64, ib, k)))
-                    work = zeros(T, ib, n1)
+                    work = zeros(T, ib * n1)
                     
                     # Test calculation
-                    NextLA.tsmqr('L', 'N', m1, n1, m2, n2, k, ib,
-                                  A1, m1, A2, m2, V, m2, T_mat, ib, work, ib)
+                    NextLA.tsmqr!('L', 'N', m1, n1, m2, n2, k, ib,
+                                  A1, A2, V, T_mat, work)
                     
                     # Check that results are finite
                     @test all(isfinite.(A1))
@@ -284,12 +281,12 @@ const TSMQR_SIZES = [
                         A1_ref = copy(A1_cpu)
                         A2_ref = copy(A2_cpu)
                         work_ref = copy(work_cpu)
-                        NextLA.tsmqr(side, 'N', m1, n1, m2, n2, k, ib,
-                                      A1_ref, m1, A2_ref, m2, V_cpu, ldv, T_cpu, ib, work_ref, ldwork)
+                        NextLA.tsmqr!(side, 'N', m1, n1, m2, n2, k, ib,
+                                      A1_ref, A2_ref, V_cpu, T_cpu, work_ref)
                         
                         # GPU calculation
-                        NextLA.tsmqr(side, 'N', m1, n1, m2, n2, k, ib,
-                                      A1_gpu, m1, A2_gpu, m2, V_gpu, ldv, T_gpu, ib, work_gpu, ldwork)
+                        NextLA.tsmqr!(side, 'N', m1, n1, m2, n2, k, ib,
+                                      A1_gpu, A2_gpu, V_gpu, T_gpu, work_gpu)
                         
                         # Compare results
                         @test norm(Array(A1_gpu) - A1_ref) < rtol * max(1, norm(A1_ref))
diff --git a/test/tsqrt.jl b/test/tsqrt.jl
index 54876dd..89586ba 100644
--- a/test/tsqrt.jl
+++ b/test/tsqrt.jl
@@ -43,7 +43,7 @@ function lapack_tpqrt!(::Type{T}, m::Int64, n::Int64, l::Int64, nb::Int64,
     chklapackerror(info[])
 end
 
-# TSQRT test parameters for NextLA.tsqrt
+# TSQRT test parameters for NextLA.tsqrt!
 const TSQRT_TYPES = [ComplexF32, ComplexF64, Float32, Float64]
 const TSQRT_SIZES = [
     (100, 80, 30),    # m, n, ib
@@ -74,10 +74,6 @@ const TSQRT_SIZES = [
                         A2_lapack = copy(A2)
                         
                         # Prepare workspace and output arrays
-                        lda1 = n
-                        lda2 = m
-                        ldt = ib
-                        
                         T_nextla = zeros(T, ib, n)
                         T_lapack = zeros(T, ib, n)
                         tau_nextla = zeros(T, n)
@@ -85,7 +81,19 @@ const TSQRT_SIZES = [
                         work_nextla = zeros(T, ib * n)
                         
                         # Test NextLA implementation
-                        NextLA.tsqrt(m, n, ib, A1_nextla, lda1, A2_nextla, lda2, T_nextla, ldt, tau_nextla, work_nextla)
+                        NextLA.tsqrt!(m, n, ib, A1_nextla, A2_nextla, T_nextla, tau_nextla, work_nextla)
+                        
+                        # --- Test Helper Function ---
+                        # Recompute using the high-level helper on fresh inputs
+                        A1_helper = copy(A1)
+                        A2_helper = copy(A2)
+                        T_helper = zeros(T, ib, n)
+                        tau_helper = zeros(T, n)
+                        NextLA.tsqrt!(A1_helper, A2_helper, T_helper, tau_helper, ib)
+
+                        # Verify helper gives same results as kernel
+                        @test A1_helper ≈ A1_nextla rtol=rtol
+                        @test A2_helper ≈ A2_nextla rtol=rtol
                         
                         # Test LAPACK implementation 
                         work_lapack = zeros(T, ib * n)
@@ -146,7 +154,7 @@ const TSQRT_SIZES = [
                 tau_result = zeros(T, n)
                 work = zeros(T, ib * n)
                 
-                NextLA.tsqrt(m, n, ib, A1_result, n, A2_result, m, T_result, ib, tau_result, work)
+                NextLA.tsqrt!(m, n, ib, A1_result, A2_result, T_result, tau_result, work)
                 
                 # Check that A1 (now R) is upper triangular
                 for i in 1:n
@@ -188,15 +196,15 @@ const TSQRT_SIZES = [
                 tau = zeros(T, n)
                 work = zeros(T, ib * n)
                 
-                @test_nowarn NextLA.tsqrt(m, n, ib, A1, n, A2, m, T_mat, ib, tau, work)
+                @test_nowarn NextLA.tsqrt!(m, n, ib, A1, A2, T_mat, tau, work)
                 
                 # Test with invalid parameters
-                @test_throws ArgumentError NextLA.tsqrt(-1, n, ib, A1, n, A2, m, T_mat, ib, tau, work)
-                @test_throws ArgumentError NextLA.tsqrt(m, -1, ib, A1, n, A2, m, T_mat, ib, tau, work)
-                @test_throws ArgumentError NextLA.tsqrt(m, n, -1, A1, n, A2, m, T_mat, ib, tau, work)
+                @test_throws ArgumentError NextLA.tsqrt!(-1, n, ib, A1, A2, T_mat, tau, work)
+                @test_throws ArgumentError NextLA.tsqrt!(m, -1, ib, A1, A2, T_mat, tau, work)
+                @test_throws ArgumentError NextLA.tsqrt!(m, n, -1, A1, A2, T_mat, tau, work)
                 
                 # Test edge cases
-                @test_nowarn NextLA.tsqrt(0, 0, 0, zeros(T, 0, 0), 1, zeros(T, 0, 0), 1, zeros(T, 0, 0), 1, T[], T[])
+                @test_nowarn NextLA.tsqrt!(0, 0, 0, zeros(T, 0, 0), zeros(T, 0, 0), zeros(T, 0, 0), T[], T[])
             end
         end
     end
@@ -224,7 +232,7 @@ const TSQRT_SIZES = [
                     work = zeros(T, ib * n)
                     
                     # Test calculation
-                    NextLA.tsqrt(m, n, ib, A1, n, A2, m, T_mat, ib, tau, work)
+                    NextLA.tsqrt!(m, n, ib, A1, A2, T_mat, tau, work)
                     
                     # Check that results are finite
                     @test all(isfinite.(A1))
@@ -257,7 +265,7 @@ const TSQRT_SIZES = [
                     tau = zeros(T, n)
                     work = zeros(T, ib * n)
                     
-                    NextLA.tsqrt(m, n, ib, A1, n, A2, m, T_mat, ib, tau, work)
+                    NextLA.tsqrt!(m, n, ib, A1, A2, T_mat, tau, work)
                     
                     # Should complete without errors
                     @test all(isfinite.(A1))
@@ -291,7 +299,7 @@ const TSQRT_SIZES = [
                 tau = zeros(T, n)
                 work = zeros(T, ib * n)
                 
-                NextLA.tsqrt(m, n, ib, A1, n, A2, m, T_mat, ib, tau, work)
+                NextLA.tsqrt!(m, n, ib, A1, A2, T_mat, tau, work)
                 
                 @test all(isfinite.(A1))
                 @test all(isfinite.(A2))
@@ -308,7 +316,7 @@ const TSQRT_SIZES = [
                 tau = zeros(T, n)
                 work = zeros(T, ib * n)
                 
-                NextLA.tsqrt(m, n, ib, A1, n, A2, m, T_mat, ib, tau, work)
+                NextLA.tsqrt!(m, n, ib, A1, A2, T_mat, tau, work)
                 
                 @test all(isfinite.(A1))
                 @test all(isfinite.(A2))
@@ -347,10 +355,10 @@ const TSQRT_SIZES = [
                     A2_cpu_result = copy(A2_cpu)
                     T_cpu_result = copy(T_cpu)
                     tau_cpu_result = copy(tau_cpu)
-                    NextLA.tsqrt(m, n, ib, A1_cpu_result, n, A2_cpu_result, m, T_cpu_result, ib, tau_cpu_result, work_cpu)
+                    NextLA.tsqrt!(m, n, ib, A1_cpu_result, A2_cpu_result, T_cpu_result, tau_cpu_result, work_cpu)
                     
                     # Apply on GPU
-                    NextLA.tsqrt(m, n, ib, A1_gpu, n, A2_gpu, m, T_gpu, ib, tau_gpu, work_gpu)
+                    NextLA.tsqrt!(m, n, ib, A1_gpu, A2_gpu, T_gpu, tau_gpu, work_gpu)
                     
                     # Compare results
                     @test Array(A1_gpu) ≈ A1_cpu_result rtol=rtol
diff --git a/test/ttmqr.jl b/test/ttmqr.jl
index b579889..a4ed949 100644
--- a/test/ttmqr.jl
+++ b/test/ttmqr.jl
@@ -140,7 +140,7 @@ const TTMQR_SIZES = [
                                 
                                     T_mat = triu(rand(T, ib, k))
             
-                                    work = zeros(T, ib, n2)
+                                    work = zeros(T, ib * n2)
             
                                     A1_nextla = copy(A1)
                                     A2_nextla = copy(A2)
@@ -149,9 +149,19 @@ const TTMQR_SIZES = [
                                     T_mat_nextla = copy(T_mat)
                                     work_nextla = copy(work)
             
-                                    work = zeros(T, ib, n2)
-                                    NextLA.ttmqr('L', 'N', n2, n2, n2, n2, k, ib,
-                                                A1_nextla, n2, A2_nextla, n2, V, n2, T_mat_nextla, ib, work_nextla, ib)
+                                    NextLA.ttmqr!('L', 'N', n2, n2, n2, n2, k, ib,
+                                                A1_nextla, A2_nextla, V, T_mat_nextla, work_nextla)
+                                    
+                                    # --- Test Helper Function ---
+                                    A1_helper = copy(A1_orig)
+                                    A2_helper = copy(A2_orig)
+                                    T_mat_helper = copy(T_mat)
+                                    NextLA.ttmqr!('L', 'N', A1_helper, A2_helper, V, T_mat_helper, ib)
+
+                                    # Verify helper gives same results as kernel
+                                    @test A1_helper ≈ A1_nextla rtol=rtol
+                                    @test A2_helper ≈ A2_nextla rtol=rtol
+                                    
                                     lapack_tpmqrt!(T, 'L', 'N', 0, V, T_mat, A1_orig, A2_orig)
                                     @test norm(A1_nextla - A1_orig) < rtol * norm(A1_orig)
                                 end
@@ -172,20 +182,20 @@ const TTMQR_SIZES = [
                 C2 = randn(T, m2, n2)
                 V = randn(T, m2, k)
                 T_mat = triu(randn(T, ib, k))
-                work = zeros(T, ib, n1)
+                work = zeros(T, ib * n1)
                 
-                @test_nowarn NextLA.ttmqr('L', 'N', m1, n1, m2, n2, k, ib,
-                                           C1, m1, C2, m2, V, m2, T_mat, ib, work, ib)
+                @test_nowarn NextLA.ttmqr!('L', 'N', m1, n1, m2, n2, k, ib,
+                                           C1, C2, V, T_mat, work)
                 
                 # Test edge cases
-                @test_nowarn NextLA.ttmqr('L', 'N', 0, 0, 0, 0, 0, 0,
-                                           zeros(T, 0, 0), 1, zeros(T, 0, 0), 1, 
-                                           zeros(T, 0, 0), 1, zeros(T, 0, 0), 1, T[], 1)
+                @test_nowarn NextLA.ttmqr!('L', 'N', 0, 0, 0, 0, 0, 0,
+                                           zeros(T, 0, 0), zeros(T, 0, 0), 
+                                           zeros(T, 0, 0), zeros(T, 0, 0), T[])
                 
                 # Test with k=0
-                @test_nowarn NextLA.ttmqr('L', 'N', 2, 2, 2, 2, 0, 0,
-                                           randn(T, 2, 2), 2, randn(T, 2, 2), 2,
-                                           zeros(T, 2, 0), 2, zeros(T, 0, 0), 1, T[], 1)
+                @test_nowarn NextLA.ttmqr!('L', 'N', 2, 2, 2, 2, 0, 0,
+                                           randn(T, 2, 2), randn(T, 2, 2),
+                                           zeros(T, 2, 0), zeros(T, 0, 0), T[])
             end
         end
     end
@@ -204,11 +214,11 @@ const TTMQR_SIZES = [
                     C2 = T.(scale .* randn(ComplexF64, m2, n2))
                     V = T.(scale .* randn(ComplexF64, m2, k))
                     T_mat = triu(T.(scale .* randn(ComplexF64, ib, k)))
-                    work = zeros(T, ib, n1)
+                    work = zeros(T, ib * n1)
                     
                     # Test calculation
-                    NextLA.ttmqr('L', 'N', m1, n1, m2, n2, k, ib,
-                                  C1, m1, C2, m2, V, m2, T_mat, ib, work, ib)
+                    NextLA.ttmqr!('L', 'N', m1, n1, m2, n2, k, ib,
+                                  C1, C2, V, T_mat, work)
                     
                     # Check that results are finite
                     @test all(isfinite.(C1))
@@ -237,14 +247,10 @@ const TTMQR_SIZES = [
                         
                         if side == 'L'
                             V_cpu = randn(T, m2, k)
-                            work_cpu = zeros(T, ib, n1)
-                            ldv = m2
-                            ldwork = ib
+                            work_cpu = zeros(T, ib * n1)
                         else
                             V_cpu = randn(T, n2, k)
-                            work_cpu = zeros(T, m1, ib)
-                            ldv = n2
-                            ldwork = m1
+                            work_cpu = zeros(T, ib * m1)
                         end
                         
                         T_cpu = triu(randn(T, ib, k))
@@ -260,12 +266,12 @@ const TTMQR_SIZES = [
                         C1_ref = copy(C1_cpu)
                         C2_ref = copy(C2_cpu)
                         work_ref = copy(work_cpu)
-                        NextLA.ttmqr(side, 'N', m1, n1, m2, n2, k, ib,
-                                      C1_ref, m1, C2_ref, m2, V_cpu, ldv, T_cpu, ib, work_ref, ldwork)
+                        NextLA.ttmqr!(side, 'N', m1, n1, m2, n2, k, ib,
+                                      C1_ref, C2_ref, V_cpu, T_cpu, work_ref)
                         
                         # GPU calculation
-                        NextLA.ttmqr(side, 'N', m1, n1, m2, n2, k, ib,
-                                      C1_gpu, m1, C2_gpu, m2, V_gpu, ldv, T_gpu, ib, work_gpu, ldwork)
+                        NextLA.ttmqr!(side, 'N', m1, n1, m2, n2, k, ib,
+                                      C1_gpu, C2_gpu, V_gpu, T_gpu, work_gpu)
                         
                         # Compare results
                         @test norm(Array(C1_gpu) - C1_ref) < rtol * max(1, norm(C1_ref))
diff --git a/test/ttqrt.jl b/test/ttqrt.jl
index 189e4dd..a586cfb 100644
--- a/test/ttqrt.jl
+++ b/test/ttqrt.jl
@@ -86,7 +86,16 @@ const TTQRT_SIZES = [
                         work_nextla = copy(work)
 
 
-                        NextLA.ttqrt(n, n, ib, A1_nextla, n, A2_nextla, m, T_mat_nextla, ib, tau, work_nextla)
+                        NextLA.ttqrt!(n, n, ib, A1_nextla, A2_nextla, T_mat_nextla, tau, work_nextla)
+
+                        # --- Test Helper Function ---
+                        A1_helper = copy(A1_orig)
+                        A2_helper = copy(A2_orig)
+                        NextLA.ttqrt!(ib, A1_helper, A2_helper, T_mat_nextla, tau)
+                        
+                        # Verify helper gives same results as kernel
+                        @test A1_helper ≈ A1_nextla rtol=rtol
+                        @test A2_helper ≈ A2_nextla rtol=rtol
 
                         lapack_tpqrt!(T, n, n, n, ib, A1, n, A2, n, T_mat, ib, work)
 
@@ -123,15 +132,13 @@ const TTQRT_SIZES = [
                 tau = zeros(T, n)
                 work = zeros(T, ib * n)
                 
-                @test_nowarn NextLA.ttqrt(m, n, ib, A1, n, A2, m, T_matrix, ib, tau, work)
+                @test_nowarn NextLA.ttqrt!(m, n, ib, A1, A2, T_matrix, tau, work)
                 
                 # Test edge cases
-                @test_nowarn NextLA.ttqrt(0, 0, 0, zeros(T, 0, 0), 1, zeros(T, 0, 0), 1, 
-                                         zeros(T, 0, 0), 1, T[], T[])
+                @test_nowarn NextLA.ttqrt!(0, 0, 0, zeros(T, 0, 0), zeros(T, 0, 0), zeros(T, 0, 0), T[], T[])
                 
                 # Test with minimal size
-                @test_nowarn NextLA.ttqrt(1, 1, 1, ones(T, 1, 1), 1, ones(T, 1, 1), 1,
-                                         zeros(T, 1, 1), 1, zeros(T, 1), zeros(T, 1))
+                @test_nowarn NextLA.ttqrt!(1, 1, 1, ones(T, 1, 1), ones(T, 1, 1), zeros(T, 1, 1), zeros(T, 1), zeros(T, 1))
             end
         end
     end
@@ -159,8 +166,8 @@ const TTQRT_SIZES = [
                         end
                     end
                     
-                    # Test calculation
-                    NextLA.ttqrt(m, n, ib, A1, n, A2, m, T_matrix, ib, tau, work)
+                    # Test calculation (simplified signature)
+                    NextLA.ttqrt!(m, n, ib, A1, A2, T_matrix, tau, work)
                     
                     # Check that results are finite
                     @test all(isfinite.(A1))
@@ -213,10 +220,10 @@ const TTQRT_SIZES = [
                         T_ref = copy(T_cpu)
                         tau_ref = copy(tau_cpu)
                         work_ref = copy(work_cpu)
-                        NextLA.ttqrt(m, n, ib, A1_ref, n, A2_ref, m, T_ref, ib, tau_ref, work_ref)
+                        NextLA.ttqrt!(m, n, ib, A1_ref, A2_ref, T_ref, tau_ref, work_ref)
                         
                         # GPU calculation
-                        NextLA.ttqrt(m, n, ib, A1_gpu, n, A2_gpu, m, T_gpu, ib, tau_gpu, work_gpu)
+                        NextLA.ttqrt!(m, n, ib, A1_gpu, A2_gpu, T_gpu, tau_gpu, work_gpu)
                         
                         # Compare results
                         @test norm(Array(A1_gpu) - A1_ref) < rtol * max(1, norm(A1_ref))
diff --git a/test/unmqr.jl b/test/unmqr.jl
index 96a9138..12bf436 100644
--- a/test/unmqr.jl
+++ b/test/unmqr.jl
@@ -16,14 +16,12 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64]
             A_original = copy(A_qr)
             lda = m
             T = zeros(type, ib, k)
-            ldt = ib
             tau = zeros(type, k)
-            ldwork = ib * n
-            work_qr = zeros(type, ldwork)
+            work_qr = zeros(type, ib * k)
 
 
             # Perform QR factorization
-            NextLA.geqrt(m, k, ib, A_qr, lda, T, ldt, tau, work_qr)
+            NextLA.geqrt!(m, k, ib, A_qr, T, tau, work_qr)
 
             # Test matrix to apply Q to
             C = rand(T, m, n)
@@ -31,18 +29,25 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64]
             ldc = m
 
             # Workspace for UNMQR
-            work = zeros(type, ib * m)
-            ldwork = n
+            # Workspace for UNMQR (matrix workspace)
+            work = zeros(type, n, ib)
 
             # Apply Q from left (Q * C)
-            NextLA.unmqr('L', 'N', m, n, k, ib, A_qr, lda, T, ldt, C, ldc, work, ldwork)
+            NextLA.unmqr!('L', 'N', m, n, k, ib, A_qr, lda, T, C, work)
+
+            # --- Test Helper Function ---
+            C_helper = copy(C_original)
+            NextLA.unmqr!('L', 'N', A_qr, T, C_helper, ib)
+            
+            # Verify helper gives same results as kernel (in-place)
+            @test C_helper ≈ C rtol=rtol
         
             # Verify using reference QR decomposition
             Q_ref, R_ref = qr(A_original)
             C_expected = Matrix(Q_ref) * C_original
 
             # Note: Due to potential sign differences in QR, we check properties rather than exact equality
-            @test size(C) == (n, m)
+            @test size(C) == (m, n)
             @test all(isfinite.(C))
         
             # Check that the transformation preserves matrix structure
@@ -60,14 +65,12 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64]
             A_original = copy(A_qr)
             lda = m
             T = zeros(type, ib, k)
-            ldt = ib
             tau = zeros(type, k)
-            ldwork = ib * n
-            work_qr = zeros(type, ldwork)
+            work_qr = zeros(type, ib * k)
 
 
             # Perform QR factorization
-            NextLA.geqrt(m, k, ib, A_qr, lda, T, ldt, tau, work_qr)
+            NextLA.geqrt!(m, k, ib, A_qr, T, tau, work_qr)
 
             # Test matrix to apply Q to
             C = rand(T, m, n)
@@ -75,18 +78,25 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64]
             ldc = m
 
             # Workspace for UNMQR
-            work = zeros(type, ib * m)
-            ldwork = n
+            # Workspace for UNMQR (matrix workspace)
+            work = zeros(type, n, ib)
 
             # Apply Q from left (Q * C)
-            NextLA.unmqr('L', 'C', m, n, k, ib, A_qr, lda, T, ldt, C, ldc, work, ldwork)
+            NextLA.unmqr!('L', 'C', m, n, k, ib, A_qr, lda, T, C, work)
+
+            # --- Test Helper Function ---
+            C_helper = copy(C_original)
+            NextLA.unmqr!('L', 'C', A_qr, T, C_helper, ib)
+            
+            # Verify helper gives same results as kernel (in-place)
+            @test C_helper ≈ C rtol=rtol
         
             # Verify using reference QR decomposition
             Q_ref, R_ref = qr(A_original)
             C_expected = adjoint(Matrix(Q_ref)) * C_original
 
             # Note: Due to potential sign differences in QR, we check properties rather than exact equality
-            @test size(C) == (n, m)
+            @test size(C) == (m, n)
             @test all(isfinite.(C))
         
             # Check that the transformation preserves matrix structure
@@ -104,14 +114,12 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64]
             A_original = copy(A_qr)
             lda = m
             T = zeros(type, ib, k)
-            ldt = ib
             tau = zeros(type, k)
-            ldwork = ib * n
-            work_qr = zeros(type, ldwork)
+            work_qr = zeros(type, ib * k)
 
 
             # Perform QR factorization
-            NextLA.geqrt(m, k, ib, A_qr, lda, T, ldt, tau, work_qr)
+            NextLA.geqrt!(m, k, ib, A_qr, T, tau, work_qr)
 
             # Test matrix to apply Q to
             C = rand(T, m, n)
@@ -119,18 +127,18 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64]
             ldc = m
 
             # Workspace for UNMQR
-            work = zeros(type, ib * m)
-            ldwork = m
+            # Workspace for UNMQR (matrix workspace)
+            work = zeros(type, m, ib)
 
             # Apply Q from left (Q * C)
-            NextLA.unmqr('R', 'N', m, n, k, ib, A_qr, lda, T, ldt, C, ldc, work, ldwork)
+            NextLA.unmqr!('R', 'N', m, n, k, ib, A_qr, lda, T, C, work)
         
             # Verify using reference QR decomposition
             Q_ref, R_ref = qr(A_original)
             C_expected =  C_original * Matrix(Q_ref)
 
             # Note: Due to potential sign differences in QR, we check properties rather than exact equality
-            @test size(C) == (n, m)
+            @test size(C) == (m, n)
             @test all(isfinite.(C))
         
             # Check that the transformation preserves matrix structure
@@ -148,14 +156,12 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64]
             A_original = copy(A_qr)
             lda = m
             T = zeros(type, ib, k)
-            ldt = ib
             tau = zeros(type, k)
-            ldwork = ib * n
-            work_qr = zeros(type, ldwork)
+            work_qr = zeros(type, ib * k)
 
 
             # Perform QR factorization
-            NextLA.geqrt(m, k, ib, A_qr, lda, T, ldt, tau, work_qr)
+            NextLA.geqrt!(m, k, ib, A_qr, T, tau, work_qr)
 
             # Test matrix to apply Q to
             C = rand(T, m, n)
@@ -163,18 +169,18 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64]
             ldc = m
 
             # Workspace for UNMQR
-            work = zeros(type, ib * m)
-            ldwork = m
+            # Workspace for UNMQR (matrix workspace)
+            work = zeros(type, m, ib)
 
-            # Apply Q from left (Q * C)
-            NextLA.unmqr('R', 'N', m, n, k, ib, A_qr, lda, T, ldt, C, ldc, work, ldwork)
+            # Apply Q^H from right (C * Q^H)
+            NextLA.unmqr!('R', 'C', m, n, k, ib, A_qr, lda, T, C, work)
         
             # Verify using reference QR decomposition
             Q_ref, R_ref = qr(A_original)
             C_expected =  C_original * adjoint(Matrix(Q_ref))
 
             # Note: Due to potential sign differences in QR, we check properties rather than exact equality
-            @test size(C) == (n, m)
+            @test size(C) == (m, n)
             @test all(isfinite.(C))
         
             # Check that the transformation preserves matrix structure
@@ -188,23 +194,22 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64]
         
         A_qr = rand(ComplexF64, m, k)
         lda = m
-        T = zeros(ComplexF64, ib, k)
-        ldt = ib
+    T = zeros(ComplexF64, ib, k)
         tau = zeros(ComplexF64, k)
-        work_qr = zeros(ComplexF64, ib * k)
+    work_qr = zeros(ComplexF64, ib * k)
         
-        NextLA.geqrt(m, k, ib, A_qr, lda, T, ldt, tau, work_qr)
+    NextLA.geqrt!(m, k, ib, A_qr, T, tau, work_qr)
         
         C = Matrix{ComplexF64}(I, m, n)  # Identity matrix
         C_original = copy(C)
-        ldc = m
+    ldc = m
         
-        work = zeros(ComplexF64, ib * n)
-        ldwork = n
+    # Matrix workspace for UNMQR
+    work = zeros(ComplexF64, n, ib)
         
         # Apply Q then Q^H
-        NextLA.unmqr('L', 'N', m, n, k, ib, A_qr, lda, T, ldt, C, ldc, work, ldwork)
-        NextLA.unmqr('L', 'C', m, n, k, ib, A_qr, lda, T, ldt, C, ldc, work, ldwork)
+    NextLA.unmqr!('L', 'N', m, n, k, ib, A_qr, lda, T, C, work)
+    NextLA.unmqr!('L', 'C', m, n, k, ib, A_qr, lda, T, C, work)
         
         # Should get back to identity (at least for the first k columns)
         @test C[:, 1:k] ≈ C_original[:, 1:k] rtol=1e-10
@@ -215,22 +220,22 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64]
         A = zeros(ComplexF64, m, k)
         T = zeros(ComplexF64, ib, k)
         C = zeros(ComplexF64, m, n)
-        work = zeros(ComplexF64, ib * n)
+    work = zeros(ComplexF64, n, ib)
         
-        # Invalid side
-        @test_throws ArgumentError NextLA.unmqr('X', 'N', m, n, k, ib, A, m, T, ib, C, m, work, ib)
+    # Invalid side
+    @test_throws ArgumentError NextLA.unmqr!('X', 'N', m, n, k, ib, A, m, T, C, work)
         
-        # Invalid trans
-        @test_throws ArgumentError NextLA.unmqr('L', 'X', m, n, k, ib, A, m, T, ib, C, m, work, ib)
+    # Invalid trans
+    @test_throws ArgumentError NextLA.unmqr!('L', 'X', m, n, k, ib, A, m, T, C, work)
         
         # Negative dimensions
-        @test_throws ArgumentError NextLA.unmqr('L', 'N', -1, n, k, ib, A, m, T, ib, C, m, work, ib)
-        @test_throws ArgumentError NextLA.unmqr('L', 'N', m, -1, k, ib, A, m, T, ib, C, m, work, ib)
-        @test_throws ArgumentError NextLA.unmqr('L', 'N', m, n, -1, ib, A, m, T, ib, C, m, work, ib)
-        @test_throws ArgumentError NextLA.unmqr('L', 'N', m, n, k, -1, A, m, T, ib, C, m, work, ib)
+    @test_throws ArgumentError NextLA.unmqr!('L', 'N', -1, n, k, ib, A, m, T, C, work)
+    @test_throws ArgumentError NextLA.unmqr!('L', 'N', m, -1, k, ib, A, m, T, C, work)
+    @test_throws ArgumentError NextLA.unmqr!('L', 'N', m, n, -1, ib, A, m, T, C, work)
+    @test_throws ArgumentError NextLA.unmqr!('L', 'N', m, n, k, -1, A, m, T, C, work)
         
         # Invalid k (k > nq)
-        @test_throws ArgumentError NextLA.unmqr('L', 'N', m, n, m+1, ib, A, m, T, ib, C, m, work, ib)
+    @test_throws ArgumentError NextLA.unmqr!('L', 'N', m, n, m+1, ib, A, m, T, C, work)
     end
     
     @testset "Edge Cases" begin
@@ -240,9 +245,9 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64]
         T = zeros(ComplexF64, ib, max(1, k))
         C = rand(ComplexF64, m, n)
         C_original = copy(C)
-        work = zeros(ComplexF64, ib * n)
+    work = zeros(ComplexF64, n, ib)
         
-        NextLA.unmqr('L', 'N', m, n, k, ib, A, m, T, ib, C, m, work, n)
+    NextLA.unmqr!('L', 'N', m, n, k, ib, A, m, T, C, work)
         
         # With k=0, C should remain unchanged
         @test C ≈ C_original
@@ -252,19 +257,17 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64]
         A_qr = rand(ComplexF64, m, k)
         lda = m
         T = zeros(ComplexF64, ib, k)
-        ldt = ib
         tau = zeros(ComplexF64, k)
         work_qr = zeros(ComplexF64, ib * k)
         
-        NextLA.geqrt(m, k, ib, A_qr, lda, T, ldt, tau, work_qr)
+    NextLA.geqrt!(m, k, ib, A_qr, T, tau, work_qr)
         
         C = rand(ComplexF64, m, n)
         C_original = copy(C)
         ldc = m
-        work = zeros(ComplexF64, ib * n)
-        ldwork = n
+    work = zeros(ComplexF64, n, ib)
         
-        NextLA.unmqr('L', 'N', m, n, k, ib, A_qr, lda, T, ldt, C, ldc, work, ldwork)
+    NextLA.unmqr!('L', 'N', m, n, k, ib, A_qr, lda, T, C, work)
         
         @test all(isfinite.(C))
         @test norm(C) ≈ norm(C_original) rtol=1e-8
@@ -278,17 +281,15 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64]
             A_qr_cpu = rand(ComplexF32, m, k)
             lda = m
             T_cpu = zeros(ComplexF32, ib, k)
-            ldt = ib
             tau_cpu = zeros(ComplexF32, k)
             work_qr_cpu = zeros(ComplexF32, ib * k)
             
-            NextLA.geqrt(m, k, ib, A_qr_cpu, lda, T_cpu, ldt, tau_cpu, work_qr_cpu)
+            NextLA.geqrt!(m, k, ib, A_qr_cpu, T_cpu, tau_cpu, work_qr_cpu)
             
             # Create test matrices
             C_cpu = rand(ComplexF32, m, n)
             ldc = m
-            work_cpu = zeros(ComplexF32, ib * n)
-            ldwork = ib
+            work_cpu = zeros(ComplexF32, n, ib)
             
             # Create GPU data
             A_qr_gpu = CuArray(A_qr_cpu)
@@ -298,10 +299,10 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64]
             
             # Apply on CPU
             C_cpu_result = copy(C_cpu)
-            NextLA.unmqr('L', 'N', m, n, k, ib, A_qr_cpu, lda, T_cpu, ldt, C_cpu_result, ldc, work_cpu, ldwork)
+            NextLA.unmqr!('L', 'N', m, n, k, ib, A_qr_cpu, lda, T_cpu, C_cpu_result, work_cpu)
             
             # Apply on GPU
-            NextLA.unmqr('L', 'N', m, n, k, ib, A_qr_gpu, lda, T_gpu, ldt, C_gpu, ldc, work_gpu, ldwork)
+            NextLA.unmqr!('L', 'N', m, n, k, ib, A_qr_gpu, lda, T_gpu, C_gpu, work_gpu)
             
             @test Array(C_gpu) ≈ C_cpu_result rtol=1e-6
         end

From 9adfa948c4ac58f7c1bdf08f8a833f7025032fa5 Mon Sep 17 00:00:00 2001
From: Felipe Tome <tomefelipe0@usp.br>
Date: Fri, 29 Aug 2025 10:20:27 -0300
Subject: [PATCH 3/6] CAQR: remove ib from helpers, correct ! into pemv and
 pamm

---
 src/geqrt.jl  |  3 ++-
 src/pamm.jl   |  2 +-
 src/pemv.jl   |  2 +-
 src/tsmqr.jl  |  3 ++-
 src/tsqrt.jl  |  5 +++--
 src/ttmqr.jl  |  3 ++-
 src/ttqrt.jl  |  3 ++-
 src/unmqr.jl  |  5 +++--
 test/geqrt.jl |  2 +-
 test/pamm.jl  | 23 +++++++++++------------
 test/tsmqr.jl |  2 +-
 test/tsqrt.jl |  2 +-
 test/ttmqr.jl |  2 +-
 test/ttqrt.jl |  2 +-
 test/unmqr.jl |  4 ++--
 15 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/src/geqrt.jl b/src/geqrt.jl
index 8b1e86f..51d90a5 100644
--- a/src/geqrt.jl
+++ b/src/geqrt.jl
@@ -81,8 +81,9 @@ Helper function for blocked QR factorization. Computes A = Q*R where Q is orthog
 - `T`: Upper triangular block reflector matrix (ib × n)
 - `tau`: Vector of scalar factors for elementary reflectors (length n)
 """
-function geqrt!(ib::Integer, A::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, tau::AbstractVector{T}) where {T}
+function geqrt!(A::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, tau::AbstractVector{T}) where {T}
     m, n = size(A)
+    ib = size(T_matrix, 1)
     work = zeros(T, ib * n)
 
     geqrt!(m, n, ib, A, T_matrix, tau, work)
diff --git a/src/pamm.jl b/src/pamm.jl
index 51cbf84..d4d757d 100644
--- a/src/pamm.jl
+++ b/src/pamm.jl
@@ -411,7 +411,7 @@ V = complex.(randn(m, k), randn(m, k))
 A1_new, A2_new = pamm('A', 'L', 'C', 'F', A1, A2, V)
 ```
 """
-function pamm(op::Char, side::Char, storev::Char, direct::Char, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, V::AbstractMatrix{T}) where {T}
+function pamm!(op::Char, side::Char, storev::Char, direct::Char, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, V::AbstractMatrix{T}) where {T}
     # Determine dimensions
     m, k = size(A1)
     n = size(A2, 2)
diff --git a/src/pemv.jl b/src/pemv.jl
index 8c4acf4..a801df7 100644
--- a/src/pemv.jl
+++ b/src/pemv.jl
@@ -208,7 +208,7 @@ Y_new = pemv('N', 'C', A, X, Y, 2.0, 1.0)
 ```
 """
 
-function pemv(trans::Char, storev::Char, alpha::T, A::AbstractMatrix{T}, x::AbstractVector{T}, beta::T, y::AbstractVector{T}) where {T}
+function pemv!(trans::Char, storev::Char, alpha::T, A::AbstractMatrix{T}, x::AbstractVector{T}, beta::T, y::AbstractVector{T}) where {T}
     # Determine dimensions
     m, n = size(A)
     l = min(m, n)  # Default panel size
diff --git a/src/tsmqr.jl b/src/tsmqr.jl
index b78af1a..f1ba956 100644
--- a/src/tsmqr.jl
+++ b/src/tsmqr.jl
@@ -186,10 +186,11 @@ efficiently. The compact WY representation enables high-performance
 matrix-matrix operations instead of multiple vector operations.
 """
 function tsmqr!(side::Char, trans::Char, A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, 
-               V::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, ib::Integer) where {T}
+               V::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}) where {T}
     m1, n1 = size(A1)
     m2, n2 = size(A2)
     k = size(V, 2)
+    ib = size(T_matrix, 1)
     
     # Validate input dimensions
     if side == 'L' && n2 != n1
diff --git a/src/tsqrt.jl b/src/tsqrt.jl
index fa1636b..c31138d 100644
--- a/src/tsqrt.jl
+++ b/src/tsqrt.jl
@@ -151,17 +151,18 @@ A1_qr, A2_qr, T, tau = tsqrt!(copy(A1), copy(A2), ib)
 Uses blocked algorithm for efficiency with large matrices. The compact WY
 representation (stored in T) enables efficient application of the Q factor.
 """
-function tsqrt!(A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, tau::AbstractVector{T}, ib::Integer) where {T}
+function tsqrt!(A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, tau::AbstractVector{T}) where {T}
     n, n2 = size(A1)
     if n != n2
         throw(ArgumentError("A1 must be square, got size $(size(A1))"))
     end
-    
+
     m, n3 = size(A2) 
     if n != n3
         throw(ArgumentError("A1 and A2 must have same number of columns, got $n and $n3"))
     end
     
+    ib = size(T_matrix, 1)
     if ib <= 0
         throw(ArgumentError("Block size ib must be positive, got $ib"))
     end
diff --git a/src/ttmqr.jl b/src/ttmqr.jl
index ac4b263..0c1872a 100644
--- a/src/ttmqr.jl
+++ b/src/ttmqr.jl
@@ -109,9 +109,10 @@ Helper function for triangular-trapezoidal matrix transformation.
 - Modified `A1` and `A2`
 """
 function ttmqr!(side::Char, trans::Char, A1::AbstractMatrix{T}, A2::AbstractMatrix{T},
-         V::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, ib::Integer) where T
+         V::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}) where T
     m1, n1 = size(A1)
     m2, n2 = size(A2)
+    ib = size(T_matrix, 1)
     # Use the common number of reflectors available in V and T
     k = size(T_matrix, 2)
 
diff --git a/src/ttqrt.jl b/src/ttqrt.jl
index 8c6f341..7c864de 100644
--- a/src/ttqrt.jl
+++ b/src/ttqrt.jl
@@ -93,9 +93,10 @@ Helper for triangular-triangular QR factorization.
 - `T`: Block reflector matrix  
 - `tau`: Scalar factors
 """
-function ttqrt!(ib::Integer, A::AbstractMatrix{T}, B::AbstractMatrix{T}, T_mat::AbstractMatrix{T}, tau::AbstractVector{T}) where {T}
+function ttqrt!(A::AbstractMatrix{T}, B::AbstractMatrix{T}, T_mat::AbstractMatrix{T}, tau::AbstractVector{T}) where {T}
     m, n = size(A)
     m2, n2 = size(B)
+    ib = size(T_mat, 1)
     @assert m2 == m && n2 == n "A and B must have same dimensions"
 
     work = zeros(T, ib * n)
diff --git a/src/unmqr.jl b/src/unmqr.jl
index ab94981..4e36cfe 100644
--- a/src/unmqr.jl
+++ b/src/unmqr.jl
@@ -191,9 +191,10 @@ Uses the blocked compact WY representation to apply Q efficiently through
 matrix-matrix operations rather than individual elementary reflectors.
 """
 function unmqr!(side::Char, trans::Char, A::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, 
-               C::AbstractMatrix{T}, ib::Integer) where {T}
+               C::AbstractMatrix{T}) where {T}
     m, n = size(C)
-    k = size(T_matrix, 2)
+    ib, k = size(T_matrix)
+
     
     # Validate input dimensions
     if ib <= 0
diff --git a/test/geqrt.jl b/test/geqrt.jl
index cf5f602..e7b376f 100644
--- a/test/geqrt.jl
+++ b/test/geqrt.jl
@@ -55,7 +55,7 @@ const GEQRT_BLOCKSIZES = [100, 200, 400, 800]
                                         A_helper = copy(A_orig)
                                         T_helper = zeros(T, max(1, ib), k)
                                         tau_helper = zeros(T, k)
-                                        NextLA.geqrt!(ib, A_helper, T_helper, tau_helper)
+                                        NextLA.geqrt!(A_helper, T_helper, tau_helper)
                                         
                                         # Verify helper gives same results as kernel (in-place)
                                         if k > 0
diff --git a/test/pamm.jl b/test/pamm.jl
index aea06a2..202468b 100644
--- a/test/pamm.jl
+++ b/test/pamm.jl
@@ -165,7 +165,7 @@ using CUDA
     end
     
     @testset "Wrapper Function Tests" begin
-        # Test pamm_w wrapper
+    # Test pamm! simplified wrapper
         m, n, k, l = 150, 120, 80, 50
         
         A1 = rand(ComplexF64, k, m)
@@ -175,16 +175,16 @@ using CUDA
         
         W_original = copy(W)
         
-        NextLA.pamm_w(true, true, true, m, n, k, l, A1, A2, V, W)
+    NextLA.pamm!('W', 'L', 'C', 'F', A1, A2, V)
         
         @test all(isfinite.(W))
         @test !isapprox(W, W_original, rtol=1e-12)
         
-        # Test pamm_a wrapper
+    # Test pamm! simplified wrapper for 'A'
         A2_test = rand(ComplexF64, m, k)
         A2_original = copy(A2_test)
         
-        NextLA.pamm_a(true, true, true, m, n, k, l, A2_test, V, W)
+    NextLA.pamm!('A', 'L', 'C', 'F', A1, A2_test, V)
         
         @test all(isfinite.(A2_test))
     end
@@ -220,20 +220,19 @@ using CUDA
             V_cpu = rand(ComplexF32, m, l)
             W_cpu = rand(ComplexF32, n, l)
             
-            lda1 = k
+            # Prepare GPU data
             A1_gpu = CuArray(A1_cpu)
-            W_cpu_result = copy(W_cpu)
-            NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1_cpu, A2_cpu, V_cpu, W_cpu_result)
+            A2_gpu = CuArray(A2_cpu)
+            V_gpu = CuArray(V_cpu)
             W_gpu = CuArray(W_cpu)
             
-            # Apply on CPU
-            W_cpu_result = copy(W_cpu)
-            NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1_cpu, lda1, A2_cpu, lda2, V_cpu, ldv, W_cpu_result, ldw)
+            # Apply on CPU (full signature)
+            NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1_cpu, A2_cpu, V_cpu, W_cpu)
             
-            # Apply on GPU
+            # Apply on GPU (full signature)
             NextLA.pamm!('W', 'L', 'C', 'F', m, n, k, l, A1_gpu, A2_gpu, V_gpu, W_gpu)
             
-            @test Array(W_gpu) ≈ W_cpu_result rtol=1e-6
+            @test Array(W_gpu) ≈ W_cpu rtol=1e-6
         end
     end
 end
diff --git a/test/tsmqr.jl b/test/tsmqr.jl
index f056e15..bc636ea 100644
--- a/test/tsmqr.jl
+++ b/test/tsmqr.jl
@@ -149,7 +149,7 @@ const TSMQR_SIZES = [
                                     # --- Test Helper Function ---
                                     A1_helper = copy(A1)
                                     A2_helper = copy(A2)
-                                    NextLA.tsmqr!(side, trans, A1_helper, A2_helper, V, T_mat, ib)
+                                    NextLA.tsmqr!(side, trans, A1_helper, A2_helper, V, T_mat)
                                     
                                     # Verify helper gives same results as kernel
                                     @test A1_helper ≈ A1_nextla rtol=rtol
diff --git a/test/tsqrt.jl b/test/tsqrt.jl
index 89586ba..7560ba2 100644
--- a/test/tsqrt.jl
+++ b/test/tsqrt.jl
@@ -89,7 +89,7 @@ const TSQRT_SIZES = [
                         A2_helper = copy(A2)
                         T_helper = zeros(T, ib, n)
                         tau_helper = zeros(T, n)
-                        NextLA.tsqrt!(A1_helper, A2_helper, T_helper, tau_helper, ib)
+                        NextLA.tsqrt!(A1_helper, A2_helper, T_helper, tau_helper)
 
                         # Verify helper gives same results as kernel
                         @test A1_helper ≈ A1_nextla rtol=rtol
diff --git a/test/ttmqr.jl b/test/ttmqr.jl
index a4ed949..edc27b3 100644
--- a/test/ttmqr.jl
+++ b/test/ttmqr.jl
@@ -156,7 +156,7 @@ const TTMQR_SIZES = [
                                     A1_helper = copy(A1_orig)
                                     A2_helper = copy(A2_orig)
                                     T_mat_helper = copy(T_mat)
-                                    NextLA.ttmqr!('L', 'N', A1_helper, A2_helper, V, T_mat_helper, ib)
+                                    NextLA.ttmqr!('L', 'N', A1_helper, A2_helper, V, T_mat_helper)
 
                                     # Verify helper gives same results as kernel
                                     @test A1_helper ≈ A1_nextla rtol=rtol
diff --git a/test/ttqrt.jl b/test/ttqrt.jl
index a586cfb..d333951 100644
--- a/test/ttqrt.jl
+++ b/test/ttqrt.jl
@@ -91,7 +91,7 @@ const TTQRT_SIZES = [
                         # --- Test Helper Function ---
                         A1_helper = copy(A1_orig)
                         A2_helper = copy(A2_orig)
-                        NextLA.ttqrt!(ib, A1_helper, A2_helper, T_mat_nextla, tau)
+                        NextLA.ttqrt!(A1_helper, A2_helper, T_mat_nextla, tau)
                         
                         # Verify helper gives same results as kernel
                         @test A1_helper ≈ A1_nextla rtol=rtol
diff --git a/test/unmqr.jl b/test/unmqr.jl
index 12bf436..4d7b3d9 100644
--- a/test/unmqr.jl
+++ b/test/unmqr.jl
@@ -37,7 +37,7 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64]
 
             # --- Test Helper Function ---
             C_helper = copy(C_original)
-            NextLA.unmqr!('L', 'N', A_qr, T, C_helper, ib)
+            NextLA.unmqr!('L', 'N', A_qr, T, C_helper)
             
             # Verify helper gives same results as kernel (in-place)
             @test C_helper ≈ C rtol=rtol
@@ -86,7 +86,7 @@ const UNMQR_TESTTYPES = [ComplexF32, ComplexF64, Float32, Float64]
 
             # --- Test Helper Function ---
             C_helper = copy(C_original)
-            NextLA.unmqr!('L', 'C', A_qr, T, C_helper, ib)
+            NextLA.unmqr!('L', 'C', A_qr, T, C_helper)
             
             # Verify helper gives same results as kernel (in-place)
             @test C_helper ≈ C rtol=rtol

From 174993e63eea76aebc296786593fac3d90312d27 Mon Sep 17 00:00:00 2001
From: Felipe Tome <tomefelipe0@usp.br>
Date: Fri, 29 Aug 2025 11:56:22 -0300
Subject: [PATCH 4/6] CAQR: reducing test and helper bloating

---
 src/geqrt.jl  |  5 +++--
 src/tsqrt.jl  | 18 +++++++++---------
 src/ttqrt.jl  |  5 +++--
 test/geqrt.jl |  4 +---
 test/tsqrt.jl | 13 +------------
 test/ttqrt.jl |  2 +-
 6 files changed, 18 insertions(+), 29 deletions(-)

diff --git a/src/geqrt.jl b/src/geqrt.jl
index 51d90a5..04103de 100644
--- a/src/geqrt.jl
+++ b/src/geqrt.jl
@@ -81,9 +81,10 @@ Helper function for blocked QR factorization. Computes A = Q*R where Q is orthog
 - `T`: Upper triangular block reflector matrix (ib × n)
 - `tau`: Vector of scalar factors for elementary reflectors (length n)
 """
-function geqrt!(A::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, tau::AbstractVector{T}) where {T}
+function geqrt!(A::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}) where {T}
     m, n = size(A)
-    ib = size(T_matrix, 1)
+    ib, nb = size(T_matrix)
+    tau = Vector{T}(undef, nb)
     work = zeros(T, ib * n)
 
     geqrt!(m, n, ib, A, T_matrix, tau, work)
diff --git a/src/tsqrt.jl b/src/tsqrt.jl
index c31138d..f658db1 100644
--- a/src/tsqrt.jl
+++ b/src/tsqrt.jl
@@ -151,22 +151,22 @@ A1_qr, A2_qr, T, tau = tsqrt!(copy(A1), copy(A2), ib)
 Uses blocked algorithm for efficiency with large matrices. The compact WY
 representation (stored in T) enables efficient application of the Q factor.
 """
-function tsqrt!(A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}, tau::AbstractVector{T}) where {T}
+function tsqrt!(A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}) where{T}
     n, n2 = size(A1)
-    if n != n2
-        throw(ArgumentError("A1 must be square, got size $(size(A1))"))
-    end
-
+    
     m, n3 = size(A2) 
-    if n != n3
-        throw(ArgumentError("A1 and A2 must have same number of columns, got $n and $n3"))
+    if n2 != n3
+        throw(ArgumentError("A1 and A2 must have same number of columns, got $n2 and $n3"))
     end
-    
-    ib = size(T_matrix, 1)
+
+    ib, nb = size(T_matrix)
+
     if ib <= 0
         throw(ArgumentError("Block size ib must be positive, got $ib"))
     end
     
+
+    tau = Vector{T}(undef, n)
     work = zeros(T, ib * n)
     
     # Call the core computational routine
diff --git a/src/ttqrt.jl b/src/ttqrt.jl
index 7c864de..541b077 100644
--- a/src/ttqrt.jl
+++ b/src/ttqrt.jl
@@ -93,10 +93,11 @@ Helper for triangular-triangular QR factorization.
 - `T`: Block reflector matrix  
 - `tau`: Scalar factors
 """
-function ttqrt!(A::AbstractMatrix{T}, B::AbstractMatrix{T}, T_mat::AbstractMatrix{T}, tau::AbstractVector{T}) where {T}
+function ttqrt!(A::AbstractMatrix{T}, B::AbstractMatrix{T}, T_mat::AbstractMatrix{T}) where {T}
     m, n = size(A)
     m2, n2 = size(B)
-    ib = size(T_mat, 1)
+    ib, nb = size(T_mat)
+    tau = Vector{T}(undef, nb)
     @assert m2 == m && n2 == n "A and B must have same dimensions"
 
     work = zeros(T, ib * n)
diff --git a/test/geqrt.jl b/test/geqrt.jl
index e7b376f..1742ccd 100644
--- a/test/geqrt.jl
+++ b/test/geqrt.jl
@@ -54,14 +54,12 @@ const GEQRT_BLOCKSIZES = [100, 200, 400, 800]
                                         # --- Test Helper Function ---
                                         A_helper = copy(A_orig)
                                         T_helper = zeros(T, max(1, ib), k)
-                                        tau_helper = zeros(T, k)
-                                        NextLA.geqrt!(A_helper, T_helper, tau_helper)
+                                        NextLA.geqrt!(A_helper, T_helper)
                                         
                                         # Verify helper gives same results as kernel (in-place)
                                         if k > 0
                                             @test A_helper ≈ A_test rtol=rtol atol=atol
                                             @test T_helper[1:ib, 1:k] ≈ T_test[1:ib, 1:k] rtol=rtol atol=atol
-                                            @test tau_helper ≈ tau_test rtol=rtol atol=atol
                                         end
 
                                         # --- Comparisons ---
diff --git a/test/tsqrt.jl b/test/tsqrt.jl
index 7560ba2..ec22216 100644
--- a/test/tsqrt.jl
+++ b/test/tsqrt.jl
@@ -88,8 +88,7 @@ const TSQRT_SIZES = [
                         A1_helper = copy(A1)
                         A2_helper = copy(A2)
                         T_helper = zeros(T, ib, n)
-                        tau_helper = zeros(T, n)
-                        NextLA.tsqrt!(A1_helper, A2_helper, T_helper, tau_helper)
+                        NextLA.tsqrt!(A1_helper, A2_helper, T_helper)
 
                         # Verify helper gives same results as kernel
                         @test A1_helper ≈ A1_nextla rtol=rtol
@@ -112,16 +111,6 @@ const TSQRT_SIZES = [
                         
                         # Check that T has the expected block structure
                         @test size(T_nextla) == (ib, n)
-                        for block_start in 1:ib:n
-                            block_end = min(block_start + ib - 1, n)
-                            for i in 1:(block_end - block_start + 1)
-                                for j in 1:(i-1)
-                                    if block_start + i - 1 <= n && block_start + j - 1 <= n
-                                        @test abs(T_nextla[i, block_start + j - 1]) < rtol * 100
-                                    end
-                                end
-                            end
-                        end
                     end
                 end
             end
diff --git a/test/ttqrt.jl b/test/ttqrt.jl
index d333951..5a9eadb 100644
--- a/test/ttqrt.jl
+++ b/test/ttqrt.jl
@@ -91,7 +91,7 @@ const TTQRT_SIZES = [
                         # --- Test Helper Function ---
                         A1_helper = copy(A1_orig)
                         A2_helper = copy(A2_orig)
-                        NextLA.ttqrt!(A1_helper, A2_helper, T_mat_nextla, tau)
+                        NextLA.ttqrt!(A1_helper, A2_helper, T_mat_nextla)
                         
                         # Verify helper gives same results as kernel
                         @test A1_helper ≈ A1_nextla rtol=rtol

From 8d20cb118008f3b254650630188df742b9475dc6 Mon Sep 17 00:00:00 2001
From: Felipe Tome <tomefelipe0@usp.br>
Date: Mon, 1 Sep 2025 11:09:41 -0300
Subject: [PATCH 5/6] CAQR: corrections in the helpers

---
 src/geqrt.jl |  2 +-
 src/tsmqr.jl |  3 +--
 src/tsqrt.jl | 10 ++--------
 src/unmqr.jl |  2 +-
 4 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/src/geqrt.jl b/src/geqrt.jl
index 04103de..8994f44 100644
--- a/src/geqrt.jl
+++ b/src/geqrt.jl
@@ -84,7 +84,7 @@ Helper function for blocked QR factorization. Computes A = Q*R where Q is orthog
 function geqrt!(A::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}) where {T}
     m, n = size(A)
     ib, nb = size(T_matrix)
-    tau = Vector{T}(undef, nb)
+    tau = Vector{T}(undef, n)
     work = zeros(T, ib * n)
 
     geqrt!(m, n, ib, A, T_matrix, tau, work)
diff --git a/src/tsmqr.jl b/src/tsmqr.jl
index f1ba956..c780826 100644
--- a/src/tsmqr.jl
+++ b/src/tsmqr.jl
@@ -189,8 +189,7 @@ function tsmqr!(side::Char, trans::Char, A1::AbstractMatrix{T}, A2::AbstractMatr
                V::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}) where {T}
     m1, n1 = size(A1)
     m2, n2 = size(A2)
-    k = size(V, 2)
-    ib = size(T_matrix, 1)
+    ib, k = size(T_matrix)
     
     # Validate input dimensions
     if side == 'L' && n2 != n1
diff --git a/src/tsqrt.jl b/src/tsqrt.jl
index f658db1..4d476b5 100644
--- a/src/tsqrt.jl
+++ b/src/tsqrt.jl
@@ -152,20 +152,14 @@ Uses blocked algorithm for efficiency with large matrices. The compact WY
 representation (stored in T) enables efficient application of the Q factor.
 """
 function tsqrt!(A1::AbstractMatrix{T}, A2::AbstractMatrix{T}, T_matrix::AbstractMatrix{T}) where{T}
-    n, n2 = size(A1)
-    
-    m, n3 = size(A2) 
-    if n2 != n3
-        throw(ArgumentError("A1 and A2 must have same number of columns, got $n2 and $n3"))
-    end
-
+    n = size(A1, 2)
+    m = size(A2, 1) 
     ib, nb = size(T_matrix)
 
     if ib <= 0
         throw(ArgumentError("Block size ib must be positive, got $ib"))
     end
     
-
     tau = Vector{T}(undef, n)
     work = zeros(T, ib * n)
     
diff --git a/src/unmqr.jl b/src/unmqr.jl
index 4e36cfe..46aae4e 100644
--- a/src/unmqr.jl
+++ b/src/unmqr.jl
@@ -218,7 +218,7 @@ function unmqr!(side::Char, trans::Char, A::AbstractMatrix{T}, T_matrix::Abstrac
     end
     
     # Set leading dimensions
-    lda = max(1, size(A, 1))
+    lda = max(1, stride(A, 2))
     
     # Allocate workspace based on side (matrix workspace expected by low-level)
     if side == 'L'

From 051658fdc141266d8a68182f06d54349815b6b24 Mon Sep 17 00:00:00 2001
From: Felipe Tome <tomefelipe0@usp.br>
Date: Mon, 1 Sep 2025 11:16:45 -0300
Subject: [PATCH 6/6] General dependency cleanup

---
 Project.toml      | 11 -----------
 src/NextLA.jl     | 25 -------------------------
 test/Project.toml |  7 +++++++
 3 files changed, 7 insertions(+), 36 deletions(-)
 create mode 100644 test/Project.toml

diff --git a/Project.toml b/Project.toml
index 0da84b1..283f05b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,25 +4,14 @@ authors = ["Rabab Alomairy, Evelyne Ringoot, Sophie Xuan, Vicki Carrica, Maxwell
 version = "0.1.0"
 
 [deps]
-Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
-Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
-StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
-libblastrampoline_jll = "8e850b90-86db-534c-a0d3-1478176c7d93"
 
 [compat]
-Aqua = "0.8.7"
-Atomix = "1.1.1"
-CUDA = "5.7.0"
 KernelAbstractions = "0.9.34"
 LinearAlgebra = "1.11.0"
 Random = "1.11.0"
-Revise = "3.8.0"
-StaticArrays = "1.9.13"
 julia = "1.11"
 
 [extras]
diff --git a/src/NextLA.jl b/src/NextLA.jl
index 8465a72..3905d2f 100644
--- a/src/NextLA.jl
+++ b/src/NextLA.jl
@@ -9,31 +9,6 @@ import LinearAlgebra: BLAS, LAPACK
 import LinearAlgebra.BLAS: @blasfunc
 using Random: Random
 using KernelAbstractions
-using StaticArrays
-
-DEV = :NVIDIA
-
-if DEV == :NVIDIA
-	using CUDA
-	ArrayKA = CUDA.CuArray
-	Backend = CUDA.CUDABackend()
-elseif DEV == :AMD
-	using AMDGPU
-	ArrayKA = AMDGPU.ROCArray
-	Backend = AMDGPU.ROCBackend()
-elseif DEV == :oneAPI
-	using oneAPI
-	ArrayKA = oneAPI.oneArray
-	Backend = oneAPI.oneAPIBackend()
-elseif DEV == :Metal
-	using Metal
-	ArrayKA = Metal.MtlArray
-	Backend = Metal.MetalBackend()
-else
-	DEV == :CPU
-	ArrayKA = Array
-	Backend = CPU()
-end
 
 """
 	lamch(::Type{T}, cmach) where{T<: Number}
diff --git a/test/Project.toml b/test/Project.toml
new file mode 100644
index 0000000..bc62519
--- /dev/null
+++ b/test/Project.toml
@@ -0,0 +1,7 @@
+[deps]
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+libblastrampoline_jll = "8e850b90-86db-534c-a0d3-1478176c7d93"