diff --git a/py/redrock/archetypes.py b/py/redrock/archetypes.py
index e9943b4e..d94fd225 100644
--- a/py/redrock/archetypes.py
+++ b/py/redrock/archetypes.py
@@ -69,6 +69,9 @@ def __init__(self, filename):
 
         # TODO: Allow Archetype files to specify their IGM model
         self.igm_model = 'Inoue14'
+        # TODO: Allow Archetype files to specify bvls or nnls or pca solver method
+        self._solver_method = 'bvls'
+        #self._solver_method = 'bvls_via_nnls' #This uses the NNLS trick to emulate BVLS
 
         self.method = 'ARCH'  # for API symmetry with Template.method
 
@@ -217,7 +220,7 @@ def eval(self, subtype, coeff, wave, z, R=None, legcoeff=None):
 
         return model
     
-    def nearest_neighbour_model(self, target,weights,flux,wflux,dwave,z, n_nearest, zzchi2, trans, per_camera, dedges=None, binned=None, use_gpu=False, prior=None, ncam=None):
+    def nearest_neighbour_model(self, target,weights,flux,wflux,dwave,z, n_nearest, zzchi2, trans, per_camera, dedges=None, binned=None, use_gpu=False, prior_sigma=None, ncam=None):
         
         """Nearest neighbour archetype approach; fitting with a combinating of nearest archetypes in chi2 space
         
@@ -235,7 +238,7 @@ def nearest_neighbour_model(self, target,weights,flux,wflux,dwave,z, n_nearest,
             dedges (dict): in GPU mode, use pre-computed dict of wavelength bin edges, already on GPU
             binned (dict): already computed dictionary of rebinned fluxes
             use_gpu (bool): use GPU or not
-            prior (2d array): prior matrix on coefficients (1/sig**2)
+            prior_sigma (float): prior to add in the final solution matrix: added as 1/(prior_sigma**2) only for per-camera mode
             ncam (int): Number of camera for given Instrument
         
         Returns:
@@ -249,11 +252,11 @@ def nearest_neighbour_model(self, target,weights,flux,wflux,dwave,z, n_nearest,
         #is multiplied by trans in get_best_archetype
         spectra = target.spectra
         nleg = target.nleg
+        bands = None
         legendre = target.legendre(nleg=nleg, use_gpu=False) #Get previously calculated legendre
 
         nleg = legendre[list(legendre.keys())[0]].shape[0]
         iBest = np.argsort(zzchi2)[0:n_nearest]
-        tdata = dict()
         if (binned is not None):
             if (use_gpu):
                 binned = { hs:binned[hs][:,iBest].get() for hs in binned }
@@ -268,17 +271,23 @@ def nearest_neighbour_model(self, target,weights,flux,wflux,dwave,z, n_nearest,
                     #Only multiply if trans[hs] is not None
                     #Both arrays are on CPU so no need to wrap with asarray
                     binned[hs] *= trans[hs][:,None]
-        for hs, w in dwave.items():
-            tdata[hs] = binned[hs][None,:,:]
-            if (nleg > 0):
-                tdata[hs] = np.append(tdata[hs], legendre[hs].transpose()[None,:,:], axis=2)
-            nbasis = tdata[hs].shape[2]
             
         if per_camera:
             #Use CPU mode since small tdata
-            (zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux, nleg, 1, method='bvls', n_nbh=n_nearest, prior=prior, use_gpu=False, bands=target.bands)
+            #CW - 4/19/24 - pass use_gpu and solver method to per_camera_coeff_with_least_square_batch
+            #it will decide there if narch == 1 to use CPU and it will calculate correct prior array
+            #CW - 5/2/24 - pass binned instead of tdata to put all BVLS/NNLS code into per_camera_coeff_with_least_square_batch
+            (zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflux, nleg, 1, method=self._solver_method, n_nbh=n_nearest, prior_sigma=prior_sigma, use_gpu=use_gpu, bands=bands)
+            #(zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflux, nleg, 1, method=self._solver_method, n_nbh=n_nearest, prior_sigma=prior_sigma, use_gpu=use_gpu, ncam=ncam)
         else:
             #Use CPU mode for calc_zchi2 since small tdata
+            #Calculate tdata here because binned is passed to per_camera_coeff_with_least_square_batch
+            tdata = dict()
+            for hs, w in dwave.items():
+                tdata[hs] = binned[hs][None,:,:]
+                if (nleg > 0):
+                    tdata[hs] = np.append(tdata[hs], legendre[hs].transpose()[None,:,:], axis=2)
+                nbasis = tdata[hs].shape[2]
             (zzchi2, zzcoeff) = calc_zchi2_batch(spectra, tdata, weights, flux, wflux, 1, nbasis, use_gpu=False)
 
         sstype = ['%s'%(self._subtype[k]) for k in iBest] # subtypes of best archetypes
@@ -287,7 +296,7 @@ def nearest_neighbour_model(self, target,weights,flux,wflux,dwave,z, n_nearest,
         #print(z, zzchi2, zzcoeff, fsstype)
         return zzchi2[0], zzcoeff[0], make_fulltype(self._rrtype, fsstype)
 
-    def get_best_archetype(self,target,weights,flux,wflux,dwave,z, per_camera, n_nearest, trans=None, solve_method='bvls', prior=None, use_gpu=False):
+    def get_best_archetype(self,target,weights,flux,wflux,dwave,z, per_camera, n_nearest, trans=None, prior_sigma=None, use_gpu=False):
 
         """Get the best archetype for the given redshift and spectype.
 
@@ -301,8 +310,7 @@ def get_best_archetype(self,target,weights,flux,wflux,dwave,z, per_camera, n_nea
             per_camera (bool): True if fitting needs to be done in each camera
             n_nearest (int): number of nearest neighbours to be used in chi2 space (including best archetype)
             trans (dict): pass previously calcualated Lyman transmission instead of recalculating
-            solve_method (string): bvls or pca
-            prior (2d array): prior matrix on coefficients (1/sig**2)
+            prior_sigma (float): prior to add in the final solution matrix: added as 1/(prior_sigma**2) only for per-camera mode
             use_gpu (bool): use GPU or not
             
         Returns:
@@ -314,6 +322,7 @@ def get_best_archetype(self,target,weights,flux,wflux,dwave,z, per_camera, n_nea
         spectra = target.spectra
         nleg = target.nleg
         legendre = target.legendre(nleg=nleg, use_gpu=use_gpu) #Get previously calculated legendre
+        solve_method = self._solver_method #get solve method from archetype class instead of passing as arg
         bands = target.bands
 
         #Select np or cp for operations as arrtype
@@ -339,9 +348,7 @@ def get_best_archetype(self,target,weights,flux,wflux,dwave,z, per_camera, n_nea
         #new_keys = [wkeys[0], wkeys[2], wkeys[1]]
         #obs_wave = np.concatenate([dwave[key] for key in new_keys])
         
-        nleg = legendre[list(legendre.keys())[0]].shape[0]
-        zzchi2 = np.zeros(self._narch, dtype=np.float64)
-        zzcoeff = np.zeros((self._narch,  1+ncam*(nleg)), dtype=np.float64)
+        #nleg = legendre[list(legendre.keys())[0]].shape[0]
         
         #TODO: return best fit model as well
         #zzmodel = np.zeros((self._narch, obs_wave.size), dtype=np.float64)
@@ -358,41 +365,47 @@ def get_best_archetype(self,target,weights,flux,wflux,dwave,z, per_camera, n_nea
         #Rebin in batch
         binned = self.rebin_template_batch(z, dwave, trapz=True, dedges=dedges, use_gpu=use_gpu)
 
-        tdata = dict()
-        nbasis = 1
-        
         ## Prior must be redefined to remove nearest neighbour approach, 
         # because prior was defined based on n_nearest argument..
         # this logic is needed because the first fitting is done with just one archetype 
         #and then nearest neighbour approach is implemented
-        if n_nearest is not None:
-            nnearest_prior = prior.copy() # prior corresponding to nearest_nbh method
-            prior = prior[n_nearest-1:,][:,n_nearest-1:] # removing first rows/columns corresponding to the nearest_archetypes, and keeping just one row for one archetype
+
+        ##CW 04/19/24 - no need to redefine prior since it is now not calculated until per_camera_coeff_with_least_square_batch
+        #if n_nearest is not None:
+        #    nnearest_prior = prior.copy() # prior corresponding to nearest_nbh method
+        #    prior = prior[n_nearest-1:,][:,n_nearest-1:] # removing first rows/columns corresponding to the nearest_archetypes, and keeping just one row for one archetype
 
         for hs, wave in dwave.items():
             if (trans[hs] is not None):
                 #Only multiply if trans[hs] is not None
                 binned[hs] *= arrtype.asarray(trans[hs][:,None])
-            #Create 3-d tdata with narch x nwave x nbasis where nbasis = 1+nleg
-            if nleg > 0:
-                tdata[hs] = arrtype.append(binned[hs].transpose()[:,:,None], arrtype.tile(arrtype.asarray(legendre[hs]).transpose()[None,:,:], (self._narch, 1, 1)), axis=2)
-            else:
-                tdata[hs] = binned[hs].transpose()[:,:,None]
-            nbasis = tdata[hs].shape[2]
 
         if per_camera:
+            #Use per_camera_coeff_with_least_square_batch which has all logic associated with BVLS/NNLS solver methods
             if (use_gpu):
-                (zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, tdata, gpuweights, gpuflux, gpuwflux, nleg, self._narch, method=solve_method, n_nbh=1, prior=prior, use_gpu=use_gpu, bands=bands)
+                (zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, binned, gpuweights, gpuflux, gpuwflux, nleg, self._narch, method=solve_method, n_nbh=1, prior_sigma=prior_sigma, use_gpu=use_gpu, bands=bands)
+                #(zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, binned, gpuweights, gpuflux, gpuwflux, nleg, self._narch, method=solve_method, n_nbh=1, prior_sigma=prior_sigma, use_gpu=use_gpu, ncam=ncam)
             else:
-                (zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux, nleg, self._narch, method=solve_method, n_nbh=1, prior=prior, use_gpu=use_gpu, bands=bands)
+                (zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflux, nleg, self._narch, method=solve_method, n_nbh=1, prior_sigma=prior_sigma, use_gpu=use_gpu, bands=bands)
+                #(zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflux, nleg, self._narch, method=solve_method, n_nbh=1, prior_sigma=prior_sigma, use_gpu=use_gpu, ncam=ncam)
         else:
+            #Calculate tdata here because binned is passed to per_camera_coeff_with_least_square_batch
+            tdata = dict()
+            nbasis = 1
+            for hs, wave in dwave.items():
+                #Create 3-d tdata with narch x nwave x nbasis where nbasis = 1+nleg
+                if nleg > 0:
+                    tdata[hs] = arrtype.append(binned[hs].transpose()[:,:,None], arrtype.tile(arrtype.asarray(legendre[hs]).transpose()[None,:,:], (self._narch, 1, 1)), axis=2)
+                else:
+                    tdata[hs] = binned[hs].transpose()[:,:,None]
+                nbasis = tdata[hs].shape[2]
             if (use_gpu):
                 (zzchi2, zzcoeff) = calc_zchi2_batch(spectra, tdata, gpuweights, gpuflux, gpuwflux, self._narch, nbasis, use_gpu=use_gpu)
             else:
                 (zzchi2, zzcoeff) = calc_zchi2_batch(spectra, tdata, weights, flux, wflux, self._narch, nbasis, use_gpu=use_gpu)
 
         if n_nearest is not None:
-            best_chi2, best_coeff, best_fulltype = self.nearest_neighbour_model(target,weights,flux,wflux,dwave,z, n_nearest, zzchi2, trans, per_camera, dedges=dedges, binned=binned, use_gpu=use_gpu, prior=nnearest_prior, ncam=ncam)
+            best_chi2, best_coeff, best_fulltype = self.nearest_neighbour_model(target,weights,flux,wflux,dwave,z, n_nearest, zzchi2, trans, per_camera, dedges=dedges, binned=binned, use_gpu=use_gpu, prior_sigma=prior_sigma, ncam=ncam)
             #print(best_chi2, best_coeff, best_fulltype)
             return best_chi2, best_coeff, best_fulltype
         else:
diff --git a/py/redrock/fitz.py b/py/redrock/fitz.py
index 400f3b16..6a10d497 100644
--- a/py/redrock/fitz.py
+++ b/py/redrock/fitz.py
@@ -106,25 +106,13 @@ def minfit(x, y):
 
     return (x0, xerr, y0, zwarn)
 
-def prior_on_coeffs(n_nbh, deg_legendre, sigma, ncamera):
-    
-    """
-    Args:
-        n_nbh (int): number of dominant archetypes
-        deg_legendre (int): number of Legendre polynomials 
-        sigma (int): prior sigma to be used for archetype fitting
-        ncamera (int): number of cameras for given instrument
-    Returns:
-        2d array to be added while solving for archetype fitting
-
-    """
-    
-    nbasis = n_nbh+deg_legendre*ncamera # 3 desi cameras
-    prior = np.zeros((nbasis, nbasis), dtype='float64');np.fill_diagonal(prior, 1/(sigma**2))
-    for i in range(n_nbh):
-        prior[i][i]=0. ## Do not add prior to the archetypes, added only to the Legendre polynomials
-    return prior
+def legendre_calculate(nleg, dwave):
+    wave = np.concatenate([ w for w in dwave.values() ])
+    wmin = wave.min()
+    wmax = wave.max()
+    legendre = { hs:np.array([scipy.special.legendre(i)( (w-wmin)/(wmax-wmin)*2.) for i in range(nleg)]) for hs, w in dwave.items() }
 
+    return legendre
 
 def fitz(zchi2, redshifts, target, template, nminima=3, archetype=None, use_gpu=False, deg_legendre=None,
          zminfit_npoints=15, per_camera=False, n_nearest=None, prior_sigma=None):
@@ -293,6 +281,8 @@ def fitz(zchi2, redshifts, target, template, nminima=3, archetype=None, use_gpu=
                 coeff = np.zeros(template.nbasis)
                 zwarn |= ZW.Z_FITLIMIT
                 zwarn |= ZW.BAD_MINFIT
+                #Set trans to None so as not to pass an empty dict to archetypes! CW 2/26/24
+                trans = None
             else:
                 #- Unknown problem; re-raise error
                 raise err
@@ -331,12 +321,9 @@ def fitz(zchi2, redshifts, target, template, nminima=3, archetype=None, use_gpu=
                 else:
                     ncamera = 1
                 if n_nearest is None:
-                    prior = prior_on_coeffs(1, deg_legendre, prior_sigma, ncamera)
-                else:
-                    prior = prior_on_coeffs(n_nearest, deg_legendre, prior_sigma, ncamera)
-            else:
-                prior=None
-            chi2min, coeff, fulltype = archetype.get_best_archetype(target,weights,flux,wflux,dwave,zbest, per_camera, n_nearest, trans=trans, use_gpu=use_gpu, prior=prior)
+                    n_nearest = 1
+            #Pass prior_sigma as a scalar and move calculation of prior array to zscan.py in per_camera_coeff_with_least_square_batch
+            chi2min, coeff, fulltype = archetype.get_best_archetype(target,weights,flux,wflux,dwave,zbest, per_camera, n_nearest, trans=trans, use_gpu=use_gpu, prior_sigma=prior_sigma)
             del trans
 
             results.append(dict(z=zbest, zerr=zerr, zwarn=zwarn,
diff --git a/py/redrock/optimize/__init__.py b/py/redrock/optimize/__init__.py
new file mode 100644
index 00000000..acfc1e7c
--- /dev/null
+++ b/py/redrock/optimize/__init__.py
@@ -0,0 +1,5 @@
+"""This module contains a GPU implementation of the BVLS algorithm in
+    scipy.optimize.lsq_linear"""
+from .gpu_lsq_linear import gpu_lsq_linear
+
+__all__ = ['gpu_lsq_linear']
diff --git a/py/redrock/optimize/gpu_bvls.py b/py/redrock/optimize/gpu_bvls.py
new file mode 100644
index 00000000..74e7a784
--- /dev/null
+++ b/py/redrock/optimize/gpu_bvls.py
@@ -0,0 +1,258 @@
+"""GPU implementation of Bounded-variable least-squares algorithm."""
+import numpy as np
+import cupy as cp
+#from numpy.linalg import norm, lstsq
+from cupy.linalg import norm, lstsq
+from scipy.optimize import OptimizeResult
+
+from .gpu_common import print_header_linear, print_iteration_linear, compute_grad
+
+def compute_kkt_optimality(g, on_bound):
+    """Compute the maximum violation of KKT conditions."""
+    g_kkt = g * on_bound
+    free_set = on_bound == 0
+    g_kkt[free_set] = cp.abs(g[free_set])
+    return cp.max(g_kkt, axis=1)
+
+
+def bvls(A, b, x_lsq, lb, ub, ib, tol, max_iter, verbose, rcond=None, return_cost=True, return_optimality=True):
+    d, m, n = A.shape
+
+    #x = x_lsq.copy()
+    x = x_lsq
+    on_bound = cp.zeros((d,n), dtype=cp.int32)
+
+    mask = x <= lb
+    x[mask] = lb[mask]
+    on_bound[mask] = -1
+
+    mask = x >= ub
+    x[mask] = ub[mask]
+    on_bound[mask] = 1
+
+    free_set = on_bound == 0
+    active_set = ~free_set
+
+    if return_cost or return_optimality:
+        r = cp.einsum("ijk,ik->ij", A, x) - b
+        #r = A.dot(x) - b
+        cost = 0.5*cp.einsum("ij,ij->i", r, r)
+        #cost = 0.5 * np.dot(r, r)
+        initial_cost = cost
+        if return_optimality:
+            g = compute_grad(A,r)
+            #g = A.T.dot(r)
+    else:
+        cost = None
+        initial_cost = None
+
+    cost_change = None
+    step_norm = None
+    iteration = 0
+
+    if verbose == 2:
+        print_header_linear()
+
+    # This is the initialization loop. The requirement is that the
+    # least-squares solution on free variables is feasible before BVLS starts.
+    # One possible initialization is to set all variables to lower or upper
+    # bounds, but many iterations may be required from this state later on.
+    # The implemented ad-hoc procedure which intuitively should give a better
+    # initial state: find the least-squares solution on current free variables,
+    # if its feasible then stop, otherwise, set violating variables to
+    # corresponding bounds and continue on the reduced set of free variables.
+
+    while free_set.any():
+        if verbose == 2 and return_optimality:
+            optimality = compute_kkt_optimality(g, on_bound)
+            print_iteration_linear(iteration, cost, cost_change, step_norm,
+                                   optimality)
+
+        iteration += 1
+        if verbose == 2:
+            #Only need x_old if verbose == 2
+            x_old = x.copy()
+
+        b_free = b - cp.einsum("ijk,ik->ij", A, x*active_set)
+        A_free = A.swapaxes(1,2).copy()
+        A_free[active_set] = 0
+        A_free = A_free.swapaxes(1,2)
+        z = cp.einsum("ijk,ik->ij", cp.linalg.pinv(A_free), b_free)
+
+        #A_free = A[:, free_set]
+        #b_free = b - A.dot(x * active_set)
+        #z = lstsq(A_free, b_free, rcond=rcond)[0]
+
+        lbv = (z < lb)*free_set
+        ubv = (z > ub)*free_set
+        #lbv = z < lb[free_set]
+        #ubv = z > ub[free_set]
+        v = lbv | ubv
+
+        if cp.any(lbv):
+            #ind = free_set[lbv]
+            #x[ind] = lb[ind]
+            x[lbv] = lb[lbv]
+            #active_set[ind] = True
+            active_set[lbv] = True
+            #on_bound[ind] = -1
+            on_bound[lbv] = -1
+
+        if cp.any(ubv):
+            #ind = free_set[ubv]
+            #x[ind] = ub[ind]
+            x[ubv] = ub[ubv]
+            #active_set[ind] = True
+            active_set[ubv] = True
+            #on_bound[ind] = 1
+            on_bound[ubv] = 1
+
+        #ind = free_set[~v]
+        ind = free_set*~v
+        #x[ind] = z[~v]
+        x[ind] = z[ind]
+
+        r = cp.einsum("ijk,ik->ij", A, x) - b
+        #r = A.dot(x) - b
+        #cost_new = 0.5 * np.dot(r, r)
+        cost_new = 0.5*cp.einsum("ij,ij->i", r, r)
+        if return_cost:
+            cost_change = cost - cost_new
+        cost = cost_new
+        #g = A.T.dot(r)
+        g = compute_grad(A,r)
+        if verbose == 2:
+            #step_norm = norm(x[free_set] - x_free_old)
+            step_norm = norm(x-x_old, axis=1)
+
+        if cp.any(v):
+            free_set[v] = False
+            #free_set = free_set[~v]
+        else:
+            break
+
+    if max_iter is None:
+        max_iter = n
+    max_iter += iteration
+
+    termination_status = None
+
+    # Main BVLS loop.
+    optimality = compute_kkt_optimality(g, on_bound)
+    idx = np.arange(d, dtype=np.int32)
+    for iteration in range(iteration, max_iter):  # BVLS Loop A
+        if verbose == 2 and return_optimality:
+            print_iteration_linear(iteration, cost, cost_change,
+                                   step_norm, optimality)
+
+        if cp.all(optimality < tol):
+            termination_status = 1
+
+        if termination_status is not None:
+            break
+
+        g_on = g * on_bound
+        move_to_free = cp.argmax(g_on, axis=1)
+        #on_bound[idx,move_to_free] = cp.minimum(0, on_bound[idx,move_to_free])
+        on_bound[idx,move_to_free] = (g_on[idx,move_to_free] < 0)*on_bound[idx,move_to_free]
+        
+        while True:   # BVLS Loop B
+            free_set = on_bound == 0
+            active_set = ~free_set
+    
+            if verbose == 2:
+                #Only need x_old if verbose == 2
+                x_old = x.copy()
+
+            b_free = b - cp.einsum("ijk,ik->ij", A, x*active_set)
+            #Set active_set to 0 in A_free
+            A_free = A.swapaxes(1,2).copy()
+            A_free[active_set] = 0
+            A_free = A_free.swapaxes(1,2)
+
+            z = cp.einsum("ijk,ik->ij", cp.linalg.pinv(A_free), b_free)
+
+            #A_free = A[:, free_set]
+            #b_free = b - A.dot(x * active_set)
+            #z = lstsq(A_free, b_free, rcond=rcond)[0]
+
+            lbv = (z < lb)*free_set
+            ubv = (z > ub)*free_set
+            v = (lbv | ubv)
+
+            #lbv, = np.nonzero(z < lb_free)
+            #ubv, = np.nonzero(z > ub_free)
+            #v = np.hstack((lbv, ubv))
+
+            #if v.size > 0:
+            if v.any():
+                #alphas = cp.zeros(x.shape)
+                #av = v.any(axis=1)
+                alphas = cp.full(x.shape, cp.inf)
+                alphas[lbv] = (lb[lbv]-x[lbv]) / (z[lbv]-x[lbv])
+                alphas[ubv] = (ub[ubv]-x[ubv]) / (z[ubv]-x[ubv])
+
+                i = cp.argmin(alphas, axis=1)
+                alpha = alphas[idx,i].reshape((d,1))
+                #x_free = x[v] * (1-alpha[v])
+                #x_free += alpha[v]*z[v]
+
+                #x_free = x[v] * (1-alpha[av])
+                #x_free += alpha[av]*z[v]
+                #x[v] = x_free
+
+                x_free = x*(1-alpha)
+                x_free += alpha*z
+                x[v] = x_free[v]
+
+                #Update on_bound
+                on_bound[cp.where(lbv[idx,i]), i[cp.where(lbv[idx,i])]] = -1
+                on_bound[cp.where(ubv[idx,i]), i[cp.where(ubv[idx,i])]] = 1
+
+                #alphas = np.hstack((
+                #    lb_free[lbv] - x_free[lbv],
+                #    ub_free[ubv] - x_free[ubv])) / (z[v] - x_free[v])
+
+                #i = np.argmin(alphas)
+                #i_free = v[i]
+                #alpha = alphas[i]
+
+                #x_free *= 1 - alpha
+                #x_free += alpha * z
+                #x[free_set] = x_free
+
+                #if i < lbv.size:
+                #    on_bound[free_set[i_free]] = -1
+                #else:
+                #    on_bound[free_set[i_free]] = 1
+            else:
+                x_free = z[free_set]
+                x[free_set] = x_free
+                break
+
+        if verbose == 2:
+            #step_norm = norm(x_free - x_free_old, axis=1)
+            step_norm = norm(x - x_old, axis=1)
+
+        r = cp.einsum("ijk,ik->ij", A, x) - b
+        #r = A.dot(x) - b
+        #cost_new = 0.5 * np.dot(r, r)
+        cost_new = 0.5*cp.einsum("ij,ij->i", r, r)
+        cost_change = cost - cost_new
+
+        if cp.all(cost_change < tol * cost):
+            termination_status = 2
+        cost = cost_new
+
+        #g = A.T.dot(r)
+        g = compute_grad(A,r)
+        optimality = compute_kkt_optimality(g, on_bound)
+        x_lsq = x
+
+    if termination_status is None:
+        termination_status = 0
+
+    return OptimizeResult(
+        x=x_lsq, fun=r, cost=cost, optimality=optimality, active_mask=on_bound,
+        nit=iteration + 1, status=termination_status,
+        initial_cost=initial_cost)
diff --git a/py/redrock/optimize/gpu_common.py b/py/redrock/optimize/gpu_common.py
new file mode 100644
index 00000000..4333507f
--- /dev/null
+++ b/py/redrock/optimize/gpu_common.py
@@ -0,0 +1,77 @@
+"""Functions used by least-squares algorithms."""
+import numpy as np
+import cupy as cp
+
+from scipy.sparse.linalg import LinearOperator, aslinearoperator
+
+
+# Utility functions to work with bound constraints.
+def in_bounds(x, lb, ub, arrtype=None):
+    """Check if a point lies within bounds."""
+    if arrtype is None:
+        arrtype = cp
+    return arrtype.all((x >= lb) & (x <= ub),axis=1)
+
+def all_in_bounds(x, lb, ub):
+    """Check if a point lies within bounds."""
+    return cp.all((x >= lb) & (x <= ub))
+
+
+# Functions to display algorithm's progress.
+
+
+def print_header_nonlinear():
+    print("{:^15}{:^15}{:^15}{:^15}{:^15}{:^15}"
+          .format("Iteration", "Total nfev", "Cost", "Cost reduction",
+                  "Step norm", "Optimality"))
+
+
+def print_iteration_nonlinear(iteration, nfev, cost, cost_reduction,
+                              step_norm, optimality):
+    if cost_reduction is None:
+        cost_reduction = " " * 15
+    else:
+        cost_reduction = f"{cost_reduction:^15.2e}"
+
+    if step_norm is None:
+        step_norm = " " * 15
+    else:
+        step_norm = f"{step_norm:^15.2e}"
+
+    print("{:^15}{:^15}{:^15.4e}{}{}{:^15.2e}"
+          .format(iteration, nfev, cost, cost_reduction,
+                  step_norm, optimality))
+
+
+def print_header_linear():
+    print("{:^15}{:^15}{:^15}{:^15}{:^15}"
+          .format("Iteration", "Cost", "Cost reduction", "Step norm",
+                  "Optimality"))
+
+
+def print_iteration_linear(iteration, cost, cost_reduction, step_norm,
+                           optimality):
+    if cost_reduction is None:
+        cost_reduction = " " * 15
+    else:
+        cost_reduction = f"{cost_reduction:^15.2e}"
+
+    if step_norm is None:
+        step_norm = " " * 15
+    else:
+        step_norm = f"{step_norm:^15.2e}"
+
+    print(f"{iteration:^15}{cost:^15.4e}{cost_reduction}{step_norm}{optimality:^15.2e}")
+
+
+# Simple helper functions.
+
+
+def compute_grad(J, f):
+    """Compute gradient of the least-squares cost function."""
+    if isinstance(J, LinearOperator):
+        return J.rmatvec(f)
+    else:
+        return cp.einsum("ikj,ij->ik", J.swapaxes(1,2), f)
+        #return J.T.dot(f)
+
diff --git a/py/redrock/optimize/gpu_lsq_linear.py b/py/redrock/optimize/gpu_lsq_linear.py
new file mode 100644
index 00000000..0c8163f7
--- /dev/null
+++ b/py/redrock/optimize/gpu_lsq_linear.py
@@ -0,0 +1,404 @@
+"""GPU implementation of Linear least squares with bound constraints on independent variables."""
+import numpy as np
+#from numpy.linalg import norm
+from scipy.sparse import issparse, csr_matrix
+from scipy.sparse.linalg import LinearOperator, lsmr
+from scipy.optimize import OptimizeResult
+from scipy.optimize._minimize import Bounds
+
+import cupy as cp
+from cupy.linalg import norm, lstsq
+
+from .gpu_common import all_in_bounds, in_bounds, compute_grad
+from scipy.optimize._lsq.trf_linear import trf_linear
+from .gpu_bvls import bvls
+import time
+
+
+def prepare_bounds(bounds, n):
+    if len(bounds) != 2:
+        raise ValueError("`bounds` must contain 2 elements.")
+    lb, ub = (cp.asarray(b, dtype=float) for b in bounds)
+
+    if lb.ndim == 0:
+        lb = cp.resize(lb, n)
+
+    if ub.ndim == 0:
+        ub = cp.resize(ub, n)
+
+    return lb, ub
+
+
+TERMINATION_MESSAGES = {
+    -1: "The algorithm was not able to make progress on the last iteration.",
+    0: "The maximum number of iterations is exceeded.",
+    1: "The first-order optimality measure is less than `tol`.",
+    2: "The relative change of the cost function is less than `tol`.",
+    3: "The unconstrained solution is optimal."
+}
+
+
+def gpu_lsq_linear(A, b, bounds=(-cp.inf, cp.inf), method='trf', tol=1e-10,
+               lsq_solver=None, lsmr_tol=None, max_iter=None,
+               verbose=0, *, lsmr_maxiter=None, return_cost=True,
+               return_optimality=True, check_bounds=False):
+    r"""Solve a linear least-squares problem with bounds on the variables.
+
+    Given a m-by-n design matrix A and a target vector b with m elements,
+    `lsq_linear` solves the following optimization problem::
+
+        minimize 0.5 * ||A x - b||**2
+        subject to lb <= x <= ub
+
+    This optimization problem is convex, hence a found minimum (if iterations
+    have converged) is guaranteed to be global.
+
+    Parameters
+    ----------
+    A : array_like, sparse matrix of LinearOperator, shape (m, n)
+        Design matrix. Can be `scipy.sparse.linalg.LinearOperator`.
+    b : array_like, shape (m,)
+        Target vector.
+    bounds : 2-tuple of array_like or `Bounds`, optional
+        Lower and upper bounds on parameters. Defaults to no bounds.
+        There are two ways to specify the bounds:
+
+            - Instance of `Bounds` class.
+
+            - 2-tuple of array_like: Each element of the tuple must be either
+              an array with the length equal to the number of parameters, or a
+              scalar (in which case the bound is taken to be the same for all
+              parameters). Use ``np.inf`` with an appropriate sign to disable
+              bounds on all or some parameters.
+
+    method : 'trf' or 'bvls', optional
+        Method to perform minimization.
+
+            * 'trf' : Trust Region Reflective algorithm adapted for a linear
+              least-squares problem. This is an interior-point-like method
+              and the required number of iterations is weakly correlated with
+              the number of variables.
+            * 'bvls' : Bounded-variable least-squares algorithm. This is
+              an active set method, which requires the number of iterations
+              comparable to the number of variables. Can't be used when `A` is
+              sparse or LinearOperator.
+
+        Default is 'trf'.
+    tol : float, optional
+        Tolerance parameter. The algorithm terminates if a relative change
+        of the cost function is less than `tol` on the last iteration.
+        Additionally, the first-order optimality measure is considered:
+
+            * ``method='trf'`` terminates if the uniform norm of the gradient,
+              scaled to account for the presence of the bounds, is less than
+              `tol`.
+            * ``method='bvls'`` terminates if Karush-Kuhn-Tucker conditions
+              are satisfied within `tol` tolerance.
+
+    lsq_solver : {None, 'exact', 'lsmr'}, optional
+        Method of solving unbounded least-squares problems throughout
+        iterations:
+
+            * 'exact' : Use dense QR or SVD decomposition approach. Can't be
+              used when `A` is sparse or LinearOperator.
+            * 'lsmr' : Use `scipy.sparse.linalg.lsmr` iterative procedure
+              which requires only matrix-vector product evaluations. Can't
+              be used with ``method='bvls'``.
+
+        If None (default), the solver is chosen based on type of `A`.
+    lsmr_tol : None, float or 'auto', optional
+        Tolerance parameters 'atol' and 'btol' for `scipy.sparse.linalg.lsmr`
+        If None (default), it is set to ``1e-2 * tol``. If 'auto', the
+        tolerance will be adjusted based on the optimality of the current
+        iterate, which can speed up the optimization process, but is not always
+        reliable.
+    max_iter : None or int, optional
+        Maximum number of iterations before termination. If None (default), it
+        is set to 100 for ``method='trf'`` or to the number of variables for
+        ``method='bvls'`` (not counting iterations for 'bvls' initialization).
+    verbose : {0, 1, 2}, optional
+        Level of algorithm's verbosity:
+
+            * 0 : work silently (default).
+            * 1 : display a termination report.
+            * 2 : display progress during iterations.
+    lsmr_maxiter : None or int, optional
+        Maximum number of iterations for the lsmr least squares solver,
+        if it is used (by setting ``lsq_solver='lsmr'``). If None (default), it
+        uses lsmr's default of ``min(m, n)`` where ``m`` and ``n`` are the
+        number of rows and columns of `A`, respectively. Has no effect if
+        ``lsq_solver='exact'``.
+
+    return_cost : True = return cost function, False = do not to save time
+    return_optimality: True = return grad function, False = do not to save time
+    check_bounds: True = check that lb < ub; False = do not to save time
+
+    Returns
+    -------
+    OptimizeResult with the following fields defined:
+    x : ndarray, shape (n,)
+        Solution found.
+    cost : float
+        Value of the cost function at the solution.
+    fun : ndarray, shape (m,)
+        Vector of residuals at the solution.
+    optimality : float
+        First-order optimality measure. The exact meaning depends on `method`,
+        refer to the description of `tol` parameter.
+    active_mask : ndarray of int, shape (n,)
+        Each component shows whether a corresponding constraint is active
+        (that is, whether a variable is at the bound):
+
+            *  0 : a constraint is not active.
+            * -1 : a lower bound is active.
+            *  1 : an upper bound is active.
+
+        Might be somewhat arbitrary for the `trf` method as it generates a
+        sequence of strictly feasible iterates and active_mask is determined
+        within a tolerance threshold.
+    unbounded_sol : tuple
+        Unbounded least squares solution tuple returned by the least squares
+        solver (set with `lsq_solver` option). If `lsq_solver` is not set or is
+        set to ``'exact'``, the tuple contains an ndarray of shape (n,) with
+        the unbounded solution, an ndarray with the sum of squared residuals,
+        an int with the rank of `A`, and an ndarray with the singular values
+        of `A` (see NumPy's ``linalg.lstsq`` for more information). If
+        `lsq_solver` is set to ``'lsmr'``, the tuple contains an ndarray of
+        shape (n,) with the unbounded solution, an int with the exit code,
+        an int with the number of iterations, and five floats with
+        various norms and the condition number of `A` (see SciPy's
+        ``sparse.linalg.lsmr`` for more information). This output can be
+        useful for determining the convergence of the least squares solver,
+        particularly the iterative ``'lsmr'`` solver. The unbounded least
+        squares problem is to minimize ``0.5 * ||A x - b||**2``.
+    nit : int
+        Number of iterations. Zero if the unconstrained solution is optimal.
+    status : int
+        Reason for algorithm termination:
+
+            * -1 : the algorithm was not able to make progress on the last
+              iteration.
+            *  0 : the maximum number of iterations is exceeded.
+            *  1 : the first-order optimality measure is less than `tol`.
+            *  2 : the relative change of the cost function is less than `tol`.
+            *  3 : the unconstrained solution is optimal.
+
+    message : str
+        Verbal description of the termination reason.
+    success : bool
+        True if one of the convergence criteria is satisfied (`status` > 0).
+
+    See Also
+    --------
+    nnls : Linear least squares with non-negativity constraint.
+    least_squares : Nonlinear least squares with bounds on the variables.
+
+    Notes
+    -----
+    The algorithm first computes the unconstrained least-squares solution by
+    `numpy.linalg.lstsq` or `scipy.sparse.linalg.lsmr` depending on
+    `lsq_solver`. This solution is returned as optimal if it lies within the
+    bounds.
+
+    Method 'trf' runs the adaptation of the algorithm described in [STIR]_ for
+    a linear least-squares problem. The iterations are essentially the same as
+    in the nonlinear least-squares algorithm, but as the quadratic function
+    model is always accurate, we don't need to track or modify the radius of
+    a trust region. The line search (backtracking) is used as a safety net
+    when a selected step does not decrease the cost function. Read more
+    detailed description of the algorithm in `scipy.optimize.least_squares`.
+
+    Method 'bvls' runs a Python implementation of the algorithm described in
+    [BVLS]_. The algorithm maintains active and free sets of variables, on
+    each iteration chooses a new variable to move from the active set to the
+    free set and then solves the unconstrained least-squares problem on free
+    variables. This algorithm is guaranteed to give an accurate solution
+    eventually, but may require up to n iterations for a problem with n
+    variables. Additionally, an ad-hoc initialization procedure is
+    implemented, that determines which variables to set free or active
+    initially. It takes some number of iterations before actual BVLS starts,
+    but can significantly reduce the number of further iterations.
+
+    References
+    ----------
+    .. [STIR] M. A. Branch, T. F. Coleman, and Y. Li, "A Subspace, Interior,
+              and Conjugate Gradient Method for Large-Scale Bound-Constrained
+              Minimization Problems," SIAM Journal on Scientific Computing,
+              Vol. 21, Number 1, pp 1-23, 1999.
+    .. [BVLS] P. B. Start and R. L. Parker, "Bounded-Variable Least-Squares:
+              an Algorithm and Applications", Computational Statistics, 10,
+              129-141, 1995.
+
+    Examples
+    --------
+    In this example, a problem with a large sparse matrix and bounds on the
+    variables is solved.
+
+    >>> import numpy as np
+    >>> from scipy.sparse import rand
+    >>> from scipy.optimize import lsq_linear
+    >>> rng = np.random.default_rng()
+    ...
+    >>> m = 20000
+    >>> n = 10000
+    ...
+    >>> A = rand(m, n, density=1e-4, random_state=rng)
+    >>> b = rng.standard_normal(m)
+    ...
+    >>> lb = rng.standard_normal(n)
+    >>> ub = lb + 1
+    ...
+    >>> res = lsq_linear(A, b, bounds=(lb, ub), lsmr_tol='auto', verbose=1)
+    # may vary
+    The relative change of the cost function is less than `tol`.
+    Number of iterations 16, initial cost 1.5039e+04, final cost 1.1112e+04,
+    first-order optimality 4.66e-08.
+    """
+    tx1 = time.time()
+    if method not in ['trf', 'bvls']:
+        raise ValueError("`method` must be 'trf' or 'bvls'")
+
+    if lsq_solver not in [None, 'exact', 'lsmr']:
+        raise ValueError("`solver` must be None, 'exact' or 'lsmr'.")
+
+    if verbose not in [0, 1, 2]:
+        raise ValueError("`verbose` must be in [0, 1, 2].")
+
+    if issparse(A):
+        A = csr_matrix(A)
+    elif not isinstance(A, LinearOperator):
+        A = cp.atleast_2d(cp.asarray(A))
+
+    if method == 'bvls':
+        if lsq_solver == 'lsmr':
+            raise ValueError("method='bvls' can't be used with "
+                             "lsq_solver='lsmr'")
+
+        if not isinstance(A, cp.ndarray):
+            raise ValueError("method='bvls' can't be used with `A` being "
+                             "sparse or LinearOperator.")
+
+    if lsq_solver is None:
+        if isinstance(A, cp.ndarray):
+            lsq_solver = 'exact'
+        else:
+            lsq_solver = 'lsmr'
+    elif lsq_solver == 'exact' and not isinstance(A, cp.ndarray):
+        raise ValueError("`exact` solver can't be used when `A` is "
+                         "sparse or LinearOperator.")
+
+    if len(A.shape) != 3:  # No ndim for LinearOperator.
+        raise ValueError("`A` must have 3 dimensions.")
+
+    if max_iter is not None and max_iter <= 0:
+        raise ValueError("`max_iter` must be None or positive integer.")
+
+    z, m, n = A.shape
+
+    if b.ndim != 2:
+        raise ValueError("`b` must have at most 2 dimension.")
+
+    zb, mb = b.shape
+
+    if mb != m:
+        raise ValueError("Inconsistent shapes between `A` and `b`.")
+
+    if isinstance(bounds, Bounds):
+        lb = bounds.lb
+        ub = bounds.ub
+    else:
+        lb, ub = prepare_bounds(bounds, n)
+
+    if lb.shape != (n,) and ub.shape != (n,) and lb.shape != (z,n) and ub.shape != (z,n):
+        raise ValueError("Bounds have wrong shape.")
+
+    if check_bounds:
+        if cp.any(lb >= ub):
+            raise ValueError("Each lower bound must be strictly less than each "
+                         "upper bound.")
+
+    if lsmr_maxiter is not None and lsmr_maxiter < 1:
+        raise ValueError("`lsmr_maxiter` must be None or positive integer.")
+
+    if not ((isinstance(lsmr_tol, float) and lsmr_tol > 0) or
+            lsmr_tol in ('auto', None)):
+        raise ValueError("`lsmr_tol` must be None, 'auto', or positive float.")
+
+    if lsq_solver == 'exact':
+        #unbd_lsq = cp.linalg.lstsq(A, b, rcond=-1)
+        if m == n:
+            #square matrices
+            unbd_lsq = cp.linalg.solve(A,b)
+        else:
+            unbd_lsq = cp.einsum("ijk,ik->ij", cp.linalg.pinv(A), b)
+    elif lsq_solver == 'lsmr':
+        raise NotImplementedError('lsmr is not yet implemetned') 
+        #first_lsmr_tol = lsmr_tol  # tol of first call to lsmr
+        #if lsmr_tol is None or lsmr_tol == 'auto':
+        #    first_lsmr_tol = 1e-2 * tol  # default if lsmr_tol not defined
+        #unbd_lsq = lsmr(A, b, maxiter=lsmr_maxiter,
+        #                atol=first_lsmr_tol, btol=first_lsmr_tol)
+    #x_lsq = unbd_lsq[0]  # extract the solution from the least squares solver
+    x_lsq = unbd_lsq
+
+    #Tile bounds if 1d arrays given
+    if (lb.shape == (n,)):
+        lb = cp.tile(lb, (z,1))
+    if (ub.shape == (n,)):
+        ub = cp.tile(ub, (z,1))
+    ib = in_bounds(x_lsq, lb, ub)
+
+    if ib.get().all():
+        #All solutions within bounds
+        termination_status = 3
+        termination_message = TERMINATION_MESSAGES[termination_status]
+        if (return_cost or return_optimality):
+            r = cp.einsum("ijk,ik->ij", A, x_lsq) - b
+            #r = A @ x_lsq - b
+            cost = 0.5*cp.einsum("ij,ij->i", r, r)
+            #cost = 0.5 * cp.dot(r, r)
+            if (return_optimality):
+                g = compute_grad(A, r)
+                g_norm = norm(g, ord=cp.inf, axis=1)
+            else:
+                g_norm = None
+        else:
+            cost=None
+            r = None
+            g_norm = None
+
+        if verbose > 0:
+            print(termination_message)
+            if (return_cost):
+                print(f"Final cost {cost:.4e}")
+            if (return_optimality):
+                print(f"first-order optimality {g_norm:.2e}")
+
+        return OptimizeResult(
+            x=x_lsq, fun=r, cost=cost, optimality=g_norm,
+            active_mask=np.zeros(n), unbounded_sol=unbd_lsq,
+            nit=0, status=termination_status,
+            message=termination_message, success=True)
+
+    if method == 'trf':
+        raise NotImplementedError('trf_linear is not yet implemented')
+        #res = trf_linear(A, b, x_lsq, lb, ub, tol, lsq_solver, lsmr_tol,
+        #                 max_iter, verbose, lsmr_maxiter=lsmr_maxiter)
+    elif method == 'bvls':
+        res = bvls(A, b, x_lsq, lb, ub, ib, tol, max_iter, verbose,
+               return_cost=return_cost, return_optimality=return_optimality)
+
+    res.unbounded_sol = unbd_lsq
+    res.message = TERMINATION_MESSAGES[res.status]
+    res.success = res.status > 0
+
+    if verbose > 0:
+        print(res.message)
+        print(
+            f"Number of iterations {res.nit}, initial cost {res.initial_cost:.4e}, "
+            f"final cost {res.cost:.4e}, first-order optimality {res.optimality:.2e}."
+        )
+
+    del res.initial_cost
+
+    return res
diff --git a/py/redrock/zfind.py b/py/redrock/zfind.py
index 913186b1..17318c36 100644
--- a/py/redrock/zfind.py
+++ b/py/redrock/zfind.py
@@ -29,7 +29,7 @@
 
 from .results import read_zscan_redrock
 
-from .zscan import calc_zchi2_targets
+from .zscan import calc_zchi2_targets, print_total_fitting_times
 
 from .fitz import fitz, get_dv
 
@@ -408,6 +408,7 @@ def zfind(targets, templates, mp_procs=1, nminima=3, archetypes=None, priors=Non
         results = [ results ]
 
     if am_root:
+        print_total_fitting_times() #print in root the total fitting times
         allresults = dict()
         for p in results:
             allresults.update(p)
diff --git a/py/redrock/zscan.py b/py/redrock/zscan.py
index 0b1fee29..ffd59728 100644
--- a/py/redrock/zscan.py
+++ b/py/redrock/zscan.py
@@ -11,7 +11,7 @@
 import sys
 import traceback
 import numpy as np
-from scipy.optimize import lsq_linear, nnls
+from scipy.optimize import nnls
 
 #try:
 #    import cupy as cp
@@ -35,6 +35,11 @@
 block_size = 512 #Default block size, should work on all modern nVidia GPUs
 
 # cuda_source contains raw CUDA kernels to be loaded as CUPY module
+gbounds = None
+gnbh = None
+
+ftimes = np.zeros(3)
+fmethods = ["PCA", "NNLS", "BVLS"]
 
 cuda_source = r'''
         extern "C" {
@@ -238,8 +243,41 @@ def calc_zchi2_one(spectra, weights, flux, wflux, tdata, solve_matrices_algorith
 
     return zchi2, zcoeff
 
-def per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux, nleg, narch, method=None, n_nbh=None, prior=None, use_gpu=False, bands=None):
-    
+def prior_on_coeffs(n_nbh, deg_legendre, sigma, ncamera, add_negative_legendre_terms=False):
+
+    """
+    Args:
+        n_nbh (int): number of dominant archetypes
+        deg_legendre (int): number of Legendre polynomials
+        sigma (int): prior sigma to be used for archetype fitting
+        ncamera (int): number of cameras for given instrument
+        add_negative_legendre_terms (bool): whether to adjust prior
+            to account for negative legendre terms
+    Returns:
+        2d array to be added while solving for archetype fitting
+
+    """
+
+    tot_legendre_terms = deg_legendre
+    if add_negative_legendre_terms:
+        #Double total legendre term count to reflect negative terms
+        tot_legendre_terms = deg_legendre*2
+    nbasis = n_nbh+tot_legendre_terms*ncamera # 3 desi cameras
+    prior = np.zeros((nbasis, nbasis), dtype='float64');np.fill_diagonal(prior, 1/(sigma**2))
+    for i in range(n_nbh):
+        prior[i][i]=0. ## Do not add prior to the archetypes, added only to the Legendre polynomials
+
+    if add_negative_legendre_terms:
+        #Add cross terms to prior array
+        for i in range(ncamera):
+            for j in range(deg_legendre):
+                idx1 = n_nbh+i*tot_legendre_terms+j
+                idx2 = n_nbh+i*tot_legendre_terms+deg_legendre+j
+                prior[idx1][idx2] = -prior[idx1][idx1]
+                prior[idx2][idx1] = -prior[idx2][idx2]
+    return prior
+
+def per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflux, nleg, narch, method=None, n_nbh=None, prior_sigma=None, use_gpu=False, bands=None):
     """This function calculates coefficients for archetype mode in each camera using normal linear algebra matrix solver or BVLS (bounded value least square) method
 
     BVLS described in : https://www.stat.berkeley.edu/~stark/Preprints/bvls.pdf
@@ -248,13 +286,13 @@ def per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux
 
     Args:
         target (object): target object
-        tdata (dict): template data for model fit for ALL archetypes
+        binned (dict): template data for model fit for ALL archetypes
         weights (array): concatenated spectral weights (ivar).
         flux (array): concatenated flux values.
         wflux (array): concatenated weighted flux values.
         nleg (int): number of Legendre polynomials
         narch (int): number of archetypes
-        method (string): 'PCA', 'BVLS', 'NMF', or 'NNLS' (same as NMF)
+        method (string): 'PCA', 'BVLS', 'NMF', or 'NNLS' (same as NMF), or 'BVLS_VIA_NNLS'
         n_nbh (int): number of nearest best archetypes
         prior (array): prior matrix added to the Legendre coefficients (1/sigma^2)
         use_gpu (bool): use GPU or not
@@ -264,12 +302,14 @@ def per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux
         coefficients and chi2
 
     """
-
-    ### TODO - implement BVLS on GPU
+    #Use globals gbounds and gnbh
+    global gbounds
+    global gnbh
     # number of cameras in DESI: b, r, z
     spectra = target.spectra
+    if bands is None:
+        bands = ['b', 'z', 'r']
     ncam = len(bands)
-    nbasis = n_nbh+nleg*ncam # n_nbh : for actual physical archetype(s), nleg: number of legendre polynomials, ncamera: number of cameras
 
     ret_zcoeff = {}
     ret_zcoeff['alpha'] = []
@@ -279,15 +319,84 @@ def per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux
 
     new_bands = sorted(bands) # saves as correct order
 
-    #Setup dict of solver args to pass bounds to solver
     method = method.upper()
+    add_negative_legendre_terms = False
+    if (method == 'BVLS_VIA_NNLS' and use_gpu):
+        #Use trick of adding negative legendre terms and using NNLS
+        #to implement BVLS
+        add_negative_legendre_terms = True
+
+    #Calculate prior here
+    prior = prior_on_coeffs(n_nbh, nleg, prior_sigma, ncam, add_negative_legendre_terms)
+    legendre = target.legendre(nleg=nleg, use_gpu=False) #Get previously calculated legendre
+
+    if add_negative_legendre_terms:
+        #Hard-code NNLS with additional legendre terms if BVLS AND gpu mode
+        method = 'NNLS'
+        #tdata should already have the additional negative legendre terms
+        nleg *= 2
+        if narch == 1:
+            #Use CPU if only 1 narch (nearest neighbor)
+            use_gpu = False
+
+    if narch == 1:
+        use_gpu = False
+
+    #Calculate nbasis after accounting for legendre terms
+    nbasis = n_nbh+nleg*ncam # n_nbh : for actual physical archetype(s), nleg: number of legendre polynomials, ncamera: number of cameras
+
+    #Calculate tdata arrays here instead of in archetypes
+    #Select np or cp for operations as arrtype
+    if (use_gpu):
+        import cupy as cp
+        arrtype = cp
+    else:
+        arrtype = np
+    tdata = dict()
+    for hs in binned: 
+        #Create 3-d tdata with narch x nwave x nbasis where nbasis = 1+nleg
+        if narch == 1:
+            tdata[hs] = binned[hs][None,:,:]
+            if nleg > 0:
+                tdata[hs] = np.append(tdata[hs], legendre[hs].transpose()[None,:,:], axis=2)
+                if add_negative_legendre_terms:
+                #if method == 'BVLS' and use_gpu:
+                    #Using BVLS and GPU - append negative copies of legendre terms as well to use NNLS with additional bases to approximate BVLS
+                    tdata[hs] = np.append(tdata[hs], -1*legendre[hs].transpose()[None,:,:], axis=2)
+        else:
+            if nleg > 0:
+                tdata[hs] = arrtype.append(binned[hs].transpose()[:,:,None], arrtype.tile(arrtype.asarray(legendre[hs]).transpose()[None,:,:], (narch, 1, 1)), axis=2)
+                if add_negative_legendre_terms:
+                #if method == 'BVLS' and use_gpu:
+                    #Using BVLS - append negative copies of legendre terms as well to use NNLS with additional bases to approximate BVLS
+                    tdata[hs] = arrtype.append(tdata[hs], arrtype.tile(-1*arrtype.asarray(legendre[hs]).transpose()[None,:,:], (narch, 1, 1)), axis=2)
+            else:
+                tdata[hs] = binned[hs].transpose()[:,:,None]
+
+    #Setup dict of solver args to pass bounds to solver
     solver_args = dict()
     if (method == 'BVLS'):
         #only positive coefficients are allowed for the archetypes
-        bounds = np.zeros((2, nbasis))
-        bounds[0][n_nbh:]=-np.inf #constant and slope terms in archetype method (can be positive or negative)
-        bounds[1] = np.inf
-        solver_args['bounds'] = bounds
+        if use_gpu:
+            #bounds should always be the same and creating small arrays on
+            #GPU is expensive so use a global variable and create once
+            if gbounds is None:
+                gbounds = cp.zeros((2,nbasis))
+                gbounds[0][n_nbh:] = -cp.inf
+                gbounds[1] = cp.inf
+                gnbh = n_nbh
+            #In practice right now n_nbh will not change on any given run
+            #But future proofing just in case
+            if gnbh != n_nbh:
+                gbounds[0][:n_nbh] = 0
+                gbounds[0][n_nbh:] = -cp.inf
+            solver_args['bounds'] = gbounds
+        else:
+            #On CPU we can create bounds array every time cheaply
+            bounds = np.zeros((2, nbasis))
+            bounds[0][n_nbh:]=-np.inf #constant and slope terms in archetype method (can be positive or negative)
+            bounds[1] = np.inf
+            solver_args['bounds'] = bounds
 
     #Use branching options because GPU is faster in batch in 3d
     #but due to timing weirdness in numpy, CPU is faster looping over
@@ -317,6 +426,8 @@ def per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux
                         tdata2[hs][:,k+nleg*i] = tdata[hs][j,:,k] # Legendre polynomials terms
                 tdata2[hs] = tdata2[hs][None,:,:]
             zzchi2[j], zzcoeff[j] = calc_zchi2_batch(spectra, tdata2, weights, flux, wflux, 1, nbasis, solve_matrices_algorithm=method, solver_args=solver_args, prior=prior, use_gpu=use_gpu)
+    if (add_negative_legendre_terms):
+        zzcoeff = remove_extra_legendre_terms(zzcoeff, n_nbh, ncam, nleg)
 
     # saving leading archetype coefficients in correct order
     ret_zcoeff['alpha'] = [zzcoeff[:,k] for k in range(n_nbh)] # archetype coefficient(s)
@@ -337,6 +448,15 @@ def per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux
     #print(f'{time.time()-start} [sec] took for per camera BVLS method\n')
     return zzchi2, coeff
 
+def remove_extra_legendre_terms(zzcoeff, n_nbh, ncam, nleg):
+    nleg = nleg // 2
+    zzcoeff2 = np.zeros_like(zzcoeff, shape=(zzcoeff.shape[0], n_nbh+ncam*(nleg)))
+    zzcoeff2[:,:n_nbh] = zzcoeff[:,:n_nbh]
+    for j in range(ncam):
+        for l in range(nleg):
+            zzcoeff2[:,n_nbh+nleg*j+l] = zzcoeff[:,n_nbh+nleg*j*2+l] - zzcoeff[:,n_nbh+nleg*j*2+nleg+l]
+    return zzcoeff2
+
 def batch_dot_product_sparse(spectra, tdata, nz, use_gpu):
     """Calculate a batch dot product of the 3 sparse matrices in spectra
     with every template in tdata.  Sparse matrix libraries are used
@@ -1117,20 +1237,24 @@ def solve_matrices(M, y, solve_algorithm="PCA", solver_args=None, use_gpu=False)
 
     if solve_algorithm.upper() == "PCA":
         #Use PCA via linalg.solve in either numpy or cupy
+        tx = time.time()
         if (use_gpu):
             #Use cupy linalg.solve to solve for zcoeff in batch for all_M and
             #all_y where all_M and all_y are 3d and 2d arrays representing
             #M and y at every redshift bin for the given template.
             #There is no Error thrown by cupy's version.
-            return cp.linalg.solve(M, y)
+            zcoeff = cp.linalg.solve(M, y)
         else:
             #Use numpy linalg.solve which throws exception
             try:
-                return np.linalg.solve(M, y)
+                zcoeff = np.linalg.solve(M, y)
             except np.linalg.LinAlgError:
                 raise
+        add_to_timer(0, time.time()-tx)
+        return zcoeff
     elif solve_algorithm in ("NMF", "NNLS"):
         if (use_gpu):
+            tx = time.time()
             nz = y.shape[0]
             Mcpu = M.get()
             ycpu = y.get()
@@ -1142,41 +1266,44 @@ def solve_matrices(M, y, solve_algorithm="PCA", solver_args=None, use_gpu=False)
                     zcoeff[j,:] = res[0]
                 except Exception:
                     zcoeff[j,:] = 9e99
+            add_to_timer(1, time.time()-tx)
             return cp.asarray(zcoeff)
         else:
             try:
+                tx = time.time()
                 res = nnls(M, y)
                 zcoeff = res[0]
-            except Exception:
+                add_to_timer(1, time.time()-tx)
+            except Exception as ex:
                 raise np.linalg.LinAlgError
             return zcoeff
     elif solve_algorithm == "BVLS":
+        tx = time.time()
         if (solver_args is not None and 'bounds' in solver_args):
             bounds = solver_args['bounds']
         else:
             nbasis = y.shape[-1]
-            bounds = np.zeros((2, nbasis))
+            bounds = np.zeros_like(y, shape=(2, nbasis))
             bounds[0]=-np.inf
             bounds[1]=np.inf
         if (use_gpu):
-            nz = y.shape[0]
-            Mcpu = M.get()
-            ycpu = y.get()
-            zcoeff = np.zeros(y.shape)
-            #Copy to CPU, run scipy.optimize.lsq_linear, copy back to GPU
-            for j in range(nz):
-                try:
-                    res = lsq_linear(Mcpu[j,:,:], ycpu[j,:], bounds=bounds, method='bvls')
-                    zcoeff[j,:] = res.x
-                except np.linalg.LinAlgError:
-                    zcoeff[j,:] = 9e99
-            return cp.asarray(zcoeff)
+            from .optimize import gpu_lsq_linear as lsq_linear
+            lsq_opts = dict(return_cost=False, return_optimality=False)
         else:
-            try:
-                res = lsq_linear(M, y, bounds=bounds, method='bvls')
-                zcoeff = res.x
-            except np.linalg.LinAlgError:
-                raise
-            return zcoeff
+            from scipy.optimize import lsq_linear
+            lsq_opts = dict()
+        res = lsq_linear(M, y, bounds=bounds, method='bvls', **lsq_opts)
+        zcoeff = res.x
+        add_to_timer(2, time.time()-tx)
+        return zcoeff
     else:
         raise NotImplementedError("The solve_algorithm "+solve_algorithm+" is not implemented.")
+
+def add_to_timer(i, t):
+    ftimes[i] += t
+
+def print_total_fitting_times():
+    for j in range(len(ftimes)):
+        if ftimes[j] > 0:
+            print("Total fitting time (method {}): {:0.1f} seconds".format(fmethods[j], ftimes[j]))
+    sys.stdout.flush()
diff --git a/requirements.txt b/requirements.txt
index 2197f552..748dfb8b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 # Based on desimodules/22.2.
+numpy<2.0
 astropy
 scipy
 h5py