diff --git a/py/redrock/archetypes.py b/py/redrock/archetypes.py index e9943b4e..d94fd225 100644 --- a/py/redrock/archetypes.py +++ b/py/redrock/archetypes.py @@ -69,6 +69,9 @@ def __init__(self, filename): # TODO: Allow Archetype files to specify their IGM model self.igm_model = 'Inoue14' + # TODO: Allow Archetype files to specify bvls or nnls or pca solver method + self._solver_method = 'bvls' + #self._solver_method = 'bvls_via_nnls' #This uses the NNLS trick to emulate BVLS self.method = 'ARCH' # for API symmetry with Template.method @@ -217,7 +220,7 @@ def eval(self, subtype, coeff, wave, z, R=None, legcoeff=None): return model - def nearest_neighbour_model(self, target,weights,flux,wflux,dwave,z, n_nearest, zzchi2, trans, per_camera, dedges=None, binned=None, use_gpu=False, prior=None, ncam=None): + def nearest_neighbour_model(self, target,weights,flux,wflux,dwave,z, n_nearest, zzchi2, trans, per_camera, dedges=None, binned=None, use_gpu=False, prior_sigma=None, ncam=None): """Nearest neighbour archetype approach; fitting with a combinating of nearest archetypes in chi2 space @@ -235,7 +238,7 @@ def nearest_neighbour_model(self, target,weights,flux,wflux,dwave,z, n_nearest, dedges (dict): in GPU mode, use pre-computed dict of wavelength bin edges, already on GPU binned (dict): already computed dictionary of rebinned fluxes use_gpu (bool): use GPU or not - prior (2d array): prior matrix on coefficients (1/sig**2) + prior_sigma (float): prior to add in the final solution matrix: added as 1/(prior_sigma**2) only for per-camera mode ncam (int): Number of camera for given Instrument Returns: @@ -249,11 +252,11 @@ def nearest_neighbour_model(self, target,weights,flux,wflux,dwave,z, n_nearest, #is multiplied by trans in get_best_archetype spectra = target.spectra nleg = target.nleg + bands = None legendre = target.legendre(nleg=nleg, use_gpu=False) #Get previously calculated legendre nleg = legendre[list(legendre.keys())[0]].shape[0] iBest = np.argsort(zzchi2)[0:n_nearest] - tdata = dict() if (binned is not None): if (use_gpu): binned = { hs:binned[hs][:,iBest].get() for hs in binned } @@ -268,17 +271,23 @@ def nearest_neighbour_model(self, target,weights,flux,wflux,dwave,z, n_nearest, #Only multiply if trans[hs] is not None #Both arrays are on CPU so no need to wrap with asarray binned[hs] *= trans[hs][:,None] - for hs, w in dwave.items(): - tdata[hs] = binned[hs][None,:,:] - if (nleg > 0): - tdata[hs] = np.append(tdata[hs], legendre[hs].transpose()[None,:,:], axis=2) - nbasis = tdata[hs].shape[2] if per_camera: #Use CPU mode since small tdata - (zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux, nleg, 1, method='bvls', n_nbh=n_nearest, prior=prior, use_gpu=False, bands=target.bands) + #CW - 4/19/24 - pass use_gpu and solver method to per_camera_coeff_with_least_square_batch + #it will decide there if narch == 1 to use CPU and it will calculate correct prior array + #CW - 5/2/24 - pass binned instead of tdata to put all BVLS/NNLS code into per_camera_coeff_with_least_square_batch + (zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflux, nleg, 1, method=self._solver_method, n_nbh=n_nearest, prior_sigma=prior_sigma, use_gpu=use_gpu, bands=bands) + #(zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflux, nleg, 1, method=self._solver_method, n_nbh=n_nearest, prior_sigma=prior_sigma, use_gpu=use_gpu, ncam=ncam) else: #Use CPU mode for calc_zchi2 since small tdata + #Calculate tdata here because binned is passed to per_camera_coeff_with_least_square_batch + tdata = dict() + for hs, w in dwave.items(): + tdata[hs] = binned[hs][None,:,:] + if (nleg > 0): + tdata[hs] = np.append(tdata[hs], legendre[hs].transpose()[None,:,:], axis=2) + nbasis = tdata[hs].shape[2] (zzchi2, zzcoeff) = calc_zchi2_batch(spectra, tdata, weights, flux, wflux, 1, nbasis, use_gpu=False) sstype = ['%s'%(self._subtype[k]) for k in iBest] # subtypes of best archetypes @@ -287,7 +296,7 @@ def nearest_neighbour_model(self, target,weights,flux,wflux,dwave,z, n_nearest, #print(z, zzchi2, zzcoeff, fsstype) return zzchi2[0], zzcoeff[0], make_fulltype(self._rrtype, fsstype) - def get_best_archetype(self,target,weights,flux,wflux,dwave,z, per_camera, n_nearest, trans=None, solve_method='bvls', prior=None, use_gpu=False): + def get_best_archetype(self,target,weights,flux,wflux,dwave,z, per_camera, n_nearest, trans=None, prior_sigma=None, use_gpu=False): """Get the best archetype for the given redshift and spectype. @@ -301,8 +310,7 @@ def get_best_archetype(self,target,weights,flux,wflux,dwave,z, per_camera, n_nea per_camera (bool): True if fitting needs to be done in each camera n_nearest (int): number of nearest neighbours to be used in chi2 space (including best archetype) trans (dict): pass previously calcualated Lyman transmission instead of recalculating - solve_method (string): bvls or pca - prior (2d array): prior matrix on coefficients (1/sig**2) + prior_sigma (float): prior to add in the final solution matrix: added as 1/(prior_sigma**2) only for per-camera mode use_gpu (bool): use GPU or not Returns: @@ -314,6 +322,7 @@ def get_best_archetype(self,target,weights,flux,wflux,dwave,z, per_camera, n_nea spectra = target.spectra nleg = target.nleg legendre = target.legendre(nleg=nleg, use_gpu=use_gpu) #Get previously calculated legendre + solve_method = self._solver_method #get solve method from archetype class instead of passing as arg bands = target.bands #Select np or cp for operations as arrtype @@ -339,9 +348,7 @@ def get_best_archetype(self,target,weights,flux,wflux,dwave,z, per_camera, n_nea #new_keys = [wkeys[0], wkeys[2], wkeys[1]] #obs_wave = np.concatenate([dwave[key] for key in new_keys]) - nleg = legendre[list(legendre.keys())[0]].shape[0] - zzchi2 = np.zeros(self._narch, dtype=np.float64) - zzcoeff = np.zeros((self._narch, 1+ncam*(nleg)), dtype=np.float64) + #nleg = legendre[list(legendre.keys())[0]].shape[0] #TODO: return best fit model as well #zzmodel = np.zeros((self._narch, obs_wave.size), dtype=np.float64) @@ -358,41 +365,47 @@ def get_best_archetype(self,target,weights,flux,wflux,dwave,z, per_camera, n_nea #Rebin in batch binned = self.rebin_template_batch(z, dwave, trapz=True, dedges=dedges, use_gpu=use_gpu) - tdata = dict() - nbasis = 1 - ## Prior must be redefined to remove nearest neighbour approach, # because prior was defined based on n_nearest argument.. # this logic is needed because the first fitting is done with just one archetype #and then nearest neighbour approach is implemented - if n_nearest is not None: - nnearest_prior = prior.copy() # prior corresponding to nearest_nbh method - prior = prior[n_nearest-1:,][:,n_nearest-1:] # removing first rows/columns corresponding to the nearest_archetypes, and keeping just one row for one archetype + + ##CW 04/19/24 - no need to redefine prior since it is now not calculated until per_camera_coeff_with_least_square_batch + #if n_nearest is not None: + # nnearest_prior = prior.copy() # prior corresponding to nearest_nbh method + # prior = prior[n_nearest-1:,][:,n_nearest-1:] # removing first rows/columns corresponding to the nearest_archetypes, and keeping just one row for one archetype for hs, wave in dwave.items(): if (trans[hs] is not None): #Only multiply if trans[hs] is not None binned[hs] *= arrtype.asarray(trans[hs][:,None]) - #Create 3-d tdata with narch x nwave x nbasis where nbasis = 1+nleg - if nleg > 0: - tdata[hs] = arrtype.append(binned[hs].transpose()[:,:,None], arrtype.tile(arrtype.asarray(legendre[hs]).transpose()[None,:,:], (self._narch, 1, 1)), axis=2) - else: - tdata[hs] = binned[hs].transpose()[:,:,None] - nbasis = tdata[hs].shape[2] if per_camera: + #Use per_camera_coeff_with_least_square_batch which has all logic associated with BVLS/NNLS solver methods if (use_gpu): - (zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, tdata, gpuweights, gpuflux, gpuwflux, nleg, self._narch, method=solve_method, n_nbh=1, prior=prior, use_gpu=use_gpu, bands=bands) + (zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, binned, gpuweights, gpuflux, gpuwflux, nleg, self._narch, method=solve_method, n_nbh=1, prior_sigma=prior_sigma, use_gpu=use_gpu, bands=bands) + #(zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, binned, gpuweights, gpuflux, gpuwflux, nleg, self._narch, method=solve_method, n_nbh=1, prior_sigma=prior_sigma, use_gpu=use_gpu, ncam=ncam) else: - (zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux, nleg, self._narch, method=solve_method, n_nbh=1, prior=prior, use_gpu=use_gpu, bands=bands) + (zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflux, nleg, self._narch, method=solve_method, n_nbh=1, prior_sigma=prior_sigma, use_gpu=use_gpu, bands=bands) + #(zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflux, nleg, self._narch, method=solve_method, n_nbh=1, prior_sigma=prior_sigma, use_gpu=use_gpu, ncam=ncam) else: + #Calculate tdata here because binned is passed to per_camera_coeff_with_least_square_batch + tdata = dict() + nbasis = 1 + for hs, wave in dwave.items(): + #Create 3-d tdata with narch x nwave x nbasis where nbasis = 1+nleg + if nleg > 0: + tdata[hs] = arrtype.append(binned[hs].transpose()[:,:,None], arrtype.tile(arrtype.asarray(legendre[hs]).transpose()[None,:,:], (self._narch, 1, 1)), axis=2) + else: + tdata[hs] = binned[hs].transpose()[:,:,None] + nbasis = tdata[hs].shape[2] if (use_gpu): (zzchi2, zzcoeff) = calc_zchi2_batch(spectra, tdata, gpuweights, gpuflux, gpuwflux, self._narch, nbasis, use_gpu=use_gpu) else: (zzchi2, zzcoeff) = calc_zchi2_batch(spectra, tdata, weights, flux, wflux, self._narch, nbasis, use_gpu=use_gpu) if n_nearest is not None: - best_chi2, best_coeff, best_fulltype = self.nearest_neighbour_model(target,weights,flux,wflux,dwave,z, n_nearest, zzchi2, trans, per_camera, dedges=dedges, binned=binned, use_gpu=use_gpu, prior=nnearest_prior, ncam=ncam) + best_chi2, best_coeff, best_fulltype = self.nearest_neighbour_model(target,weights,flux,wflux,dwave,z, n_nearest, zzchi2, trans, per_camera, dedges=dedges, binned=binned, use_gpu=use_gpu, prior_sigma=prior_sigma, ncam=ncam) #print(best_chi2, best_coeff, best_fulltype) return best_chi2, best_coeff, best_fulltype else: diff --git a/py/redrock/fitz.py b/py/redrock/fitz.py index 400f3b16..6a10d497 100644 --- a/py/redrock/fitz.py +++ b/py/redrock/fitz.py @@ -106,25 +106,13 @@ def minfit(x, y): return (x0, xerr, y0, zwarn) -def prior_on_coeffs(n_nbh, deg_legendre, sigma, ncamera): - - """ - Args: - n_nbh (int): number of dominant archetypes - deg_legendre (int): number of Legendre polynomials - sigma (int): prior sigma to be used for archetype fitting - ncamera (int): number of cameras for given instrument - Returns: - 2d array to be added while solving for archetype fitting - - """ - - nbasis = n_nbh+deg_legendre*ncamera # 3 desi cameras - prior = np.zeros((nbasis, nbasis), dtype='float64');np.fill_diagonal(prior, 1/(sigma**2)) - for i in range(n_nbh): - prior[i][i]=0. ## Do not add prior to the archetypes, added only to the Legendre polynomials - return prior +def legendre_calculate(nleg, dwave): + wave = np.concatenate([ w for w in dwave.values() ]) + wmin = wave.min() + wmax = wave.max() + legendre = { hs:np.array([scipy.special.legendre(i)( (w-wmin)/(wmax-wmin)*2.) for i in range(nleg)]) for hs, w in dwave.items() } + return legendre def fitz(zchi2, redshifts, target, template, nminima=3, archetype=None, use_gpu=False, deg_legendre=None, zminfit_npoints=15, per_camera=False, n_nearest=None, prior_sigma=None): @@ -293,6 +281,8 @@ def fitz(zchi2, redshifts, target, template, nminima=3, archetype=None, use_gpu= coeff = np.zeros(template.nbasis) zwarn |= ZW.Z_FITLIMIT zwarn |= ZW.BAD_MINFIT + #Set trans to None so as not to pass an empty dict to archetypes! CW 2/26/24 + trans = None else: #- Unknown problem; re-raise error raise err @@ -331,12 +321,9 @@ def fitz(zchi2, redshifts, target, template, nminima=3, archetype=None, use_gpu= else: ncamera = 1 if n_nearest is None: - prior = prior_on_coeffs(1, deg_legendre, prior_sigma, ncamera) - else: - prior = prior_on_coeffs(n_nearest, deg_legendre, prior_sigma, ncamera) - else: - prior=None - chi2min, coeff, fulltype = archetype.get_best_archetype(target,weights,flux,wflux,dwave,zbest, per_camera, n_nearest, trans=trans, use_gpu=use_gpu, prior=prior) + n_nearest = 1 + #Pass prior_sigma as a scalar and move calculation of prior array to zscan.py in per_camera_coeff_with_least_square_batch + chi2min, coeff, fulltype = archetype.get_best_archetype(target,weights,flux,wflux,dwave,zbest, per_camera, n_nearest, trans=trans, use_gpu=use_gpu, prior_sigma=prior_sigma) del trans results.append(dict(z=zbest, zerr=zerr, zwarn=zwarn, diff --git a/py/redrock/optimize/__init__.py b/py/redrock/optimize/__init__.py new file mode 100644 index 00000000..acfc1e7c --- /dev/null +++ b/py/redrock/optimize/__init__.py @@ -0,0 +1,5 @@ +"""This module contains a GPU implementation of the BVLS algorithm in + scipy.optimize.lsq_linear""" +from .gpu_lsq_linear import gpu_lsq_linear + +__all__ = ['gpu_lsq_linear'] diff --git a/py/redrock/optimize/gpu_bvls.py b/py/redrock/optimize/gpu_bvls.py new file mode 100644 index 00000000..74e7a784 --- /dev/null +++ b/py/redrock/optimize/gpu_bvls.py @@ -0,0 +1,258 @@ +"""GPU implementation of Bounded-variable least-squares algorithm.""" +import numpy as np +import cupy as cp +#from numpy.linalg import norm, lstsq +from cupy.linalg import norm, lstsq +from scipy.optimize import OptimizeResult + +from .gpu_common import print_header_linear, print_iteration_linear, compute_grad + +def compute_kkt_optimality(g, on_bound): + """Compute the maximum violation of KKT conditions.""" + g_kkt = g * on_bound + free_set = on_bound == 0 + g_kkt[free_set] = cp.abs(g[free_set]) + return cp.max(g_kkt, axis=1) + + +def bvls(A, b, x_lsq, lb, ub, ib, tol, max_iter, verbose, rcond=None, return_cost=True, return_optimality=True): + d, m, n = A.shape + + #x = x_lsq.copy() + x = x_lsq + on_bound = cp.zeros((d,n), dtype=cp.int32) + + mask = x <= lb + x[mask] = lb[mask] + on_bound[mask] = -1 + + mask = x >= ub + x[mask] = ub[mask] + on_bound[mask] = 1 + + free_set = on_bound == 0 + active_set = ~free_set + + if return_cost or return_optimality: + r = cp.einsum("ijk,ik->ij", A, x) - b + #r = A.dot(x) - b + cost = 0.5*cp.einsum("ij,ij->i", r, r) + #cost = 0.5 * np.dot(r, r) + initial_cost = cost + if return_optimality: + g = compute_grad(A,r) + #g = A.T.dot(r) + else: + cost = None + initial_cost = None + + cost_change = None + step_norm = None + iteration = 0 + + if verbose == 2: + print_header_linear() + + # This is the initialization loop. The requirement is that the + # least-squares solution on free variables is feasible before BVLS starts. + # One possible initialization is to set all variables to lower or upper + # bounds, but many iterations may be required from this state later on. + # The implemented ad-hoc procedure which intuitively should give a better + # initial state: find the least-squares solution on current free variables, + # if its feasible then stop, otherwise, set violating variables to + # corresponding bounds and continue on the reduced set of free variables. + + while free_set.any(): + if verbose == 2 and return_optimality: + optimality = compute_kkt_optimality(g, on_bound) + print_iteration_linear(iteration, cost, cost_change, step_norm, + optimality) + + iteration += 1 + if verbose == 2: + #Only need x_old if verbose == 2 + x_old = x.copy() + + b_free = b - cp.einsum("ijk,ik->ij", A, x*active_set) + A_free = A.swapaxes(1,2).copy() + A_free[active_set] = 0 + A_free = A_free.swapaxes(1,2) + z = cp.einsum("ijk,ik->ij", cp.linalg.pinv(A_free), b_free) + + #A_free = A[:, free_set] + #b_free = b - A.dot(x * active_set) + #z = lstsq(A_free, b_free, rcond=rcond)[0] + + lbv = (z < lb)*free_set + ubv = (z > ub)*free_set + #lbv = z < lb[free_set] + #ubv = z > ub[free_set] + v = lbv | ubv + + if cp.any(lbv): + #ind = free_set[lbv] + #x[ind] = lb[ind] + x[lbv] = lb[lbv] + #active_set[ind] = True + active_set[lbv] = True + #on_bound[ind] = -1 + on_bound[lbv] = -1 + + if cp.any(ubv): + #ind = free_set[ubv] + #x[ind] = ub[ind] + x[ubv] = ub[ubv] + #active_set[ind] = True + active_set[ubv] = True + #on_bound[ind] = 1 + on_bound[ubv] = 1 + + #ind = free_set[~v] + ind = free_set*~v + #x[ind] = z[~v] + x[ind] = z[ind] + + r = cp.einsum("ijk,ik->ij", A, x) - b + #r = A.dot(x) - b + #cost_new = 0.5 * np.dot(r, r) + cost_new = 0.5*cp.einsum("ij,ij->i", r, r) + if return_cost: + cost_change = cost - cost_new + cost = cost_new + #g = A.T.dot(r) + g = compute_grad(A,r) + if verbose == 2: + #step_norm = norm(x[free_set] - x_free_old) + step_norm = norm(x-x_old, axis=1) + + if cp.any(v): + free_set[v] = False + #free_set = free_set[~v] + else: + break + + if max_iter is None: + max_iter = n + max_iter += iteration + + termination_status = None + + # Main BVLS loop. + optimality = compute_kkt_optimality(g, on_bound) + idx = np.arange(d, dtype=np.int32) + for iteration in range(iteration, max_iter): # BVLS Loop A + if verbose == 2 and return_optimality: + print_iteration_linear(iteration, cost, cost_change, + step_norm, optimality) + + if cp.all(optimality < tol): + termination_status = 1 + + if termination_status is not None: + break + + g_on = g * on_bound + move_to_free = cp.argmax(g_on, axis=1) + #on_bound[idx,move_to_free] = cp.minimum(0, on_bound[idx,move_to_free]) + on_bound[idx,move_to_free] = (g_on[idx,move_to_free] < 0)*on_bound[idx,move_to_free] + + while True: # BVLS Loop B + free_set = on_bound == 0 + active_set = ~free_set + + if verbose == 2: + #Only need x_old if verbose == 2 + x_old = x.copy() + + b_free = b - cp.einsum("ijk,ik->ij", A, x*active_set) + #Set active_set to 0 in A_free + A_free = A.swapaxes(1,2).copy() + A_free[active_set] = 0 + A_free = A_free.swapaxes(1,2) + + z = cp.einsum("ijk,ik->ij", cp.linalg.pinv(A_free), b_free) + + #A_free = A[:, free_set] + #b_free = b - A.dot(x * active_set) + #z = lstsq(A_free, b_free, rcond=rcond)[0] + + lbv = (z < lb)*free_set + ubv = (z > ub)*free_set + v = (lbv | ubv) + + #lbv, = np.nonzero(z < lb_free) + #ubv, = np.nonzero(z > ub_free) + #v = np.hstack((lbv, ubv)) + + #if v.size > 0: + if v.any(): + #alphas = cp.zeros(x.shape) + #av = v.any(axis=1) + alphas = cp.full(x.shape, cp.inf) + alphas[lbv] = (lb[lbv]-x[lbv]) / (z[lbv]-x[lbv]) + alphas[ubv] = (ub[ubv]-x[ubv]) / (z[ubv]-x[ubv]) + + i = cp.argmin(alphas, axis=1) + alpha = alphas[idx,i].reshape((d,1)) + #x_free = x[v] * (1-alpha[v]) + #x_free += alpha[v]*z[v] + + #x_free = x[v] * (1-alpha[av]) + #x_free += alpha[av]*z[v] + #x[v] = x_free + + x_free = x*(1-alpha) + x_free += alpha*z + x[v] = x_free[v] + + #Update on_bound + on_bound[cp.where(lbv[idx,i]), i[cp.where(lbv[idx,i])]] = -1 + on_bound[cp.where(ubv[idx,i]), i[cp.where(ubv[idx,i])]] = 1 + + #alphas = np.hstack(( + # lb_free[lbv] - x_free[lbv], + # ub_free[ubv] - x_free[ubv])) / (z[v] - x_free[v]) + + #i = np.argmin(alphas) + #i_free = v[i] + #alpha = alphas[i] + + #x_free *= 1 - alpha + #x_free += alpha * z + #x[free_set] = x_free + + #if i < lbv.size: + # on_bound[free_set[i_free]] = -1 + #else: + # on_bound[free_set[i_free]] = 1 + else: + x_free = z[free_set] + x[free_set] = x_free + break + + if verbose == 2: + #step_norm = norm(x_free - x_free_old, axis=1) + step_norm = norm(x - x_old, axis=1) + + r = cp.einsum("ijk,ik->ij", A, x) - b + #r = A.dot(x) - b + #cost_new = 0.5 * np.dot(r, r) + cost_new = 0.5*cp.einsum("ij,ij->i", r, r) + cost_change = cost - cost_new + + if cp.all(cost_change < tol * cost): + termination_status = 2 + cost = cost_new + + #g = A.T.dot(r) + g = compute_grad(A,r) + optimality = compute_kkt_optimality(g, on_bound) + x_lsq = x + + if termination_status is None: + termination_status = 0 + + return OptimizeResult( + x=x_lsq, fun=r, cost=cost, optimality=optimality, active_mask=on_bound, + nit=iteration + 1, status=termination_status, + initial_cost=initial_cost) diff --git a/py/redrock/optimize/gpu_common.py b/py/redrock/optimize/gpu_common.py new file mode 100644 index 00000000..4333507f --- /dev/null +++ b/py/redrock/optimize/gpu_common.py @@ -0,0 +1,77 @@ +"""Functions used by least-squares algorithms.""" +import numpy as np +import cupy as cp + +from scipy.sparse.linalg import LinearOperator, aslinearoperator + + +# Utility functions to work with bound constraints. +def in_bounds(x, lb, ub, arrtype=None): + """Check if a point lies within bounds.""" + if arrtype is None: + arrtype = cp + return arrtype.all((x >= lb) & (x <= ub),axis=1) + +def all_in_bounds(x, lb, ub): + """Check if a point lies within bounds.""" + return cp.all((x >= lb) & (x <= ub)) + + +# Functions to display algorithm's progress. + + +def print_header_nonlinear(): + print("{:^15}{:^15}{:^15}{:^15}{:^15}{:^15}" + .format("Iteration", "Total nfev", "Cost", "Cost reduction", + "Step norm", "Optimality")) + + +def print_iteration_nonlinear(iteration, nfev, cost, cost_reduction, + step_norm, optimality): + if cost_reduction is None: + cost_reduction = " " * 15 + else: + cost_reduction = f"{cost_reduction:^15.2e}" + + if step_norm is None: + step_norm = " " * 15 + else: + step_norm = f"{step_norm:^15.2e}" + + print("{:^15}{:^15}{:^15.4e}{}{}{:^15.2e}" + .format(iteration, nfev, cost, cost_reduction, + step_norm, optimality)) + + +def print_header_linear(): + print("{:^15}{:^15}{:^15}{:^15}{:^15}" + .format("Iteration", "Cost", "Cost reduction", "Step norm", + "Optimality")) + + +def print_iteration_linear(iteration, cost, cost_reduction, step_norm, + optimality): + if cost_reduction is None: + cost_reduction = " " * 15 + else: + cost_reduction = f"{cost_reduction:^15.2e}" + + if step_norm is None: + step_norm = " " * 15 + else: + step_norm = f"{step_norm:^15.2e}" + + print(f"{iteration:^15}{cost:^15.4e}{cost_reduction}{step_norm}{optimality:^15.2e}") + + +# Simple helper functions. + + +def compute_grad(J, f): + """Compute gradient of the least-squares cost function.""" + if isinstance(J, LinearOperator): + return J.rmatvec(f) + else: + return cp.einsum("ikj,ij->ik", J.swapaxes(1,2), f) + #return J.T.dot(f) + diff --git a/py/redrock/optimize/gpu_lsq_linear.py b/py/redrock/optimize/gpu_lsq_linear.py new file mode 100644 index 00000000..0c8163f7 --- /dev/null +++ b/py/redrock/optimize/gpu_lsq_linear.py @@ -0,0 +1,404 @@ +"""GPU implementation of Linear least squares with bound constraints on independent variables.""" +import numpy as np +#from numpy.linalg import norm +from scipy.sparse import issparse, csr_matrix +from scipy.sparse.linalg import LinearOperator, lsmr +from scipy.optimize import OptimizeResult +from scipy.optimize._minimize import Bounds + +import cupy as cp +from cupy.linalg import norm, lstsq + +from .gpu_common import all_in_bounds, in_bounds, compute_grad +from scipy.optimize._lsq.trf_linear import trf_linear +from .gpu_bvls import bvls +import time + + +def prepare_bounds(bounds, n): + if len(bounds) != 2: + raise ValueError("`bounds` must contain 2 elements.") + lb, ub = (cp.asarray(b, dtype=float) for b in bounds) + + if lb.ndim == 0: + lb = cp.resize(lb, n) + + if ub.ndim == 0: + ub = cp.resize(ub, n) + + return lb, ub + + +TERMINATION_MESSAGES = { + -1: "The algorithm was not able to make progress on the last iteration.", + 0: "The maximum number of iterations is exceeded.", + 1: "The first-order optimality measure is less than `tol`.", + 2: "The relative change of the cost function is less than `tol`.", + 3: "The unconstrained solution is optimal." +} + + +def gpu_lsq_linear(A, b, bounds=(-cp.inf, cp.inf), method='trf', tol=1e-10, + lsq_solver=None, lsmr_tol=None, max_iter=None, + verbose=0, *, lsmr_maxiter=None, return_cost=True, + return_optimality=True, check_bounds=False): + r"""Solve a linear least-squares problem with bounds on the variables. + + Given a m-by-n design matrix A and a target vector b with m elements, + `lsq_linear` solves the following optimization problem:: + + minimize 0.5 * ||A x - b||**2 + subject to lb <= x <= ub + + This optimization problem is convex, hence a found minimum (if iterations + have converged) is guaranteed to be global. + + Parameters + ---------- + A : array_like, sparse matrix of LinearOperator, shape (m, n) + Design matrix. Can be `scipy.sparse.linalg.LinearOperator`. + b : array_like, shape (m,) + Target vector. + bounds : 2-tuple of array_like or `Bounds`, optional + Lower and upper bounds on parameters. Defaults to no bounds. + There are two ways to specify the bounds: + + - Instance of `Bounds` class. + + - 2-tuple of array_like: Each element of the tuple must be either + an array with the length equal to the number of parameters, or a + scalar (in which case the bound is taken to be the same for all + parameters). Use ``np.inf`` with an appropriate sign to disable + bounds on all or some parameters. + + method : 'trf' or 'bvls', optional + Method to perform minimization. + + * 'trf' : Trust Region Reflective algorithm adapted for a linear + least-squares problem. This is an interior-point-like method + and the required number of iterations is weakly correlated with + the number of variables. + * 'bvls' : Bounded-variable least-squares algorithm. This is + an active set method, which requires the number of iterations + comparable to the number of variables. Can't be used when `A` is + sparse or LinearOperator. + + Default is 'trf'. + tol : float, optional + Tolerance parameter. The algorithm terminates if a relative change + of the cost function is less than `tol` on the last iteration. + Additionally, the first-order optimality measure is considered: + + * ``method='trf'`` terminates if the uniform norm of the gradient, + scaled to account for the presence of the bounds, is less than + `tol`. + * ``method='bvls'`` terminates if Karush-Kuhn-Tucker conditions + are satisfied within `tol` tolerance. + + lsq_solver : {None, 'exact', 'lsmr'}, optional + Method of solving unbounded least-squares problems throughout + iterations: + + * 'exact' : Use dense QR or SVD decomposition approach. Can't be + used when `A` is sparse or LinearOperator. + * 'lsmr' : Use `scipy.sparse.linalg.lsmr` iterative procedure + which requires only matrix-vector product evaluations. Can't + be used with ``method='bvls'``. + + If None (default), the solver is chosen based on type of `A`. + lsmr_tol : None, float or 'auto', optional + Tolerance parameters 'atol' and 'btol' for `scipy.sparse.linalg.lsmr` + If None (default), it is set to ``1e-2 * tol``. If 'auto', the + tolerance will be adjusted based on the optimality of the current + iterate, which can speed up the optimization process, but is not always + reliable. + max_iter : None or int, optional + Maximum number of iterations before termination. If None (default), it + is set to 100 for ``method='trf'`` or to the number of variables for + ``method='bvls'`` (not counting iterations for 'bvls' initialization). + verbose : {0, 1, 2}, optional + Level of algorithm's verbosity: + + * 0 : work silently (default). + * 1 : display a termination report. + * 2 : display progress during iterations. + lsmr_maxiter : None or int, optional + Maximum number of iterations for the lsmr least squares solver, + if it is used (by setting ``lsq_solver='lsmr'``). If None (default), it + uses lsmr's default of ``min(m, n)`` where ``m`` and ``n`` are the + number of rows and columns of `A`, respectively. Has no effect if + ``lsq_solver='exact'``. + + return_cost : True = return cost function, False = do not to save time + return_optimality: True = return grad function, False = do not to save time + check_bounds: True = check that lb < ub; False = do not to save time + + Returns + ------- + OptimizeResult with the following fields defined: + x : ndarray, shape (n,) + Solution found. + cost : float + Value of the cost function at the solution. + fun : ndarray, shape (m,) + Vector of residuals at the solution. + optimality : float + First-order optimality measure. The exact meaning depends on `method`, + refer to the description of `tol` parameter. + active_mask : ndarray of int, shape (n,) + Each component shows whether a corresponding constraint is active + (that is, whether a variable is at the bound): + + * 0 : a constraint is not active. + * -1 : a lower bound is active. + * 1 : an upper bound is active. + + Might be somewhat arbitrary for the `trf` method as it generates a + sequence of strictly feasible iterates and active_mask is determined + within a tolerance threshold. + unbounded_sol : tuple + Unbounded least squares solution tuple returned by the least squares + solver (set with `lsq_solver` option). If `lsq_solver` is not set or is + set to ``'exact'``, the tuple contains an ndarray of shape (n,) with + the unbounded solution, an ndarray with the sum of squared residuals, + an int with the rank of `A`, and an ndarray with the singular values + of `A` (see NumPy's ``linalg.lstsq`` for more information). If + `lsq_solver` is set to ``'lsmr'``, the tuple contains an ndarray of + shape (n,) with the unbounded solution, an int with the exit code, + an int with the number of iterations, and five floats with + various norms and the condition number of `A` (see SciPy's + ``sparse.linalg.lsmr`` for more information). This output can be + useful for determining the convergence of the least squares solver, + particularly the iterative ``'lsmr'`` solver. The unbounded least + squares problem is to minimize ``0.5 * ||A x - b||**2``. + nit : int + Number of iterations. Zero if the unconstrained solution is optimal. + status : int + Reason for algorithm termination: + + * -1 : the algorithm was not able to make progress on the last + iteration. + * 0 : the maximum number of iterations is exceeded. + * 1 : the first-order optimality measure is less than `tol`. + * 2 : the relative change of the cost function is less than `tol`. + * 3 : the unconstrained solution is optimal. + + message : str + Verbal description of the termination reason. + success : bool + True if one of the convergence criteria is satisfied (`status` > 0). + + See Also + -------- + nnls : Linear least squares with non-negativity constraint. + least_squares : Nonlinear least squares with bounds on the variables. + + Notes + ----- + The algorithm first computes the unconstrained least-squares solution by + `numpy.linalg.lstsq` or `scipy.sparse.linalg.lsmr` depending on + `lsq_solver`. This solution is returned as optimal if it lies within the + bounds. + + Method 'trf' runs the adaptation of the algorithm described in [STIR]_ for + a linear least-squares problem. The iterations are essentially the same as + in the nonlinear least-squares algorithm, but as the quadratic function + model is always accurate, we don't need to track or modify the radius of + a trust region. The line search (backtracking) is used as a safety net + when a selected step does not decrease the cost function. Read more + detailed description of the algorithm in `scipy.optimize.least_squares`. + + Method 'bvls' runs a Python implementation of the algorithm described in + [BVLS]_. The algorithm maintains active and free sets of variables, on + each iteration chooses a new variable to move from the active set to the + free set and then solves the unconstrained least-squares problem on free + variables. This algorithm is guaranteed to give an accurate solution + eventually, but may require up to n iterations for a problem with n + variables. Additionally, an ad-hoc initialization procedure is + implemented, that determines which variables to set free or active + initially. It takes some number of iterations before actual BVLS starts, + but can significantly reduce the number of further iterations. + + References + ---------- + .. [STIR] M. A. Branch, T. F. Coleman, and Y. Li, "A Subspace, Interior, + and Conjugate Gradient Method for Large-Scale Bound-Constrained + Minimization Problems," SIAM Journal on Scientific Computing, + Vol. 21, Number 1, pp 1-23, 1999. + .. [BVLS] P. B. Start and R. L. Parker, "Bounded-Variable Least-Squares: + an Algorithm and Applications", Computational Statistics, 10, + 129-141, 1995. + + Examples + -------- + In this example, a problem with a large sparse matrix and bounds on the + variables is solved. + + >>> import numpy as np + >>> from scipy.sparse import rand + >>> from scipy.optimize import lsq_linear + >>> rng = np.random.default_rng() + ... + >>> m = 20000 + >>> n = 10000 + ... + >>> A = rand(m, n, density=1e-4, random_state=rng) + >>> b = rng.standard_normal(m) + ... + >>> lb = rng.standard_normal(n) + >>> ub = lb + 1 + ... + >>> res = lsq_linear(A, b, bounds=(lb, ub), lsmr_tol='auto', verbose=1) + # may vary + The relative change of the cost function is less than `tol`. + Number of iterations 16, initial cost 1.5039e+04, final cost 1.1112e+04, + first-order optimality 4.66e-08. + """ + tx1 = time.time() + if method not in ['trf', 'bvls']: + raise ValueError("`method` must be 'trf' or 'bvls'") + + if lsq_solver not in [None, 'exact', 'lsmr']: + raise ValueError("`solver` must be None, 'exact' or 'lsmr'.") + + if verbose not in [0, 1, 2]: + raise ValueError("`verbose` must be in [0, 1, 2].") + + if issparse(A): + A = csr_matrix(A) + elif not isinstance(A, LinearOperator): + A = cp.atleast_2d(cp.asarray(A)) + + if method == 'bvls': + if lsq_solver == 'lsmr': + raise ValueError("method='bvls' can't be used with " + "lsq_solver='lsmr'") + + if not isinstance(A, cp.ndarray): + raise ValueError("method='bvls' can't be used with `A` being " + "sparse or LinearOperator.") + + if lsq_solver is None: + if isinstance(A, cp.ndarray): + lsq_solver = 'exact' + else: + lsq_solver = 'lsmr' + elif lsq_solver == 'exact' and not isinstance(A, cp.ndarray): + raise ValueError("`exact` solver can't be used when `A` is " + "sparse or LinearOperator.") + + if len(A.shape) != 3: # No ndim for LinearOperator. + raise ValueError("`A` must have 3 dimensions.") + + if max_iter is not None and max_iter <= 0: + raise ValueError("`max_iter` must be None or positive integer.") + + z, m, n = A.shape + + if b.ndim != 2: + raise ValueError("`b` must have at most 2 dimension.") + + zb, mb = b.shape + + if mb != m: + raise ValueError("Inconsistent shapes between `A` and `b`.") + + if isinstance(bounds, Bounds): + lb = bounds.lb + ub = bounds.ub + else: + lb, ub = prepare_bounds(bounds, n) + + if lb.shape != (n,) and ub.shape != (n,) and lb.shape != (z,n) and ub.shape != (z,n): + raise ValueError("Bounds have wrong shape.") + + if check_bounds: + if cp.any(lb >= ub): + raise ValueError("Each lower bound must be strictly less than each " + "upper bound.") + + if lsmr_maxiter is not None and lsmr_maxiter < 1: + raise ValueError("`lsmr_maxiter` must be None or positive integer.") + + if not ((isinstance(lsmr_tol, float) and lsmr_tol > 0) or + lsmr_tol in ('auto', None)): + raise ValueError("`lsmr_tol` must be None, 'auto', or positive float.") + + if lsq_solver == 'exact': + #unbd_lsq = cp.linalg.lstsq(A, b, rcond=-1) + if m == n: + #square matrices + unbd_lsq = cp.linalg.solve(A,b) + else: + unbd_lsq = cp.einsum("ijk,ik->ij", cp.linalg.pinv(A), b) + elif lsq_solver == 'lsmr': + raise NotImplementedError('lsmr is not yet implemetned') + #first_lsmr_tol = lsmr_tol # tol of first call to lsmr + #if lsmr_tol is None or lsmr_tol == 'auto': + # first_lsmr_tol = 1e-2 * tol # default if lsmr_tol not defined + #unbd_lsq = lsmr(A, b, maxiter=lsmr_maxiter, + # atol=first_lsmr_tol, btol=first_lsmr_tol) + #x_lsq = unbd_lsq[0] # extract the solution from the least squares solver + x_lsq = unbd_lsq + + #Tile bounds if 1d arrays given + if (lb.shape == (n,)): + lb = cp.tile(lb, (z,1)) + if (ub.shape == (n,)): + ub = cp.tile(ub, (z,1)) + ib = in_bounds(x_lsq, lb, ub) + + if ib.get().all(): + #All solutions within bounds + termination_status = 3 + termination_message = TERMINATION_MESSAGES[termination_status] + if (return_cost or return_optimality): + r = cp.einsum("ijk,ik->ij", A, x_lsq) - b + #r = A @ x_lsq - b + cost = 0.5*cp.einsum("ij,ij->i", r, r) + #cost = 0.5 * cp.dot(r, r) + if (return_optimality): + g = compute_grad(A, r) + g_norm = norm(g, ord=cp.inf, axis=1) + else: + g_norm = None + else: + cost=None + r = None + g_norm = None + + if verbose > 0: + print(termination_message) + if (return_cost): + print(f"Final cost {cost:.4e}") + if (return_optimality): + print(f"first-order optimality {g_norm:.2e}") + + return OptimizeResult( + x=x_lsq, fun=r, cost=cost, optimality=g_norm, + active_mask=np.zeros(n), unbounded_sol=unbd_lsq, + nit=0, status=termination_status, + message=termination_message, success=True) + + if method == 'trf': + raise NotImplementedError('trf_linear is not yet implemented') + #res = trf_linear(A, b, x_lsq, lb, ub, tol, lsq_solver, lsmr_tol, + # max_iter, verbose, lsmr_maxiter=lsmr_maxiter) + elif method == 'bvls': + res = bvls(A, b, x_lsq, lb, ub, ib, tol, max_iter, verbose, + return_cost=return_cost, return_optimality=return_optimality) + + res.unbounded_sol = unbd_lsq + res.message = TERMINATION_MESSAGES[res.status] + res.success = res.status > 0 + + if verbose > 0: + print(res.message) + print( + f"Number of iterations {res.nit}, initial cost {res.initial_cost:.4e}, " + f"final cost {res.cost:.4e}, first-order optimality {res.optimality:.2e}." + ) + + del res.initial_cost + + return res diff --git a/py/redrock/zfind.py b/py/redrock/zfind.py index 913186b1..17318c36 100644 --- a/py/redrock/zfind.py +++ b/py/redrock/zfind.py @@ -29,7 +29,7 @@ from .results import read_zscan_redrock -from .zscan import calc_zchi2_targets +from .zscan import calc_zchi2_targets, print_total_fitting_times from .fitz import fitz, get_dv @@ -408,6 +408,7 @@ def zfind(targets, templates, mp_procs=1, nminima=3, archetypes=None, priors=Non results = [ results ] if am_root: + print_total_fitting_times() #print in root the total fitting times allresults = dict() for p in results: allresults.update(p) diff --git a/py/redrock/zscan.py b/py/redrock/zscan.py index 0b1fee29..ffd59728 100644 --- a/py/redrock/zscan.py +++ b/py/redrock/zscan.py @@ -11,7 +11,7 @@ import sys import traceback import numpy as np -from scipy.optimize import lsq_linear, nnls +from scipy.optimize import nnls #try: # import cupy as cp @@ -35,6 +35,11 @@ block_size = 512 #Default block size, should work on all modern nVidia GPUs # cuda_source contains raw CUDA kernels to be loaded as CUPY module +gbounds = None +gnbh = None + +ftimes = np.zeros(3) +fmethods = ["PCA", "NNLS", "BVLS"] cuda_source = r''' extern "C" { @@ -238,8 +243,41 @@ def calc_zchi2_one(spectra, weights, flux, wflux, tdata, solve_matrices_algorith return zchi2, zcoeff -def per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux, nleg, narch, method=None, n_nbh=None, prior=None, use_gpu=False, bands=None): - +def prior_on_coeffs(n_nbh, deg_legendre, sigma, ncamera, add_negative_legendre_terms=False): + + """ + Args: + n_nbh (int): number of dominant archetypes + deg_legendre (int): number of Legendre polynomials + sigma (int): prior sigma to be used for archetype fitting + ncamera (int): number of cameras for given instrument + add_negative_legendre_terms (bool): whether to adjust prior + to account for negative legendre terms + Returns: + 2d array to be added while solving for archetype fitting + + """ + + tot_legendre_terms = deg_legendre + if add_negative_legendre_terms: + #Double total legendre term count to reflect negative terms + tot_legendre_terms = deg_legendre*2 + nbasis = n_nbh+tot_legendre_terms*ncamera # 3 desi cameras + prior = np.zeros((nbasis, nbasis), dtype='float64');np.fill_diagonal(prior, 1/(sigma**2)) + for i in range(n_nbh): + prior[i][i]=0. ## Do not add prior to the archetypes, added only to the Legendre polynomials + + if add_negative_legendre_terms: + #Add cross terms to prior array + for i in range(ncamera): + for j in range(deg_legendre): + idx1 = n_nbh+i*tot_legendre_terms+j + idx2 = n_nbh+i*tot_legendre_terms+deg_legendre+j + prior[idx1][idx2] = -prior[idx1][idx1] + prior[idx2][idx1] = -prior[idx2][idx2] + return prior + +def per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflux, nleg, narch, method=None, n_nbh=None, prior_sigma=None, use_gpu=False, bands=None): """This function calculates coefficients for archetype mode in each camera using normal linear algebra matrix solver or BVLS (bounded value least square) method BVLS described in : https://www.stat.berkeley.edu/~stark/Preprints/bvls.pdf @@ -248,13 +286,13 @@ def per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux Args: target (object): target object - tdata (dict): template data for model fit for ALL archetypes + binned (dict): template data for model fit for ALL archetypes weights (array): concatenated spectral weights (ivar). flux (array): concatenated flux values. wflux (array): concatenated weighted flux values. nleg (int): number of Legendre polynomials narch (int): number of archetypes - method (string): 'PCA', 'BVLS', 'NMF', or 'NNLS' (same as NMF) + method (string): 'PCA', 'BVLS', 'NMF', or 'NNLS' (same as NMF), or 'BVLS_VIA_NNLS' n_nbh (int): number of nearest best archetypes prior (array): prior matrix added to the Legendre coefficients (1/sigma^2) use_gpu (bool): use GPU or not @@ -264,12 +302,14 @@ def per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux coefficients and chi2 """ - - ### TODO - implement BVLS on GPU + #Use globals gbounds and gnbh + global gbounds + global gnbh # number of cameras in DESI: b, r, z spectra = target.spectra + if bands is None: + bands = ['b', 'z', 'r'] ncam = len(bands) - nbasis = n_nbh+nleg*ncam # n_nbh : for actual physical archetype(s), nleg: number of legendre polynomials, ncamera: number of cameras ret_zcoeff = {} ret_zcoeff['alpha'] = [] @@ -279,15 +319,84 @@ def per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux new_bands = sorted(bands) # saves as correct order - #Setup dict of solver args to pass bounds to solver method = method.upper() + add_negative_legendre_terms = False + if (method == 'BVLS_VIA_NNLS' and use_gpu): + #Use trick of adding negative legendre terms and using NNLS + #to implement BVLS + add_negative_legendre_terms = True + + #Calculate prior here + prior = prior_on_coeffs(n_nbh, nleg, prior_sigma, ncam, add_negative_legendre_terms) + legendre = target.legendre(nleg=nleg, use_gpu=False) #Get previously calculated legendre + + if add_negative_legendre_terms: + #Hard-code NNLS with additional legendre terms if BVLS AND gpu mode + method = 'NNLS' + #tdata should already have the additional negative legendre terms + nleg *= 2 + if narch == 1: + #Use CPU if only 1 narch (nearest neighbor) + use_gpu = False + + if narch == 1: + use_gpu = False + + #Calculate nbasis after accounting for legendre terms + nbasis = n_nbh+nleg*ncam # n_nbh : for actual physical archetype(s), nleg: number of legendre polynomials, ncamera: number of cameras + + #Calculate tdata arrays here instead of in archetypes + #Select np or cp for operations as arrtype + if (use_gpu): + import cupy as cp + arrtype = cp + else: + arrtype = np + tdata = dict() + for hs in binned: + #Create 3-d tdata with narch x nwave x nbasis where nbasis = 1+nleg + if narch == 1: + tdata[hs] = binned[hs][None,:,:] + if nleg > 0: + tdata[hs] = np.append(tdata[hs], legendre[hs].transpose()[None,:,:], axis=2) + if add_negative_legendre_terms: + #if method == 'BVLS' and use_gpu: + #Using BVLS and GPU - append negative copies of legendre terms as well to use NNLS with additional bases to approximate BVLS + tdata[hs] = np.append(tdata[hs], -1*legendre[hs].transpose()[None,:,:], axis=2) + else: + if nleg > 0: + tdata[hs] = arrtype.append(binned[hs].transpose()[:,:,None], arrtype.tile(arrtype.asarray(legendre[hs]).transpose()[None,:,:], (narch, 1, 1)), axis=2) + if add_negative_legendre_terms: + #if method == 'BVLS' and use_gpu: + #Using BVLS - append negative copies of legendre terms as well to use NNLS with additional bases to approximate BVLS + tdata[hs] = arrtype.append(tdata[hs], arrtype.tile(-1*arrtype.asarray(legendre[hs]).transpose()[None,:,:], (narch, 1, 1)), axis=2) + else: + tdata[hs] = binned[hs].transpose()[:,:,None] + + #Setup dict of solver args to pass bounds to solver solver_args = dict() if (method == 'BVLS'): #only positive coefficients are allowed for the archetypes - bounds = np.zeros((2, nbasis)) - bounds[0][n_nbh:]=-np.inf #constant and slope terms in archetype method (can be positive or negative) - bounds[1] = np.inf - solver_args['bounds'] = bounds + if use_gpu: + #bounds should always be the same and creating small arrays on + #GPU is expensive so use a global variable and create once + if gbounds is None: + gbounds = cp.zeros((2,nbasis)) + gbounds[0][n_nbh:] = -cp.inf + gbounds[1] = cp.inf + gnbh = n_nbh + #In practice right now n_nbh will not change on any given run + #But future proofing just in case + if gnbh != n_nbh: + gbounds[0][:n_nbh] = 0 + gbounds[0][n_nbh:] = -cp.inf + solver_args['bounds'] = gbounds + else: + #On CPU we can create bounds array every time cheaply + bounds = np.zeros((2, nbasis)) + bounds[0][n_nbh:]=-np.inf #constant and slope terms in archetype method (can be positive or negative) + bounds[1] = np.inf + solver_args['bounds'] = bounds #Use branching options because GPU is faster in batch in 3d #but due to timing weirdness in numpy, CPU is faster looping over @@ -317,6 +426,8 @@ def per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux tdata2[hs][:,k+nleg*i] = tdata[hs][j,:,k] # Legendre polynomials terms tdata2[hs] = tdata2[hs][None,:,:] zzchi2[j], zzcoeff[j] = calc_zchi2_batch(spectra, tdata2, weights, flux, wflux, 1, nbasis, solve_matrices_algorithm=method, solver_args=solver_args, prior=prior, use_gpu=use_gpu) + if (add_negative_legendre_terms): + zzcoeff = remove_extra_legendre_terms(zzcoeff, n_nbh, ncam, nleg) # saving leading archetype coefficients in correct order ret_zcoeff['alpha'] = [zzcoeff[:,k] for k in range(n_nbh)] # archetype coefficient(s) @@ -337,6 +448,15 @@ def per_camera_coeff_with_least_square_batch(target, tdata, weights, flux, wflux #print(f'{time.time()-start} [sec] took for per camera BVLS method\n') return zzchi2, coeff +def remove_extra_legendre_terms(zzcoeff, n_nbh, ncam, nleg): + nleg = nleg // 2 + zzcoeff2 = np.zeros_like(zzcoeff, shape=(zzcoeff.shape[0], n_nbh+ncam*(nleg))) + zzcoeff2[:,:n_nbh] = zzcoeff[:,:n_nbh] + for j in range(ncam): + for l in range(nleg): + zzcoeff2[:,n_nbh+nleg*j+l] = zzcoeff[:,n_nbh+nleg*j*2+l] - zzcoeff[:,n_nbh+nleg*j*2+nleg+l] + return zzcoeff2 + def batch_dot_product_sparse(spectra, tdata, nz, use_gpu): """Calculate a batch dot product of the 3 sparse matrices in spectra with every template in tdata. Sparse matrix libraries are used @@ -1117,20 +1237,24 @@ def solve_matrices(M, y, solve_algorithm="PCA", solver_args=None, use_gpu=False) if solve_algorithm.upper() == "PCA": #Use PCA via linalg.solve in either numpy or cupy + tx = time.time() if (use_gpu): #Use cupy linalg.solve to solve for zcoeff in batch for all_M and #all_y where all_M and all_y are 3d and 2d arrays representing #M and y at every redshift bin for the given template. #There is no Error thrown by cupy's version. - return cp.linalg.solve(M, y) + zcoeff = cp.linalg.solve(M, y) else: #Use numpy linalg.solve which throws exception try: - return np.linalg.solve(M, y) + zcoeff = np.linalg.solve(M, y) except np.linalg.LinAlgError: raise + add_to_timer(0, time.time()-tx) + return zcoeff elif solve_algorithm in ("NMF", "NNLS"): if (use_gpu): + tx = time.time() nz = y.shape[0] Mcpu = M.get() ycpu = y.get() @@ -1142,41 +1266,44 @@ def solve_matrices(M, y, solve_algorithm="PCA", solver_args=None, use_gpu=False) zcoeff[j,:] = res[0] except Exception: zcoeff[j,:] = 9e99 + add_to_timer(1, time.time()-tx) return cp.asarray(zcoeff) else: try: + tx = time.time() res = nnls(M, y) zcoeff = res[0] - except Exception: + add_to_timer(1, time.time()-tx) + except Exception as ex: raise np.linalg.LinAlgError return zcoeff elif solve_algorithm == "BVLS": + tx = time.time() if (solver_args is not None and 'bounds' in solver_args): bounds = solver_args['bounds'] else: nbasis = y.shape[-1] - bounds = np.zeros((2, nbasis)) + bounds = np.zeros_like(y, shape=(2, nbasis)) bounds[0]=-np.inf bounds[1]=np.inf if (use_gpu): - nz = y.shape[0] - Mcpu = M.get() - ycpu = y.get() - zcoeff = np.zeros(y.shape) - #Copy to CPU, run scipy.optimize.lsq_linear, copy back to GPU - for j in range(nz): - try: - res = lsq_linear(Mcpu[j,:,:], ycpu[j,:], bounds=bounds, method='bvls') - zcoeff[j,:] = res.x - except np.linalg.LinAlgError: - zcoeff[j,:] = 9e99 - return cp.asarray(zcoeff) + from .optimize import gpu_lsq_linear as lsq_linear + lsq_opts = dict(return_cost=False, return_optimality=False) else: - try: - res = lsq_linear(M, y, bounds=bounds, method='bvls') - zcoeff = res.x - except np.linalg.LinAlgError: - raise - return zcoeff + from scipy.optimize import lsq_linear + lsq_opts = dict() + res = lsq_linear(M, y, bounds=bounds, method='bvls', **lsq_opts) + zcoeff = res.x + add_to_timer(2, time.time()-tx) + return zcoeff else: raise NotImplementedError("The solve_algorithm "+solve_algorithm+" is not implemented.") + +def add_to_timer(i, t): + ftimes[i] += t + +def print_total_fitting_times(): + for j in range(len(ftimes)): + if ftimes[j] > 0: + print("Total fitting time (method {}): {:0.1f} seconds".format(fmethods[j], ftimes[j])) + sys.stdout.flush() diff --git a/requirements.txt b/requirements.txt index 2197f552..748dfb8b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ # Based on desimodules/22.2. +numpy<2.0 astropy scipy h5py