diff --git a/py/redrock/archetypes.py b/py/redrock/archetypes.py index aa32eae..d94fd22 100644 --- a/py/redrock/archetypes.py +++ b/py/redrock/archetypes.py @@ -71,6 +71,7 @@ def __init__(self, filename): self.igm_model = 'Inoue14' # TODO: Allow Archetype files to specify bvls or nnls or pca solver method self._solver_method = 'bvls' + #self._solver_method = 'bvls_via_nnls' #This uses the NNLS trick to emulate BVLS self.method = 'ARCH' # for API symmetry with Template.method diff --git a/py/redrock/optimize/__init__.py b/py/redrock/optimize/__init__.py new file mode 100644 index 0000000..acfc1e7 --- /dev/null +++ b/py/redrock/optimize/__init__.py @@ -0,0 +1,5 @@ +"""This module contains a GPU implementation of the BVLS algorithm in + scipy.optimize.lsq_linear""" +from .gpu_lsq_linear import gpu_lsq_linear + +__all__ = ['gpu_lsq_linear'] diff --git a/py/redrock/optimize/gpu_bvls.py b/py/redrock/optimize/gpu_bvls.py new file mode 100644 index 0000000..315cfeb --- /dev/null +++ b/py/redrock/optimize/gpu_bvls.py @@ -0,0 +1,244 @@ +"""GPU implementation of Bounded-variable least-squares algorithm.""" +import numpy as np +import cupy as cp +#from numpy.linalg import norm, lstsq +from cupy.linalg import norm, lstsq +from scipy.optimize import OptimizeResult + +from .gpu_common import print_header_linear, print_iteration_linear, compute_grad + +def compute_kkt_optimality(g, on_bound): + """Compute the maximum violation of KKT conditions.""" + g_kkt = g * on_bound + free_set = on_bound == 0 + g_kkt[free_set] = cp.abs(g[free_set]) + return cp.max(g_kkt, axis=1) + + +def bvls(A, b, x_lsq, lb, ub, ib, tol, max_iter, verbose, rcond=None, return_cost=True, return_optimality=True): + d, m, n = A.shape + + #x = x_lsq.copy() + x = x_lsq + on_bound = cp.zeros((d,n), dtype=cp.int32) + + mask = x <= lb + x[mask] = lb[mask] + on_bound[mask] = -1 + + mask = x >= ub + x[mask] = ub[mask] + on_bound[mask] = 1 + + free_set = on_bound == 0 + active_set = ~free_set + + if return_cost or return_optimality: + r = cp.einsum("ijk,ik->ij", A, x) - b + #r = A.dot(x) - b + cost = 0.5*cp.einsum("ij,ij->i", r, r) + #cost = 0.5 * np.dot(r, r) + initial_cost = cost + if return_optimality: + g = compute_grad(A,r) + #g = A.T.dot(r) + else: + cost = None + initial_cost = None + + cost_change = None + step_norm = None + iteration = 0 + + if verbose == 2: + print_header_linear() + + # This is the initialization loop. The requirement is that the + # least-squares solution on free variables is feasible before BVLS starts. + # One possible initialization is to set all variables to lower or upper + # bounds, but many iterations may be required from this state later on. + # The implemented ad-hoc procedure which intuitively should give a better + # initial state: find the least-squares solution on current free variables, + # if its feasible then stop, otherwise, set violating variables to + # corresponding bounds and continue on the reduced set of free variables. + + while free_set.size > 0: + if verbose == 2 and return_optimality: + optimality = compute_kkt_optimality(g, on_bound) + print_iteration_linear(iteration, cost, cost_change, step_norm, + optimality) + + iteration += 1 + if verbose == 2: + #Only need x_old if verbose == 2 + x_old = x.copy() + + b_free = b - cp.einsum("ijk,ik->ij", A, x*active_set) + A_free = A.swapaxes(1,2).copy() + A_free[active_set] = 0 + A_free = A_free.swapaxes(1,2) + z = cp.einsum("ijk,ik->ij", cp.linalg.pinv(A_free), b_free) + + #A_free = A[:, free_set] + #b_free = b - A.dot(x * active_set) + #z = lstsq(A_free, b_free, rcond=rcond)[0] + + lbv = z < lb + ubv = z > ub + #lbv = z < lb[free_set] + #ubv = z > ub[free_set] + v = lbv | ubv + + if cp.any(lbv): + #ind = free_set[lbv] + #x[ind] = lb[ind] + x[lbv] = lb[lbv] + #active_set[ind] = True + active_set[lbv] = True + #on_bound[ind] = -1 + on_bound[lbv] = -1 + + if cp.any(ubv): + #ind = free_set[ubv] + #x[ind] = ub[ind] + x[ubv] = ub[ubv] + #active_set[ind] = True + active_set[ubv] = True + #on_bound[ind] = 1 + on_bound[ubv] = 1 + + #ind = free_set[~v] + #x[ind] = z[~v] + x[~v] = z[~v] + + r = cp.einsum("ijk,ik->ij", A, x) - b + #r = A.dot(x) - b + #cost_new = 0.5 * np.dot(r, r) + cost_new = 0.5*cp.einsum("ij,ij->i", r, r) + if return_cost: + cost_change = cost - cost_new + cost = cost_new + #g = A.T.dot(r) + g = compute_grad(A,r) + if verbose == 2: + #step_norm = norm(x[free_set] - x_free_old) + step_norm = norm(x-x_old, axis=1) + + if cp.any(v): + free_set = free_set[~v] + else: + break + + if max_iter is None: + max_iter = n + max_iter += iteration + + termination_status = None + + # Main BVLS loop. + optimality = compute_kkt_optimality(g, on_bound) + idx = np.arange(d, dtype=np.int32) + for iteration in range(iteration, max_iter): # BVLS Loop A + if verbose == 2 and return_optimality: + print_iteration_linear(iteration, cost, cost_change, + step_norm, optimality) + + if cp.all(optimality < tol): + termination_status = 1 + + if termination_status is not None: + break + + move_to_free = cp.argmax(g * on_bound, axis=1) + on_bound[idx,move_to_free] = 0 + + while True: # BVLS Loop B + free_set = on_bound == 0 + active_set = ~free_set + + if verbose == 2: + #Only need x_old if verbose == 2 + x_old = x.copy() + + b_free = b - cp.einsum("ijk,ik->ij", A, x*active_set) + #Set active_set to 0 in A_free + A_free = A.swapaxes(1,2).copy() + A_free[active_set] = 0 + A_free = A_free.swapaxes(1,2) + + z = cp.einsum("ijk,ik->ij", cp.linalg.pinv(A_free), b_free) + + #A_free = A[:, free_set] + #b_free = b - A.dot(x * active_set) + #z = lstsq(A_free, b_free, rcond=rcond)[0] + + lbv = z < lb + ubv = z > ub + v = (lbv | ubv)*free_set + #lbv, = np.nonzero(z < lb_free) + #ubv, = np.nonzero(z > ub_free) + #v = np.hstack((lbv, ubv)) + + #if v.size > 0: + if v.any(): + alphas = cp.zeros(x.shape) + alphas[lbv] = (lb[lbv]-x[lbv]) / (z[lbv]-x[lbv]) + alphas[ubv] = (ub[ubv]-x[ubv]) / (z[ubv]-x[ubv]) + + i = cp.argmin(alphas, axis=1) + alpha = alphas[idx,i] + x_free = x[v] * (1-alpha[v]) + x_free += alpha[v]*z[v] + x[v] = x_free + + #Update on_bound + on_bound[cp.where(lbv[idx,i]), i[cp.where(lbv[idx,i])]] = -1 + on_bound[cp.where(ubv[idx,i]), i[cp.where(ubv[idx,i])]] = 1 + + #alphas = np.hstack(( + # lb_free[lbv] - x_free[lbv], + # ub_free[ubv] - x_free[ubv])) / (z[v] - x_free[v]) + + #i = np.argmin(alphas) + #i_free = v[i] + #alpha = alphas[i] + + #x_free *= 1 - alpha + #x_free += alpha * z + #x[free_set] = x_free + + #if i < lbv.size: + # on_bound[free_set[i_free]] = -1 + #else: + # on_bound[free_set[i_free]] = 1 + else: + x_free = z[free_set] + x[free_set] = x_free + break + + if verbose == 2: + #step_norm = norm(x_free - x_free_old, axis=1) + step_norm = norm(x - x_old, axis=1) + + r = cp.einsum("ijk,ik->ij", A, x) - b + #r = A.dot(x) - b + #cost_new = 0.5 * np.dot(r, r) + cost_new = 0.5*cp.einsum("ij,ij->i", r, r) + cost_change = cost - cost_new + + if cp.all(cost_change < tol * cost): + termination_status = 2 + cost = cost_new + + #g = A.T.dot(r) + g = compute_grad(A,r) + optimality = compute_kkt_optimality(g, on_bound) + x_lsq = x + + if termination_status is None: + termination_status = 0 + + return OptimizeResult( + x=x_lsq, fun=r, cost=cost, optimality=optimality, active_mask=on_bound, + nit=iteration + 1, status=termination_status, + initial_cost=initial_cost) diff --git a/py/redrock/optimize/gpu_common.py b/py/redrock/optimize/gpu_common.py new file mode 100644 index 0000000..a0661ba --- /dev/null +++ b/py/redrock/optimize/gpu_common.py @@ -0,0 +1,77 @@ +"""Functions used by least-squares algorithms.""" +import numpy as np +import cupy as cp + +from scipy.sparse.linalg import LinearOperator, aslinearoperator + + +# Utility functions to work with bound constraints. +def in_bounds(x, lb, ub, arrtype=None): + """Check if a point lies within bounds.""" + if arrtype is None: + arrtype = cp + return arrtype.all((x >= lb) & (x <= ub),axis=1) + +def all_in_bounds(x, lb, ub): + """Check if a point lies within bounds.""" + return cp.all((x >= lb) & (x <= ub)) + + +# Functions to display algorithm's progress. + + +def print_header_nonlinear(): + print("{:^15}{:^15}{:^15}{:^15}{:^15}{:^15}" + .format("Iteration", "Total nfev", "Cost", "Cost reduction", + "Step norm", "Optimality")) + + +def print_iteration_nonlinear(iteration, nfev, cost, cost_reduction, + step_norm, optimality): + if cost_reduction is None: + cost_reduction = " " * 15 + else: + cost_reduction = f"{cost_reduction:^15.2e}" + + if step_norm is None: + step_norm = " " * 15 + else: + step_norm = f"{step_norm:^15.2e}" + + print("{:^15}{:^15}{:^15.4e}{}{}{:^15.2e}" + .format(iteration, nfev, cost, cost_reduction, + step_norm, optimality)) + + +def print_header_linear(): + print("{:^15}{:^15}{:^15}{:^15}{:^15}" + .format("Iteration", "Cost", "Cost reduction", "Step norm", + "Optimality")) + + +def print_iteration_linear(iteration, cost, cost_reduction, step_norm, + optimality): + if cost_reduction is None: + cost_reduction = " " * 15 + else: + cost_reduction = f"{cost_reduction:^15.2e}" + + if step_norm is None: + step_norm = " " * 15 + else: + step_norm = f"{step_norm:^15.2e}" + + print(f"{iteration:^15}{cost:^15.4e}{cost_reduction}{step_norm}{optimality:^15.2e}") + + +# Simple helper functions. + + +def compute_grad(J, f): + """Compute gradient of the least-squares cost function.""" + if isinstance(J, LinearOperator): + return J.rmatvec(f) + else: + return cp.einsum("ikj,ik->ij", J.swapaxes(1,2), f) + #return J.T.dot(f) + diff --git a/py/redrock/optimize/gpu_lsq_linear.py b/py/redrock/optimize/gpu_lsq_linear.py new file mode 100644 index 0000000..0c8163f --- /dev/null +++ b/py/redrock/optimize/gpu_lsq_linear.py @@ -0,0 +1,404 @@ +"""GPU implementation of Linear least squares with bound constraints on independent variables.""" +import numpy as np +#from numpy.linalg import norm +from scipy.sparse import issparse, csr_matrix +from scipy.sparse.linalg import LinearOperator, lsmr +from scipy.optimize import OptimizeResult +from scipy.optimize._minimize import Bounds + +import cupy as cp +from cupy.linalg import norm, lstsq + +from .gpu_common import all_in_bounds, in_bounds, compute_grad +from scipy.optimize._lsq.trf_linear import trf_linear +from .gpu_bvls import bvls +import time + + +def prepare_bounds(bounds, n): + if len(bounds) != 2: + raise ValueError("`bounds` must contain 2 elements.") + lb, ub = (cp.asarray(b, dtype=float) for b in bounds) + + if lb.ndim == 0: + lb = cp.resize(lb, n) + + if ub.ndim == 0: + ub = cp.resize(ub, n) + + return lb, ub + + +TERMINATION_MESSAGES = { + -1: "The algorithm was not able to make progress on the last iteration.", + 0: "The maximum number of iterations is exceeded.", + 1: "The first-order optimality measure is less than `tol`.", + 2: "The relative change of the cost function is less than `tol`.", + 3: "The unconstrained solution is optimal." +} + + +def gpu_lsq_linear(A, b, bounds=(-cp.inf, cp.inf), method='trf', tol=1e-10, + lsq_solver=None, lsmr_tol=None, max_iter=None, + verbose=0, *, lsmr_maxiter=None, return_cost=True, + return_optimality=True, check_bounds=False): + r"""Solve a linear least-squares problem with bounds on the variables. + + Given a m-by-n design matrix A and a target vector b with m elements, + `lsq_linear` solves the following optimization problem:: + + minimize 0.5 * ||A x - b||**2 + subject to lb <= x <= ub + + This optimization problem is convex, hence a found minimum (if iterations + have converged) is guaranteed to be global. + + Parameters + ---------- + A : array_like, sparse matrix of LinearOperator, shape (m, n) + Design matrix. Can be `scipy.sparse.linalg.LinearOperator`. + b : array_like, shape (m,) + Target vector. + bounds : 2-tuple of array_like or `Bounds`, optional + Lower and upper bounds on parameters. Defaults to no bounds. + There are two ways to specify the bounds: + + - Instance of `Bounds` class. + + - 2-tuple of array_like: Each element of the tuple must be either + an array with the length equal to the number of parameters, or a + scalar (in which case the bound is taken to be the same for all + parameters). Use ``np.inf`` with an appropriate sign to disable + bounds on all or some parameters. + + method : 'trf' or 'bvls', optional + Method to perform minimization. + + * 'trf' : Trust Region Reflective algorithm adapted for a linear + least-squares problem. This is an interior-point-like method + and the required number of iterations is weakly correlated with + the number of variables. + * 'bvls' : Bounded-variable least-squares algorithm. This is + an active set method, which requires the number of iterations + comparable to the number of variables. Can't be used when `A` is + sparse or LinearOperator. + + Default is 'trf'. + tol : float, optional + Tolerance parameter. The algorithm terminates if a relative change + of the cost function is less than `tol` on the last iteration. + Additionally, the first-order optimality measure is considered: + + * ``method='trf'`` terminates if the uniform norm of the gradient, + scaled to account for the presence of the bounds, is less than + `tol`. + * ``method='bvls'`` terminates if Karush-Kuhn-Tucker conditions + are satisfied within `tol` tolerance. + + lsq_solver : {None, 'exact', 'lsmr'}, optional + Method of solving unbounded least-squares problems throughout + iterations: + + * 'exact' : Use dense QR or SVD decomposition approach. Can't be + used when `A` is sparse or LinearOperator. + * 'lsmr' : Use `scipy.sparse.linalg.lsmr` iterative procedure + which requires only matrix-vector product evaluations. Can't + be used with ``method='bvls'``. + + If None (default), the solver is chosen based on type of `A`. + lsmr_tol : None, float or 'auto', optional + Tolerance parameters 'atol' and 'btol' for `scipy.sparse.linalg.lsmr` + If None (default), it is set to ``1e-2 * tol``. If 'auto', the + tolerance will be adjusted based on the optimality of the current + iterate, which can speed up the optimization process, but is not always + reliable. + max_iter : None or int, optional + Maximum number of iterations before termination. If None (default), it + is set to 100 for ``method='trf'`` or to the number of variables for + ``method='bvls'`` (not counting iterations for 'bvls' initialization). + verbose : {0, 1, 2}, optional + Level of algorithm's verbosity: + + * 0 : work silently (default). + * 1 : display a termination report. + * 2 : display progress during iterations. + lsmr_maxiter : None or int, optional + Maximum number of iterations for the lsmr least squares solver, + if it is used (by setting ``lsq_solver='lsmr'``). If None (default), it + uses lsmr's default of ``min(m, n)`` where ``m`` and ``n`` are the + number of rows and columns of `A`, respectively. Has no effect if + ``lsq_solver='exact'``. + + return_cost : True = return cost function, False = do not to save time + return_optimality: True = return grad function, False = do not to save time + check_bounds: True = check that lb < ub; False = do not to save time + + Returns + ------- + OptimizeResult with the following fields defined: + x : ndarray, shape (n,) + Solution found. + cost : float + Value of the cost function at the solution. + fun : ndarray, shape (m,) + Vector of residuals at the solution. + optimality : float + First-order optimality measure. The exact meaning depends on `method`, + refer to the description of `tol` parameter. + active_mask : ndarray of int, shape (n,) + Each component shows whether a corresponding constraint is active + (that is, whether a variable is at the bound): + + * 0 : a constraint is not active. + * -1 : a lower bound is active. + * 1 : an upper bound is active. + + Might be somewhat arbitrary for the `trf` method as it generates a + sequence of strictly feasible iterates and active_mask is determined + within a tolerance threshold. + unbounded_sol : tuple + Unbounded least squares solution tuple returned by the least squares + solver (set with `lsq_solver` option). If `lsq_solver` is not set or is + set to ``'exact'``, the tuple contains an ndarray of shape (n,) with + the unbounded solution, an ndarray with the sum of squared residuals, + an int with the rank of `A`, and an ndarray with the singular values + of `A` (see NumPy's ``linalg.lstsq`` for more information). If + `lsq_solver` is set to ``'lsmr'``, the tuple contains an ndarray of + shape (n,) with the unbounded solution, an int with the exit code, + an int with the number of iterations, and five floats with + various norms and the condition number of `A` (see SciPy's + ``sparse.linalg.lsmr`` for more information). This output can be + useful for determining the convergence of the least squares solver, + particularly the iterative ``'lsmr'`` solver. The unbounded least + squares problem is to minimize ``0.5 * ||A x - b||**2``. + nit : int + Number of iterations. Zero if the unconstrained solution is optimal. + status : int + Reason for algorithm termination: + + * -1 : the algorithm was not able to make progress on the last + iteration. + * 0 : the maximum number of iterations is exceeded. + * 1 : the first-order optimality measure is less than `tol`. + * 2 : the relative change of the cost function is less than `tol`. + * 3 : the unconstrained solution is optimal. + + message : str + Verbal description of the termination reason. + success : bool + True if one of the convergence criteria is satisfied (`status` > 0). + + See Also + -------- + nnls : Linear least squares with non-negativity constraint. + least_squares : Nonlinear least squares with bounds on the variables. + + Notes + ----- + The algorithm first computes the unconstrained least-squares solution by + `numpy.linalg.lstsq` or `scipy.sparse.linalg.lsmr` depending on + `lsq_solver`. This solution is returned as optimal if it lies within the + bounds. + + Method 'trf' runs the adaptation of the algorithm described in [STIR]_ for + a linear least-squares problem. The iterations are essentially the same as + in the nonlinear least-squares algorithm, but as the quadratic function + model is always accurate, we don't need to track or modify the radius of + a trust region. The line search (backtracking) is used as a safety net + when a selected step does not decrease the cost function. Read more + detailed description of the algorithm in `scipy.optimize.least_squares`. + + Method 'bvls' runs a Python implementation of the algorithm described in + [BVLS]_. The algorithm maintains active and free sets of variables, on + each iteration chooses a new variable to move from the active set to the + free set and then solves the unconstrained least-squares problem on free + variables. This algorithm is guaranteed to give an accurate solution + eventually, but may require up to n iterations for a problem with n + variables. Additionally, an ad-hoc initialization procedure is + implemented, that determines which variables to set free or active + initially. It takes some number of iterations before actual BVLS starts, + but can significantly reduce the number of further iterations. + + References + ---------- + .. [STIR] M. A. Branch, T. F. Coleman, and Y. Li, "A Subspace, Interior, + and Conjugate Gradient Method for Large-Scale Bound-Constrained + Minimization Problems," SIAM Journal on Scientific Computing, + Vol. 21, Number 1, pp 1-23, 1999. + .. [BVLS] P. B. Start and R. L. Parker, "Bounded-Variable Least-Squares: + an Algorithm and Applications", Computational Statistics, 10, + 129-141, 1995. + + Examples + -------- + In this example, a problem with a large sparse matrix and bounds on the + variables is solved. + + >>> import numpy as np + >>> from scipy.sparse import rand + >>> from scipy.optimize import lsq_linear + >>> rng = np.random.default_rng() + ... + >>> m = 20000 + >>> n = 10000 + ... + >>> A = rand(m, n, density=1e-4, random_state=rng) + >>> b = rng.standard_normal(m) + ... + >>> lb = rng.standard_normal(n) + >>> ub = lb + 1 + ... + >>> res = lsq_linear(A, b, bounds=(lb, ub), lsmr_tol='auto', verbose=1) + # may vary + The relative change of the cost function is less than `tol`. + Number of iterations 16, initial cost 1.5039e+04, final cost 1.1112e+04, + first-order optimality 4.66e-08. + """ + tx1 = time.time() + if method not in ['trf', 'bvls']: + raise ValueError("`method` must be 'trf' or 'bvls'") + + if lsq_solver not in [None, 'exact', 'lsmr']: + raise ValueError("`solver` must be None, 'exact' or 'lsmr'.") + + if verbose not in [0, 1, 2]: + raise ValueError("`verbose` must be in [0, 1, 2].") + + if issparse(A): + A = csr_matrix(A) + elif not isinstance(A, LinearOperator): + A = cp.atleast_2d(cp.asarray(A)) + + if method == 'bvls': + if lsq_solver == 'lsmr': + raise ValueError("method='bvls' can't be used with " + "lsq_solver='lsmr'") + + if not isinstance(A, cp.ndarray): + raise ValueError("method='bvls' can't be used with `A` being " + "sparse or LinearOperator.") + + if lsq_solver is None: + if isinstance(A, cp.ndarray): + lsq_solver = 'exact' + else: + lsq_solver = 'lsmr' + elif lsq_solver == 'exact' and not isinstance(A, cp.ndarray): + raise ValueError("`exact` solver can't be used when `A` is " + "sparse or LinearOperator.") + + if len(A.shape) != 3: # No ndim for LinearOperator. + raise ValueError("`A` must have 3 dimensions.") + + if max_iter is not None and max_iter <= 0: + raise ValueError("`max_iter` must be None or positive integer.") + + z, m, n = A.shape + + if b.ndim != 2: + raise ValueError("`b` must have at most 2 dimension.") + + zb, mb = b.shape + + if mb != m: + raise ValueError("Inconsistent shapes between `A` and `b`.") + + if isinstance(bounds, Bounds): + lb = bounds.lb + ub = bounds.ub + else: + lb, ub = prepare_bounds(bounds, n) + + if lb.shape != (n,) and ub.shape != (n,) and lb.shape != (z,n) and ub.shape != (z,n): + raise ValueError("Bounds have wrong shape.") + + if check_bounds: + if cp.any(lb >= ub): + raise ValueError("Each lower bound must be strictly less than each " + "upper bound.") + + if lsmr_maxiter is not None and lsmr_maxiter < 1: + raise ValueError("`lsmr_maxiter` must be None or positive integer.") + + if not ((isinstance(lsmr_tol, float) and lsmr_tol > 0) or + lsmr_tol in ('auto', None)): + raise ValueError("`lsmr_tol` must be None, 'auto', or positive float.") + + if lsq_solver == 'exact': + #unbd_lsq = cp.linalg.lstsq(A, b, rcond=-1) + if m == n: + #square matrices + unbd_lsq = cp.linalg.solve(A,b) + else: + unbd_lsq = cp.einsum("ijk,ik->ij", cp.linalg.pinv(A), b) + elif lsq_solver == 'lsmr': + raise NotImplementedError('lsmr is not yet implemetned') + #first_lsmr_tol = lsmr_tol # tol of first call to lsmr + #if lsmr_tol is None or lsmr_tol == 'auto': + # first_lsmr_tol = 1e-2 * tol # default if lsmr_tol not defined + #unbd_lsq = lsmr(A, b, maxiter=lsmr_maxiter, + # atol=first_lsmr_tol, btol=first_lsmr_tol) + #x_lsq = unbd_lsq[0] # extract the solution from the least squares solver + x_lsq = unbd_lsq + + #Tile bounds if 1d arrays given + if (lb.shape == (n,)): + lb = cp.tile(lb, (z,1)) + if (ub.shape == (n,)): + ub = cp.tile(ub, (z,1)) + ib = in_bounds(x_lsq, lb, ub) + + if ib.get().all(): + #All solutions within bounds + termination_status = 3 + termination_message = TERMINATION_MESSAGES[termination_status] + if (return_cost or return_optimality): + r = cp.einsum("ijk,ik->ij", A, x_lsq) - b + #r = A @ x_lsq - b + cost = 0.5*cp.einsum("ij,ij->i", r, r) + #cost = 0.5 * cp.dot(r, r) + if (return_optimality): + g = compute_grad(A, r) + g_norm = norm(g, ord=cp.inf, axis=1) + else: + g_norm = None + else: + cost=None + r = None + g_norm = None + + if verbose > 0: + print(termination_message) + if (return_cost): + print(f"Final cost {cost:.4e}") + if (return_optimality): + print(f"first-order optimality {g_norm:.2e}") + + return OptimizeResult( + x=x_lsq, fun=r, cost=cost, optimality=g_norm, + active_mask=np.zeros(n), unbounded_sol=unbd_lsq, + nit=0, status=termination_status, + message=termination_message, success=True) + + if method == 'trf': + raise NotImplementedError('trf_linear is not yet implemented') + #res = trf_linear(A, b, x_lsq, lb, ub, tol, lsq_solver, lsmr_tol, + # max_iter, verbose, lsmr_maxiter=lsmr_maxiter) + elif method == 'bvls': + res = bvls(A, b, x_lsq, lb, ub, ib, tol, max_iter, verbose, + return_cost=return_cost, return_optimality=return_optimality) + + res.unbounded_sol = unbd_lsq + res.message = TERMINATION_MESSAGES[res.status] + res.success = res.status > 0 + + if verbose > 0: + print(res.message) + print( + f"Number of iterations {res.nit}, initial cost {res.initial_cost:.4e}, " + f"final cost {res.cost:.4e}, first-order optimality {res.optimality:.2e}." + ) + + del res.initial_cost + + return res diff --git a/py/redrock/zfind.py b/py/redrock/zfind.py index 913186b..17318c3 100644 --- a/py/redrock/zfind.py +++ b/py/redrock/zfind.py @@ -29,7 +29,7 @@ from .results import read_zscan_redrock -from .zscan import calc_zchi2_targets +from .zscan import calc_zchi2_targets, print_total_fitting_times from .fitz import fitz, get_dv @@ -408,6 +408,7 @@ def zfind(targets, templates, mp_procs=1, nminima=3, archetypes=None, priors=Non results = [ results ] if am_root: + print_total_fitting_times() #print in root the total fitting times allresults = dict() for p in results: allresults.update(p) diff --git a/py/redrock/zscan.py b/py/redrock/zscan.py index 10a0c8d..ffd5972 100644 --- a/py/redrock/zscan.py +++ b/py/redrock/zscan.py @@ -11,7 +11,7 @@ import sys import traceback import numpy as np -from scipy.optimize import lsq_linear, nnls +from scipy.optimize import nnls #try: # import cupy as cp @@ -35,6 +35,11 @@ block_size = 512 #Default block size, should work on all modern nVidia GPUs # cuda_source contains raw CUDA kernels to be loaded as CUPY module +gbounds = None +gnbh = None + +ftimes = np.zeros(3) +fmethods = ["PCA", "NNLS", "BVLS"] cuda_source = r''' extern "C" { @@ -287,7 +292,7 @@ def per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflu wflux (array): concatenated weighted flux values. nleg (int): number of Legendre polynomials narch (int): number of archetypes - method (string): 'PCA', 'BVLS', 'NMF', or 'NNLS' (same as NMF) + method (string): 'PCA', 'BVLS', 'NMF', or 'NNLS' (same as NMF), or 'BVLS_VIA_NNLS' n_nbh (int): number of nearest best archetypes prior (array): prior matrix added to the Legendre coefficients (1/sigma^2) use_gpu (bool): use GPU or not @@ -297,8 +302,9 @@ def per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflu coefficients and chi2 """ - - ### TODO - implement BVLS on GPU + #Use globals gbounds and gnbh + global gbounds + global gnbh # number of cameras in DESI: b, r, z spectra = target.spectra if bands is None: @@ -315,7 +321,9 @@ def per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflu method = method.upper() add_negative_legendre_terms = False - if (method == 'BVLS' and use_gpu): + if (method == 'BVLS_VIA_NNLS' and use_gpu): + #Use trick of adding negative legendre terms and using NNLS + #to implement BVLS add_negative_legendre_terms = True #Calculate prior here @@ -331,6 +339,9 @@ def per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflu #Use CPU if only 1 narch (nearest neighbor) use_gpu = False + if narch == 1: + use_gpu = False + #Calculate nbasis after accounting for legendre terms nbasis = n_nbh+nleg*ncam # n_nbh : for actual physical archetype(s), nleg: number of legendre polynomials, ncamera: number of cameras @@ -366,10 +377,26 @@ def per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflu solver_args = dict() if (method == 'BVLS'): #only positive coefficients are allowed for the archetypes - bounds = np.zeros((2, nbasis)) - bounds[0][n_nbh:]=-np.inf #constant and slope terms in archetype method (can be positive or negative) - bounds[1] = np.inf - solver_args['bounds'] = bounds + if use_gpu: + #bounds should always be the same and creating small arrays on + #GPU is expensive so use a global variable and create once + if gbounds is None: + gbounds = cp.zeros((2,nbasis)) + gbounds[0][n_nbh:] = -cp.inf + gbounds[1] = cp.inf + gnbh = n_nbh + #In practice right now n_nbh will not change on any given run + #But future proofing just in case + if gnbh != n_nbh: + gbounds[0][:n_nbh] = 0 + gbounds[0][n_nbh:] = -cp.inf + solver_args['bounds'] = gbounds + else: + #On CPU we can create bounds array every time cheaply + bounds = np.zeros((2, nbasis)) + bounds[0][n_nbh:]=-np.inf #constant and slope terms in archetype method (can be positive or negative) + bounds[1] = np.inf + solver_args['bounds'] = bounds #Use branching options because GPU is faster in batch in 3d #but due to timing weirdness in numpy, CPU is faster looping over @@ -1210,20 +1237,24 @@ def solve_matrices(M, y, solve_algorithm="PCA", solver_args=None, use_gpu=False) if solve_algorithm.upper() == "PCA": #Use PCA via linalg.solve in either numpy or cupy + tx = time.time() if (use_gpu): #Use cupy linalg.solve to solve for zcoeff in batch for all_M and #all_y where all_M and all_y are 3d and 2d arrays representing #M and y at every redshift bin for the given template. #There is no Error thrown by cupy's version. - return cp.linalg.solve(M, y) + zcoeff = cp.linalg.solve(M, y) else: #Use numpy linalg.solve which throws exception try: - return np.linalg.solve(M, y) + zcoeff = np.linalg.solve(M, y) except np.linalg.LinAlgError: raise + add_to_timer(0, time.time()-tx) + return zcoeff elif solve_algorithm in ("NMF", "NNLS"): if (use_gpu): + tx = time.time() nz = y.shape[0] Mcpu = M.get() ycpu = y.get() @@ -1235,41 +1266,44 @@ def solve_matrices(M, y, solve_algorithm="PCA", solver_args=None, use_gpu=False) zcoeff[j,:] = res[0] except Exception: zcoeff[j,:] = 9e99 + add_to_timer(1, time.time()-tx) return cp.asarray(zcoeff) else: try: + tx = time.time() res = nnls(M, y) zcoeff = res[0] - except Exception: + add_to_timer(1, time.time()-tx) + except Exception as ex: raise np.linalg.LinAlgError return zcoeff elif solve_algorithm == "BVLS": + tx = time.time() if (solver_args is not None and 'bounds' in solver_args): bounds = solver_args['bounds'] else: nbasis = y.shape[-1] - bounds = np.zeros((2, nbasis)) + bounds = np.zeros_like(y, shape=(2, nbasis)) bounds[0]=-np.inf bounds[1]=np.inf if (use_gpu): - nz = y.shape[0] - Mcpu = M.get() - ycpu = y.get() - zcoeff = np.zeros(y.shape) - #Copy to CPU, run scipy.optimize.lsq_linear, copy back to GPU - for j in range(nz): - try: - res = lsq_linear(Mcpu[j,:,:], ycpu[j,:], bounds=bounds, method='bvls') - zcoeff[j,:] = res.x - except np.linalg.LinAlgError: - zcoeff[j,:] = 9e99 - return cp.asarray(zcoeff) + from .optimize import gpu_lsq_linear as lsq_linear + lsq_opts = dict(return_cost=False, return_optimality=False) else: - try: - res = lsq_linear(M, y, bounds=bounds, method='bvls') - zcoeff = res.x - except np.linalg.LinAlgError: - raise - return zcoeff + from scipy.optimize import lsq_linear + lsq_opts = dict() + res = lsq_linear(M, y, bounds=bounds, method='bvls', **lsq_opts) + zcoeff = res.x + add_to_timer(2, time.time()-tx) + return zcoeff else: raise NotImplementedError("The solve_algorithm "+solve_algorithm+" is not implemented.") + +def add_to_timer(i, t): + ftimes[i] += t + +def print_total_fitting_times(): + for j in range(len(ftimes)): + if ftimes[j] > 0: + print("Total fitting time (method {}): {:0.1f} seconds".format(fmethods[j], ftimes[j])) + sys.stdout.flush()