diff --git a/py/redrock/archetypes.py b/py/redrock/archetypes.py
index aa32eae..d94fd22 100644
--- a/py/redrock/archetypes.py
+++ b/py/redrock/archetypes.py
@@ -71,6 +71,7 @@ def __init__(self, filename):
         self.igm_model = 'Inoue14'
         # TODO: Allow Archetype files to specify bvls or nnls or pca solver method
         self._solver_method = 'bvls'
+        #self._solver_method = 'bvls_via_nnls' #This uses the NNLS trick to emulate BVLS
 
         self.method = 'ARCH'  # for API symmetry with Template.method
 
diff --git a/py/redrock/optimize/__init__.py b/py/redrock/optimize/__init__.py
new file mode 100644
index 0000000..acfc1e7
--- /dev/null
+++ b/py/redrock/optimize/__init__.py
@@ -0,0 +1,5 @@
+"""This module contains a GPU implementation of the BVLS algorithm in
+    scipy.optimize.lsq_linear"""
+from .gpu_lsq_linear import gpu_lsq_linear
+
+__all__ = ['gpu_lsq_linear']
diff --git a/py/redrock/optimize/gpu_bvls.py b/py/redrock/optimize/gpu_bvls.py
new file mode 100644
index 0000000..315cfeb
--- /dev/null
+++ b/py/redrock/optimize/gpu_bvls.py
@@ -0,0 +1,244 @@
+"""GPU implementation of Bounded-variable least-squares algorithm."""
+import numpy as np
+import cupy as cp
+#from numpy.linalg import norm, lstsq
+from cupy.linalg import norm, lstsq
+from scipy.optimize import OptimizeResult
+
+from .gpu_common import print_header_linear, print_iteration_linear, compute_grad
+
+def compute_kkt_optimality(g, on_bound):
+    """Compute the maximum violation of KKT conditions."""
+    g_kkt = g * on_bound
+    free_set = on_bound == 0
+    g_kkt[free_set] = cp.abs(g[free_set])
+    return cp.max(g_kkt, axis=1)
+
+
+def bvls(A, b, x_lsq, lb, ub, ib, tol, max_iter, verbose, rcond=None, return_cost=True, return_optimality=True):
+    d, m, n = A.shape
+
+    #x = x_lsq.copy()
+    x = x_lsq
+    on_bound = cp.zeros((d,n), dtype=cp.int32)
+
+    mask = x <= lb
+    x[mask] = lb[mask]
+    on_bound[mask] = -1
+
+    mask = x >= ub
+    x[mask] = ub[mask]
+    on_bound[mask] = 1
+
+    free_set = on_bound == 0
+    active_set = ~free_set
+
+    if return_cost or return_optimality:
+        r = cp.einsum("ijk,ik->ij", A, x) - b
+        #r = A.dot(x) - b
+        cost = 0.5*cp.einsum("ij,ij->i", r, r)
+        #cost = 0.5 * np.dot(r, r)
+        initial_cost = cost
+        if return_optimality:
+            g = compute_grad(A,r)
+            #g = A.T.dot(r)
+    else:
+        cost = None
+        initial_cost = None
+
+    cost_change = None
+    step_norm = None
+    iteration = 0
+
+    if verbose == 2:
+        print_header_linear()
+
+    # This is the initialization loop. The requirement is that the
+    # least-squares solution on free variables is feasible before BVLS starts.
+    # One possible initialization is to set all variables to lower or upper
+    # bounds, but many iterations may be required from this state later on.
+    # The implemented ad-hoc procedure which intuitively should give a better
+    # initial state: find the least-squares solution on current free variables,
+    # if its feasible then stop, otherwise, set violating variables to
+    # corresponding bounds and continue on the reduced set of free variables.
+
+    while free_set.size > 0:
+        if verbose == 2 and return_optimality:
+            optimality = compute_kkt_optimality(g, on_bound)
+            print_iteration_linear(iteration, cost, cost_change, step_norm,
+                                   optimality)
+
+        iteration += 1
+        if verbose == 2:
+            #Only need x_old if verbose == 2
+            x_old = x.copy()
+
+        b_free = b - cp.einsum("ijk,ik->ij", A, x*active_set)
+        A_free = A.swapaxes(1,2).copy()
+        A_free[active_set] = 0
+        A_free = A_free.swapaxes(1,2)
+        z = cp.einsum("ijk,ik->ij", cp.linalg.pinv(A_free), b_free)
+
+        #A_free = A[:, free_set]
+        #b_free = b - A.dot(x * active_set)
+        #z = lstsq(A_free, b_free, rcond=rcond)[0]
+
+        lbv = z < lb
+        ubv = z > ub
+        #lbv = z < lb[free_set]
+        #ubv = z > ub[free_set]
+        v = lbv | ubv
+
+        if cp.any(lbv):
+            #ind = free_set[lbv]
+            #x[ind] = lb[ind]
+            x[lbv] = lb[lbv]
+            #active_set[ind] = True
+            active_set[lbv] = True
+            #on_bound[ind] = -1
+            on_bound[lbv] = -1
+
+        if cp.any(ubv):
+            #ind = free_set[ubv]
+            #x[ind] = ub[ind]
+            x[ubv] = ub[ubv]
+            #active_set[ind] = True
+            active_set[ubv] = True
+            #on_bound[ind] = 1
+            on_bound[ubv] = 1
+
+        #ind = free_set[~v]
+        #x[ind] = z[~v]
+        x[~v] = z[~v]
+
+        r = cp.einsum("ijk,ik->ij", A, x) - b
+        #r = A.dot(x) - b
+        #cost_new = 0.5 * np.dot(r, r)
+        cost_new = 0.5*cp.einsum("ij,ij->i", r, r)
+        if return_cost:
+            cost_change = cost - cost_new
+        cost = cost_new
+        #g = A.T.dot(r)
+        g = compute_grad(A,r)
+        if verbose == 2:
+            #step_norm = norm(x[free_set] - x_free_old)
+            step_norm = norm(x-x_old, axis=1)
+
+        if cp.any(v):
+            free_set = free_set[~v]
+        else:
+            break
+
+    if max_iter is None:
+        max_iter = n
+    max_iter += iteration
+
+    termination_status = None
+
+    # Main BVLS loop.
+    optimality = compute_kkt_optimality(g, on_bound)
+    idx = np.arange(d, dtype=np.int32)
+    for iteration in range(iteration, max_iter):  # BVLS Loop A
+        if verbose == 2 and return_optimality:
+            print_iteration_linear(iteration, cost, cost_change,
+                                   step_norm, optimality)
+
+        if cp.all(optimality < tol):
+            termination_status = 1
+
+        if termination_status is not None:
+            break
+
+        move_to_free = cp.argmax(g * on_bound, axis=1)
+        on_bound[idx,move_to_free] = 0
+        
+        while True:   # BVLS Loop B
+            free_set = on_bound == 0
+            active_set = ~free_set
+    
+            if verbose == 2:
+                #Only need x_old if verbose == 2
+                x_old = x.copy()
+
+            b_free = b - cp.einsum("ijk,ik->ij", A, x*active_set)
+            #Set active_set to 0 in A_free
+            A_free = A.swapaxes(1,2).copy()
+            A_free[active_set] = 0
+            A_free = A_free.swapaxes(1,2)
+
+            z = cp.einsum("ijk,ik->ij", cp.linalg.pinv(A_free), b_free)
+
+            #A_free = A[:, free_set]
+            #b_free = b - A.dot(x * active_set)
+            #z = lstsq(A_free, b_free, rcond=rcond)[0]
+
+            lbv = z < lb 
+            ubv = z > ub 
+            v = (lbv | ubv)*free_set
+            #lbv, = np.nonzero(z < lb_free)
+            #ubv, = np.nonzero(z > ub_free)
+            #v = np.hstack((lbv, ubv))
+
+            #if v.size > 0:
+            if v.any():
+                alphas = cp.zeros(x.shape)
+                alphas[lbv] = (lb[lbv]-x[lbv]) / (z[lbv]-x[lbv])
+                alphas[ubv] = (ub[ubv]-x[ubv]) / (z[ubv]-x[ubv])
+
+                i = cp.argmin(alphas, axis=1)
+                alpha = alphas[idx,i]
+                x_free = x[v] * (1-alpha[v]) 
+                x_free += alpha[v]*z[v]
+                x[v] = x_free
+
+                #Update on_bound
+                on_bound[cp.where(lbv[idx,i]), i[cp.where(lbv[idx,i])]] = -1
+                on_bound[cp.where(ubv[idx,i]), i[cp.where(ubv[idx,i])]] = 1
+
+                #alphas = np.hstack((
+                #    lb_free[lbv] - x_free[lbv],
+                #    ub_free[ubv] - x_free[ubv])) / (z[v] - x_free[v])
+
+                #i = np.argmin(alphas)
+                #i_free = v[i]
+                #alpha = alphas[i]
+
+                #x_free *= 1 - alpha
+                #x_free += alpha * z
+                #x[free_set] = x_free
+
+                #if i < lbv.size:
+                #    on_bound[free_set[i_free]] = -1
+                #else:
+                #    on_bound[free_set[i_free]] = 1
+            else:
+                x_free = z[free_set]
+                x[free_set] = x_free
+                break
+
+        if verbose == 2:
+            #step_norm = norm(x_free - x_free_old, axis=1)
+            step_norm = norm(x - x_old, axis=1)
+
+        r = cp.einsum("ijk,ik->ij", A, x) - b
+        #r = A.dot(x) - b
+        #cost_new = 0.5 * np.dot(r, r)
+        cost_new = 0.5*cp.einsum("ij,ij->i", r, r)
+        cost_change = cost - cost_new
+
+        if cp.all(cost_change < tol * cost):
+            termination_status = 2
+        cost = cost_new
+
+        #g = A.T.dot(r)
+        g = compute_grad(A,r)
+        optimality = compute_kkt_optimality(g, on_bound)
+        x_lsq = x
+
+    if termination_status is None:
+        termination_status = 0
+
+    return OptimizeResult(
+        x=x_lsq, fun=r, cost=cost, optimality=optimality, active_mask=on_bound,
+        nit=iteration + 1, status=termination_status,
+        initial_cost=initial_cost)
diff --git a/py/redrock/optimize/gpu_common.py b/py/redrock/optimize/gpu_common.py
new file mode 100644
index 0000000..a0661ba
--- /dev/null
+++ b/py/redrock/optimize/gpu_common.py
@@ -0,0 +1,77 @@
+"""Functions used by least-squares algorithms."""
+import numpy as np
+import cupy as cp
+
+from scipy.sparse.linalg import LinearOperator, aslinearoperator
+
+
+# Utility functions to work with bound constraints.
+def in_bounds(x, lb, ub, arrtype=None):
+    """Check if a point lies within bounds."""
+    if arrtype is None:
+        arrtype = cp
+    return arrtype.all((x >= lb) & (x <= ub),axis=1)
+
+def all_in_bounds(x, lb, ub):
+    """Check if a point lies within bounds."""
+    return cp.all((x >= lb) & (x <= ub))
+
+
+# Functions to display algorithm's progress.
+
+
+def print_header_nonlinear():
+    print("{:^15}{:^15}{:^15}{:^15}{:^15}{:^15}"
+          .format("Iteration", "Total nfev", "Cost", "Cost reduction",
+                  "Step norm", "Optimality"))
+
+
+def print_iteration_nonlinear(iteration, nfev, cost, cost_reduction,
+                              step_norm, optimality):
+    if cost_reduction is None:
+        cost_reduction = " " * 15
+    else:
+        cost_reduction = f"{cost_reduction:^15.2e}"
+
+    if step_norm is None:
+        step_norm = " " * 15
+    else:
+        step_norm = f"{step_norm:^15.2e}"
+
+    print("{:^15}{:^15}{:^15.4e}{}{}{:^15.2e}"
+          .format(iteration, nfev, cost, cost_reduction,
+                  step_norm, optimality))
+
+
+def print_header_linear():
+    print("{:^15}{:^15}{:^15}{:^15}{:^15}"
+          .format("Iteration", "Cost", "Cost reduction", "Step norm",
+                  "Optimality"))
+
+
+def print_iteration_linear(iteration, cost, cost_reduction, step_norm,
+                           optimality):
+    if cost_reduction is None:
+        cost_reduction = " " * 15
+    else:
+        cost_reduction = f"{cost_reduction:^15.2e}"
+
+    if step_norm is None:
+        step_norm = " " * 15
+    else:
+        step_norm = f"{step_norm:^15.2e}"
+
+    print(f"{iteration:^15}{cost:^15.4e}{cost_reduction}{step_norm}{optimality:^15.2e}")
+
+
+# Simple helper functions.
+
+
+def compute_grad(J, f):
+    """Compute gradient of the least-squares cost function."""
+    if isinstance(J, LinearOperator):
+        return J.rmatvec(f)
+    else:
+        return cp.einsum("ikj,ik->ij", J.swapaxes(1,2), f)
+        #return J.T.dot(f)
+
diff --git a/py/redrock/optimize/gpu_lsq_linear.py b/py/redrock/optimize/gpu_lsq_linear.py
new file mode 100644
index 0000000..0c8163f
--- /dev/null
+++ b/py/redrock/optimize/gpu_lsq_linear.py
@@ -0,0 +1,404 @@
+"""GPU implementation of Linear least squares with bound constraints on independent variables."""
+import numpy as np
+#from numpy.linalg import norm
+from scipy.sparse import issparse, csr_matrix
+from scipy.sparse.linalg import LinearOperator, lsmr
+from scipy.optimize import OptimizeResult
+from scipy.optimize._minimize import Bounds
+
+import cupy as cp
+from cupy.linalg import norm, lstsq
+
+from .gpu_common import all_in_bounds, in_bounds, compute_grad
+from scipy.optimize._lsq.trf_linear import trf_linear
+from .gpu_bvls import bvls
+import time
+
+
+def prepare_bounds(bounds, n):
+    if len(bounds) != 2:
+        raise ValueError("`bounds` must contain 2 elements.")
+    lb, ub = (cp.asarray(b, dtype=float) for b in bounds)
+
+    if lb.ndim == 0:
+        lb = cp.resize(lb, n)
+
+    if ub.ndim == 0:
+        ub = cp.resize(ub, n)
+
+    return lb, ub
+
+
+TERMINATION_MESSAGES = {
+    -1: "The algorithm was not able to make progress on the last iteration.",
+    0: "The maximum number of iterations is exceeded.",
+    1: "The first-order optimality measure is less than `tol`.",
+    2: "The relative change of the cost function is less than `tol`.",
+    3: "The unconstrained solution is optimal."
+}
+
+
+def gpu_lsq_linear(A, b, bounds=(-cp.inf, cp.inf), method='trf', tol=1e-10,
+               lsq_solver=None, lsmr_tol=None, max_iter=None,
+               verbose=0, *, lsmr_maxiter=None, return_cost=True,
+               return_optimality=True, check_bounds=False):
+    r"""Solve a linear least-squares problem with bounds on the variables.
+
+    Given a m-by-n design matrix A and a target vector b with m elements,
+    `lsq_linear` solves the following optimization problem::
+
+        minimize 0.5 * ||A x - b||**2
+        subject to lb <= x <= ub
+
+    This optimization problem is convex, hence a found minimum (if iterations
+    have converged) is guaranteed to be global.
+
+    Parameters
+    ----------
+    A : array_like, sparse matrix of LinearOperator, shape (m, n)
+        Design matrix. Can be `scipy.sparse.linalg.LinearOperator`.
+    b : array_like, shape (m,)
+        Target vector.
+    bounds : 2-tuple of array_like or `Bounds`, optional
+        Lower and upper bounds on parameters. Defaults to no bounds.
+        There are two ways to specify the bounds:
+
+            - Instance of `Bounds` class.
+
+            - 2-tuple of array_like: Each element of the tuple must be either
+              an array with the length equal to the number of parameters, or a
+              scalar (in which case the bound is taken to be the same for all
+              parameters). Use ``np.inf`` with an appropriate sign to disable
+              bounds on all or some parameters.
+
+    method : 'trf' or 'bvls', optional
+        Method to perform minimization.
+
+            * 'trf' : Trust Region Reflective algorithm adapted for a linear
+              least-squares problem. This is an interior-point-like method
+              and the required number of iterations is weakly correlated with
+              the number of variables.
+            * 'bvls' : Bounded-variable least-squares algorithm. This is
+              an active set method, which requires the number of iterations
+              comparable to the number of variables. Can't be used when `A` is
+              sparse or LinearOperator.
+
+        Default is 'trf'.
+    tol : float, optional
+        Tolerance parameter. The algorithm terminates if a relative change
+        of the cost function is less than `tol` on the last iteration.
+        Additionally, the first-order optimality measure is considered:
+
+            * ``method='trf'`` terminates if the uniform norm of the gradient,
+              scaled to account for the presence of the bounds, is less than
+              `tol`.
+            * ``method='bvls'`` terminates if Karush-Kuhn-Tucker conditions
+              are satisfied within `tol` tolerance.
+
+    lsq_solver : {None, 'exact', 'lsmr'}, optional
+        Method of solving unbounded least-squares problems throughout
+        iterations:
+
+            * 'exact' : Use dense QR or SVD decomposition approach. Can't be
+              used when `A` is sparse or LinearOperator.
+            * 'lsmr' : Use `scipy.sparse.linalg.lsmr` iterative procedure
+              which requires only matrix-vector product evaluations. Can't
+              be used with ``method='bvls'``.
+
+        If None (default), the solver is chosen based on type of `A`.
+    lsmr_tol : None, float or 'auto', optional
+        Tolerance parameters 'atol' and 'btol' for `scipy.sparse.linalg.lsmr`
+        If None (default), it is set to ``1e-2 * tol``. If 'auto', the
+        tolerance will be adjusted based on the optimality of the current
+        iterate, which can speed up the optimization process, but is not always
+        reliable.
+    max_iter : None or int, optional
+        Maximum number of iterations before termination. If None (default), it
+        is set to 100 for ``method='trf'`` or to the number of variables for
+        ``method='bvls'`` (not counting iterations for 'bvls' initialization).
+    verbose : {0, 1, 2}, optional
+        Level of algorithm's verbosity:
+
+            * 0 : work silently (default).
+            * 1 : display a termination report.
+            * 2 : display progress during iterations.
+    lsmr_maxiter : None or int, optional
+        Maximum number of iterations for the lsmr least squares solver,
+        if it is used (by setting ``lsq_solver='lsmr'``). If None (default), it
+        uses lsmr's default of ``min(m, n)`` where ``m`` and ``n`` are the
+        number of rows and columns of `A`, respectively. Has no effect if
+        ``lsq_solver='exact'``.
+
+    return_cost : True = return cost function, False = do not to save time
+    return_optimality: True = return grad function, False = do not to save time
+    check_bounds: True = check that lb < ub; False = do not to save time
+
+    Returns
+    -------
+    OptimizeResult with the following fields defined:
+    x : ndarray, shape (n,)
+        Solution found.
+    cost : float
+        Value of the cost function at the solution.
+    fun : ndarray, shape (m,)
+        Vector of residuals at the solution.
+    optimality : float
+        First-order optimality measure. The exact meaning depends on `method`,
+        refer to the description of `tol` parameter.
+    active_mask : ndarray of int, shape (n,)
+        Each component shows whether a corresponding constraint is active
+        (that is, whether a variable is at the bound):
+
+            *  0 : a constraint is not active.
+            * -1 : a lower bound is active.
+            *  1 : an upper bound is active.
+
+        Might be somewhat arbitrary for the `trf` method as it generates a
+        sequence of strictly feasible iterates and active_mask is determined
+        within a tolerance threshold.
+    unbounded_sol : tuple
+        Unbounded least squares solution tuple returned by the least squares
+        solver (set with `lsq_solver` option). If `lsq_solver` is not set or is
+        set to ``'exact'``, the tuple contains an ndarray of shape (n,) with
+        the unbounded solution, an ndarray with the sum of squared residuals,
+        an int with the rank of `A`, and an ndarray with the singular values
+        of `A` (see NumPy's ``linalg.lstsq`` for more information). If
+        `lsq_solver` is set to ``'lsmr'``, the tuple contains an ndarray of
+        shape (n,) with the unbounded solution, an int with the exit code,
+        an int with the number of iterations, and five floats with
+        various norms and the condition number of `A` (see SciPy's
+        ``sparse.linalg.lsmr`` for more information). This output can be
+        useful for determining the convergence of the least squares solver,
+        particularly the iterative ``'lsmr'`` solver. The unbounded least
+        squares problem is to minimize ``0.5 * ||A x - b||**2``.
+    nit : int
+        Number of iterations. Zero if the unconstrained solution is optimal.
+    status : int
+        Reason for algorithm termination:
+
+            * -1 : the algorithm was not able to make progress on the last
+              iteration.
+            *  0 : the maximum number of iterations is exceeded.
+            *  1 : the first-order optimality measure is less than `tol`.
+            *  2 : the relative change of the cost function is less than `tol`.
+            *  3 : the unconstrained solution is optimal.
+
+    message : str
+        Verbal description of the termination reason.
+    success : bool
+        True if one of the convergence criteria is satisfied (`status` > 0).
+
+    See Also
+    --------
+    nnls : Linear least squares with non-negativity constraint.
+    least_squares : Nonlinear least squares with bounds on the variables.
+
+    Notes
+    -----
+    The algorithm first computes the unconstrained least-squares solution by
+    `numpy.linalg.lstsq` or `scipy.sparse.linalg.lsmr` depending on
+    `lsq_solver`. This solution is returned as optimal if it lies within the
+    bounds.
+
+    Method 'trf' runs the adaptation of the algorithm described in [STIR]_ for
+    a linear least-squares problem. The iterations are essentially the same as
+    in the nonlinear least-squares algorithm, but as the quadratic function
+    model is always accurate, we don't need to track or modify the radius of
+    a trust region. The line search (backtracking) is used as a safety net
+    when a selected step does not decrease the cost function. Read more
+    detailed description of the algorithm in `scipy.optimize.least_squares`.
+
+    Method 'bvls' runs a Python implementation of the algorithm described in
+    [BVLS]_. The algorithm maintains active and free sets of variables, on
+    each iteration chooses a new variable to move from the active set to the
+    free set and then solves the unconstrained least-squares problem on free
+    variables. This algorithm is guaranteed to give an accurate solution
+    eventually, but may require up to n iterations for a problem with n
+    variables. Additionally, an ad-hoc initialization procedure is
+    implemented, that determines which variables to set free or active
+    initially. It takes some number of iterations before actual BVLS starts,
+    but can significantly reduce the number of further iterations.
+
+    References
+    ----------
+    .. [STIR] M. A. Branch, T. F. Coleman, and Y. Li, "A Subspace, Interior,
+              and Conjugate Gradient Method for Large-Scale Bound-Constrained
+              Minimization Problems," SIAM Journal on Scientific Computing,
+              Vol. 21, Number 1, pp 1-23, 1999.
+    .. [BVLS] P. B. Start and R. L. Parker, "Bounded-Variable Least-Squares:
+              an Algorithm and Applications", Computational Statistics, 10,
+              129-141, 1995.
+
+    Examples
+    --------
+    In this example, a problem with a large sparse matrix and bounds on the
+    variables is solved.
+
+    >>> import numpy as np
+    >>> from scipy.sparse import rand
+    >>> from scipy.optimize import lsq_linear
+    >>> rng = np.random.default_rng()
+    ...
+    >>> m = 20000
+    >>> n = 10000
+    ...
+    >>> A = rand(m, n, density=1e-4, random_state=rng)
+    >>> b = rng.standard_normal(m)
+    ...
+    >>> lb = rng.standard_normal(n)
+    >>> ub = lb + 1
+    ...
+    >>> res = lsq_linear(A, b, bounds=(lb, ub), lsmr_tol='auto', verbose=1)
+    # may vary
+    The relative change of the cost function is less than `tol`.
+    Number of iterations 16, initial cost 1.5039e+04, final cost 1.1112e+04,
+    first-order optimality 4.66e-08.
+    """
+    tx1 = time.time()
+    if method not in ['trf', 'bvls']:
+        raise ValueError("`method` must be 'trf' or 'bvls'")
+
+    if lsq_solver not in [None, 'exact', 'lsmr']:
+        raise ValueError("`solver` must be None, 'exact' or 'lsmr'.")
+
+    if verbose not in [0, 1, 2]:
+        raise ValueError("`verbose` must be in [0, 1, 2].")
+
+    if issparse(A):
+        A = csr_matrix(A)
+    elif not isinstance(A, LinearOperator):
+        A = cp.atleast_2d(cp.asarray(A))
+
+    if method == 'bvls':
+        if lsq_solver == 'lsmr':
+            raise ValueError("method='bvls' can't be used with "
+                             "lsq_solver='lsmr'")
+
+        if not isinstance(A, cp.ndarray):
+            raise ValueError("method='bvls' can't be used with `A` being "
+                             "sparse or LinearOperator.")
+
+    if lsq_solver is None:
+        if isinstance(A, cp.ndarray):
+            lsq_solver = 'exact'
+        else:
+            lsq_solver = 'lsmr'
+    elif lsq_solver == 'exact' and not isinstance(A, cp.ndarray):
+        raise ValueError("`exact` solver can't be used when `A` is "
+                         "sparse or LinearOperator.")
+
+    if len(A.shape) != 3:  # No ndim for LinearOperator.
+        raise ValueError("`A` must have 3 dimensions.")
+
+    if max_iter is not None and max_iter <= 0:
+        raise ValueError("`max_iter` must be None or positive integer.")
+
+    z, m, n = A.shape
+
+    if b.ndim != 2:
+        raise ValueError("`b` must have at most 2 dimension.")
+
+    zb, mb = b.shape
+
+    if mb != m:
+        raise ValueError("Inconsistent shapes between `A` and `b`.")
+
+    if isinstance(bounds, Bounds):
+        lb = bounds.lb
+        ub = bounds.ub
+    else:
+        lb, ub = prepare_bounds(bounds, n)
+
+    if lb.shape != (n,) and ub.shape != (n,) and lb.shape != (z,n) and ub.shape != (z,n):
+        raise ValueError("Bounds have wrong shape.")
+
+    if check_bounds:
+        if cp.any(lb >= ub):
+            raise ValueError("Each lower bound must be strictly less than each "
+                         "upper bound.")
+
+    if lsmr_maxiter is not None and lsmr_maxiter < 1:
+        raise ValueError("`lsmr_maxiter` must be None or positive integer.")
+
+    if not ((isinstance(lsmr_tol, float) and lsmr_tol > 0) or
+            lsmr_tol in ('auto', None)):
+        raise ValueError("`lsmr_tol` must be None, 'auto', or positive float.")
+
+    if lsq_solver == 'exact':
+        #unbd_lsq = cp.linalg.lstsq(A, b, rcond=-1)
+        if m == n:
+            #square matrices
+            unbd_lsq = cp.linalg.solve(A,b)
+        else:
+            unbd_lsq = cp.einsum("ijk,ik->ij", cp.linalg.pinv(A), b)
+    elif lsq_solver == 'lsmr':
+        raise NotImplementedError('lsmr is not yet implemetned') 
+        #first_lsmr_tol = lsmr_tol  # tol of first call to lsmr
+        #if lsmr_tol is None or lsmr_tol == 'auto':
+        #    first_lsmr_tol = 1e-2 * tol  # default if lsmr_tol not defined
+        #unbd_lsq = lsmr(A, b, maxiter=lsmr_maxiter,
+        #                atol=first_lsmr_tol, btol=first_lsmr_tol)
+    #x_lsq = unbd_lsq[0]  # extract the solution from the least squares solver
+    x_lsq = unbd_lsq
+
+    #Tile bounds if 1d arrays given
+    if (lb.shape == (n,)):
+        lb = cp.tile(lb, (z,1))
+    if (ub.shape == (n,)):
+        ub = cp.tile(ub, (z,1))
+    ib = in_bounds(x_lsq, lb, ub)
+
+    if ib.get().all():
+        #All solutions within bounds
+        termination_status = 3
+        termination_message = TERMINATION_MESSAGES[termination_status]
+        if (return_cost or return_optimality):
+            r = cp.einsum("ijk,ik->ij", A, x_lsq) - b
+            #r = A @ x_lsq - b
+            cost = 0.5*cp.einsum("ij,ij->i", r, r)
+            #cost = 0.5 * cp.dot(r, r)
+            if (return_optimality):
+                g = compute_grad(A, r)
+                g_norm = norm(g, ord=cp.inf, axis=1)
+            else:
+                g_norm = None
+        else:
+            cost=None
+            r = None
+            g_norm = None
+
+        if verbose > 0:
+            print(termination_message)
+            if (return_cost):
+                print(f"Final cost {cost:.4e}")
+            if (return_optimality):
+                print(f"first-order optimality {g_norm:.2e}")
+
+        return OptimizeResult(
+            x=x_lsq, fun=r, cost=cost, optimality=g_norm,
+            active_mask=np.zeros(n), unbounded_sol=unbd_lsq,
+            nit=0, status=termination_status,
+            message=termination_message, success=True)
+
+    if method == 'trf':
+        raise NotImplementedError('trf_linear is not yet implemented')
+        #res = trf_linear(A, b, x_lsq, lb, ub, tol, lsq_solver, lsmr_tol,
+        #                 max_iter, verbose, lsmr_maxiter=lsmr_maxiter)
+    elif method == 'bvls':
+        res = bvls(A, b, x_lsq, lb, ub, ib, tol, max_iter, verbose,
+               return_cost=return_cost, return_optimality=return_optimality)
+
+    res.unbounded_sol = unbd_lsq
+    res.message = TERMINATION_MESSAGES[res.status]
+    res.success = res.status > 0
+
+    if verbose > 0:
+        print(res.message)
+        print(
+            f"Number of iterations {res.nit}, initial cost {res.initial_cost:.4e}, "
+            f"final cost {res.cost:.4e}, first-order optimality {res.optimality:.2e}."
+        )
+
+    del res.initial_cost
+
+    return res
diff --git a/py/redrock/zfind.py b/py/redrock/zfind.py
index 913186b..17318c3 100644
--- a/py/redrock/zfind.py
+++ b/py/redrock/zfind.py
@@ -29,7 +29,7 @@
 
 from .results import read_zscan_redrock
 
-from .zscan import calc_zchi2_targets
+from .zscan import calc_zchi2_targets, print_total_fitting_times
 
 from .fitz import fitz, get_dv
 
@@ -408,6 +408,7 @@ def zfind(targets, templates, mp_procs=1, nminima=3, archetypes=None, priors=Non
         results = [ results ]
 
     if am_root:
+        print_total_fitting_times() #print in root the total fitting times
         allresults = dict()
         for p in results:
             allresults.update(p)
diff --git a/py/redrock/zscan.py b/py/redrock/zscan.py
index 10a0c8d..ffd5972 100644
--- a/py/redrock/zscan.py
+++ b/py/redrock/zscan.py
@@ -11,7 +11,7 @@
 import sys
 import traceback
 import numpy as np
-from scipy.optimize import lsq_linear, nnls
+from scipy.optimize import nnls
 
 #try:
 #    import cupy as cp
@@ -35,6 +35,11 @@
 block_size = 512 #Default block size, should work on all modern nVidia GPUs
 
 # cuda_source contains raw CUDA kernels to be loaded as CUPY module
+gbounds = None
+gnbh = None
+
+ftimes = np.zeros(3)
+fmethods = ["PCA", "NNLS", "BVLS"]
 
 cuda_source = r'''
         extern "C" {
@@ -287,7 +292,7 @@ def per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflu
         wflux (array): concatenated weighted flux values.
         nleg (int): number of Legendre polynomials
         narch (int): number of archetypes
-        method (string): 'PCA', 'BVLS', 'NMF', or 'NNLS' (same as NMF)
+        method (string): 'PCA', 'BVLS', 'NMF', or 'NNLS' (same as NMF), or 'BVLS_VIA_NNLS'
         n_nbh (int): number of nearest best archetypes
         prior (array): prior matrix added to the Legendre coefficients (1/sigma^2)
         use_gpu (bool): use GPU or not
@@ -297,8 +302,9 @@ def per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflu
         coefficients and chi2
 
     """
-
-    ### TODO - implement BVLS on GPU
+    #Use globals gbounds and gnbh
+    global gbounds
+    global gnbh
     # number of cameras in DESI: b, r, z
     spectra = target.spectra
     if bands is None:
@@ -315,7 +321,9 @@ def per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflu
 
     method = method.upper()
     add_negative_legendre_terms = False
-    if (method == 'BVLS' and use_gpu):
+    if (method == 'BVLS_VIA_NNLS' and use_gpu):
+        #Use trick of adding negative legendre terms and using NNLS
+        #to implement BVLS
         add_negative_legendre_terms = True
 
     #Calculate prior here
@@ -331,6 +339,9 @@ def per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflu
             #Use CPU if only 1 narch (nearest neighbor)
             use_gpu = False
 
+    if narch == 1:
+        use_gpu = False
+
     #Calculate nbasis after accounting for legendre terms
     nbasis = n_nbh+nleg*ncam # n_nbh : for actual physical archetype(s), nleg: number of legendre polynomials, ncamera: number of cameras
 
@@ -366,10 +377,26 @@ def per_camera_coeff_with_least_square_batch(target, binned, weights, flux, wflu
     solver_args = dict()
     if (method == 'BVLS'):
         #only positive coefficients are allowed for the archetypes
-        bounds = np.zeros((2, nbasis))
-        bounds[0][n_nbh:]=-np.inf #constant and slope terms in archetype method (can be positive or negative)
-        bounds[1] = np.inf
-        solver_args['bounds'] = bounds
+        if use_gpu:
+            #bounds should always be the same and creating small arrays on
+            #GPU is expensive so use a global variable and create once
+            if gbounds is None:
+                gbounds = cp.zeros((2,nbasis))
+                gbounds[0][n_nbh:] = -cp.inf
+                gbounds[1] = cp.inf
+                gnbh = n_nbh
+            #In practice right now n_nbh will not change on any given run
+            #But future proofing just in case
+            if gnbh != n_nbh:
+                gbounds[0][:n_nbh] = 0
+                gbounds[0][n_nbh:] = -cp.inf
+            solver_args['bounds'] = gbounds
+        else:
+            #On CPU we can create bounds array every time cheaply
+            bounds = np.zeros((2, nbasis))
+            bounds[0][n_nbh:]=-np.inf #constant and slope terms in archetype method (can be positive or negative)
+            bounds[1] = np.inf
+            solver_args['bounds'] = bounds
 
     #Use branching options because GPU is faster in batch in 3d
     #but due to timing weirdness in numpy, CPU is faster looping over
@@ -1210,20 +1237,24 @@ def solve_matrices(M, y, solve_algorithm="PCA", solver_args=None, use_gpu=False)
 
     if solve_algorithm.upper() == "PCA":
         #Use PCA via linalg.solve in either numpy or cupy
+        tx = time.time()
         if (use_gpu):
             #Use cupy linalg.solve to solve for zcoeff in batch for all_M and
             #all_y where all_M and all_y are 3d and 2d arrays representing
             #M and y at every redshift bin for the given template.
             #There is no Error thrown by cupy's version.
-            return cp.linalg.solve(M, y)
+            zcoeff = cp.linalg.solve(M, y)
         else:
             #Use numpy linalg.solve which throws exception
             try:
-                return np.linalg.solve(M, y)
+                zcoeff = np.linalg.solve(M, y)
             except np.linalg.LinAlgError:
                 raise
+        add_to_timer(0, time.time()-tx)
+        return zcoeff
     elif solve_algorithm in ("NMF", "NNLS"):
         if (use_gpu):
+            tx = time.time()
             nz = y.shape[0]
             Mcpu = M.get()
             ycpu = y.get()
@@ -1235,41 +1266,44 @@ def solve_matrices(M, y, solve_algorithm="PCA", solver_args=None, use_gpu=False)
                     zcoeff[j,:] = res[0]
                 except Exception:
                     zcoeff[j,:] = 9e99
+            add_to_timer(1, time.time()-tx)
             return cp.asarray(zcoeff)
         else:
             try:
+                tx = time.time()
                 res = nnls(M, y)
                 zcoeff = res[0]
-            except Exception:
+                add_to_timer(1, time.time()-tx)
+            except Exception as ex:
                 raise np.linalg.LinAlgError
             return zcoeff
     elif solve_algorithm == "BVLS":
+        tx = time.time()
         if (solver_args is not None and 'bounds' in solver_args):
             bounds = solver_args['bounds']
         else:
             nbasis = y.shape[-1]
-            bounds = np.zeros((2, nbasis))
+            bounds = np.zeros_like(y, shape=(2, nbasis))
             bounds[0]=-np.inf
             bounds[1]=np.inf
         if (use_gpu):
-            nz = y.shape[0]
-            Mcpu = M.get()
-            ycpu = y.get()
-            zcoeff = np.zeros(y.shape)
-            #Copy to CPU, run scipy.optimize.lsq_linear, copy back to GPU
-            for j in range(nz):
-                try:
-                    res = lsq_linear(Mcpu[j,:,:], ycpu[j,:], bounds=bounds, method='bvls')
-                    zcoeff[j,:] = res.x
-                except np.linalg.LinAlgError:
-                    zcoeff[j,:] = 9e99
-            return cp.asarray(zcoeff)
+            from .optimize import gpu_lsq_linear as lsq_linear
+            lsq_opts = dict(return_cost=False, return_optimality=False)
         else:
-            try:
-                res = lsq_linear(M, y, bounds=bounds, method='bvls')
-                zcoeff = res.x
-            except np.linalg.LinAlgError:
-                raise
-            return zcoeff
+            from scipy.optimize import lsq_linear
+            lsq_opts = dict()
+        res = lsq_linear(M, y, bounds=bounds, method='bvls', **lsq_opts)
+        zcoeff = res.x
+        add_to_timer(2, time.time()-tx)
+        return zcoeff
     else:
         raise NotImplementedError("The solve_algorithm "+solve_algorithm+" is not implemented.")
+
+def add_to_timer(i, t):
+    ftimes[i] += t
+
+def print_total_fitting_times():
+    for j in range(len(ftimes)):
+        if ftimes[j] > 0:
+            print("Total fitting time (method {}): {:0.1f} seconds".format(fmethods[j], ftimes[j]))
+    sys.stdout.flush()