Skip to content

Commit

Permalink
Implemented a CuPy based GPU version of scipy.optimize.lsq_linear's BVLS
Browse files Browse the repository at this point in the history
algorithm.  This is in the optimize folder and will be submitted to CuPy
for inclusion at which point it will be removed from Redrock.

zscan has been updated to use this GPU based version instead of our NNLS
trick of adding negative legendre terms and using NNLS to emulate BVLS;
although that trick is preserved via self._solver_method =
"bvls_via_nnls" in archetypes.py where Archetype files are allowed to
specify their preferred solver method (pca, nnls, bvls, or
bvls_via_nnls).

This branch is forked from the branch legendre_nnls_merge_with_main,
which was ready to merge with main and had addressed conflicts that had
arisen during parallel developement by myself and Abhijeet.

zfind is modified slightly to add a timing print statement for the time
spent in each solver method.

Timing Results:
With 4 GPUs and 4 CPUs, BVLS on GPU takes ~17s resulting in a fine
redshift scan time of 72s compared to the NNLS trick taking ~14s for a
fine redshift scan time of 76s (the reason the fitting is faster but the
overall time is slower is due to other operations performed on larger
arrays with the added negative legendre terms).  This compares to using
scipy's CPU-based BVLS, which takes ~63s to do the fitting and 119s for
the fine redshift scan.
  • Loading branch information
craigwarner-ufastro committed Jul 26, 2024
1 parent 425d3fc commit 382f6c0
Show file tree
Hide file tree
Showing 7 changed files with 798 additions and 32 deletions.
1 change: 1 addition & 0 deletions py/redrock/archetypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def __init__(self, filename):
self.igm_model = 'Inoue14'
# TODO: Allow Archetype files to specify bvls or nnls or pca solver method
self._solver_method = 'bvls'
#self._solver_method = 'bvls_via_nnls' #This uses the NNLS trick to emulate BVLS

self.method = 'ARCH' # for API symmetry with Template.method

Expand Down
5 changes: 5 additions & 0 deletions py/redrock/optimize/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""This module contains a GPU implementation of the BVLS algorithm in
scipy.optimize.lsq_linear"""
from .gpu_lsq_linear import gpu_lsq_linear

__all__ = ['gpu_lsq_linear']
244 changes: 244 additions & 0 deletions py/redrock/optimize/gpu_bvls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
"""GPU implementation of Bounded-variable least-squares algorithm."""
import numpy as np
import cupy as cp
#from numpy.linalg import norm, lstsq
from cupy.linalg import norm, lstsq
from scipy.optimize import OptimizeResult

from .gpu_common import print_header_linear, print_iteration_linear, compute_grad

def compute_kkt_optimality(g, on_bound):
"""Compute the maximum violation of KKT conditions."""
g_kkt = g * on_bound
free_set = on_bound == 0
g_kkt[free_set] = cp.abs(g[free_set])
return cp.max(g_kkt, axis=1)


def bvls(A, b, x_lsq, lb, ub, ib, tol, max_iter, verbose, rcond=None, return_cost=True, return_optimality=True):
d, m, n = A.shape

#x = x_lsq.copy()
x = x_lsq
on_bound = cp.zeros((d,n), dtype=cp.int32)

mask = x <= lb
x[mask] = lb[mask]
on_bound[mask] = -1

mask = x >= ub
x[mask] = ub[mask]
on_bound[mask] = 1

free_set = on_bound == 0
active_set = ~free_set

if return_cost or return_optimality:
r = cp.einsum("ijk,ik->ij", A, x) - b
#r = A.dot(x) - b
cost = 0.5*cp.einsum("ij,ij->i", r, r)
#cost = 0.5 * np.dot(r, r)
initial_cost = cost
if return_optimality:
g = compute_grad(A,r)
#g = A.T.dot(r)
else:
cost = None
initial_cost = None

cost_change = None
step_norm = None
iteration = 0

if verbose == 2:
print_header_linear()

# This is the initialization loop. The requirement is that the
# least-squares solution on free variables is feasible before BVLS starts.
# One possible initialization is to set all variables to lower or upper
# bounds, but many iterations may be required from this state later on.
# The implemented ad-hoc procedure which intuitively should give a better
# initial state: find the least-squares solution on current free variables,
# if its feasible then stop, otherwise, set violating variables to
# corresponding bounds and continue on the reduced set of free variables.

while free_set.size > 0:
if verbose == 2 and return_optimality:
optimality = compute_kkt_optimality(g, on_bound)
print_iteration_linear(iteration, cost, cost_change, step_norm,
optimality)

iteration += 1
if verbose == 2:
#Only need x_old if verbose == 2
x_old = x.copy()

b_free = b - cp.einsum("ijk,ik->ij", A, x*active_set)
A_free = A.swapaxes(1,2).copy()
A_free[active_set] = 0
A_free = A_free.swapaxes(1,2)
z = cp.einsum("ijk,ik->ij", cp.linalg.pinv(A_free), b_free)

#A_free = A[:, free_set]
#b_free = b - A.dot(x * active_set)
#z = lstsq(A_free, b_free, rcond=rcond)[0]

lbv = z < lb
ubv = z > ub
#lbv = z < lb[free_set]
#ubv = z > ub[free_set]
v = lbv | ubv

if cp.any(lbv):
#ind = free_set[lbv]
#x[ind] = lb[ind]
x[lbv] = lb[lbv]
#active_set[ind] = True
active_set[lbv] = True
#on_bound[ind] = -1
on_bound[lbv] = -1

if cp.any(ubv):
#ind = free_set[ubv]
#x[ind] = ub[ind]
x[ubv] = ub[ubv]
#active_set[ind] = True
active_set[ubv] = True
#on_bound[ind] = 1
on_bound[ubv] = 1

#ind = free_set[~v]
#x[ind] = z[~v]
x[~v] = z[~v]

r = cp.einsum("ijk,ik->ij", A, x) - b
#r = A.dot(x) - b
#cost_new = 0.5 * np.dot(r, r)
cost_new = 0.5*cp.einsum("ij,ij->i", r, r)
if return_cost:
cost_change = cost - cost_new
cost = cost_new
#g = A.T.dot(r)
g = compute_grad(A,r)
if verbose == 2:
#step_norm = norm(x[free_set] - x_free_old)
step_norm = norm(x-x_old, axis=1)

if cp.any(v):
free_set = free_set[~v]
else:
break

if max_iter is None:
max_iter = n
max_iter += iteration

termination_status = None

# Main BVLS loop.
optimality = compute_kkt_optimality(g, on_bound)
idx = np.arange(d, dtype=np.int32)
for iteration in range(iteration, max_iter): # BVLS Loop A
if verbose == 2 and return_optimality:
print_iteration_linear(iteration, cost, cost_change,
step_norm, optimality)

if cp.all(optimality < tol):
termination_status = 1

if termination_status is not None:
break

move_to_free = cp.argmax(g * on_bound, axis=1)
on_bound[idx,move_to_free] = 0

while True: # BVLS Loop B
free_set = on_bound == 0
active_set = ~free_set

if verbose == 2:
#Only need x_old if verbose == 2
x_old = x.copy()

b_free = b - cp.einsum("ijk,ik->ij", A, x*active_set)
#Set active_set to 0 in A_free
A_free = A.swapaxes(1,2).copy()
A_free[active_set] = 0
A_free = A_free.swapaxes(1,2)

z = cp.einsum("ijk,ik->ij", cp.linalg.pinv(A_free), b_free)

#A_free = A[:, free_set]
#b_free = b - A.dot(x * active_set)
#z = lstsq(A_free, b_free, rcond=rcond)[0]

lbv = z < lb
ubv = z > ub
v = (lbv | ubv)*free_set
#lbv, = np.nonzero(z < lb_free)
#ubv, = np.nonzero(z > ub_free)
#v = np.hstack((lbv, ubv))

#if v.size > 0:
if v.any():
alphas = cp.zeros(x.shape)
alphas[lbv] = (lb[lbv]-x[lbv]) / (z[lbv]-x[lbv])
alphas[ubv] = (ub[ubv]-x[ubv]) / (z[ubv]-x[ubv])

i = cp.argmin(alphas, axis=1)
alpha = alphas[idx,i]
x_free = x[v] * (1-alpha[v])
x_free += alpha[v]*z[v]
x[v] = x_free

#Update on_bound
on_bound[cp.where(lbv[idx,i]), i[cp.where(lbv[idx,i])]] = -1
on_bound[cp.where(ubv[idx,i]), i[cp.where(ubv[idx,i])]] = 1

#alphas = np.hstack((
# lb_free[lbv] - x_free[lbv],
# ub_free[ubv] - x_free[ubv])) / (z[v] - x_free[v])

#i = np.argmin(alphas)
#i_free = v[i]
#alpha = alphas[i]

#x_free *= 1 - alpha
#x_free += alpha * z
#x[free_set] = x_free

#if i < lbv.size:
# on_bound[free_set[i_free]] = -1
#else:
# on_bound[free_set[i_free]] = 1
else:
x_free = z[free_set]
x[free_set] = x_free
break

if verbose == 2:
#step_norm = norm(x_free - x_free_old, axis=1)
step_norm = norm(x - x_old, axis=1)

r = cp.einsum("ijk,ik->ij", A, x) - b
#r = A.dot(x) - b
#cost_new = 0.5 * np.dot(r, r)
cost_new = 0.5*cp.einsum("ij,ij->i", r, r)
cost_change = cost - cost_new

if cp.all(cost_change < tol * cost):
termination_status = 2
cost = cost_new

#g = A.T.dot(r)
g = compute_grad(A,r)
optimality = compute_kkt_optimality(g, on_bound)
x_lsq = x

if termination_status is None:
termination_status = 0

return OptimizeResult(
x=x_lsq, fun=r, cost=cost, optimality=optimality, active_mask=on_bound,
nit=iteration + 1, status=termination_status,
initial_cost=initial_cost)
77 changes: 77 additions & 0 deletions py/redrock/optimize/gpu_common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""Functions used by least-squares algorithms."""
import numpy as np
import cupy as cp

from scipy.sparse.linalg import LinearOperator, aslinearoperator


# Utility functions to work with bound constraints.
def in_bounds(x, lb, ub, arrtype=None):
"""Check if a point lies within bounds."""
if arrtype is None:
arrtype = cp
return arrtype.all((x >= lb) & (x <= ub),axis=1)

def all_in_bounds(x, lb, ub):
"""Check if a point lies within bounds."""
return cp.all((x >= lb) & (x <= ub))


# Functions to display algorithm's progress.


def print_header_nonlinear():
print("{:^15}{:^15}{:^15}{:^15}{:^15}{:^15}"
.format("Iteration", "Total nfev", "Cost", "Cost reduction",
"Step norm", "Optimality"))


def print_iteration_nonlinear(iteration, nfev, cost, cost_reduction,
step_norm, optimality):
if cost_reduction is None:
cost_reduction = " " * 15
else:
cost_reduction = f"{cost_reduction:^15.2e}"

if step_norm is None:
step_norm = " " * 15
else:
step_norm = f"{step_norm:^15.2e}"

print("{:^15}{:^15}{:^15.4e}{}{}{:^15.2e}"
.format(iteration, nfev, cost, cost_reduction,
step_norm, optimality))


def print_header_linear():
print("{:^15}{:^15}{:^15}{:^15}{:^15}"
.format("Iteration", "Cost", "Cost reduction", "Step norm",
"Optimality"))


def print_iteration_linear(iteration, cost, cost_reduction, step_norm,
optimality):
if cost_reduction is None:
cost_reduction = " " * 15
else:
cost_reduction = f"{cost_reduction:^15.2e}"

if step_norm is None:
step_norm = " " * 15
else:
step_norm = f"{step_norm:^15.2e}"

print(f"{iteration:^15}{cost:^15.4e}{cost_reduction}{step_norm}{optimality:^15.2e}")


# Simple helper functions.


def compute_grad(J, f):
"""Compute gradient of the least-squares cost function."""
if isinstance(J, LinearOperator):
return J.rmatvec(f)
else:
return cp.einsum("ikj,ik->ij", J.swapaxes(1,2), f)
#return J.T.dot(f)

Loading

0 comments on commit 382f6c0

Please sign in to comment.