Skip to content

Commit

Permalink
Merge pull request #83 from kundajelab/reducemem3
Browse files Browse the repository at this point in the history
Sparse Matrices Memory Reduction
  • Loading branch information
AvantiShri authored Feb 11, 2021
2 parents 3e5f5e8 + b437469 commit a0ec95b
Show file tree
Hide file tree
Showing 10 changed files with 2,947 additions and 3,331 deletions.
5,718 changes: 2,469 additions & 3,249 deletions examples/simulated_TAL_GATA_deeplearning/TF_MoDISco_TAL_GATA.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion modisco.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: modisco
Version: 0.5.10.2
Version: 0.5.11.0
Summary: TF MOtif Discovery from Importance SCOres
Home-page: https://github.com/kundajelab/tfmodisco
License: UNKNOWN
Expand Down
147 changes: 132 additions & 15 deletions modisco/affinitymat/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import time
import itertools
import scipy.stats
from scipy.sparse import coo_matrix
import gc
import sklearn
from joblib import Parallel, delayed
Expand Down Expand Up @@ -94,6 +95,11 @@ def __call__(self, seqlets):
raise NotImplementedError()


class AbstractSparseAffmatFromFwdAndRevOneDVecs(object):
def __call__(self, fwd_vecs, rev_vecs):
raise NotImplementedError()


class AbstractAffinityMatrixFromOneD(object):

def __call__(self, vecs1, vecs2):
Expand All @@ -111,7 +117,65 @@ def sparse_cosine_similarity(sparse_mat_1, sparse_mat_2):
return normed_sparse_mat_1.dot(normed_sparse_mat_2.transpose())


class SparseNumpyCosineSimFromFwdAndRevOneDVecs():
#take the dot product of fwd_vecs with
# fwd_vecs and rev_vecs, take max over the fwd and rev sim, then return
# the top k
def top_k_fwdandrev_dot_prod(fwd_vecs, rev_vecs, slice_start, slice_end, k,
initclusters):
if (initclusters is not None):
assert len(initclusters)==fwd_vecs.shape[0]
fwd_vecs_slice = fwd_vecs[slice_start:slice_end]
initclusters_slice = (None if initclusters is None
else initclusters[slice_start:slice_end])
k = min(k, fwd_vecs.shape[0])
if (scipy.sparse.issparse(fwd_vecs_slice)):
fwd_dot = np.array(fwd_vecs_slice.dot(fwd_vecs.transpose()).todense())
else:
fwd_dot = np.matmul(fwd_vecs_slice,fwd_vecs.T)
if (rev_vecs is not None):
if (scipy.sparse.issparse(fwd_vecs_slice)):
rev_dot = np.array(fwd_vecs_slice.dot(rev_vecs.transpose())
.todense())
else:
rev_dot = np.matmul(fwd_vecs_slice, rev_vecs.T)
dotprod = np.maximum(fwd_dot, rev_dot)
else:
dotprod = fwd_dot

#dotprod has shape batchsize X num_seqlets
dotprod_argsort = np.argsort(-dotprod, axis=-1)
sorted_topk_indices = []
sorted_topk_sims = []
for row_idx,argsort_row in enumerate(dotprod_argsort):
combined_neighbor_row = []
neighbor_row_topnn = argsort_row[:k]
neighbor_set_topnn = set(neighbor_row_topnn)
#combined_neighbor_row ends up being the union of the standard nearest
# neighbors plus the nearest neighbors if focusing on the initclusters
combined_neighbor_row.extend(neighbor_row_topnn)
if (initclusters_slice is not None):
combined_neighbor_row.extend([
y for y in ([x for x in argsort_row
if initclusters_slice[x]==initclusters_slice[row_idx]][:k])
if y not in neighbor_set_topnn])
sorted_topk_indices.append(
np.array(combined_neighbor_row).astype("int"))
sorted_topk_sims.append(dotprod[row_idx][combined_neighbor_row])
##get the top k indices
#top_k_indices = np.argpartition(dotprod, -k, axis=1)[:,-k:]
#sims = np.take_along_axis(arr=dotprod, indices=top_k_indices, axis=1)

##sort by similarity
#sims_argsort_result = np.argsort(sims, axis=-1)
#sorted_topk_sims = np.take_along_axis(arr=sims,
# indices=sims_argsort_result, axis=1)
#sorted_topk_indices = np.take_along_axis(arr=top_k_indices,
# indices=sims_argsort_result, axis=1)
return (sorted_topk_indices, sorted_topk_sims)


class SparseNumpyCosineSimFromFwdAndRevOneDVecs(
AbstractSparseAffmatFromFwdAndRevOneDVecs):

def __init__(self, n_neighbors, verbose, nn_n_jobs,
memory_cap_gb=1.0):
Expand All @@ -120,11 +184,14 @@ def __init__(self, n_neighbors, verbose, nn_n_jobs,
self.verbose = verbose
self.memory_cap_gb = memory_cap_gb

def __call__(self, fwd_vecs, rev_vecs):
def __call__(self, fwd_vecs, rev_vecs, initclusters):

#normalize the vectors
fwd_vecs = magnitude_norm_sparsemat(sparse_mat=fwd_vecs)
rev_vecs = magnitude_norm_sparsemat(sparse_mat=rev_vecs)
if (rev_vecs is not None):
rev_vecs = magnitude_norm_sparsemat(sparse_mat=rev_vecs)
else:
rev_vecs = None

#fwd_sims = fwd_vecs.dot(fwd_vecs.transpose())
#rev_sims = fwd_vecs.dot(rev_vecs.transpose())
Expand All @@ -133,21 +200,22 @@ def __call__(self, fwd_vecs, rev_vecs):
# to use given the memory cap
memory_cap_gb = (self.memory_cap_gb if rev_vecs
is None else self.memory_cap_gb/2.0)
batch_size = int(memory_cap_gb*(2**30)/(len(fwd_vecs)*8))
batch_size = min(max(1,batch_size),len(fwd_vecs))
batch_size = int(memory_cap_gb*(2**30)/(fwd_vecs.shape[0]*8))
batch_size = min(max(1,batch_size),fwd_vecs.shape[0])
if (self.verbose):
print("Batching in slices of size",batch_size)
sys.stdout.flush()

topk_cosine_sim_results = []
for i in tqdm(range(0,len(fwd_vecs),batch_size)):
topk_cosine_sim_results.append(
top_k_fwdandrev_dot_prod(fwd_vecs[i:i+batch_size],
neighbors, sims = [], []
for i in tqdm(range(0,fwd_vecs.shape[0],batch_size)):
neighbors_batch, sims_batch = top_k_fwdandrev_dot_prod(fwd_vecs,
rev_vecs,
fwd_vecs, self.n_neighbors+1))
neighbors = np.concatenate(
[x[0] for x in topk_cosine_sim_results], axis=0)
sims = np.concatenate([x[1] for x in topk_cosine_sim_results], axis=0)
slice_start=i,
slice_end=(i+batch_size),
k=self.n_neighbors+1,
initclusters=initclusters)
neighbors.extend(neighbors_batch)
sims.extend(sims_batch)

return sims, neighbors

Expand Down Expand Up @@ -308,6 +376,54 @@ def __call__(self, seqlets):
else np.array(affinity_mat_fwd))


class SparseAffmatFromFwdAndRevSeqletEmbeddings(
AbstractAffinityMatrixFromSeqlets):

def __init__(self, seqlets_to_1d_embedder,
sparse_affmat_from_fwdnrev1dvecs, verbose):
self.seqlets_to_1d_embedder = seqlets_to_1d_embedder
self.sparse_affmat_from_fwdnrev1dvecs =\
sparse_affmat_from_fwdnrev1dvecs
self.verbose = verbose

def __call__(self, seqlets, initclusters):

cp1_time = time.time()
if (self.verbose):
print("Beginning embedding computation")
print_memory_use()
sys.stdout.flush()

embedding_fwd, embedding_rev = self.seqlets_to_1d_embedder(seqlets)
gc.collect()

cp2_time = time.time()
if (self.verbose):
print("Finished embedding computation in",
round(cp2_time-cp1_time,2),"s")
print_memory_use()
sys.stdout.flush()

if (self.verbose):
print("Starting affinity matrix computations")
print_memory_use()
sys.stdout.flush()

sparse_affmat, neighbors = self.sparse_affmat_from_fwdnrev1dvecs(
fwd_vecs=embedding_fwd,
rev_vecs=embedding_rev,
initclusters=initclusters)

cp3_time = time.time()

if (self.verbose):
print("Finished affinity matrix computations in",
round(cp3_time-cp2_time,2),"s")
print_memory_use()
sys.stdout.flush()
return sparse_affmat, neighbors


class MaxCrossMetricAffinityMatrixFromSeqlets(
AbstractAffinityMatrixFromSeqlets):

Expand Down Expand Up @@ -439,7 +555,7 @@ def __call__(self, seqlets, filter_seqlets=None,
(affmat_rev is not None) else np.array(affmat_fwd))
else:
if (len(affmat_fwd[0].shape)==2):
#dims are N x N x 2, where first entry of last idx is sim,
#dims are N x neighbs x 2, where first entry of last idx is sim,
# and the second entry is the alignment.
if (affmat_rev is None):
affmat = affmat_fwd
Expand Down Expand Up @@ -484,7 +600,8 @@ def __call__(self, filters, things_to_scan, min_overlap,
if (neighbors_of_things_to_scan is None):
neighbors_of_things_to_scan = [list(range(len(filters)))
for x in things_to_scan]
assert len(neighbors_of_things_to_scan) == things_to_scan.shape[0]
assert (len(neighbors_of_things_to_scan) == things_to_scan.shape[0]),\
(len(neighbors_of_things_to_scan), things_to_scan.shape[0])
assert np.max([np.max(x) for x in neighbors_of_things_to_scan])\
< filters.shape[0]
assert len(things_to_scan.shape)==3
Expand Down
96 changes: 90 additions & 6 deletions modisco/affinitymat/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import sklearn
import sklearn.manifold
import scipy
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.neighbors import NearestNeighbors
import sys
from ..cluster import phenograph as ph
Expand Down Expand Up @@ -245,12 +245,96 @@ def __call__(self, affinity_mat):
return to_return


class AffToDistViaLogInv(AbstractAffToDistMat):
class AbstractNNTsneProbs(AbstractAffMatTransformer):

def __call__(self, affinity_mat):
to_return = np.log(1.0/np.maximum(affinity_mat, 0.0000001))
to_return = np.maximum(to_return, 0.0) #eliminate tiny neg floats
return to_return
def __init__(self, perplexity, aff_to_dist_mat, verbose=1):
self.perplexity = perplexity
self.verbose=verbose
self.aff_to_dist_mat = aff_to_dist_mat

def __call__(self, affinity_mat, nearest_neighbors):

#pad the affmat and the nearest neighbors so they all have the
# same length...
max_neighbors = max([len(x) for x in nearest_neighbors])
affinity_mat = np.array([list(row)+[-np.inf for x in
range(max_neighbors-len(row))]
for row in affinity_mat])
nearest_neighbors = [list(row)+[np.nan for x in
range(max_neighbors-len(row))]
for row in nearest_neighbors]

#assert that affinity_mat as the same dims as nearest_neighbors
assert affinity_mat.shape==(len(nearest_neighbors),
len(nearest_neighbors[0]))
#assert all rows of nearest_neighbors have the same length (i.e.
# they have been padded)
assert len(set([len(x) for x in nearest_neighbors]))==1

distmat_nn = self.aff_to_dist_mat(
affinity_mat=affinity_mat)
#assert that the distances are increasing to the right
assert np.min(distmat_nn[:,1:] - distmat_nn[:,:-1]) >= 0.0
#assert that the self-distances are 0
assert np.min(distmat_nn[:,0])==0
assert np.max(distmat_nn[:,0])==0
#assert that each idx is its own nearest neighbor
assert all([i==nearest_neighbors[i][0] for
i in range(len(nearest_neighbors))])
# Compute the number of nearest neighbors to find.
# LvdM uses 3 * perplexity as the number of neighbors.
# In the event that we have very small # of points
# set the neighbors to n - 1.
n_samples = distmat_nn.shape[0]
k = min(n_samples - 1, int(3. * self.perplexity + 1))
assert k < distmat_nn.shape[1],(
"Not enough neighbors for perplexity calc! Need over"
+" "+str(k)+" but have "+str(distmat_nn.shape[1]))
P = self.tsne_probs_calc(distances_nn=distmat_nn[:,1:(k+1)],
neighbors_nn=[row[1:(k+1)] for row in
nearest_neighbors])
return P


class NNTsneConditionalProbs(AbstractNNTsneProbs):

def tsne_probs_calc(self, distances_nn, neighbors_nn):
t0 = time.time()
# Compute conditional probabilities such that they approximately match
# the desired perplexity
assert len(set([len(row) for row in neighbors_nn]))==1
n_samples, k = len(neighbors_nn),len(neighbors_nn[0])
distances = distances_nn.astype(np.float32, copy=False)
neighbors = neighbors_nn
try:
conditional_P = sklearn.manifold._utils._binary_search_perplexity(
distances, np.array(neighbors).astype("int"),
self.perplexity, self.verbose)
except:
#API changed in v0.22 to not require the redundant neighbors
# argument
conditional_P = sklearn.manifold._utils._binary_search_perplexity(
distances, self.perplexity, self.verbose)
#for some reason, likely a sklearn bug, a few of
#the rows don't sum to 1...for now, fix by making them sum to 1
assert np.all(np.isfinite(conditional_P)), \
"All probabilities should be finite"
#normalize the conditional_P to sum to 1 across the rows
conditional_P = conditional_P/np.sum(conditional_P, axis=-1)[:,None]

data = []
rows = []
cols = []
for row_idx,(ps,neigh_row) in enumerate(zip(conditional_P, neighbors)):
data.extend([p for p,neighbor in zip(ps, neigh_row)
if np.isnan(neighbor)==False])
rows.extend([row_idx for neighbor in neigh_row
if np.isnan(neighbor)==False])
cols.extend([neighbor for neighbor in neigh_row
if np.isnan(neighbor)==False])
P = coo_matrix((data, (rows, cols)),
shape=(len(neighbors), len(neighbors)))
return P


class AbstractTsneProbs(AbstractAffMatTransformer):
Expand Down
Loading

0 comments on commit a0ec95b

Please sign in to comment.