Skip to content

Commit

Permalink
Merge pull request #39 from kundajelab/norevcomp
Browse files Browse the repository at this point in the history
Added support for different manual positive and negative thresholds
  • Loading branch information
AvantiShri authored Mar 8, 2019
2 parents ffe7952 + 68bef15 commit 2fd98a2
Show file tree
Hide file tree
Showing 7 changed files with 1,156 additions and 1,369 deletions.
2 changes: 1 addition & 1 deletion modisco.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: modisco
Version: 0.5.1.0
Version: 0.5.1.1
Summary: TF MOtif Discovery from Importance SCOres
Home-page: https://github.com/kundajelab/tfmodisco
License: UNKNOWN
Expand Down
73 changes: 50 additions & 23 deletions modisco/coordproducers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
from sklearn.neighbors.kde import KernelDensity
import sys
import time
from .value_provider import AbstractValTransformer, AbsPercentileValTransformer
from .value_provider import (
AbstractValTransformer, AbsPercentileValTransformer,
SignedPercentileValTransformer)


class TransformAndThresholdResults(object):
Expand Down Expand Up @@ -101,13 +103,9 @@ def get_simple_window_sum_function(window_size):
def window_sum_function(arrs):
to_return = []
for arr in arrs:
current_sum = np.sum(arr[0:window_size])
arr_running_sum = [current_sum]
for i in range(0,(len(arr)-window_size)):
current_sum = (current_sum +
arr[i+window_size] - arr[i])
arr_running_sum.append(current_sum)
to_return.append(np.array(arr_running_sum))
cumsum = np.cumsum(arr)
cumsum = np.array([0]+list(cumsum))
to_return.append(cumsum[window_size:]-cumsum[:-window_size])
return to_return
return window_sum_function

Expand Down Expand Up @@ -311,16 +309,19 @@ def __init__(self, sliding,
target_fdr,
min_passing_windows_frac,
max_passing_windows_frac,
separate_pos_neg_thresholds=False,
max_seqlets_total=None,
progress_update=5000,
verbose=True):
verbose=True,
):
self.sliding = sliding
self.flank = flank
self.suppress = suppress
self.target_fdr = target_fdr
assert max_passing_windows_frac >= min_passing_windows_frac
self.min_passing_windows_frac = min_passing_windows_frac
self.max_passing_windows_frac = max_passing_windows_frac
self.separate_pos_neg_thresholds = separate_pos_neg_thresholds
self.max_seqlets_total = None
self.progress_update = progress_update
self.verbose = verbose
Expand All @@ -333,6 +334,7 @@ def from_hdf5(cls, grp):
target_fdr = grp.attrs["target_fdr"]
min_passing_windows_frac = grp.attrs["min_passing_windows_frac"]
max_passing_windows_frac = grp.attrs["max_passing_windows_frac"]
separate_pos_neg_thresholds = grp.attrs["separate_pos_neg_thresholds"]
if ("max_seqlets_total" in grp.attrs):
max_seqlets_total = grp.attrs["max_seqlets_total"]
else:
Expand All @@ -344,6 +346,7 @@ def from_hdf5(cls, grp):
target_fdr=target_fdr,
min_passing_windows_frac=min_passing_windows_frac,
max_passing_windows_frac=max_passing_windows_frac,
separate_pos_neg_thresholds=separate_pos_neg_thresholds,
max_seqlets_total=max_seqlets_total,
progress_update=progress_update, verbose=verbose)

Expand All @@ -355,6 +358,8 @@ def save_hdf5(self, grp):
grp.attrs["target_fdr"] = self.target_fdr
grp.attrs["min_passing_windows_frac"] = self.min_passing_windows_frac
grp.attrs["max_passing_windows_frac"] = self.max_passing_windows_frac
grp.attrs["separate_pos_neg_thresholds"] =\
self.separate_pos_neg_thresholds
#TODO: save min_seqlets feature
if (self.max_seqlets_total is not None):
grp.attrs["max_seqlets_total"] = self.max_seqlets_total
Expand Down Expand Up @@ -435,22 +440,42 @@ def __call__(self, score_track, null_track, tnt_results=None):
print("Passing windows frac was",
frac_passing_windows,", which is below ",
self.min_passing_windows_frac,"; adjusting")
pos_threshold = np.percentile(
a=np.abs(orig_vals),
q=100*(1-self.min_passing_windows_frac))
neg_threshold = -pos_threshold
if (self.separate_pos_neg_thresholds):
pos_threshold = np.percentile(
a=[x for x in orig_vals if x > 0],
q=100*(1-self.min_passing_windows_frac))
neg_threshold = np.percentile(
a=[x for x in orig_vals if x < 0],
q=100*(self.min_passing_windows_frac))
else:
pos_threshold = np.percentile(
a=np.abs(orig_vals),
q=100*(1-self.min_passing_windows_frac))
neg_threshold = -pos_threshold
if (frac_passing_windows > self.max_passing_windows_frac):
if (self.verbose):
print("Passing windows frac was",
frac_passing_windows,", which is above ",
self.max_passing_windows_frac,"; adjusting")
pos_threshold = np.percentile(
a=np.abs(orig_vals),
q=100*(1-self.max_passing_windows_frac))
neg_threshold = -pos_threshold

val_transformer = AbsPercentileValTransformer(
distribution=orig_vals)
if (self.separate_pos_neg_thresholds):
pos_threshold = np.percentile(
a=[x for x in orig_vals if x > 0],
q=100*(1-self.max_passing_windows_frac))
neg_threshold = np.percentile(
a=[x for x in orig_vals if x < 0],
q=100*(self.max_passing_windows_frac))
else:
pos_threshold = np.percentile(
a=np.abs(orig_vals),
q=100*(1-self.max_passing_windows_frac))
neg_threshold = -pos_threshold

if (self.separate_pos_neg_thresholds):
val_transformer = SignedPercentileValTransformer(
distribution=orig_vals)
else:
val_transformer = AbsPercentileValTransformer(
distribution=orig_vals)

if (self.verbose):
print("Final raw thresholds are",
Expand Down Expand Up @@ -516,12 +541,12 @@ def __call__(self, score_track, null_track, tnt_results=None):
neg_threshold = tnt_results.neg_threshold
pos_threshold = tnt_results.pos_threshold

summed_score_track = [x.copy() for x in original_summed_score_track]
summed_score_track = [np.array(x) for x in original_summed_score_track]

#if a position is less than the threshold, set it to -np.inf
summed_score_track = [
np.array([np.abs(y) if (y >= pos_threshold
or y <= neg_threshold)
np.array([np.abs(y) if (y > pos_threshold
or y < neg_threshold)
else -np.inf for y in x])
for x in summed_score_track]

Expand Down Expand Up @@ -550,6 +575,8 @@ def __call__(self, score_track, null_track, tnt_results=None):
start=argmax-self.flank,
end=argmax+self.sliding+self.flank,
score=original_summed_score_track[example_idx][argmax])
assert (coord.score > pos_threshold
or coord.score < neg_threshold)
coords.append(coord)
else:
assert False,\
Expand Down
5 changes: 4 additions & 1 deletion modisco/tfmodisco_workflow/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ def __init__(self,
target_seqlet_fdr=0.2,
min_passing_windows_frac=0.03,
max_passing_windows_frac=0.2,
separate_pos_neg_thresholds=False,
verbose=True,
min_seqlets_per_task=None):

Expand All @@ -177,6 +178,7 @@ def __init__(self,
self.max_seqlets_per_metacluster = max_seqlets_per_metacluster
self.min_passing_windows_frac = min_passing_windows_frac
self.max_passing_windows_frac = max_passing_windows_frac
self.separate_pos_neg_thresholds = separate_pos_neg_thresholds
self.verbose = verbose

self.build()
Expand Down Expand Up @@ -206,6 +208,7 @@ def __call__(self, task_names, contrib_scores,
target_fdr=self.target_seqlet_fdr,
min_passing_windows_frac=self.min_passing_windows_frac,
max_passing_windows_frac=self.max_passing_windows_frac,
separate_pos_neg_thresholds=self.separate_pos_neg_thresholds,
max_seqlets_total=None,
verbose=self.verbose)

Expand Down Expand Up @@ -234,7 +237,7 @@ def __call__(self, task_names, contrib_scores,
abs(x.tnt_results.transformed_neg_threshold))
for x in (multitask_seqlet_creation_results.
task_name_to_coord_producer_results.values())]) -
0.0000001) #subtract 1e-7 to avoid numerical issues
0.0000001) #subtract 1e-7 to avoid weird numerical issues
print("Across all tasks, the weakest transformed threshold used"
+" was: "+str(weakest_transformed_thresh))

Expand Down
36 changes: 35 additions & 1 deletion modisco/value_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def __call__(self, seqlet):
def get_val(self, seqlet):
flank_to_ignore = int(0.5*(len(seqlet)-self.central_window))
track_values = seqlet[self.track_name]\
.fwd[flank_to_ignore:-flank_to_ignore]
.fwd[flank_to_ignore:(len(seqlet)-flank_to_ignore)]
return np.sum(track_values)

def save_hdf5(self, grp):
Expand Down Expand Up @@ -95,3 +95,37 @@ def __call__(self, val):
return np.sign(val)*np.searchsorted(
a=self.distribution,
v=abs(val))/float(len(self.distribution))


class SignedPercentileValTransformer(AbstractValTransformer):

def __init__(self, distribution):
self.distribution = np.array(distribution)
self.pos_dist = np.array(sorted(
[x for x in self.distribution if x > 0]))
self.abs_neg_dist = np.array(sorted(
[abs(x) for x in self.distribution if x < 0]))

@classmethod
def from_hdf5(cls, grp):
distribution = np.array(grp["distribution"][:])
return cls(distribution=distribution)

def save_hdf5(self, grp):
grp.attrs["class"] = type(self).__name__
grp.create_dataset("distribution", data=self.distribution)

def __call__(self, val):
if (val == 0):
return 0
elif (val > 0):
#add 1E-7 for complicated numerical stability issues
# basically need robustness when dealing with ties
return np.searchsorted(
a=self.pos_dist, v=(val+1E-7))/float(len(self.pos_dist))
else:
#add 1E-7 for complicated numerical stability issues
# basically need robustness when dealing with ties
return np.searchsorted(
a=self.abs_neg_dist, v=(abs(val)+1E-7))/float(
len(self.abs_neg_dist))
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
description='TF MOtif Discovery from Importance SCOres',
long_description="""Algorithm for discovering consolidated patterns from base-pair-level importance scores""",
url='https://github.com/kundajelab/tfmodisco',
version='0.5.1.0',
version='0.5.1.1',
packages=find_packages(),
package_data={
'': ['cluster/phenograph/louvain/*convert*', 'cluster/phenograph/louvain/*community*', 'cluster/phenograph/louvain/*hierarchy*']
Expand Down
Loading

0 comments on commit 2fd98a2

Please sign in to comment.