Merge pull request #39 from kundajelab/norevcomp

Added support for different manual positive and negative thresholds
kundajelab · Mar 8, 2019 · 2fd98a2 · 2fd98a2
2 parents ffe7952 + 68bef15
commit 2fd98a2
Show file tree

Hide file tree

Showing 7 changed files with 1,156 additions and 1,369 deletions.
diff --git a/modisco.egg-info/PKG-INFO b/modisco.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: modisco
-Version: 0.5.1.0
+Version: 0.5.1.1
 Summary: TF MOtif Discovery from Importance SCOres
 Home-page: https://github.com/kundajelab/tfmodisco
 License: UNKNOWN

diff --git a/modisco/coordproducers.py b/modisco/coordproducers.py
@@ -7,7 +7,9 @@
 from sklearn.neighbors.kde import KernelDensity
 import sys
 import time
-from .value_provider import AbstractValTransformer, AbsPercentileValTransformer
+from .value_provider import (
+    AbstractValTransformer, AbsPercentileValTransformer,
+    SignedPercentileValTransformer)
 
 
 class TransformAndThresholdResults(object):
@@ -101,13 +103,9 @@ def get_simple_window_sum_function(window_size):
     def window_sum_function(arrs):
         to_return = []
         for arr in arrs:
-            current_sum = np.sum(arr[0:window_size])
-            arr_running_sum = [current_sum]
-            for i in range(0,(len(arr)-window_size)):
-                current_sum = (current_sum +
-                               arr[i+window_size] - arr[i])
-                arr_running_sum.append(current_sum)
-            to_return.append(np.array(arr_running_sum))
+            cumsum = np.cumsum(arr)
+            cumsum = np.array([0]+list(cumsum))
+            to_return.append(cumsum[window_size:]-cumsum[:-window_size])
         return to_return
     return window_sum_function
 
@@ -311,16 +309,19 @@ def __init__(self, sliding,
                        target_fdr,
                        min_passing_windows_frac,
                        max_passing_windows_frac,
+                       separate_pos_neg_thresholds=False,
                        max_seqlets_total=None,
                        progress_update=5000,
-                       verbose=True):
+                       verbose=True,
+                       ):
         self.sliding = sliding
         self.flank = flank
         self.suppress = suppress
         self.target_fdr = target_fdr
         assert max_passing_windows_frac >= min_passing_windows_frac
         self.min_passing_windows_frac = min_passing_windows_frac 
         self.max_passing_windows_frac = max_passing_windows_frac
+        self.separate_pos_neg_thresholds = separate_pos_neg_thresholds
         self.max_seqlets_total = None
         self.progress_update = progress_update
         self.verbose = verbose
@@ -333,6 +334,7 @@ def from_hdf5(cls, grp):
         target_fdr = grp.attrs["target_fdr"]
         min_passing_windows_frac = grp.attrs["min_passing_windows_frac"]
         max_passing_windows_frac = grp.attrs["max_passing_windows_frac"]
+        separate_pos_neg_thresholds = grp.attrs["separate_pos_neg_thresholds"]
         if ("max_seqlets_total" in grp.attrs):
             max_seqlets_total = grp.attrs["max_seqlets_total"]
         else:
@@ -344,6 +346,7 @@ def from_hdf5(cls, grp):
                     target_fdr=target_fdr,
                     min_passing_windows_frac=min_passing_windows_frac,
                     max_passing_windows_frac=max_passing_windows_frac,
+                    separate_pos_neg_thresholds=separate_pos_neg_thresholds,
                     max_seqlets_total=max_seqlets_total,
                     progress_update=progress_update, verbose=verbose) 
 
@@ -355,6 +358,8 @@ def save_hdf5(self, grp):
         grp.attrs["target_fdr"] = self.target_fdr
         grp.attrs["min_passing_windows_frac"] = self.min_passing_windows_frac
         grp.attrs["max_passing_windows_frac"] = self.max_passing_windows_frac
+        grp.attrs["separate_pos_neg_thresholds"] =\
+            self.separate_pos_neg_thresholds
         #TODO: save min_seqlets feature
         if (self.max_seqlets_total is not None):
             grp.attrs["max_seqlets_total"] = self.max_seqlets_total 
@@ -435,22 +440,42 @@ def __call__(self, score_track, null_track, tnt_results=None):
                     print("Passing windows frac was",
                           frac_passing_windows,", which is below ",
                           self.min_passing_windows_frac,"; adjusting")
-                pos_threshold = np.percentile(
-                    a=np.abs(orig_vals),
-                    q=100*(1-self.min_passing_windows_frac)) 
-                neg_threshold = -pos_threshold
+                if (self.separate_pos_neg_thresholds):
+                    pos_threshold = np.percentile(
+                        a=[x for x in orig_vals if x > 0],
+                        q=100*(1-self.min_passing_windows_frac))
+                    neg_threshold = np.percentile(
+                        a=[x for x in orig_vals if x < 0],
+                        q=100*(self.min_passing_windows_frac))
+                else:
+                    pos_threshold = np.percentile(
+                        a=np.abs(orig_vals),
+                        q=100*(1-self.min_passing_windows_frac)) 
+                    neg_threshold = -pos_threshold
             if (frac_passing_windows > self.max_passing_windows_frac):
                 if (self.verbose):
                     print("Passing windows frac was",
                           frac_passing_windows,", which is above ",
                           self.max_passing_windows_frac,"; adjusting")
-                pos_threshold = np.percentile(
-                    a=np.abs(orig_vals),
-                    q=100*(1-self.max_passing_windows_frac)) 
-                neg_threshold = -pos_threshold
-
-            val_transformer = AbsPercentileValTransformer(
-                distribution=orig_vals)
+                if (self.separate_pos_neg_thresholds):
+                    pos_threshold = np.percentile(
+                        a=[x for x in orig_vals if x > 0],
+                        q=100*(1-self.max_passing_windows_frac))
+                    neg_threshold = np.percentile(
+                        a=[x for x in orig_vals if x < 0],
+                        q=100*(self.max_passing_windows_frac))
+                else:
+                    pos_threshold = np.percentile(
+                        a=np.abs(orig_vals),
+                        q=100*(1-self.max_passing_windows_frac)) 
+                    neg_threshold = -pos_threshold
+
+            if (self.separate_pos_neg_thresholds):
+                val_transformer = SignedPercentileValTransformer(
+                    distribution=orig_vals)
+            else:
+                val_transformer = AbsPercentileValTransformer(
+                    distribution=orig_vals)
 
             if (self.verbose):
                 print("Final raw thresholds are",
@@ -516,12 +541,12 @@ def __call__(self, score_track, null_track, tnt_results=None):
         neg_threshold = tnt_results.neg_threshold
         pos_threshold = tnt_results.pos_threshold
 
-        summed_score_track = [x.copy() for x in original_summed_score_track]
+        summed_score_track = [np.array(x) for x in original_summed_score_track]
 
         #if a position is less than the threshold, set it to -np.inf
         summed_score_track = [
-            np.array([np.abs(y) if (y >= pos_threshold
-                            or y <= neg_threshold)
+            np.array([np.abs(y) if (y > pos_threshold
+                            or y < neg_threshold)
                            else -np.inf for y in x])
             for x in summed_score_track]
 
@@ -550,6 +575,8 @@ def __call__(self, score_track, null_track, tnt_results=None):
                         start=argmax-self.flank,
                         end=argmax+self.sliding+self.flank,
                         score=original_summed_score_track[example_idx][argmax]) 
+                    assert (coord.score > pos_threshold
+                            or coord.score < neg_threshold)
                     coords.append(coord)
                 else:
                     assert False,\

diff --git a/modisco/tfmodisco_workflow/workflow.py b/modisco/tfmodisco_workflow/workflow.py
@@ -155,6 +155,7 @@ def __init__(self,
                  target_seqlet_fdr=0.2,
                  min_passing_windows_frac=0.03,
                  max_passing_windows_frac=0.2,
+                 separate_pos_neg_thresholds=False,
                  verbose=True,
                  min_seqlets_per_task=None):
 
@@ -177,6 +178,7 @@ def __init__(self,
         self.max_seqlets_per_metacluster = max_seqlets_per_metacluster
         self.min_passing_windows_frac = min_passing_windows_frac
         self.max_passing_windows_frac = max_passing_windows_frac
+        self.separate_pos_neg_thresholds = separate_pos_neg_thresholds
         self.verbose = verbose
 
         self.build()
@@ -206,6 +208,7 @@ def __call__(self, task_names, contrib_scores,
             target_fdr=self.target_seqlet_fdr,
             min_passing_windows_frac=self.min_passing_windows_frac,
             max_passing_windows_frac=self.max_passing_windows_frac,
+            separate_pos_neg_thresholds=self.separate_pos_neg_thresholds,
             max_seqlets_total=None,
             verbose=self.verbose) 
 
@@ -234,7 +237,7 @@ def __call__(self, task_names, contrib_scores,
                  abs(x.tnt_results.transformed_neg_threshold))
                  for x in (multitask_seqlet_creation_results.
                            task_name_to_coord_producer_results.values())]) -
-            0.0000001) #subtract 1e-7 to avoid numerical issues
+            0.0000001) #subtract 1e-7 to avoid weird numerical issues
         print("Across all tasks, the weakest transformed threshold used"
               +" was: "+str(weakest_transformed_thresh))
 

diff --git a/modisco/value_provider.py b/modisco/value_provider.py
@@ -43,7 +43,7 @@ def __call__(self, seqlet):
     def get_val(self, seqlet):
         flank_to_ignore = int(0.5*(len(seqlet)-self.central_window))
         track_values = seqlet[self.track_name]\
-                        .fwd[flank_to_ignore:-flank_to_ignore]
+                        .fwd[flank_to_ignore:(len(seqlet)-flank_to_ignore)]
         return np.sum(track_values)
 
     def save_hdf5(self, grp):
@@ -95,3 +95,37 @@ def __call__(self, val):
         return np.sign(val)*np.searchsorted(
                  a=self.distribution,
                  v=abs(val))/float(len(self.distribution))
+
+
+class SignedPercentileValTransformer(AbstractValTransformer):
+
+    def __init__(self, distribution):
+        self.distribution = np.array(distribution)
+        self.pos_dist = np.array(sorted(
+            [x for x in self.distribution if x > 0]))
+        self.abs_neg_dist = np.array(sorted(
+            [abs(x) for x in self.distribution if x < 0]))
+
+    @classmethod
+    def from_hdf5(cls, grp):
+        distribution = np.array(grp["distribution"][:])
+        return cls(distribution=distribution) 
+
+    def save_hdf5(self, grp):
+        grp.attrs["class"] = type(self).__name__
+        grp.create_dataset("distribution", data=self.distribution)
+
+    def __call__(self, val):
+        if (val == 0):
+            return 0
+        elif (val > 0):
+            #add 1E-7 for complicated numerical stability issues 
+            # basically need robustness when dealing with ties
+            return  np.searchsorted(
+                     a=self.pos_dist, v=(val+1E-7))/float(len(self.pos_dist))
+        else:
+            #add 1E-7 for complicated numerical stability issues 
+            # basically need robustness when dealing with ties
+            return  np.searchsorted(
+                     a=self.abs_neg_dist, v=(abs(val)+1E-7))/float(
+                        len(self.abs_neg_dist))
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
           description='TF MOtif Discovery from Importance SCOres',
           long_description="""Algorithm for discovering consolidated patterns from base-pair-level importance scores""",
           url='https://github.com/kundajelab/tfmodisco',
-          version='0.5.1.0',
+          version='0.5.1.1',
           packages=find_packages(),
           package_data={
                 '': ['cluster/phenograph/louvain/*convert*', 'cluster/phenograph/louvain/*community*', 'cluster/phenograph/louvain/*hierarchy*']