From 716ae263d2291f4146a325b04d1788019148ac86 Mon Sep 17 00:00:00 2001
From: Ziga Avsec <avsec@in.tum.de>
Date: Sun, 17 Feb 2019 14:12:54 -0800
Subject: [PATCH 1/3] added min_metacluster_size_frac

---
 modisco/coordproducers.py              |  4 ++++
 modisco/tfmodisco_workflow/workflow.py | 10 ++++++++++
 2 files changed, 14 insertions(+)

diff --git a/modisco/coordproducers.py b/modisco/coordproducers.py
index 2fcc726..33166af 100644
--- a/modisco/coordproducers.py
+++ b/modisco/coordproducers.py
@@ -170,6 +170,7 @@ def __call__(self, values):
         if (self.min_seqlets is not None):
             num_pos_passing = np.sum(pos_values > pos_threshold)
             num_neg_passing = np.sum(neg_values < neg_threshold)
+            print("testing if {} + {} < {}".format(num_pos_passing, num_neg_passing, self.min_seqlets))
             if (num_pos_passing + num_neg_passing < self.min_seqlets):
                 #manually adjust the threshold
                 shifted_values = values - mu
@@ -179,6 +180,9 @@ def __call__(self, values):
                     print("Manually adjusting thresholds to get desired num seqlets")
                 pos_threshold = abs_threshold
                 neg_threshold = -abs_threshold
+            else:
+                print("thresholds were not adjusted")
+                    
         
         pos_threshold_cdf = 1-np.exp(-pos_threshold/pos_laplace_b)
         neg_threshold_cdf = 1-np.exp(neg_threshold/neg_laplace_b)
diff --git a/modisco/tfmodisco_workflow/workflow.py b/modisco/tfmodisco_workflow/workflow.py
index 30be284..59d1f1a 100644
--- a/modisco/tfmodisco_workflow/workflow.py
+++ b/modisco/tfmodisco_workflow/workflow.py
@@ -142,6 +142,7 @@ def __init__(self,
                  histogram_bins=100, percentiles_in_bandwidth=10, 
                  overlap_portion=0.5,
                  min_metacluster_size=100,
+                 min_metacluster_size_frac=0.01,
                  target_seqlet_fdr=0.05,
                  weak_threshold_for_counting_sign=0.99,
                  max_seqlets_per_metacluster=20000,
@@ -156,6 +157,7 @@ def __init__(self,
         self.percentiles_in_bandwidth = percentiles_in_bandwidth
         self.overlap_portion = overlap_portion
         self.min_metacluster_size = min_metacluster_size
+        self.min_metacluster_size_frac = min_metacluster_size_frac
         self.target_seqlet_fdr = target_seqlet_fdr
         self.weak_threshold_for_counting_sign =\
             weak_threshold_for_counting_sign
@@ -218,6 +220,14 @@ def __call__(self, task_names, contrib_scores,
             print("WARNING: you found relatively few seqlets."
                   +" Consider dropping target_seqlet_fdr") 
 
+        
+        if int(self.min_metacluster_size_frac * len(seqlets)) > self.min_metacluster_size:
+            print("min_metacluster_size_frac * len(seqlets) = {0} is more than min_metacluster_size={1}.".\
+                  format(int(self.min_metacluster_size_frac * len(seqlets)), self.min_metacluster_size))
+            print("Using it as a new min_metacluster_size")
+            self.min_metacluster_size = int(self.min_metacluster_size_frac * len(seqlets))
+
+
         if (self.weak_threshold_for_counting_sign is None):
             weak_threshold_for_counting_sign = laplace_threshold_cdf
         else:

From 0f3a7c375e37c3b8e36da334f79ea67513f7a147 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=BDiga=20Avsec?= <Avsecz@users.noreply.github.com>
Date: Tue, 26 Feb 2019 01:54:52 +0100
Subject: [PATCH 2/3] Update coordproducers.py

---
 modisco/coordproducers.py | 75 ---------------------------------------
 1 file changed, 75 deletions(-)

diff --git a/modisco/coordproducers.py b/modisco/coordproducers.py
index 3bf473b..358ab71 100644
--- a/modisco/coordproducers.py
+++ b/modisco/coordproducers.py
@@ -189,81 +189,6 @@ def __call__(self, score_track, windowsize, original_summed_score_track):
         l_edge = bin_edges2[peak2]
         r_edge = bin_edges2[peak2+1]
         mu = (l_edge + r_edge) / 2
-        print("peak(mu)=", mu)
-
-        pos_values = np.array(sorted(values[values > mu] - mu))
-        neg_values = np.array(sorted(values[values < mu] - mu, key=lambda x: -x))
-
-        #We assume that the null is governed by a laplace, because
-        #that's what I (Av Shrikumar) have personally observed
-        #But we calculate a different laplace distribution for
-        # positive and negative values, in case they are
-        # distributed slightly differently
-        #estimate b using the percentile
-        #for x below 0:
-        #cdf = 0.5*exp(x/b)
-        #b = x/(log(cdf/0.5))
-        neg_laplace_b = np.percentile(neg_values, 95)/(np.log(0.95))
-        pos_laplace_b = (-np.percentile(pos_values, 5))/(np.log(0.95))
-
-        #for the pos and neg, compute the expected number above a
-        #particular threshold based on the total number of examples,
-        #and use this to estimate the fdr
-        #for pos_null_above, we estimate the total num of examples
-        #as 2*len(pos_values)
-        pos_fdrs = (len(pos_values)*(np.exp(-pos_values/pos_laplace_b)))/(
-                    len(pos_values)-np.arange(len(pos_values)))
-        pos_fdrs = np.minimum(pos_fdrs, 1.0)
-        neg_fdrs = (len(neg_values)*(np.exp(neg_values/neg_laplace_b)))/(
-                    len(neg_values)-np.arange(len(neg_values)))
-        neg_fdrs = np.minimum(neg_fdrs, 1.0)
-
-        pos_fdrs_passing_thresh = [x for x in zip(pos_values, pos_fdrs)
-                                   if x[1] <= self.target_fdr]
-        neg_fdrs_passing_thresh = [x for x in zip(neg_values, neg_fdrs)
-                                   if x[1] <= self.target_fdr]
-        if (len(pos_fdrs_passing_thresh) > 0):
-            pos_threshold, pos_thresh_fdr = pos_fdrs_passing_thresh[0]
-        else:
-            pos_threshold, pos_thresh_fdr = pos_values[-1], pos_fdrs[-1]
-            pos_threshold += 0.0000001
-        if (len(neg_fdrs_passing_thresh) > 0):
-            neg_threshold, neg_thresh_fdr = neg_fdrs_passing_thresh[0]
-            neg_threshold = neg_threshold - 0.0000001
-        else:
-            neg_threshold, neg_thresh_fdr = neg_values[-1], neg_fdrs[-1]
-
-        if (self.min_seqlets is not None):
-            num_pos_passing = np.sum(pos_values > pos_threshold)
-            num_neg_passing = np.sum(neg_values < neg_threshold)
-            print("testing if {} + {} < {}".format(num_pos_passing, num_neg_passing, self.min_seqlets))
-            if (num_pos_passing + num_neg_passing < self.min_seqlets):
-                #manually adjust the threshold
-                shifted_values = values - mu
-                values_sorted_by_abs = sorted(np.abs(shifted_values), key=lambda x: -x)
-                abs_threshold = values_sorted_by_abs[self.min_seqlets-1]
-                if (self.verbose):
-                    print("Manually adjusting thresholds to get desired num seqlets")
-                pos_threshold = abs_threshold
-                neg_threshold = -abs_threshold
-            else:
-                print("thresholds were not adjusted")
-                    
-        
-        pos_threshold_cdf = 1-np.exp(-pos_threshold/pos_laplace_b)
-        neg_threshold_cdf = 1-np.exp(neg_threshold/neg_laplace_b)
-        #neg_threshold = np.log((1-self.threshold_cdf)*2)*neg_laplace_b
-        #pos_threshold = -np.log((1-self.threshold_cdf)*2)*pos_laplace_b
-        
-        neg_threshold += mu
-        pos_threshold += mu
-        neg_threshold = min(neg_threshold, 0)
-        pos_threshold = max(pos_threshold, 0)
-        
-        
-
-        #plot the result
-
         if (self.verbose):
             print("peak(mu)=", mu)
 

From 44dda5a3c6c78d09f584ca7763cd215533bb7bec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=BDiga=20Avsec?= <Avsecz@users.noreply.github.com>
Date: Tue, 26 Feb 2019 01:56:27 +0100
Subject: [PATCH 3/3] Update setup.py

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 578f865..b2029dc 100644
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@
           description='TF MOtif Discovery from Importance SCOres',
           long_description="""Algorithm for discovering consolidated patterns from base-pair-level importance scores""",
           url='https://github.com/kundajelab/tfmodisco',
-          version='0.5.0.0',
+          version='0.5.0.1',
           packages=find_packages(),
           package_data={
                 '': ['cluster/phenograph/louvain/*convert*', 'cluster/phenograph/louvain/*community*', 'cluster/phenograph/louvain/*hierarchy*']