From 716ae263d2291f4146a325b04d1788019148ac86 Mon Sep 17 00:00:00 2001 From: Ziga Avsec Date: Sun, 17 Feb 2019 14:12:54 -0800 Subject: [PATCH 1/3] added min_metacluster_size_frac --- modisco/coordproducers.py | 4 ++++ modisco/tfmodisco_workflow/workflow.py | 10 ++++++++++ 2 files changed, 14 insertions(+) diff --git a/modisco/coordproducers.py b/modisco/coordproducers.py index 2fcc726..33166af 100644 --- a/modisco/coordproducers.py +++ b/modisco/coordproducers.py @@ -170,6 +170,7 @@ def __call__(self, values): if (self.min_seqlets is not None): num_pos_passing = np.sum(pos_values > pos_threshold) num_neg_passing = np.sum(neg_values < neg_threshold) + print("testing if {} + {} < {}".format(num_pos_passing, num_neg_passing, self.min_seqlets)) if (num_pos_passing + num_neg_passing < self.min_seqlets): #manually adjust the threshold shifted_values = values - mu @@ -179,6 +180,9 @@ def __call__(self, values): print("Manually adjusting thresholds to get desired num seqlets") pos_threshold = abs_threshold neg_threshold = -abs_threshold + else: + print("thresholds were not adjusted") + pos_threshold_cdf = 1-np.exp(-pos_threshold/pos_laplace_b) neg_threshold_cdf = 1-np.exp(neg_threshold/neg_laplace_b) diff --git a/modisco/tfmodisco_workflow/workflow.py b/modisco/tfmodisco_workflow/workflow.py index 30be284..59d1f1a 100644 --- a/modisco/tfmodisco_workflow/workflow.py +++ b/modisco/tfmodisco_workflow/workflow.py @@ -142,6 +142,7 @@ def __init__(self, histogram_bins=100, percentiles_in_bandwidth=10, overlap_portion=0.5, min_metacluster_size=100, + min_metacluster_size_frac=0.01, target_seqlet_fdr=0.05, weak_threshold_for_counting_sign=0.99, max_seqlets_per_metacluster=20000, @@ -156,6 +157,7 @@ def __init__(self, self.percentiles_in_bandwidth = percentiles_in_bandwidth self.overlap_portion = overlap_portion self.min_metacluster_size = min_metacluster_size + self.min_metacluster_size_frac = min_metacluster_size_frac self.target_seqlet_fdr = target_seqlet_fdr self.weak_threshold_for_counting_sign =\ weak_threshold_for_counting_sign @@ -218,6 +220,14 @@ def __call__(self, task_names, contrib_scores, print("WARNING: you found relatively few seqlets." +" Consider dropping target_seqlet_fdr") + + if int(self.min_metacluster_size_frac * len(seqlets)) > self.min_metacluster_size: + print("min_metacluster_size_frac * len(seqlets) = {0} is more than min_metacluster_size={1}.".\ + format(int(self.min_metacluster_size_frac * len(seqlets)), self.min_metacluster_size)) + print("Using it as a new min_metacluster_size") + self.min_metacluster_size = int(self.min_metacluster_size_frac * len(seqlets)) + + if (self.weak_threshold_for_counting_sign is None): weak_threshold_for_counting_sign = laplace_threshold_cdf else: From 0f3a7c375e37c3b8e36da334f79ea67513f7a147 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=BDiga=20Avsec?= Date: Tue, 26 Feb 2019 01:54:52 +0100 Subject: [PATCH 2/3] Update coordproducers.py --- modisco/coordproducers.py | 75 --------------------------------------- 1 file changed, 75 deletions(-) diff --git a/modisco/coordproducers.py b/modisco/coordproducers.py index 3bf473b..358ab71 100644 --- a/modisco/coordproducers.py +++ b/modisco/coordproducers.py @@ -189,81 +189,6 @@ def __call__(self, score_track, windowsize, original_summed_score_track): l_edge = bin_edges2[peak2] r_edge = bin_edges2[peak2+1] mu = (l_edge + r_edge) / 2 - print("peak(mu)=", mu) - - pos_values = np.array(sorted(values[values > mu] - mu)) - neg_values = np.array(sorted(values[values < mu] - mu, key=lambda x: -x)) - - #We assume that the null is governed by a laplace, because - #that's what I (Av Shrikumar) have personally observed - #But we calculate a different laplace distribution for - # positive and negative values, in case they are - # distributed slightly differently - #estimate b using the percentile - #for x below 0: - #cdf = 0.5*exp(x/b) - #b = x/(log(cdf/0.5)) - neg_laplace_b = np.percentile(neg_values, 95)/(np.log(0.95)) - pos_laplace_b = (-np.percentile(pos_values, 5))/(np.log(0.95)) - - #for the pos and neg, compute the expected number above a - #particular threshold based on the total number of examples, - #and use this to estimate the fdr - #for pos_null_above, we estimate the total num of examples - #as 2*len(pos_values) - pos_fdrs = (len(pos_values)*(np.exp(-pos_values/pos_laplace_b)))/( - len(pos_values)-np.arange(len(pos_values))) - pos_fdrs = np.minimum(pos_fdrs, 1.0) - neg_fdrs = (len(neg_values)*(np.exp(neg_values/neg_laplace_b)))/( - len(neg_values)-np.arange(len(neg_values))) - neg_fdrs = np.minimum(neg_fdrs, 1.0) - - pos_fdrs_passing_thresh = [x for x in zip(pos_values, pos_fdrs) - if x[1] <= self.target_fdr] - neg_fdrs_passing_thresh = [x for x in zip(neg_values, neg_fdrs) - if x[1] <= self.target_fdr] - if (len(pos_fdrs_passing_thresh) > 0): - pos_threshold, pos_thresh_fdr = pos_fdrs_passing_thresh[0] - else: - pos_threshold, pos_thresh_fdr = pos_values[-1], pos_fdrs[-1] - pos_threshold += 0.0000001 - if (len(neg_fdrs_passing_thresh) > 0): - neg_threshold, neg_thresh_fdr = neg_fdrs_passing_thresh[0] - neg_threshold = neg_threshold - 0.0000001 - else: - neg_threshold, neg_thresh_fdr = neg_values[-1], neg_fdrs[-1] - - if (self.min_seqlets is not None): - num_pos_passing = np.sum(pos_values > pos_threshold) - num_neg_passing = np.sum(neg_values < neg_threshold) - print("testing if {} + {} < {}".format(num_pos_passing, num_neg_passing, self.min_seqlets)) - if (num_pos_passing + num_neg_passing < self.min_seqlets): - #manually adjust the threshold - shifted_values = values - mu - values_sorted_by_abs = sorted(np.abs(shifted_values), key=lambda x: -x) - abs_threshold = values_sorted_by_abs[self.min_seqlets-1] - if (self.verbose): - print("Manually adjusting thresholds to get desired num seqlets") - pos_threshold = abs_threshold - neg_threshold = -abs_threshold - else: - print("thresholds were not adjusted") - - - pos_threshold_cdf = 1-np.exp(-pos_threshold/pos_laplace_b) - neg_threshold_cdf = 1-np.exp(neg_threshold/neg_laplace_b) - #neg_threshold = np.log((1-self.threshold_cdf)*2)*neg_laplace_b - #pos_threshold = -np.log((1-self.threshold_cdf)*2)*pos_laplace_b - - neg_threshold += mu - pos_threshold += mu - neg_threshold = min(neg_threshold, 0) - pos_threshold = max(pos_threshold, 0) - - - - #plot the result - if (self.verbose): print("peak(mu)=", mu) From 44dda5a3c6c78d09f584ca7763cd215533bb7bec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=BDiga=20Avsec?= Date: Tue, 26 Feb 2019 01:56:27 +0100 Subject: [PATCH 3/3] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 578f865..b2029dc 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ description='TF MOtif Discovery from Importance SCOres', long_description="""Algorithm for discovering consolidated patterns from base-pair-level importance scores""", url='https://github.com/kundajelab/tfmodisco', - version='0.5.0.0', + version='0.5.0.1', packages=find_packages(), package_data={ '': ['cluster/phenograph/louvain/*convert*', 'cluster/phenograph/louvain/*community*', 'cluster/phenograph/louvain/*hierarchy*']