From ffcb3d1cf35a43d1ddaa9199a965c041b933ef35 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Tue, 20 Oct 2015 15:45:31 -0400 Subject: [PATCH 1/3] Minor modification in the report for the duplicated samples step --- pyGenClean/run_data_clean_up.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyGenClean/run_data_clean_up.py b/pyGenClean/run_data_clean_up.py index a26cc26..e3b3e9d 100644 --- a/pyGenClean/run_data_clean_up.py +++ b/pyGenClean/run_data_clean_up.py @@ -385,9 +385,9 @@ def run_duplicated_samples(in_prefix, in_type, out_prefix, base_dir, options): if len(not_good_still) > 0: text = latex_template.textbf( - "There {} {:,d} sample{} that {} not good enough for " - "completion, but {} still selected as the best " - "duplicate (see Table~{}).".format( + "There {} {:,d} sample{} that {} not good due to low " + "completion or concordance, but {} still selected as " + "the best duplicate (see Table~{}).".format( "were" if len(not_good_still) > 1 else "was", len(not_good_still), "s" if len(not_good_still) > 1 else "", From 07c5b9cf18a8e9c2a0d2692630026585825464eb Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Mon, 26 Oct 2015 10:03:01 -0400 Subject: [PATCH 2/3] Updated the report to adjust for the number of duplicated markers --- pyGenClean/run_data_clean_up.py | 88 ++++++++++++++++++++------------- 1 file changed, 53 insertions(+), 35 deletions(-) diff --git a/pyGenClean/run_data_clean_up.py b/pyGenClean/run_data_clean_up.py index e3b3e9d..d9fa6ea 100644 --- a/pyGenClean/run_data_clean_up.py +++ b/pyGenClean/run_data_clean_up.py @@ -592,12 +592,35 @@ def run_duplicated_snps(in_prefix, in_type, out_prefix, base_dir, options): for marker_id in removed_markers: print >>o_file, marker_id + "\t" + "removed duplicate" - # Reading the markers with problem - problematic_markers = set() - if os.path.isfile(script_prefix + ".problems"): - with open(script_prefix + ".problems", "r") as i_file: - for markers in i_file.read().splitlines()[1:]: - problematic_markers |= set(markers.split("\t")[2].split(";")) + # Writing the summary results + total_remaining = 0 + with open(os.path.join(base_dir, "results_summary.txt"), "a") as o_file: + print >>o_file, "# {}".format(script_prefix) + rep_counter = Counter(duplicated_count.values()).most_common() + if rep_counter: + print >>o_file, "Number of replicated markers" + else: + print >>o_file, "Number of replicated markers\t0" + total_nb_removed_rep = 0 + for rep_type, rep_count in rep_counter: + nb_removed_rep = (rep_count * rep_type) - rep_count + print >>o_file, " - x{}\t{:,d}\t-{:,d}".format( + rep_type, + rep_count, + nb_removed_rep, + ) + total_nb_removed_rep += nb_removed_rep + total_remaining = total_nb_removed_rep - len(removed_markers) + print >>o_file, ( + "Number of replicated markers kept\t{nb:,d}\t+{nb:,d}".format( + nb=total_remaining, + ) + ) + print >>o_file, ("Poorly chosen replicated markers\t" + "{nb:,d}".format(nb=len(not_good_still))) + print >>o_file, ("Final number of excluded markers\t" + "{nb:,d}".format(nb=len(removed_markers))) + print >>o_file, "---" # We create a LaTeX summary latex_file = os.path.join(script_prefix + ".summary.tex") @@ -641,18 +664,37 @@ def run_duplicated_snps(in_prefix, in_type, out_prefix, base_dir, options): ) print >>o_file, latex_template.wrap_lines(text) + if total_remaining > 0: + text = latex_template.textbf( + "In total, {:,d} maker{} {} not merged for different " + "reasons (low completion rate, discordant allele, " + "discordant MAF, etc) and {} still present in the " + "dataset.".format( + total_remaining, + "s" if total_remaining > 1 else "", + "were" if total_remaining > 1 else "was", + "are" if total_remaining > 1 else "is", + ) + ) + print >>o_file, latex_template.wrap_lines(text) + if len(not_good_still) > 0: + start = "A total of" + end = " and {} still present in the final dataset.".format( + "are" if len(not_good_still) > 1 else "is", + ) + if total_remaining > 0: + start = "Out of these," + end = "." text = latex_template.textbf( - "There {} {:,d} marker{} that {} not good enough for " + start + " {:,d} marker{} {} not good enough for " "completion, but {} still selected as the best " - "duplicate and {} still present in the final " - "dataset.".format( - "were" if len(not_good_still) > 1 else "was", + "duplicate{}".format( len(not_good_still), "s" if len(not_good_still) > 1 else "", "were" if len(not_good_still) > 1 else "was", "were" if len(not_good_still) > 1 else "was", - "are" if len(not_good_still) > 1 else "is", + end, ) ) print >>o_file, latex_template.wrap_lines(text) @@ -661,30 +703,6 @@ def run_duplicated_snps(in_prefix, in_type, out_prefix, base_dir, options): msg = "{}: cannot write LaTeX summary".format(latex_file) raise ProgramError(msg) - # Writing the summary results - with open(os.path.join(base_dir, "results_summary.txt"), "a") as o_file: - print >>o_file, "# {}".format(script_prefix) - counter = Counter(duplicated_count.values()).most_common() - if counter: - print >>o_file, "Number of replicated markers" - else: - print >>o_file, "Number of replicated markers\t0" - for rep_type, rep_count in counter: - print >>o_file, " - x{}\t{:,d}\t-{:,d}".format( - rep_type, - rep_count, - (rep_count * rep_type) - rep_count, - ) - print >>o_file, ("Poorly chosen replicated markers\t" - "{nb:,d}\t+{nb:,d}".format(nb=len(not_good_still))) - print >>o_file, ("Problematic markers not chosen\t" - "{nb:,d}\t+{nb:,d}".format( - nb=len(problematic_markers - chosen_markers), - )) - print >>o_file, ("Final number of excluded markers\t" - "{nb:,d}".format(nb=len(removed_markers))) - print >>o_file, "---" - # We know this step does produce a new data set (tfile), so we return it return (os.path.join(out_prefix, "dup_snps.final"), "tfile", latex_file, duplicated_snps.desc, duplicated_snps.long_desc, None) From 0ea2647a96c648c3b977aa6c0f2f6891052a70f2 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Lemieux Perreault Date: Mon, 26 Oct 2015 10:18:47 -0400 Subject: [PATCH 3/3] Preparing for next release --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3b6aeb3..9413924 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ MAJOR = 1 MINOR = 7 -MICRO = 0 +MICRO = 1 VERSION = "{0}.{1}.{2}".format(MAJOR, MINOR, MICRO)