From ada3ea068055e3564e0e27cc71281ce5f1a73872 Mon Sep 17 00:00:00 2001 From: Sebastian Will Date: Fri, 25 May 2018 15:52:24 +0200 Subject: [PATCH] Adjust behavior of locarna and mlocarna for pairwise alignments -- do not use alifold in binaries when calculating base pairs for single sequences -- use noLP in mlocarna for calculating base pair probabilties /and/ the alignment -- increase output precision for probabilities in pp files (reducing rounding issues for the typical cutoff) --- ChangeLog | 6 +++++ configure.ac | 2 +- src/LocARNA/rna_data.cc | 25 ++++++++++++--------- src/LocARNA/rna_data_impl.hh | 1 - src/Tests/mlocarna-threads.testresult | 32 +++++++++++++-------------- src/Utils/mlocarna | 24 +++++++++++++++----- 6 files changed, 56 insertions(+), 34 deletions(-) diff --git a/ChangeLog b/ChangeLog index b39b4520..9fe34837 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,12 @@ <<<<<<< HEAD === 2018 +2.0.0RC6 + adjust behavior of locarna and mlocarna for pairwise alignments + -- do not use alifold in binaries when calculating base pairs for single sequences + -- use noLP in mlocarna for calculating base pair probabilties /and/ the alignment + -- increase output precision for probabilities in pp files + (reducing rounding issues for the typical cutoff) 2.0.0RC5 (2018-05-08) Allow penalized in global alignment diff --git a/configure.ac b/configure.ac index 8ea38477..6fd56520 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ dnl -*- Autoconf -*- dnl Process this file with autoconf to produce a configure script. AC_PREREQ(2.59) -AC_INIT([LocARNA], [2.0.0RC5], [will@informatik.uni-freiburg.de], [locarna], +AC_INIT([LocARNA], [2.0.0RC6], [will@informatik.uni-freiburg.de], [locarna], [http://www.bioinf.uni-freiburg.de/Software/LocARNA/]) dnl special dir for aux config files diff --git a/src/LocARNA/rna_data.cc b/src/LocARNA/rna_data.cc index 78a448de..6374203b 100644 --- a/src/LocARNA/rna_data.cc +++ b/src/LocARNA/rna_data.cc @@ -55,7 +55,7 @@ namespace LocARNA { // recompute all probabilities RnaEnsemble rna_ensemble( pimpl_->sequence_, pfoldparams, false, - true); // use given parameters, no in loop, use alifold + pimpl_->sequence_.num_of_rows()>1); // use given parameters, no in loop, use alifold unless single seq // initialize from RnaEnsemble; note: method is virtual init_from_rna_ensemble(rna_ensemble, pfoldparams); @@ -152,7 +152,7 @@ namespace LocARNA { // recompute all probabilities RnaEnsemble rna_ensemble( sequence(), pfoldparams, true, - true); // use given parameters, in-loop, use alifold + pimpl_->sequence_.num_of_rows()>1); // use given parameters, in-loop, use alifold unless single seq // initialize init_from_rna_ensemble(rna_ensemble, pfoldparams); @@ -1278,20 +1278,23 @@ namespace LocARNA { */ std::string format_prob(double prob) { + size_t precision = 4; + std::ostringstream outd; - outd.precision(3); + outd.precision(precision); outd << prob; - if (outd.str().length() <= 6) { - return outd.str(); - } + std::string s = outd.str(); - std::ostringstream outs; - outs.setf(std::ios::scientific, std::ios::floatfield); - outs.precision(2); - outs << prob; + if (outd.str().length() > precision+4) { + std::ostringstream outs; + outs.setf(std::ios::scientific, std::ios::floatfield); + outs.precision(precision-1); + outs << prob; + + s = outs.str(); + } - std::string s = outs.str(); size_t pos = s.find("e-0"); if (pos != std::string::npos) { s.replace(pos, 3, "e-"); diff --git a/src/LocARNA/rna_data_impl.hh b/src/LocARNA/rna_data_impl.hh index d672dec4..6028420c 100644 --- a/src/LocARNA/rna_data_impl.hh +++ b/src/LocARNA/rna_data_impl.hh @@ -11,7 +11,6 @@ namespace LocARNA { - class MultipleAlignment; class RnaEnsemble; class PFoldParams; // template class SparseVector; diff --git a/src/Tests/mlocarna-threads.testresult b/src/Tests/mlocarna-threads.testresult index 2a43a495..2fa7b03d 100644 --- a/src/Tests/mlocarna-threads.testresult +++ b/src/Tests/mlocarna-threads.testresult @@ -1,18 +1,18 @@ -CLUSTAL W --- LocARNA 2.0.0RC1 +CLUSTAL W --- LocARNA 2.0.0RC5 -X58295.1_1384-1453 UGGCGUCU-UCAUGAGGGAGGGGCCCAAAGCC----CUUGUGGGCGGACCUCCCCUGAGCCUGUCUGAGGGGCCA -D25220.1_1493-1556 CU-UGCGU-UAAUGAGAACAGAAACGAAAACUAUAA-C-CU-AGGGGUUUCUGUUGGAU---GGUU-GGCAAC-- -AY060611.1_560-627 GU-GGCGC-UUAUGACGCAGUUGUCUUAAACUCGAA-CUCG-AGCGGGCAAUUGCUGAUUACGAUU-AACCAC-- -L24896.1_600-665 CC-GGCAC-UCAUGACGGUCUGCCUGAAAACCAGCC-CGCU-GGUGGGGCAGUCCCGAGGAC--CU-GGCGUG-- -AF096875.1_5504-5568 GU-GUG-C-GAAUGAUAACUACUGACGAAAGAGCUGUCUGC-UCAGUCUGUGGUUGGAU---GUAG-UCACAC-- -AF093774.1_5851-5916 GU-GUG-C-GGAUGAUAACUACUGACGAAAGAGUCAUCGACCUCAGUUAGUGGUUGGAU---GUAG-UCACAU-- -AE003628.2_106178-106240 UU-CAA-C-UUAUGAGGAUUAUUUCUUAAAGGCC--UCUGG-CUCGGAAAUAGUCUGAA---CCUU-AUUGUA-- -AC000078.2_21139-21077 GC-CAG-A-UGAUGACGACCUGGGUGGAAACCUACCCUGUG--GGCACCCAUGUCCGAG---CC-C-CCUGGC-- -AF241527.2_359-424 GC-CGC-U-UCAUGACAGGAAGGACUGAAA-UGUCUUAGACCUGUGGUCUUUCCUCGAU---GU-U-CCUGCGGC -AF333036.1_2190-2249 CA-UGCGU-CCAUGAAGUCACUGGCC----UCAAGCCCAAGUGGUGGGCAGUGACAGA---------AGAGCUGC -X57999.1_1526-1586 AUAUUUGU-UUAUGAUGGUCACAGUGUAAA-----GUUCAC--ACAGCUGUGACUUGAUU-UUUAA---AAAU-- -X12367.1_703-764 GU-UUU-U-CCAUGACGGUGUUUCCUCUAA-----AUUUAC-AUGGAGAAACACCUGAUU-UCCAG-AAAAAU-- -X13710.1_946-1008 UU-UUCAU-CUAUGAGGGUGUUUCCUCUAA-----ACCUACGAGGGAGGAACACCUGAUC-U-UAC-AGAAAA-- -AF322071.1_1577-1642 AUGUGGUCUUUAUGAAGGCAGGUGCAGAAACUAUGCACUAGUGG-UGUC--UGUCUGAU---GUUUGGCCAU--- -AC002327.1_156204-156268 --CUCAGCAGGAUGAUGAGAAGGGCUGAAAUGCUGC-CAAACCA-GGUCCUUUUCUGAU---GGUGGCUGGG--- +X58295.1_1384-1453 --UGGCGUCUUCAUGAGGGAGGGGCCCAAAGCC----CUUGUGGGCGGACCUCCCCUGAGCCUGUCUGAGGGGCCA +AC000078.2_21139-21077 --GC-CAG-AUGAUGACGACCUGGGUGGAAACCUACCCUGUG--GGCACCCAUGUCCGA---GCCCCCU--GGC-- +AE003628.2_106178-106240 --UU-CAA-CUUAUGAGGAUUAUUUCUUAAA-GGCCUCUGGC--UCGGAAAUAGUCUGA---ACCUUAU--UGUA- +AF241527.2_359-424 --GC-CGC-UUCAUGACAGGAAGGACUGAAA-UGUCUUAGACCUGUGGUCUUUCCUCGA---UGUUCCU--GCGGC +AF096875.1_5504-5568 --GU-GUG-CGAAUGAUAACUACUGACGAAAGAGCUGUCUGC-UCAGUCUGUGGUUGGA---UG-UAGU--CACAC +AF093774.1_5851-5916 --GU-GUG-CGGAUGAUAACUACUGACGAAAGAGUCAUCGACCUCAGUUAGUGGUUGGA---UG-UAGU--CACAU +D25220.1_1493-1556 --CU-UGCGUUAAUGAGAACAGAAACGAAAACUAUAAC--CU-AGGGGUUUCUGUUGGA---U---GGUUGGCAAC +AY060611.1_560-627 --GU-GGCGCUUAUGACGCAGUUGUCUUAAACUCGAAC-UCG-AGCGGGCAAUUGCUGA---UUACGAUUAACCAC +L24896.1_600-665 --CC-GGCACUCAUGACGGUCUGCCUGAAAACCAGCCC-GCU-GGUGGGGCAGUCCCGA---GGAC--CUGGCGUG +AF333036.1_2190-2249 --CA-UGCGUCCAUGAAGUCACUGGCC----UCAAGCCCAAGUGGUGGGCAGUGACAGA---AGA------GCUGC +X12367.1_703-764 --GU-UUU-UCCAUGACGGUGUUUCCUCUAA-----AUUUAC-AUGGAGAAACACCUGA---UUUCCAG-AAAAAU +X13710.1_946-1008 --UU-UUCAUCUAUGAGGGUGUUUCCUCUAA-----ACCUACGAGGGAGGAACACCUGA---UCU-UAC-AGAAAA +X57999.1_1526-1586 --AUAUUUGUUUAUGAUGGUCACAGUGUAAA-----GUUCAC--ACAGCUGUGACUUGA---UUUUUAA---AAAU +AF322071.1_1577-1642 AUGU-GGUCUUUAUGAAGGCAGGUGCAGAAACUAUGCA--CUAGU-GGUGUCUGUCUGA------UGUUUGGCCAU +AC002327.1_156204-156268 --CU-CAGCAGGAUGAUGAGAAGGGCUGAAAUGCUGCC--AAACCAGGUCCUUUUCUGA------UGGUGGCUGGG diff --git a/src/Utils/mlocarna b/src/Utils/mlocarna index 2817f4f1..17b9b882 100755 --- a/src/Utils/mlocarna +++ b/src/Utils/mlocarna @@ -1076,7 +1076,8 @@ my %opts; ## some default values # -$opts{'noLP'}=1; +# handle noLP / LP later +# $opts{'noLP'}=1; $opts{'skip-pp'}=0; ## =1 skips the computation of pair probabilities ## for files that exist already @@ -1274,15 +1275,16 @@ if (defined($opts{'evaluate'})) { exit 0; } +# LP and noLP should never both be given on the command line if (defined($opts{'LP'}) && defined($opts{'noLP'})) { - printerr "Only one of the options --noLP and --LP can be defined at a time.\n"; - pod2usage(1); + printerr "ERROR: The flags --noLP and --LP contradict each other.\n"; + exit(-1); } +# this makes noLP the default, unless LP is given if (defined($opts{'LP'})) { $opts{'noLP'}=0; -} -if (defined($opts{'noLP'})) { +} else { $opts{'noLP'}=1; } @@ -2903,6 +2905,10 @@ sub compute_dotplot_rnafold_pp { my @fold_cmd = ( "$bindir/locarna_rnafold_pp", "-p" => $opts{'min-prob'} ); + if ($opts{'noLP'}) { + push @fold_cmd, "--noLP"; + } + if ($opts{'in-loop-robabilities'}) { push @fold_cmd, "--in-loop"; } @@ -2981,6 +2987,10 @@ sub compute_dotplot_rnafold { my @fold_cmd = ("$RNAfold", "-p2"); push @fold_cmd, @RNAfold_args; + if ($opts{'noLP'}) { + push @fold_cmd, '--noLP'; + } + my $seq_str = $seq->{seq}; ## the sequence string my $input = ">$tmpname\n$seq_str\n"; @@ -3022,6 +3032,10 @@ sub compute_dotplot_rnaplfold { "-L" => $opts{'plfold-span'}, "-W" => $opts{'plfold-winsize'}, @RNAfold_args; + if ($opts{'noLP'}) { + push @fold_cmd, '--noLP'; + } + my $slen=length($seq_str); my $win = $slen; for (my $i=0; $i < @fold_cmd; $i++) {