From 529e0a95f31fa6839bf160d3ce9a2bccd7a61628 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 23 Jul 2024 14:26:34 +0100 Subject: [PATCH] Extend the -O, --overlap option to choose the denominator with respect to the SRC or TGT. The extension is backward compatible. --- annot-tsv.1 | 8 +++--- annot-tsv.c | 50 ++++++++++++++++++++++++++----------- test/annot-tsv/out.13.1.txt | 2 ++ test/annot-tsv/out.13.2.txt | 2 ++ test/annot-tsv/out.13.3.txt | 2 ++ test/annot-tsv/out.13.4.txt | 2 ++ test/annot-tsv/src.13.txt | 2 ++ test/test.pl | 6 +++++ 8 files changed, 57 insertions(+), 17 deletions(-) create mode 100644 test/annot-tsv/out.13.1.txt create mode 100644 test/annot-tsv/out.13.2.txt create mode 100644 test/annot-tsv/out.13.3.txt create mode 100644 test/annot-tsv/out.13.4.txt create mode 100644 test/annot-tsv/src.13.txt diff --git a/annot-tsv.1 b/annot-tsv.1 index 34e1dd617..fcdec29aa 100644 --- a/annot-tsv.1 +++ b/annot-tsv.1 @@ -170,13 +170,15 @@ Ignore the headers completely and use numeric indexes even when a header exists Suppress index numbers in the printed header. If given twice, drop the entire header. .RE .PP -.BR \-O ", " \-\-overlap " FLOAT" +.BR \-O ", " \-\-overlap " FLOAT,[FLOAT]" .RS 4 -Minimum overlap as a fraction of region length in at least one of the overlapping regions. If also +Minimum overlap as a fraction of region length in SRC and TGT, respectively (with two numbers), or in +at least one of the overlapping regions (with a single number). If also .BR \-r ", " \-\-reciprocal is given, require at least .I FLOAT -overlap with respect to both regions +overlap with respect to both regions. Two identical numbers are equivalent to running with +.BR \-r ", " \-\-reciprocal .RE .PP .BR \-r ", " \-\-reciprocal diff --git a/annot-tsv.c b/annot-tsv.c index e453ede5b..494c43744 100644 --- a/annot-tsv.c +++ b/annot-tsv.c @@ -105,8 +105,8 @@ typedef struct char *core_str, *match_str, *transfer_str, *annots_str, *headers_str, *delim_str; char *temp_dir, *out_fname; BGZF *out_fp; - int allow_dups, reciprocal, max_annots, mode, no_write_hdr; - double overlap; + int allow_dups, max_annots, mode, no_write_hdr, overlap_either; + double overlap_src, overlap_dst; regidx_t *idx; regitr_t *itr; kstring_t tmp_kstr; @@ -736,18 +736,20 @@ void process_line(args_t *args, char *line, size_t size) int has_match = 0, annot_len = 0; while ( regitr_overlap(args->itr) ) { - if ( args->overlap ) + if ( args->overlap_src || args->overlap_dst ) { - double len1 = end - beg + 1; - double len2 = args->itr->end - args->itr->beg + 1; + double len_dst = end - beg + 1; + double len_src = args->itr->end - args->itr->beg + 1; double isec = (args->itr->end < end ? args->itr->end : end) - (args->itr->beg > beg ? args->itr->beg : beg) + 1; - if ( args->reciprocal ) + int pass_dst = isec/len_dst < args->overlap_dst ? 0 : 1; + int pass_src = isec/len_src < args->overlap_src ? 0 : 1; + if ( args->overlap_either ) { - if ( isec/len1 < args->overlap || isec/len2 < args->overlap ) continue; + if ( !pass_dst && !pass_src ) continue; } else { - if ( isec/len1 < args->overlap && isec/len2 < args->overlap ) continue; + if ( !pass_dst || !pass_src ) continue; } } cols_t *src_cols = regitr_payload(args->itr,cols_t*); @@ -885,8 +887,9 @@ static const char *usage_text(void) " -H, --ignore-headers Use numeric indices, ignore the headers completely\n" " -I, --no-header-idx Suppress index numbers in the printed header. If given\n" " twice, drop the entire header\n" - " -O, --overlap FLOAT Minimum required overlap (non-reciprocal, unless -r\n" - " is given)\n" + " -O, --overlap FLOAT[,FLOAT] Minimum required overlap with respect to SRC,TGT.\n" + " If single value, the bigger overlap is considered.\n" + " Identical values are equivalent to running with -r.\n" " -r, --reciprocal Apply the -O requirement to both overlapping\n" " intervals\n" " -x, --drop-overlaps Drop overlapping regions (precludes -f)\n" @@ -941,6 +944,7 @@ int main(int argc, char **argv) }; char *tmp = NULL; int c; + int reciprocal = 0; while ((c = getopt_long(argc, argv, "c:f:m:o:s:t:a:HO:rxh:Id:",loptions,NULL)) >= 0) { switch (c) @@ -960,16 +964,24 @@ int main(int argc, char **argv) case 'd': args->delim_str = optarg; break; case 'h': args->headers_str = optarg; break; case 'H': args->headers_str = "0:0"; break; - case 'r': args->reciprocal = 1; break; + case 'r': reciprocal = 1; break; case 'c': args->core_str = optarg; break; case 't': args->dst.fname = optarg; break; case 'm': args->match_str = optarg; break; case 'a': args->annots_str = optarg; break; case 'o': args->out_fname = optarg; break; case 'O': - args->overlap = strtod(optarg, &tmp); - if ( tmp==optarg || *tmp ) error("Could not parse --overlap %s\n", optarg); - if ( args->overlap<0 || args->overlap>1 ) error("Expected value from the interval [0,1]: --overlap %s\n", optarg); + args->overlap_src = strtod(optarg, &tmp); + if ( tmp==optarg || (*tmp && *tmp!=',') ) error("Could not parse --overlap %s\n", optarg); + if ( args->overlap_src<0 || args->overlap_src>1 ) error("Expected value(s) from the interval [0,1]: --overlap %s\n", optarg); + if ( *tmp ) + { + args->overlap_dst = strtod(tmp+1, &tmp); + if ( *tmp ) error("Could not parse --overlap %s\n", optarg); + if ( args->overlap_dst<0 || args->overlap_dst>1 ) error("Expected value(s) from the interval [0,1]: --overlap %s\n", optarg); + } + else + args->overlap_either = 1; break; case 's': args->src.fname = optarg; break; case 'f': args->transfer_str = optarg; break; @@ -994,6 +1006,16 @@ int main(int argc, char **argv) else args->mode = PRINT_MATCHING|PRINT_NONMATCHING; } if ( (args->transfer_str || args->annots_str) && !(args->mode & PRINT_MATCHING) ) error("The option -x cannot be combined with -f and -a\n"); + if ( reciprocal ) + { + if ( args->overlap_dst && args->overlap_src && args->overlap_dst!=args->overlap_src ) + error("The combination of --reciprocal with --overlap %f,%f makes no sense: expected single value or identical values\n",args->overlap_src,args->overlap_dst); + if ( !args->overlap_src ) + args->overlap_src = args->overlap_dst; + else + args->overlap_dst = args->overlap_src; + args->overlap_either = 0; + } init_data(args); write_header(args, &args->dst); diff --git a/test/annot-tsv/out.13.1.txt b/test/annot-tsv/out.13.1.txt new file mode 100644 index 000000000..a1bf0be68 --- /dev/null +++ b/test/annot-tsv/out.13.1.txt @@ -0,0 +1,2 @@ +1 10 20 long long,short +1 15 15 short long,short diff --git a/test/annot-tsv/out.13.2.txt b/test/annot-tsv/out.13.2.txt new file mode 100644 index 000000000..7c543b134 --- /dev/null +++ b/test/annot-tsv/out.13.2.txt @@ -0,0 +1,2 @@ +1 10 20 long long +1 15 15 short short diff --git a/test/annot-tsv/out.13.3.txt b/test/annot-tsv/out.13.3.txt new file mode 100644 index 000000000..8911afad8 --- /dev/null +++ b/test/annot-tsv/out.13.3.txt @@ -0,0 +1,2 @@ +1 10 20 long long +1 15 15 short long,short diff --git a/test/annot-tsv/out.13.4.txt b/test/annot-tsv/out.13.4.txt new file mode 100644 index 000000000..f7a0e4d88 --- /dev/null +++ b/test/annot-tsv/out.13.4.txt @@ -0,0 +1,2 @@ +1 10 20 long long,short +1 15 15 short short diff --git a/test/annot-tsv/src.13.txt b/test/annot-tsv/src.13.txt new file mode 100644 index 000000000..de3338de1 --- /dev/null +++ b/test/annot-tsv/src.13.txt @@ -0,0 +1,2 @@ +1 10 20 long +1 15 15 short diff --git a/test/test.pl b/test/test.pl index ef6a56612..b5f52bdfb 100755 --- a/test/test.pl +++ b/test/test.pl @@ -1472,4 +1472,10 @@ sub test_annot_tsv run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.3.txt',args=>'-c chr2,beg2,end2:chr,beg,end -f smpl2:src_smpl -h 3:2 -I'); run_annot_tsv($opts,src=>'src.12.txt',dst=>'dst.12.txt',out=>'out.12.1.txt',args=>'-c 1,2,3:1,2,3 -f 4:5 -h 0:0 -d ,'); run_annot_tsv($opts,src=>'src.12.txt',dst=>'dst.11.txt',out=>'out.11.1.txt',args=>q[-c 1,2,3:1,2,3 -f 4:5 -h 0:0 -d $',:\t']); + run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.1.txt',args=>q[-c 1,2,3 -f 4:5]); + run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.1.txt',args=>q[-c 1,2,3 -f 4:5 -O 0.5]); + run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.2.txt',args=>q[-c 1,2,3 -f 4:5 -O 0.5 -r]); + run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.2.txt',args=>q[-c 1,2,3 -f 4:5 -O 0.5,0.5]); + run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.3.txt',args=>q[-c 1,2,3 -f 4:5 -O 0,1]); + run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.4.txt',args=>q[-c 1,2,3 -f 4:5 -O 1,0]); }