From 1cf8f41c79e5c4e23a5769dc4a1ddcc18ffeead9 Mon Sep 17 00:00:00 2001 From: Steven Leonard Date: Wed, 26 Feb 2020 11:45:37 +0000 Subject: [PATCH 1/5] minor template changes to avoid common seq_alignment failures --- Changes | 3 +++ data/vtlib/salmon_alignment.json | 4 ++-- data/vtlib/subsample.json | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Changes b/Changes index 3057c43ca..84aa130e0 100644 --- a/Changes +++ b/Changes @@ -1,6 +1,9 @@ CHANGES LOG ----------- + - check for divide by 0 when calc samtools subsample value + - drop file check, explictly assume fastq.gz files are gzipped when calc #reads for salmon + 0.26.0 - add parameters file for top-up merge - functional equivalence: enable selection of markdup method - biobambam (default), samtools or picard diff --git a/data/vtlib/salmon_alignment.json b/data/vtlib/salmon_alignment.json index 968784fa9..bb5e59e4f 100644 --- a/data/vtlib/salmon_alignment.json +++ b/data/vtlib/salmon_alignment.json @@ -99,14 +99,14 @@ "--geneMap", {"subst":"annotation_val", "required":"yes"}, "--output", {"subst":"salmon_out"}, {"subst":"b2c_mt", "ifnull":{"subst_constructor":{ "vals":[ "-p", {"subst":"b2c_mt_val"} ]}}}, "\";", - "case `file $0` in *ASCII*) PART1=`head -n 1000 $0 | wc -l`;; *compressed*) PART1=`gunzip -c $0 | head -n 1000 | wc -l`;; *empty*) PART1=0;; esac;", + "PART1=`gunzip -c $0 | head -n 1000 | wc -l`;", "if [[ $0 && ! $1 ]]; then", "SALMON_CMD+=\"-r $0\";", "if [[ $PART1 -lt 1000 ]]; then", ">&2 printf \"Not enough reads to run Salmon: fq: %s\" \"$((PART1/4))\"; exit 0; fi;", "elif [[ $0 && $1 ]]; then", "SALMON_CMD+=\"-1 $0 -2 $1\";", - "case `file $1` in *ASCII*) PART2=`head -n 1000 $1 | wc -l`;; *compressed*) PART2=`gunzip -c $1 | head -n 1000 | wc -l`;; *empty*) PART2=0;; esac;", + "PART2=`gunzip -c $1 | head -n 1000 | wc -l`;", "if [[ $PART1 -lt 1000 || $PART2 -lt 1000 ]]; then", ">&2 printf \"Not enough reads to run Salmon: fq1: %s - fq2: %s\" \"$((PART1/4))\" \"$((PART2/4))\"; exit 0; fi; fi;", "$SALMON_CMD'", diff --git a/data/vtlib/subsample.json b/data/vtlib/subsample.json index a537d1f88..271cf6ca2 100644 --- a/data/vtlib/subsample.json +++ b/data/vtlib/subsample.json @@ -42,7 +42,7 @@ "use_STDOUT": true, "cmd":[ "bash -c '", - {"subst_constructor":{"vals":["tmfs=\"", {"subst":"tag_metrics_files", "required":true}, "\""],"postproc":{"op":"concat","pad":""}}}, "; if [ ! -z \"${tmfs}\" ]; then for tag_metrics_file in ${tmfs}; do reads_count=`jq", {"subst":"jqkey", "ifnull":{"subst_constructor":{"vals":["'\"'\"'.reads_count.\"", {"subst":"s2_tag_index", "required":true}, "\"'\"'\"'"],"postproc":{"op":"concat","pad":""}}}}, "${tag_metrics_file}`; reads_count=`echo ${reads_count} | tr -cd [:digit:]`; reads_count_total=$((${reads_count_total}+${reads_count})); done; frac=`echo \"10000/${reads_count_total}\" | bc -l`; fi;", + {"subst_constructor":{"vals":["tmfs=\"", {"subst":"tag_metrics_files", "required":true}, "\""],"postproc":{"op":"concat","pad":""}}}, "; if [ ! -z \"${tmfs}\" ]; then for tag_metrics_file in ${tmfs}; do reads_count=`jq", {"subst":"jqkey", "ifnull":{"subst_constructor":{"vals":["'\"'\"'.reads_count.\"", {"subst":"s2_tag_index", "required":true}, "\"'\"'\"'"],"postproc":{"op":"concat","pad":""}}}}, "${tag_metrics_file}`; reads_count=`echo ${reads_count} | tr -cd [:digit:]`; reads_count_total=$((${reads_count_total}+${reads_count})); done; if [[ $reads_count_total -eq 0 ]]; then reads_count_total=1; fi; frac=`echo \"10000/${reads_count_total}\" | bc -l`; fi;", "if [ ! -z $frac ]; then", "samtools", "view", From daecf76ceaf6569fa128186250540e2caabf696e Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Wed, 4 Mar 2020 11:15:56 +0000 Subject: [PATCH 2/5] unconditionally remove auxtags before adapter clipping when realignment_switch is 1 (otherwise, when realigning, old clipping information can be passed through to the adapter detection+clipping step which can result in downstream clip reinsert failure, hard clipped sequences and seqchksum errors) --- data/vtlib/pre_alignment.json | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/data/vtlib/pre_alignment.json b/data/vtlib/pre_alignment.json index 61cfc1405..dd6d970d6 100644 --- a/data/vtlib/pre_alignment.json +++ b/data/vtlib/pre_alignment.json @@ -25,13 +25,12 @@ "type":"EXEC", "use_STDIN": true, "use_STDOUT": true, - "cmd":[ - "bamreset", - {"subst":"resetaux_flag","required":"no","ifnull":{"subst_constructor":{"vals":[ "resetaux", {"subst":"resetaux_val", "required":"no", "ifnull":"0"} ],"postproc":{"op":"concat","pad":"="}}}}, - {"subst":"auxfilter_flag","required":"no"}, - "level=0", - "verbose=0" - ], + "cmd":{"select":"realignment_switch", "select_range":[1], "default":0, "comment":"remove aux tags unconditionally for realignment (realignment is non-default)", + "cases":[ + ["bamreset", "resetaux=0", {"subst":"auxfilter_flag","required":"no", "comment":"auxfilter=comma separated list of aux tags to be kept if resetaux=0"}, "level=0", "verbose=0"], + ["bamreset", "level=0", "verbose=0"] + ] + }, "comment":"Alignment removal also required for bamadapterclip (at least 0.0.142)" }, { From 806ad2ba85a20ea4fffa35601d2d99e237e067fe Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Fri, 6 Mar 2020 17:51:22 +0000 Subject: [PATCH 3/5] when there are insufficient reads for salmon, it and downstream archiving steps should behave better. --- data/vtlib/salmon_alignment.json | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/data/vtlib/salmon_alignment.json b/data/vtlib/salmon_alignment.json index bb5e59e4f..61db99e0e 100644 --- a/data/vtlib/salmon_alignment.json +++ b/data/vtlib/salmon_alignment.json @@ -103,12 +103,12 @@ "if [[ $0 && ! $1 ]]; then", "SALMON_CMD+=\"-r $0\";", "if [[ $PART1 -lt 1000 ]]; then", - ">&2 printf \"Not enough reads to run Salmon: fq: %s\" \"$((PART1/4))\"; exit 0; fi;", + ">&2 printf \"Not enough reads to run Salmon: fq: %s\\n\" \"$((PART1/4))\"; echo 'NO_ARCHIVE_SE'; exit 0; fi;", "elif [[ $0 && $1 ]]; then", "SALMON_CMD+=\"-1 $0 -2 $1\";", "PART2=`gunzip -c $1 | head -n 1000 | wc -l`;", "if [[ $PART1 -lt 1000 || $PART2 -lt 1000 ]]; then", - ">&2 printf \"Not enough reads to run Salmon: fq1: %s - fq2: %s\" \"$((PART1/4))\" \"$((PART2/4))\"; exit 0; fi; fi;", + ">&2 printf \"Not enough reads to run Salmon: fq1: %s - fq2: %s\\n\" \"$((PART1/4))\" \"$((PART2/4))\"; echo 'NO_ARCHIVE_PE'; exit 0; fi; fi;", "$SALMON_CMD'", { "select":"alignment_reads_layout", @@ -118,38 +118,42 @@ "1":[{"port":"fq1", "direction":"in"}], "2":[{"port":"fq1", "direction":"in"}, {"port":"fq2", "direction":"in"}] } - } + }, + "&& echo 'DO_ARCHIVE';" ], "comment":"salmon is too fussy and requires a minimum of good reads to work or it throws a fit. wrapped in a bash script to validate fastq files" }, { "id":"zip_salmon_quant", "type":"EXEC", + "subtype":"STRINGIFY", "use_STDIN": true, "use_STDOUT": false, - "cmd":[ "zip", "-r", {"subst":"zip_target"}, {"subst":"quant"}, {"subst":"quant_genes"}, {"subst":"lib_format_counts"}, {"subst":"libparams"}, {"subst":"cmd_info"} ] + "cmd":[ + "bash -c 'ms=`cat`; if [ \"${ms}\" == \"DO_ARCHIVE\" ]; then >&2 printf \"Found archive instruction, so zipping\\n\"; zip -r $0 $1 $2 $3 $4 $5; else >&2 printf \"No such file, so no zip: %s\\n\" $1; exit 0; fi'", {"subst":"zip_target"}, {"subst":"quant"}, {"subst":"quant_genes"}, {"subst":"lib_format_counts"}, {"subst":"libparams"}, {"subst":"cmd_info"} + ] }, { - "id":"quant_genes", + "id":"archive_decision", "type":"RAFILE", - "subtype":"DUMMY", - "name":{"subst":"quant_genes"} + "stubtype":"DUMMY", + "name":{"subst":"archive_decision", "ifnull":"archive_decision.txt"} }, { "id":"cp_quant_genes", "type":"EXEC", "subtype":"STRINGIFY", - "use_STDIN": false, + "use_STDIN": true, "use_STDOUT": false, - "cmd":[ "bash -c 'if [ -e $0 ]; then cp $0 $1; else >&2 printf \"No such file: %s\" $0; exit 0; fi'", - {"port":"src_quant_genes", "direction":"in"}, {"subst":"quant_genes_target"} + "cmd":[ "bash -c 'ms=`cat`; if [[ \"${ms}\" == \"DO_ARCHIVE\" ]] && [[ -e $0 ]]; then >&2 cp -v $0 $1; else >&2 printf \"No quant_genes file: %s\\n\" $0; exit 0; fi'", + {"subst":"quant_genes"}, {"subst":"quant_genes_target"} ], - "comment":"if salmon is not run this file is not created" + "comment":"if salmon is not run the quant_genes file is not created" } ], "edges":[ - { "id":"zip_salmon_output", "from":"quant_genes", "to":"zip_salmon_quant"}, - { "id":"salmon_to_quant_genes", "from":"salmon", "to":"quant_genes" }, - { "id":"cp_quant_genes", "from":"quant_genes", "to":"cp_quant_genes:src_quant_genes" } + { "id":"salmon_to_quant_genes", "from":"salmon", "to":"archive_decision" }, + { "id":"zip_salmon_output", "from":"archive_decision", "to":"zip_salmon_quant"}, + { "id":"cp_quant_genes", "from":"archive_decision", "to":"cp_quant_genes" } ] } From 63771ed375a7b6e6c049279c0fa4ae37d808ea53 Mon Sep 17 00:00:00 2001 From: mgcam Date: Mon, 9 Mar 2020 13:57:26 +0000 Subject: [PATCH 4/5] Changes file update for release 0.27.0 --- Changes | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Changes b/Changes index 84aa130e0..939b248fb 100644 --- a/Changes +++ b/Changes @@ -1,8 +1,10 @@ CHANGES LOG ----------- +0.27.0 - check for divide by 0 when calc samtools subsample value - drop file check, explictly assume fastq.gz files are gzipped when calc #reads for salmon + - unconditionally remove auxtags before adapter clipping when realignment_switch is 1 0.26.0 - add parameters file for top-up merge From fe5e8cf23a2cc99fdc9c42757bb4604909ac6ac0 Mon Sep 17 00:00:00 2001 From: mgcam Date: Tue, 10 Mar 2020 13:04:48 +0000 Subject: [PATCH 5/5] Changes file update to reflect the latest merge pr --- Changes | 1 + 1 file changed, 1 insertion(+) diff --git a/Changes b/Changes index 939b248fb..b09ee866a 100644 --- a/Changes +++ b/Changes @@ -4,6 +4,7 @@ CHANGES LOG 0.27.0 - check for divide by 0 when calc samtools subsample value - drop file check, explictly assume fastq.gz files are gzipped when calc #reads for salmon + - make the code flow more robust in cases of insufficient reads for salmon - unconditionally remove auxtags before adapter clipping when realignment_switch is 1 0.26.0