diff --git a/README.md b/README.md index 640b4cf..8588fe6 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,11 @@ # five-dollar-genome-analysis-pipeline Workflows used for germline short variant discovery in WGS data + ### germline_single_sample_workflow : This WDL pipeline implements data pre-processing and initial variant calling (GVCF generation) according to the GATK Best Practices (June 2016) for germline SNP and Indel discovery in human whole-genome sequencing data. -Note: For those users interested in running this wdl on FireCloud (FC), the FC -version has been provided as fc_germline_single_sample_workflow.wdl. Please visit the -FC featured methods and workspaces for more GATK Best Practices pipelines. - #### Requirements/expectations - Human whole-genome paired-end sequencing data in unmapped BAM (uBAM) format - One or more read groups, one per uBAM file, all belonging to a single sample (SM) @@ -18,6 +15,7 @@ FC featured methods and workspaces for more GATK Best Practices pipelines. - - reads are provided in query-sorted order - - all reads must have an RG tag - Reference genome must be Hg38 with ALT contigs + #### Outputs - Cram, cram index, and cram md5 - GVCF and its gvcf index @@ -25,6 +23,27 @@ FC featured methods and workspaces for more GATK Best Practices pipelines. - Several Summary Metrics ### Software version requirements : -Cromwell version support -- Successfully tested on v30.2 -- Does not work on versions < v23 due to output syntax +- GATK 4.0.10.1 +- Picard 2.16.0-SNAPSHOT +- Samtools 1.3.1 +- Python 2.7 +- Cromwell version support + - Successfully tested on v37 + - Does not work on versions < v23 due to output syntax + +### Important Note : +- The provided JSON is meant to be a ready to use example JSON template of the workflow. It is the user’s responsibility to correctly set the reference and resource input variables using the [GATK Tool and Tutorial Documentations](https://software.broadinstitute.org/gatk/documentation/). +- Relevant reference and resources bundles can be accessed in [Resource Bundle](https://software.broadinstitute.org/gatk/download/bundle). +- Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +- For help running workflows on the Google Cloud Platform or locally please +view the following tutorial [(How to) Execute Workflows from the gatk-workflows Git Organization](https://software.broadinstitute.org/gatk/documentation/article?id=12521). +- The following material is provided by the GATK Team. Please post any questions or concerns to one of our forum sites : [GATK](https://gatkforums.broadinstitute.org/gatk/categories/ask-the-team/) , [FireCloud](https://gatkforums.broadinstitute.org/firecloud/categories/ask-the-firecloud-team) or [Terra](https://broadinstitute.zendesk.com/hc/en-us/community/topics/360000500432-General-Discussion) , [WDL/Cromwell](https://gatkforums.broadinstitute.org/wdl/categories/ask-the-wdl-team). +- Please visit the [User Guide](https://software.broadinstitute.org/gatk/documentation/) site for further documentation on our workflows and tools. + +### LICENSING : +Copyright Broad Institute, 2019 | BSD-3 +This script is released under the WDL open source code license (BSD-3) (full license text at https://github.com/openwdl/wdl/blob/master/LICENSE). Note however that the programs it calls may be subject to different licenses. Users are responsible for checking that they are authorized to run all programs before running this script. +- [GATK](https://software.broadinstitute.org/gatk/download/licensing.php) +- [BWA](http://bio-bwa.sourceforge.net/bwa.shtml#13) +- [Picard](https://broadinstitute.github.io/picard/) +- [Samtools](http://www.htslib.org/terms/) diff --git a/WholeGenomeGermlineSingleSample.hg38.inputs.json b/WholeGenomeGermlineSingleSample.hg38.inputs.json new file mode 100644 index 0000000..f191577 --- /dev/null +++ b/WholeGenomeGermlineSingleSample.hg38.inputs.json @@ -0,0 +1,53 @@ +{ + "WholeGenomeGermlineSingleSample.sample_and_unmapped_bams": { + "sample_name": "NA12878 PLUMBING", + "base_file_name": "NA12878_PLUMBING", + "flowcell_unmapped_bams": [ + "gs://broad-public-datasets/NA12878_downsampled_for_testing/unmapped/H06HDADXX130110.1.ATCACGAT.20k_reads.bam", + "gs://broad-public-datasets/NA12878_downsampled_for_testing/unmapped/H06HDADXX130110.2.ATCACGAT.20k_reads.bam", + "gs://broad-public-datasets/NA12878_downsampled_for_testing/unmapped/H06JUADXX130110.1.ATCACGAT.20k_reads.bam" + ], + "final_gvcf_base_name": "NA12878_PLUMBING", + "unmapped_bam_suffix": ".bam" + }, + + "WholeGenomeGermlineSingleSample.references": { + "fingerprint_genotypes_file": "gs://dsde-data-na12878-public/NA12878.hg38.reference.fingerprint.vcf", + "fingerprint_genotypes_index": "gs://dsde-data-na12878-public/NA12878.hg38.reference.fingerprint.vcf.idx", + "contamination_sites_ud": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.contam.UD", + "contamination_sites_bed": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.contam.bed", + "contamination_sites_mu": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.contam.mu", + "calling_interval_list": "gs://broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list", + "haplotype_scatter_count": 10, + "break_bands_at_multiples_of": 100000, + "reference_fasta" : { + "ref_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ref_fasta": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", + "ref_fasta_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", + "ref_alt": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.alt", + "ref_sa": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.sa", + "ref_amb": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.amb", + "ref_bwt": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.bwt", + "ref_ann": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.ann", + "ref_pac": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.pac" + }, + "known_indels_sites_vcfs": [ + "gs://broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", + "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz" + ], + "known_indels_sites_indices": [ + "gs://broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", + "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi" + ], + "dbsnp_vcf": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", + "dbsnp_vcf_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx", + "evaluation_interval_list": "gs://broad-references/hg38/v0/wgs_evaluation_regions.hg38.interval_list" + }, + + "WholeGenomeGermlineSingleSample.wgs_coverage_interval_list": "gs://broad-references/hg38/v0/wgs_coverage_regions.hg38.interval_list", + + "WholeGenomeGermlineSingleSample.papi_settings": { + "preemptible_tries": 3, + "agg_preemptible_tries": 3 + } +} diff --git a/WholeGenomeGermlineSingleSample.wdl b/WholeGenomeGermlineSingleSample.wdl new file mode 100644 index 0000000..80da7ba --- /dev/null +++ b/WholeGenomeGermlineSingleSample.wdl @@ -0,0 +1,219 @@ +version 1.0 + +## Copyright Broad Institute, 2018 +## +## This WDL pipeline implements data pre-processing and initial variant calling (GVCF +## generation) according to the GATK Best Practices (June 2016) for germline SNP and +## Indel discovery in human whole-genome data. +## +## Requirements/expectations : +## - Human whole-genome pair-end sequencing data in unmapped BAM (uBAM) format +## - One or more read groups, one per uBAM file, all belonging to a single sample (SM) +## - Input uBAM files must additionally comply with the following requirements: +## - - filenames all have the same suffix (we use ".unmapped.bam") +## - - files must pass validation by ValidateSamFile +## - - reads are provided in query-sorted order +## - - all reads must have an RG tag +## - GVCF output names must end in ".g.vcf.gz" +## - Reference genome must be Hg38 with ALT contigs +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# Local import +#import "../../../../pipelines/dna_seq/UnmappedBamToAlignedBam.wdl" as ToBam +#import "../../../../tasks/AggregatedBamQC.wdl" as AggregatedQC +#import "../../../../tasks/GermlineVariantDiscovery.wdl" as Calling +#import "../../../../tasks/Qc.wdl" as QC +#import "../../../../tasks/Utilities.wdl" as Utils +#import "../../../../tasks/BamToCram.wdl" as ToCram +#import "../../../../tasks/VariantCalling.wdl" as ToGvcf +#import "../../../../structs/dna_seq/germline/GermlineStructs.wdl" + +# Git URL import +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/UnmappedBamToAlignedBam.wdl" as ToBam +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/AggregatedBamQC.wdl" as AggregatedQC +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/GermlineVariantDiscovery.wdl" as Calling +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Qc.wdl" as QC +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Utilities.wdl" as Utils +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/BamToCram.wdl" as ToCram +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/VariantCalling.wdl" as ToGvcf +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/structs/GermlineStructs.wdl" + +# WORKFLOW DEFINITION +workflow WholeGenomeGermlineSingleSample { + input { + SampleAndUnmappedBams sample_and_unmapped_bams + GermlineSingleSampleReferences references + PapiSettings papi_settings + File wgs_coverage_interval_list + + File? haplotype_database_file + Boolean provide_bam_output = false + Boolean use_gatk3_haplotype_caller = true + } + + # Not overridable: + Int read_length = 250 + Float lod_threshold = -20.0 + String cross_check_fingerprints_by = "READGROUP" + String recalibrated_bam_basename = sample_and_unmapped_bams.base_file_name + ".aligned.duplicates_marked.recalibrated" + + call ToBam.UnmappedBamToAlignedBam { + input: + sample_and_unmapped_bams = sample_and_unmapped_bams, + references = references, + papi_settings = papi_settings, + + cross_check_fingerprints_by = cross_check_fingerprints_by, + haplotype_database_file = haplotype_database_file, + lod_threshold = lod_threshold, + recalibrated_bam_basename = recalibrated_bam_basename + } + + call AggregatedQC.AggregatedBamQC { + input: + base_recalibrated_bam = UnmappedBamToAlignedBam.output_bam, + base_recalibrated_bam_index = UnmappedBamToAlignedBam.output_bam_index, + base_name = sample_and_unmapped_bams.base_file_name, + sample_name = sample_and_unmapped_bams.sample_name, + recalibrated_bam_base_name = recalibrated_bam_basename, + haplotype_database_file = haplotype_database_file, + references = references, + papi_settings = papi_settings + } + + call ToCram.BamToCram as BamToCram { + input: + input_bam = UnmappedBamToAlignedBam.output_bam, + ref_fasta = references.reference_fasta.ref_fasta, + ref_fasta_index = references.reference_fasta.ref_fasta_index, + ref_dict = references.reference_fasta.ref_dict, + duplication_metrics = UnmappedBamToAlignedBam.duplicate_metrics, + chimerism_metrics = AggregatedBamQC.agg_alignment_summary_metrics, + base_file_name = sample_and_unmapped_bams.base_file_name, + agg_preemptible_tries = papi_settings.agg_preemptible_tries + } + + # QC the sample WGS metrics (stringent thresholds) + call QC.CollectWgsMetrics as CollectWgsMetrics { + input: + input_bam = UnmappedBamToAlignedBam.output_bam, + input_bam_index = UnmappedBamToAlignedBam.output_bam_index, + metrics_filename = sample_and_unmapped_bams.base_file_name + ".wgs_metrics", + ref_fasta = references.reference_fasta.ref_fasta, + ref_fasta_index = references.reference_fasta.ref_fasta_index, + wgs_coverage_interval_list = wgs_coverage_interval_list, + read_length = read_length, + preemptible_tries = papi_settings.agg_preemptible_tries + } + + # QC the sample raw WGS metrics (common thresholds) + call QC.CollectRawWgsMetrics as CollectRawWgsMetrics { + input: + input_bam = UnmappedBamToAlignedBam.output_bam, + input_bam_index = UnmappedBamToAlignedBam.output_bam_index, + metrics_filename = sample_and_unmapped_bams.base_file_name + ".raw_wgs_metrics", + ref_fasta = references.reference_fasta.ref_fasta, + ref_fasta_index = references.reference_fasta.ref_fasta_index, + wgs_coverage_interval_list = wgs_coverage_interval_list, + read_length = read_length, + preemptible_tries = papi_settings.agg_preemptible_tries + } + + call ToGvcf.VariantCalling as BamToGvcf { + input: + calling_interval_list = references.calling_interval_list, + evaluation_interval_list = references.evaluation_interval_list, + haplotype_scatter_count = references.haplotype_scatter_count, + break_bands_at_multiples_of = references.break_bands_at_multiples_of, + contamination = UnmappedBamToAlignedBam.contamination, + input_bam = UnmappedBamToAlignedBam.output_bam, + ref_fasta = references.reference_fasta.ref_fasta, + ref_fasta_index = references.reference_fasta.ref_fasta_index, + ref_dict = references.reference_fasta.ref_dict, + dbsnp_vcf = references.dbsnp_vcf, + dbsnp_vcf_index = references.dbsnp_vcf_index, + base_file_name = sample_and_unmapped_bams.base_file_name, + final_vcf_base_name = sample_and_unmapped_bams.final_gvcf_base_name, + agg_preemptible_tries = papi_settings.agg_preemptible_tries, + use_gatk3_haplotype_caller = use_gatk3_haplotype_caller + } + + if (provide_bam_output) { + File provided_output_bam = UnmappedBamToAlignedBam.output_bam + File provided_output_bam_index = UnmappedBamToAlignedBam.output_bam_index + } + + # Outputs that will be retained when execution is complete + output { + Array[File] quality_yield_metrics = UnmappedBamToAlignedBam.quality_yield_metrics + + Array[File] unsorted_read_group_base_distribution_by_cycle_pdf = UnmappedBamToAlignedBam.unsorted_read_group_base_distribution_by_cycle_pdf + Array[File] unsorted_read_group_base_distribution_by_cycle_metrics = UnmappedBamToAlignedBam.unsorted_read_group_base_distribution_by_cycle_metrics + Array[File] unsorted_read_group_insert_size_histogram_pdf = UnmappedBamToAlignedBam.unsorted_read_group_insert_size_histogram_pdf + Array[File] unsorted_read_group_insert_size_metrics = UnmappedBamToAlignedBam.unsorted_read_group_insert_size_metrics + Array[File] unsorted_read_group_quality_by_cycle_pdf = UnmappedBamToAlignedBam.unsorted_read_group_quality_by_cycle_pdf + Array[File] unsorted_read_group_quality_by_cycle_metrics = UnmappedBamToAlignedBam.unsorted_read_group_quality_by_cycle_metrics + Array[File] unsorted_read_group_quality_distribution_pdf = UnmappedBamToAlignedBam.unsorted_read_group_quality_distribution_pdf + Array[File] unsorted_read_group_quality_distribution_metrics = UnmappedBamToAlignedBam.unsorted_read_group_quality_distribution_metrics + + File read_group_alignment_summary_metrics = AggregatedBamQC.read_group_alignment_summary_metrics + File read_group_gc_bias_detail_metrics = AggregatedBamQC.read_group_gc_bias_detail_metrics + File read_group_gc_bias_pdf = AggregatedBamQC.read_group_gc_bias_pdf + File read_group_gc_bias_summary_metrics = AggregatedBamQC.read_group_gc_bias_summary_metrics + + File? cross_check_fingerprints_metrics = UnmappedBamToAlignedBam.cross_check_fingerprints_metrics + + File selfSM = UnmappedBamToAlignedBam.selfSM + Float contamination = UnmappedBamToAlignedBam.contamination + + File calculate_read_group_checksum_md5 = AggregatedBamQC.calculate_read_group_checksum_md5 + + File agg_alignment_summary_metrics = AggregatedBamQC.agg_alignment_summary_metrics + File agg_bait_bias_detail_metrics = AggregatedBamQC.agg_bait_bias_detail_metrics + File agg_bait_bias_summary_metrics = AggregatedBamQC.agg_bait_bias_summary_metrics + File agg_gc_bias_detail_metrics = AggregatedBamQC.agg_gc_bias_detail_metrics + File agg_gc_bias_pdf = AggregatedBamQC.agg_gc_bias_pdf + File agg_gc_bias_summary_metrics = AggregatedBamQC.agg_gc_bias_summary_metrics + File agg_insert_size_histogram_pdf = AggregatedBamQC.agg_insert_size_histogram_pdf + File agg_insert_size_metrics = AggregatedBamQC.agg_insert_size_metrics + File agg_pre_adapter_detail_metrics = AggregatedBamQC.agg_pre_adapter_detail_metrics + File agg_pre_adapter_summary_metrics = AggregatedBamQC.agg_pre_adapter_summary_metrics + File agg_quality_distribution_pdf = AggregatedBamQC.agg_quality_distribution_pdf + File agg_quality_distribution_metrics = AggregatedBamQC.agg_quality_distribution_metrics + File agg_error_summary_metrics = AggregatedBamQC.agg_error_summary_metrics + + File? fingerprint_summary_metrics = AggregatedBamQC.fingerprint_summary_metrics + File? fingerprint_detail_metrics = AggregatedBamQC.fingerprint_detail_metrics + + File wgs_metrics = CollectWgsMetrics.metrics + File raw_wgs_metrics = CollectRawWgsMetrics.metrics + + File duplicate_metrics = UnmappedBamToAlignedBam.duplicate_metrics + File output_bqsr_reports = UnmappedBamToAlignedBam.output_bqsr_reports + + File gvcf_summary_metrics = BamToGvcf.vcf_summary_metrics + File gvcf_detail_metrics = BamToGvcf.vcf_detail_metrics + + File? output_bam = provided_output_bam + File? output_bam_index = provided_output_bam_index + + File output_cram = BamToCram.output_cram + File output_cram_index = BamToCram.output_cram_index + File output_cram_md5 = BamToCram.output_cram_md5 + + File validate_cram_file_report = BamToCram.validate_cram_file_report + + File output_vcf = BamToGvcf.output_vcf + File output_vcf_index = BamToGvcf.output_vcf_index + } +} diff --git a/fc_germline_single_sample_workflow.wdl b/fc_germline_single_sample_workflow.wdl deleted file mode 100644 index ad461b4..0000000 --- a/fc_germline_single_sample_workflow.wdl +++ /dev/null @@ -1,521 +0,0 @@ -## Copyright Broad Institute, 2018 -## -## This WDL pipeline implements data pre-processing and initial variant calling (GVCF -## generation) according to the GATK Best Practices (June 2016) for germline SNP and -## Indel discovery in human whole-genome sequencing data. -## -## Requirements/expectations : -## - Human whole-genome pair-end sequencing data in unmapped BAM (uBAM) format -## - One or more read groups, one per uBAM file, all belonging to a single sample (SM) -## - Input uBAM files must additionally comply with the following requirements: -## - - filenames all have the same suffix (we use ".unmapped.bam") -## - - files must pass validation by ValidateSamFile -## - - reads are provided in query-sorted order -## - - all reads must have an RG tag -## - GVCF output names must end in ".g.vcf.gz" -## - Reference genome must be Hg38 with ALT contigs -## -## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. -## For program versions, see docker containers. -## -## LICENSING : -## This script is released under the WDL source code license (BSD-3) (see LICENSE in -## https://github.com/broadinstitute/wdl). Note however that the programs it calls may -## be subject to different licenses. Users are responsible for checking that they are -## authorized to run all programs before running this script. Please see the docker -## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed -## licensing information pertaining to the included programs. - -import "https://api.firecloud.org/ga4gh/v1/tools/gatk:alignment/versions/4/plain-WDL/descriptor" as Alignment -import "https://api.firecloud.org/ga4gh/v1/tools/gatk:split-large-readgroup/versions/4/plain-WDL/descriptor" as SplitRG -import "https://api.firecloud.org/ga4gh/v1/tools/gatk:quality-control/versions/2/plain-WDL/descriptor" as QC -import "https://api.firecloud.org/ga4gh/v1/tools/gatk:bam-processing/versions/4/plain-WDL/descriptor" as Processing -import "https://api.firecloud.org/ga4gh/v1/tools/gatk:germline-variant-discovery/versions/3/plain-WDL/descriptor" as Calling -import "https://api.firecloud.org/ga4gh/v1/tools/gatk:utilities/versions/2/plain-WDL/descriptor" as Utils - -# WORKFLOW DEFINITION -workflow germline_single_sample_workflow { - - File flowcell_unmapped_bams_fofn - Array[File] flowcell_unmapped_bams = read_lines(flowcell_unmapped_bams_fofn) - - File contamination_sites_ud - File contamination_sites_bed - File contamination_sites_mu - File wgs_evaluation_interval_list - File wgs_coverage_interval_list - - String sample_name - String base_file_name - String final_vcf_base_name - String unmapped_bam_suffix - - File wgs_calling_interval_list - Int haplotype_scatter_count - Int break_bands_at_multiples_of - Int read_length = 250 - - File ref_fasta - File ref_fasta_index - File ref_dict - File ref_alt - File ref_bwt - File ref_sa - File ref_amb - File ref_ann - File ref_pac - - File dbSNP_vcf - File dbSNP_vcf_index - Array[File] known_indels_sites_VCFs - Array[File] known_indels_sites_indices - - Int preemptible_tries - Int agg_preemptible_tries - - Boolean skip_QC - Boolean make_gatk4_single_sample_vcf - Boolean use_gatk4_haplotype_caller - - Float cutoff_for_large_rg_in_gb = 20.0 - - String bwa_commandline="bwa mem -K 100000000 -p -v 3 -t 16 -Y $bash_ref_fasta" - - String recalibrated_bam_basename = base_file_name + ".aligned.duplicates_marked.recalibrated" - - Int compression_level = 2 - - # Get the version of BWA to include in the PG record in the header of the BAM produced - # by MergeBamAlignment. - call Alignment.GetBwaVersion - - # Align flowcell-level unmapped input bams in parallel - scatter (unmapped_bam in flowcell_unmapped_bams) { - - Float unmapped_bam_size = size(unmapped_bam, "GB") - - String unmapped_bam_basename = basename(unmapped_bam, unmapped_bam_suffix) - - if (!skip_QC) { - # QC the unmapped BAM - call QC.CollectQualityYieldMetrics as CollectQualityYieldMetrics { - input: - input_bam = unmapped_bam, - metrics_filename = unmapped_bam_basename + ".unmapped.quality_yield_metrics", - preemptible_tries = preemptible_tries - } - } - - if (unmapped_bam_size > cutoff_for_large_rg_in_gb) { - # Split bam into multiple smaller bams, - # map reads to reference and recombine into one bam - call SplitRG.split_large_readgroup as SplitRG { - input: - input_bam = unmapped_bam, - bwa_commandline = bwa_commandline, - bwa_version = GetBwaVersion.version, - output_bam_basename = unmapped_bam_basename + ".aligned.unsorted", - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - ref_dict = ref_dict, - ref_alt = ref_alt, - ref_amb = ref_amb, - ref_ann = ref_ann, - ref_bwt = ref_bwt, - ref_pac = ref_pac, - ref_sa = ref_sa, - compression_level = compression_level, - preemptible_tries = preemptible_tries - } - } - - if (unmapped_bam_size <= cutoff_for_large_rg_in_gb) { - # Map reads to reference - call Alignment.SamToFastqAndBwaMemAndMba as SamToFastqAndBwaMemAndMba { - input: - input_bam = unmapped_bam, - bwa_commandline = bwa_commandline, - output_bam_basename = unmapped_bam_basename + ".aligned.unsorted", - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - ref_dict = ref_dict, - ref_alt = ref_alt, - ref_bwt = ref_bwt, - ref_amb = ref_amb, - ref_ann = ref_ann, - ref_pac = ref_pac, - ref_sa = ref_sa, - bwa_version = GetBwaVersion.version, - compression_level = compression_level, - preemptible_tries = preemptible_tries - } - } - - File output_aligned_bam = select_first([SamToFastqAndBwaMemAndMba.output_bam, SplitRG.aligned_bam]) - - Float mapped_bam_size = size(output_aligned_bam, "GB") - - if (!skip_QC) { - # QC the aligned but unsorted readgroup BAM - # no reference as the input here is unsorted, providing a reference would cause an error - call QC.CollectUnsortedReadgroupBamQualityMetrics as CollectUnsortedReadgroupBamQualityMetrics { - input: - input_bam = output_aligned_bam, - output_bam_prefix = unmapped_bam_basename + ".readgroup", - preemptible_tries = preemptible_tries - } - } - } - - # Sum the read group bam sizes to approximate the aggregated bam size - call Utils.SumFloats as SumFloats { - input: - sizes = mapped_bam_size, - preemptible_tries = preemptible_tries - } - - # Aggregate aligned+merged flowcell BAM files and mark duplicates - # We take advantage of the tool's ability to take multiple BAM inputs and write out a single output - # to avoid having to spend time just merging BAM files. - call Processing.MarkDuplicates as MarkDuplicates { - input: - input_bams = output_aligned_bam, - output_bam_basename = base_file_name + ".aligned.unsorted.duplicates_marked", - metrics_filename = base_file_name + ".duplicate_metrics", - total_input_size = SumFloats.total_size, - compression_level = compression_level, - preemptible_tries = agg_preemptible_tries - } - - # Sort aggregated+deduped BAM file - call Processing.SortSam as SortSampleBam { - input: - input_bam = MarkDuplicates.output_bam, - output_bam_basename = base_file_name + ".aligned.duplicate_marked.sorted", - compression_level = compression_level, - preemptible_tries = agg_preemptible_tries - } - - # Create list of sequences for scatter-gather parallelization - call Utils.CreateSequenceGroupingTSV as CreateSequenceGroupingTSV { - input: - ref_dict = ref_dict, - preemptible_tries = preemptible_tries - } - - # Estimate level of cross-sample contamination - call Processing.CheckContamination as CheckContamination { - input: - input_bam = SortSampleBam.output_bam, - input_bam_index = SortSampleBam.output_bam_index, - contamination_sites_ud = contamination_sites_ud, - contamination_sites_bed = contamination_sites_bed, - contamination_sites_mu = contamination_sites_mu, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - output_prefix = base_file_name + ".preBqsr", - preemptible_tries = agg_preemptible_tries, - contamination_underestimation_factor = 0.75 - } - - # We need disk to localize the sharded input and output due to the scatter for BQSR. - # If we take the number we are scattering by and reduce by 3 we will have enough disk space - # to account for the fact that the data is not split evenly. - Int num_of_bqsr_scatters = length(CreateSequenceGroupingTSV.sequence_grouping) - Int potential_bqsr_divisor = num_of_bqsr_scatters - 10 - Int bqsr_divisor = if potential_bqsr_divisor > 1 then potential_bqsr_divisor else 1 - - # Perform Base Quality Score Recalibration (BQSR) on the sorted BAM in parallel - scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping) { - # Generate the recalibration model by interval - call Processing.BaseRecalibrator as BaseRecalibrator { - input: - input_bam = SortSampleBam.output_bam, - recalibration_report_filename = base_file_name + ".recal_data.csv", - sequence_group_interval = subgroup, - dbSNP_vcf = dbSNP_vcf, - dbSNP_vcf_index = dbSNP_vcf_index, - known_indels_sites_VCFs = known_indels_sites_VCFs, - known_indels_sites_indices = known_indels_sites_indices, - ref_dict = ref_dict, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - bqsr_scatter = bqsr_divisor, - preemptible_tries = agg_preemptible_tries - } - } - - # Merge the recalibration reports resulting from by-interval recalibration - # The reports are always the same size - call Processing.GatherBqsrReports as GatherBqsrReports { - input: - input_bqsr_reports = BaseRecalibrator.recalibration_report, - output_report_filename = base_file_name + ".recal_data.csv", - preemptible_tries = preemptible_tries - } - - scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping_with_unmapped) { - # Apply the recalibration model by interval - call Processing.ApplyBQSR as ApplyBQSR { - input: - input_bam = SortSampleBam.output_bam, - output_bam_basename = recalibrated_bam_basename, - recalibration_report = GatherBqsrReports.output_bqsr_report, - sequence_group_interval = subgroup, - ref_dict = ref_dict, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - bqsr_scatter = bqsr_divisor, - compression_level = compression_level, - preemptible_tries = agg_preemptible_tries - } - } - - Float agg_bam_size = size(SortSampleBam.output_bam, "GB") - - # Merge the recalibrated BAM files resulting from by-interval recalibration - call Processing.GatherSortedBamFiles as GatherBamFiles { - input: - input_bams = ApplyBQSR.recalibrated_bam, - output_bam_basename = base_file_name, - total_input_size = agg_bam_size, - compression_level = compression_level, - preemptible_tries = agg_preemptible_tries - } - - #BQSR bins the qualities which makes a significantly smaller bam - Float binned_qual_bam_size = size(GatherBamFiles.output_bam, "GB") - - File? chimerism_metrics - - if (!skip_QC) { - # QC the final BAM (consolidated after scattered BQSR) - call QC.CollectReadgroupBamQualityMetrics as CollectReadgroupBamQualityMetrics { - input: - input_bam = GatherBamFiles.output_bam, - input_bam_index = GatherBamFiles.output_bam_index, - output_bam_prefix = base_file_name + ".readgroup", - ref_dict = ref_dict, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - preemptible_tries = agg_preemptible_tries - } - - # QC the final BAM some more (no such thing as too much QC) - call QC.CollectAggregationMetrics as CollectAggregationMetrics { - input: - input_bam = GatherBamFiles.output_bam, - input_bam_index = GatherBamFiles.output_bam_index, - output_bam_prefix = base_file_name, - ref_dict = ref_dict, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - preemptible_tries = agg_preemptible_tries - } - - # QC the sample WGS metrics (stringent thresholds) - call QC.CollectWgsMetrics as CollectWgsMetrics { - input: - input_bam = GatherBamFiles.output_bam, - input_bam_index = GatherBamFiles.output_bam_index, - metrics_filename = base_file_name + ".wgs_metrics", - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - wgs_coverage_interval_list = wgs_coverage_interval_list, - read_length = read_length, - preemptible_tries = agg_preemptible_tries - } - - # QC the sample raw WGS metrics (common thresholds) - call QC.CollectRawWgsMetrics as CollectRawWgsMetrics { - input: - input_bam = GatherBamFiles.output_bam, - input_bam_index = GatherBamFiles.output_bam_index, - metrics_filename = base_file_name + ".raw_wgs_metrics", - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - wgs_coverage_interval_list = wgs_coverage_interval_list, - read_length = read_length, - preemptible_tries = agg_preemptible_tries - } - - # Generate a checksum per readgroup in the final BAM - call QC.CalculateReadGroupChecksum as CalculateReadGroupChecksum { - input: - input_bam = GatherBamFiles.output_bam, - input_bam_index = GatherBamFiles.output_bam_index, - read_group_md5_filename = recalibrated_bam_basename + ".bam.read_group_md5", - preemptible_tries = agg_preemptible_tries - } - } - - # ValidateSamFile runs out of memory in mate validation on crazy edge case data, so we want to skip the mate validation - # in those cases. These values set the thresholds for what is considered outside the normal realm of "reasonable" data. - Float max_duplication_in_reasonable_sample = 0.30 - Float max_chimerism_in_reasonable_sample = 0.15 - - # Convert the final merged recalibrated BAM file to CRAM format - call Utils.ConvertToCram as ConvertToCram { - input: - input_bam = GatherBamFiles.output_bam, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - output_basename = base_file_name, - preemptible_tries = agg_preemptible_tries - } - - if (!skip_QC) { - # Check whether the data has massively high duplication or chimerism rates - call QC.CheckPreValidation as CheckPreValidation { - input: - duplication_metrics = MarkDuplicates.duplicate_metrics, - chimerism_metrics = select_first([CollectAggregationMetrics.alignment_summary_metrics, ""]), - max_duplication_in_reasonable_sample = max_duplication_in_reasonable_sample, - max_chimerism_in_reasonable_sample = max_chimerism_in_reasonable_sample, - preemptible_tries = agg_preemptible_tries - } - } - - Boolean is_outlier_data = select_first([CheckPreValidation.is_outlier_data, false]) - - # Validate the CRAM file - call QC.ValidateSamFile as ValidateCram { - input: - input_bam = ConvertToCram.output_cram, - input_bam_index = ConvertToCram.output_cram_index, - report_filename = base_file_name + ".cram.validation_report", - ref_dict = ref_dict, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - ignore = ["MISSING_TAG_NM"], - max_output = 1000000000, - is_outlier_data = is_outlier_data, - preemptible_tries = agg_preemptible_tries - } - - # Break the calling interval_list into sub-intervals - # Perform variant calling on the sub-intervals, and then gather the results - call Utils.ScatterIntervalList as ScatterIntervalList { - input: - interval_list = wgs_calling_interval_list, - scatter_count = haplotype_scatter_count, - break_bands_at_multiples_of = break_bands_at_multiples_of - } - - # We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller. - # If we take the number we are scattering by and reduce by 20 we will have enough disk space - # to account for the fact that the data is quite uneven across the shards. - Int potential_hc_divisor = ScatterIntervalList.interval_count - 20 - Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1 - - # Call variants in parallel over WGS calling intervals - scatter (index in range(ScatterIntervalList.interval_count)) { - - if (make_gatk4_single_sample_vcf || use_gatk4_haplotype_caller) { - call Calling.HaplotypeCaller_GATK4_VCF as HaplotypeCaller4 { - input: - contamination = CheckContamination.contamination, - input_bam = GatherBamFiles.output_bam, - interval_list = ScatterIntervalList.out[index], - make_gvcf = !make_gatk4_single_sample_vcf, - vcf_basename = base_file_name, - ref_dict = ref_dict, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - hc_scatter = hc_divisor, - preemptible_tries = agg_preemptible_tries - } - - if (make_gatk4_single_sample_vcf) { - call Calling.HardFilterVcf as FilterVcf { - input: - input_vcf = HaplotypeCaller4.output_vcf, - input_vcf_index = HaplotypeCaller4.output_vcf_index, - vcf_basename = base_file_name, - interval_list = ScatterIntervalList.out[index], - preemptible_tries = preemptible_tries - } - } - } - if (!make_gatk4_single_sample_vcf && !use_gatk4_haplotype_caller) { - call Calling.HaplotypeCaller_GATK35_GVCF as HaplotypeCaller3 { - input: - contamination = CheckContamination.contamination, - input_bam = GatherBamFiles.output_bam, - interval_list = ScatterIntervalList.out[index], - gvcf_basename = base_file_name, - ref_dict = ref_dict, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - hc_scatter = hc_divisor, - preemptible_tries = agg_preemptible_tries - } - } - - File merge_input = select_first([FilterVcf.output_vcf, HaplotypeCaller4.output_vcf, HaplotypeCaller3.output_gvcf]) - File merge_input_index = select_first([FilterVcf.output_vcf_index, HaplotypeCaller4.output_vcf_index, HaplotypeCaller3.output_gvcf_index]) - } - - String name_token = if make_gatk4_single_sample_vcf then ".filtered" else ".g" - - # Combine by-interval VCFs into a single sample VCF file - call Calling.MergeVCFs as MergeVCFs { - input: - input_vcfs = merge_input, - input_vcfs_indexes = merge_input_index, - output_vcf_name = final_vcf_base_name + name_token + ".vcf.gz", - preemptible_tries = agg_preemptible_tries - } - - # Outputs that will be retained when execution is complete - output { - Array[File?] quality_yield_metrics = CollectQualityYieldMetrics.quality_yield_metrics - - Array[File?] unsorted_read_group_base_distribution_by_cycle_pdf = CollectUnsortedReadgroupBamQualityMetrics.base_distribution_by_cycle_pdf - Array[File?] unsorted_read_group_base_distribution_by_cycle_metrics = CollectUnsortedReadgroupBamQualityMetrics.base_distribution_by_cycle_metrics - Array[File?] unsorted_read_group_insert_size_histogram_pdf = CollectUnsortedReadgroupBamQualityMetrics.insert_size_histogram_pdf - Array[File?] unsorted_read_group_insert_size_metrics = CollectUnsortedReadgroupBamQualityMetrics.insert_size_metrics - Array[File?] unsorted_read_group_quality_by_cycle_pdf = CollectUnsortedReadgroupBamQualityMetrics.quality_by_cycle_pdf - Array[File?] unsorted_read_group_quality_by_cycle_metrics = CollectUnsortedReadgroupBamQualityMetrics.quality_by_cycle_metrics - Array[File?] unsorted_read_group_quality_distribution_pdf = CollectUnsortedReadgroupBamQualityMetrics.quality_distribution_pdf - Array[File?] unsorted_read_group_quality_distribution_metrics = CollectUnsortedReadgroupBamQualityMetrics.quality_distribution_metrics - - File? read_group_alignment_summary_metrics = CollectReadgroupBamQualityMetrics.alignment_summary_metrics - File? read_group_gc_bias_detail_metrics = CollectReadgroupBamQualityMetrics.gc_bias_detail_metrics - File? read_group_gc_bias_pdf = CollectReadgroupBamQualityMetrics.gc_bias_pdf - File? read_group_gc_bias_summary_metrics = CollectReadgroupBamQualityMetrics.gc_bias_summary_metrics - - File selfSM = CheckContamination.selfSM - Float contamination = CheckContamination.contamination - - File? calculate_read_group_checksum_md5 = CalculateReadGroupChecksum.md5_file - - File? agg_alignment_summary_metrics = CollectAggregationMetrics.alignment_summary_metrics - File? agg_bait_bias_detail_metrics = CollectAggregationMetrics.bait_bias_detail_metrics - File? agg_bait_bias_summary_metrics = CollectAggregationMetrics.bait_bias_summary_metrics - File? agg_gc_bias_detail_metrics = CollectAggregationMetrics.gc_bias_detail_metrics - File? agg_gc_bias_pdf = CollectAggregationMetrics.gc_bias_pdf - File? agg_gc_bias_summary_metrics = CollectAggregationMetrics.gc_bias_summary_metrics - File? agg_insert_size_histogram_pdf = CollectAggregationMetrics.insert_size_histogram_pdf - File? agg_insert_size_metrics = CollectAggregationMetrics.insert_size_metrics - File? agg_pre_adapter_detail_metrics = CollectAggregationMetrics.pre_adapter_detail_metrics - File? agg_pre_adapter_summary_metrics = CollectAggregationMetrics.pre_adapter_summary_metrics - File? agg_quality_distribution_pdf = CollectAggregationMetrics.quality_distribution_pdf - File? agg_quality_distribution_metrics = CollectAggregationMetrics.quality_distribution_metrics - - File? wgs_metrics = CollectWgsMetrics.metrics - File? raw_wgs_metrics = CollectRawWgsMetrics.metrics - - File duplicate_metrics = MarkDuplicates.duplicate_metrics - File output_bqsr_reports = GatherBqsrReports.output_bqsr_report - - File output_cram = ConvertToCram.output_cram - File output_cram_index = ConvertToCram.output_cram_index - File output_cram_md5 = ConvertToCram.output_cram_md5 - - File validate_cram_file_report = ValidateCram.report - - File output_vcf = MergeVCFs.output_vcf - File output_vcf_index = MergeVCFs.output_vcf_index - } -} diff --git a/germline_single_sample_workflow.hg38.inputs.json b/germline_single_sample_workflow.hg38.inputs.json deleted file mode 100644 index c8a885d..0000000 --- a/germline_single_sample_workflow.hg38.inputs.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "##_COMMENT1": "Take note of the .64 extensions on the reference files, issues between 32 and 64 bit OS", - - "##_COMMENT2": "SAMPLE NAME AND UNMAPPED BAMS - read the README to find other examples.", - "germline_single_sample_workflow.sample_name": "NA12878", - "germline_single_sample_workflow.base_file_name": "NA12878", - "germline_single_sample_workflow.flowcell_unmapped_bams": ["gs://broad-public-datasets/NA12878_downsampled_for_testing/unmapped/H06HDADXX130110.1.ATCACGAT.20k_reads.bam", - "gs://broad-public-datasets/NA12878_downsampled_for_testing/unmapped/H06HDADXX130110.2.ATCACGAT.20k_reads.bam", - "gs://broad-public-datasets/NA12878_downsampled_for_testing/unmapped/H06JUADXX130110.1.ATCACGAT.20k_reads.bam"], - "germline_single_sample_workflow.final_gvcf_base_name": "NA12878", - "germline_single_sample_workflow.unmapped_bam_suffix": ".bam", - - "##_COMMENT3": "REFERENCES", - "germline_single_sample_workflow.fingerprint_genotypes_file": "gs://dsde-data-na12878-public/NA12878.hg38.reference.fingerprint.vcf", - "germline_single_sample_workflow.contamination_sites_ud": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.contam.UD", - "germline_single_sample_workflow.contamination_sites_bed": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.contam.bed", - "germline_single_sample_workflow.contamination_sites_mu": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.contam.mu", - "germline_single_sample_workflow.wgs_calling_interval_list": "gs://broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list", - "germline_single_sample_workflow.ref_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict", - "germline_single_sample_workflow.ref_fasta": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", - "germline_single_sample_workflow.ref_fasta_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "germline_single_sample_workflow.ref_alt": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.alt", - "germline_single_sample_workflow.ref_sa": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.sa", - "germline_single_sample_workflow.ref_amb": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.amb", - "germline_single_sample_workflow.ref_bwt": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.bwt", - "germline_single_sample_workflow.ref_ann": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.ann", - "germline_single_sample_workflow.ref_pac": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.pac", - "germline_single_sample_workflow.known_indels_sites_VCFs": [ - "gs://broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", - "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz" - ], - "germline_single_sample_workflow.known_indels_sites_indices": [ - "gs://broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", - "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi" - ], - "germline_single_sample_workflow.dbSNP_vcf": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", - "germline_single_sample_workflow.dbSNP_vcf_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx", - "germline_single_sample_workflow.wgs_coverage_interval_list": "gs://broad-references/hg38/v0/wgs_coverage_regions.hg38.interval_list", - "germline_single_sample_workflow.wgs_evaluation_interval_list": "gs://broad-references/hg38/v0/wgs_evaluation_regions.hg38.interval_list", - "##germline_single_sample_workflow.haplotype_database_file": "File (optional)", - - "##_COMMENT5": "Runtime Attributes", - "germline_single_sample_workflow.flowcell_small_disk": 100, - "germline_single_sample_workflow.flowcell_medium_disk": 200, - "germline_single_sample_workflow.agg_small_disk": 200, - "germline_single_sample_workflow.agg_medium_disk": 300, - "germline_single_sample_workflow.agg_large_disk": 400, - "germline_single_sample_workflow.preemptible_tries": 3, - "germline_single_sample_workflow.agg_preemptible_tries": 3, - - "##_COMMENT6": "MISC", - "germline_single_sample_workflow.break_bands_at_multiples_of": 1000000, - "germline_single_sample_workflow.haplotype_scatter_count": 50 -} - diff --git a/germline_single_sample_workflow.wdl b/germline_single_sample_workflow.wdl deleted file mode 100644 index 2ba8400..0000000 --- a/germline_single_sample_workflow.wdl +++ /dev/null @@ -1,275 +0,0 @@ -## Copyright Broad Institute, 2018 -## -## This WDL pipeline implements data pre-processing and initial variant calling (GVCF -## generation) according to the GATK Best Practices (June 2016) for germline SNP and -## Indel discovery in human whole-genome. -## -## Requirements/expectations : -## - Human whole-genome pair-end sequencing data in unmapped BAM (uBAM) format -## - One or more read groups, one per uBAM file, all belonging to a single sample (SM) -## - Input uBAM files must additionally comply with the following requirements: -## - - filenames all have the same suffix (we use ".unmapped.bam") -## - - files must pass validation by ValidateSamFile -## - - reads are provided in query-sorted order -## - - all reads must have an RG tag -## - GVCF output names must end in ".g.vcf.gz" -## - Reference genome must be Hg38 with ALT contigs -## -## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. -## For program versions, see docker containers. -## -## LICENSING : -## This script is released under the WDL source code license (BSD-3) (see LICENSE in -## https://github.com/broadinstitute/wdl). Note however that the programs it calls may -## be subject to different licenses. Users are responsible for checking that they are -## authorized to run all programs before running this script. Please see the docker -## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed -## licensing information pertaining to the included programs. - -import "./tasks_pipelines/unmapped_bam_to_aligned_bam.wdl" as ToBam -import "./tasks_pipelines/germline_variant_discovery.wdl" as Calling -import "./tasks_pipelines/qc.wdl" as QC -import "./tasks_pipelines/utilities.wdl" as Utils - -# WORKFLOW DEFINITION -workflow germline_single_sample_workflow { - - File contamination_sites_ud - File contamination_sites_bed - File contamination_sites_mu - File? fingerprint_genotypes_file - File? fingerprint_genotypes_index - File? haplotype_database_file - File wgs_evaluation_interval_list - File wgs_coverage_interval_list - - String sample_name - String base_file_name - String final_gvcf_base_name - Array[File] flowcell_unmapped_bams - String unmapped_bam_suffix - - File wgs_calling_interval_list - Int haplotype_scatter_count - Int break_bands_at_multiples_of - Int read_length = 250 - - File ref_fasta - File ref_fasta_index - File ref_dict - File ref_alt - File ref_bwt - File ref_sa - File ref_amb - File ref_ann - File ref_pac - - File dbSNP_vcf - File dbSNP_vcf_index - Array[File] known_indels_sites_VCFs - Array[File] known_indels_sites_indices - - Int preemptible_tries - Int agg_preemptible_tries - - call ToBam.to_bam_workflow { - input: - contamination_sites_ud = contamination_sites_ud, - contamination_sites_bed = contamination_sites_bed, - contamination_sites_mu = contamination_sites_mu, - fingerprint_genotypes_file = fingerprint_genotypes_file, - fingerprint_genotypes_index = fingerprint_genotypes_index, - haplotype_database_file = haplotype_database_file, - wgs_coverage_interval_list = wgs_coverage_interval_list, - sample_name = sample_name, - base_file_name = base_file_name, - flowcell_unmapped_bams = flowcell_unmapped_bams, - unmapped_bam_suffix = unmapped_bam_suffix, - read_length = read_length, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - ref_dict = ref_dict, - ref_alt = ref_alt, - ref_bwt = ref_bwt, - ref_sa = ref_sa, - ref_amb = ref_amb, - ref_ann = ref_ann, - ref_pac = ref_pac, - dbSNP_vcf = dbSNP_vcf, - dbSNP_vcf_index = dbSNP_vcf_index, - known_indels_sites_VCFs = known_indels_sites_VCFs, - known_indels_sites_indices = known_indels_sites_indices, - preemptible_tries = preemptible_tries, - agg_preemptible_tries = agg_preemptible_tries - } - - # ValidateSamFile runs out of memory in mate validation on crazy edge case data, so we want to skip the mate validation - # in those cases. These values set the thresholds for what is considered outside the normal realm of "reasonable" data. - Float max_duplication_in_reasonable_sample = 0.30 - Float max_chimerism_in_reasonable_sample = 0.15 - - # Convert the final merged recalibrated BAM file to CRAM format - call Utils.ConvertToCram as ConvertToCram { - input: - input_bam = to_bam_workflow.output_bam, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - output_basename = base_file_name, - preemptible_tries = agg_preemptible_tries - } - - # Check whether the data has massively high duplication or chimerism rates - call QC.CheckPreValidation as CheckPreValidation { - input: - duplication_metrics = to_bam_workflow.duplicate_metrics, - chimerism_metrics = to_bam_workflow.agg_alignment_summary_metrics, - max_duplication_in_reasonable_sample = max_duplication_in_reasonable_sample, - max_chimerism_in_reasonable_sample = max_chimerism_in_reasonable_sample, - preemptible_tries = agg_preemptible_tries - } - - # Validate the CRAM file - call QC.ValidateSamFile as ValidateCram { - input: - input_bam = ConvertToCram.output_cram, - input_bam_index = ConvertToCram.output_cram_index, - report_filename = base_file_name + ".cram.validation_report", - ref_dict = ref_dict, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - ignore = ["MISSING_TAG_NM"], - max_output = 1000000000, - is_outlier_data = CheckPreValidation.is_outlier_data, - preemptible_tries = agg_preemptible_tries - } - - # Break the calling interval_list into sub-intervals - # Perform variant calling on the sub-intervals, and then gather the results - call Utils.ScatterIntervalList as ScatterIntervalList { - input: - interval_list = wgs_calling_interval_list, - scatter_count = haplotype_scatter_count, - break_bands_at_multiples_of = break_bands_at_multiples_of - } - - # We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller. - # If we take the number we are scattering by and reduce by 20 we will have enough disk space - # to account for the fact that the data is quite uneven across the shards. - Int potential_hc_divisor = ScatterIntervalList.interval_count - 20 - Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1 - - # Call variants in parallel over WGS calling intervals - scatter (index in range(ScatterIntervalList.interval_count)) { - # Generate GVCF by interval - call Calling.HaplotypeCaller_GATK35_GVCF as HaplotypeCaller { - input: - contamination = to_bam_workflow.contamination, - input_bam = to_bam_workflow.output_bam, - interval_list = ScatterIntervalList.out[index], - gvcf_basename = base_file_name, - ref_dict = ref_dict, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - hc_scatter = hc_divisor, - preemptible_tries = agg_preemptible_tries - } - } - - # Combine by-interval GVCFs into a single sample GVCF file - call Calling.MergeVCFs as MergeVCFs { - input: - input_vcfs = HaplotypeCaller.output_gvcf, - input_vcfs_indexes = HaplotypeCaller.output_gvcf_index, - output_vcf_name = final_gvcf_base_name + ".g.vcf.gz", - preemptible_tries = agg_preemptible_tries - } - - Float gvcf_size = size(MergeVCFs.output_vcf, "GB") - - # Validate the GVCF output of HaplotypeCaller - call QC.ValidateGVCF as ValidateGVCF { - input: - input_vcf = MergeVCFs.output_vcf, - input_vcf_index = MergeVCFs.output_vcf_index, - dbSNP_vcf = dbSNP_vcf, - dbSNP_vcf_index = dbSNP_vcf_index, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - ref_dict = ref_dict, - wgs_calling_interval_list = wgs_calling_interval_list, - preemptible_tries = agg_preemptible_tries - } - - # QC the GVCF - call QC.CollectGvcfCallingMetrics as CollectGvcfCallingMetrics { - input: - input_vcf = MergeVCFs.output_vcf, - input_vcf_index = MergeVCFs.output_vcf_index, - metrics_basename = final_gvcf_base_name, - dbSNP_vcf = dbSNP_vcf, - dbSNP_vcf_index = dbSNP_vcf_index, - ref_dict = ref_dict, - wgs_evaluation_interval_list = wgs_evaluation_interval_list, - preemptible_tries = agg_preemptible_tries - } - - # Outputs that will be retained when execution is complete - output { - Array[File] quality_yield_metrics = to_bam_workflow.quality_yield_metrics - - Array[File] unsorted_read_group_base_distribution_by_cycle_pdf = to_bam_workflow.unsorted_read_group_base_distribution_by_cycle_pdf - Array[File] unsorted_read_group_base_distribution_by_cycle_metrics = to_bam_workflow.unsorted_read_group_base_distribution_by_cycle_metrics - Array[File] unsorted_read_group_insert_size_histogram_pdf = to_bam_workflow.unsorted_read_group_insert_size_histogram_pdf - Array[File] unsorted_read_group_insert_size_metrics = to_bam_workflow.unsorted_read_group_insert_size_metrics - Array[File] unsorted_read_group_quality_by_cycle_pdf = to_bam_workflow.unsorted_read_group_quality_by_cycle_pdf - Array[File] unsorted_read_group_quality_by_cycle_metrics = to_bam_workflow.unsorted_read_group_quality_by_cycle_metrics - Array[File] unsorted_read_group_quality_distribution_pdf = to_bam_workflow.unsorted_read_group_quality_distribution_pdf - Array[File] unsorted_read_group_quality_distribution_metrics = to_bam_workflow.unsorted_read_group_quality_distribution_metrics - - File read_group_alignment_summary_metrics = to_bam_workflow.read_group_alignment_summary_metrics - File read_group_gc_bias_detail_metrics = to_bam_workflow.read_group_gc_bias_detail_metrics - File read_group_gc_bias_pdf = to_bam_workflow.read_group_gc_bias_pdf - File read_group_gc_bias_summary_metrics = to_bam_workflow.read_group_gc_bias_summary_metrics - - File? cross_check_fingerprints_metrics = to_bam_workflow.cross_check_fingerprints_metrics - - File selfSM = to_bam_workflow.selfSM - Float contamination = to_bam_workflow.contamination - - File calculate_read_group_checksum_md5 = to_bam_workflow.calculate_read_group_checksum_md5 - - File agg_alignment_summary_metrics = to_bam_workflow.agg_alignment_summary_metrics - File agg_bait_bias_detail_metrics = to_bam_workflow.agg_bait_bias_detail_metrics - File agg_bait_bias_summary_metrics = to_bam_workflow.agg_bait_bias_summary_metrics - File agg_gc_bias_detail_metrics = to_bam_workflow.agg_gc_bias_detail_metrics - File agg_gc_bias_pdf = to_bam_workflow.agg_gc_bias_pdf - File agg_gc_bias_summary_metrics = to_bam_workflow.agg_gc_bias_summary_metrics - File agg_insert_size_histogram_pdf = to_bam_workflow.agg_insert_size_histogram_pdf - File agg_insert_size_metrics = to_bam_workflow.agg_insert_size_metrics - File agg_pre_adapter_detail_metrics = to_bam_workflow.agg_pre_adapter_detail_metrics - File agg_pre_adapter_summary_metrics = to_bam_workflow.agg_pre_adapter_summary_metrics - File agg_quality_distribution_pdf = to_bam_workflow.agg_quality_distribution_pdf - File agg_quality_distribution_metrics = to_bam_workflow.agg_quality_distribution_metrics - - File? fingerprint_summary_metrics = to_bam_workflow.fingerprint_summary_metrics - File? fingerprint_detail_metrics = to_bam_workflow.fingerprint_detail_metrics - - File wgs_metrics = to_bam_workflow.wgs_metrics - File raw_wgs_metrics = to_bam_workflow.raw_wgs_metrics - - File duplicate_metrics = to_bam_workflow.duplicate_metrics - File output_bqsr_reports = to_bam_workflow.output_bqsr_reports - - File gvcf_summary_metrics = CollectGvcfCallingMetrics.summary_metrics - File gvcf_detail_metrics = CollectGvcfCallingMetrics.detail_metrics - - File output_cram = ConvertToCram.output_cram - File output_cram_index = ConvertToCram.output_cram_index - File output_cram_md5 = ConvertToCram.output_cram_md5 - - File validate_cram_file_report = ValidateCram.report - - File output_vcf = MergeVCFs.output_vcf - File output_vcf_index = MergeVCFs.output_vcf_index - } -} diff --git a/structs/GermlineStructs.wdl b/structs/GermlineStructs.wdl new file mode 100644 index 0000000..485e383 --- /dev/null +++ b/structs/GermlineStructs.wdl @@ -0,0 +1,72 @@ +version 1.0 + +struct SampleAndUnmappedBams { + String base_file_name + String final_gvcf_base_name + Array[File] flowcell_unmapped_bams + String sample_name + String unmapped_bam_suffix +} + +struct ReferenceFasta { + File ref_dict + File ref_fasta + File ref_fasta_index + File ref_alt + File ref_sa + File ref_amb + File ref_bwt + File ref_ann + File ref_pac +} + +struct GermlineSingleSampleReferences { + File? fingerprint_genotypes_file + File? fingerprint_genotypes_index + + File contamination_sites_ud + File contamination_sites_bed + File contamination_sites_mu + File calling_interval_list + + Int haplotype_scatter_count + Int break_bands_at_multiples_of + + ReferenceFasta reference_fasta + + Array[File] known_indels_sites_vcfs + Array[File] known_indels_sites_indices + + File dbsnp_vcf + File dbsnp_vcf_index + + File evaluation_interval_list +} + +struct ExomeGermlineSingleSampleOligos { + File target_interval_list + File bait_interval_list + String bait_set_name +} + +struct CrossSpeciesContaminationReferences { + File filter_bwa_image + File kmer_file + File meats_bwa_image + File meats_fasta + File meats_fasta_dict + File meats_taxonomy_file + File microbe_bwa_image + File microbe_fasta + File microbe_fasta_dict + File microbe_taxonomy_file + File normalization_file + File metrics_script_file + Float score_min_identity + Int reads_after_downsampling +} + +struct PapiSettings { + Int preemptible_tries + Int agg_preemptible_tries +} diff --git a/tasks/AggregatedBamQC.wdl b/tasks/AggregatedBamQC.wdl new file mode 100644 index 0000000..acd5445 --- /dev/null +++ b/tasks/AggregatedBamQC.wdl @@ -0,0 +1,112 @@ +version 1.0 +## Copyright Broad Institute, 2018 +## +## This WDL pipeline implements data processing according to the GATK Best Practices (June 2016) +## for human whole-genome and exome sequencing data. +## +## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# Local import +#import "./Qc.wdl" as QC +#import "../structs/GermlineStructs.wdl" + +# Git URL import +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Qc.wdl" as QC +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/structs/GermlineStructs.wdl" + +# WORKFLOW DEFINITION +workflow AggregatedBamQC { +input { + File base_recalibrated_bam + File base_recalibrated_bam_index + String base_name + String sample_name + String recalibrated_bam_base_name + File? haplotype_database_file + GermlineSingleSampleReferences references + PapiSettings papi_settings + } + + # QC the final BAM (consolidated after scattered BQSR) + call QC.CollectReadgroupBamQualityMetrics as CollectReadgroupBamQualityMetrics { + input: + input_bam = base_recalibrated_bam, + input_bam_index = base_recalibrated_bam_index, + output_bam_prefix = base_name + ".readgroup", + ref_dict = references.reference_fasta.ref_dict, + ref_fasta = references.reference_fasta.ref_fasta, + ref_fasta_index = references.reference_fasta.ref_fasta_index, + preemptible_tries = papi_settings.agg_preemptible_tries + } + + # QC the final BAM some more (no such thing as too much QC) + call QC.CollectAggregationMetrics as CollectAggregationMetrics { + input: + input_bam = base_recalibrated_bam, + input_bam_index = base_recalibrated_bam_index, + output_bam_prefix = base_name, + ref_dict = references.reference_fasta.ref_dict, + ref_fasta = references.reference_fasta.ref_fasta, + ref_fasta_index = references.reference_fasta.ref_fasta_index, + preemptible_tries = papi_settings.agg_preemptible_tries + } + + if (defined(haplotype_database_file) && defined(references.fingerprint_genotypes_file)) { + # Check the sample BAM fingerprint against the sample array + call QC.CheckFingerprint as CheckFingerprint { + input: + input_bam = base_recalibrated_bam, + input_bam_index = base_recalibrated_bam_index, + haplotype_database_file = haplotype_database_file, + genotypes = references.fingerprint_genotypes_file, + genotypes_index = references.fingerprint_genotypes_index, + output_basename = base_name, + sample = sample_name, + preemptible_tries = papi_settings.agg_preemptible_tries + } + } + + # Generate a checksum per readgroup in the final BAM + call QC.CalculateReadGroupChecksum as CalculateReadGroupChecksum { + input: + input_bam = base_recalibrated_bam, + input_bam_index = base_recalibrated_bam_index, + read_group_md5_filename = recalibrated_bam_base_name + ".bam.read_group_md5", + preemptible_tries = papi_settings.agg_preemptible_tries + } + + output { + File read_group_alignment_summary_metrics = CollectReadgroupBamQualityMetrics.alignment_summary_metrics + File read_group_gc_bias_detail_metrics = CollectReadgroupBamQualityMetrics.gc_bias_detail_metrics + File read_group_gc_bias_pdf = CollectReadgroupBamQualityMetrics.gc_bias_pdf + File read_group_gc_bias_summary_metrics = CollectReadgroupBamQualityMetrics.gc_bias_summary_metrics + + File calculate_read_group_checksum_md5 = CalculateReadGroupChecksum.md5_file + + File agg_alignment_summary_metrics = CollectAggregationMetrics.alignment_summary_metrics + File agg_bait_bias_detail_metrics = CollectAggregationMetrics.bait_bias_detail_metrics + File agg_bait_bias_summary_metrics = CollectAggregationMetrics.bait_bias_summary_metrics + File agg_gc_bias_detail_metrics = CollectAggregationMetrics.gc_bias_detail_metrics + File agg_gc_bias_pdf = CollectAggregationMetrics.gc_bias_pdf + File agg_gc_bias_summary_metrics = CollectAggregationMetrics.gc_bias_summary_metrics + File agg_insert_size_histogram_pdf = CollectAggregationMetrics.insert_size_histogram_pdf + File agg_insert_size_metrics = CollectAggregationMetrics.insert_size_metrics + File agg_pre_adapter_detail_metrics = CollectAggregationMetrics.pre_adapter_detail_metrics + File agg_pre_adapter_summary_metrics = CollectAggregationMetrics.pre_adapter_summary_metrics + File agg_quality_distribution_pdf = CollectAggregationMetrics.quality_distribution_pdf + File agg_quality_distribution_metrics = CollectAggregationMetrics.quality_distribution_metrics + File agg_error_summary_metrics = CollectAggregationMetrics.error_summary_metrics + + File? fingerprint_summary_metrics = CheckFingerprint.summary_metrics + File? fingerprint_detail_metrics = CheckFingerprint.detail_metrics + } +} diff --git a/tasks_pipelines/alignment.wdl b/tasks/Alignment.wdl similarity index 57% rename from tasks_pipelines/alignment.wdl rename to tasks/Alignment.wdl index 0ba0334..dad2882 100644 --- a/tasks_pipelines/alignment.wdl +++ b/tasks/Alignment.wdl @@ -1,3 +1,4 @@ +version 1.0 ## Copyright Broad Institute, 2018 ## ## This WDL defines tasks used for alignment of human whole-genome or exome sequencing data. @@ -13,6 +14,12 @@ ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed ## licensing information pertaining to the included programs. +# Local Import +#import "../structs/GermlineStructs.wdl" + +# Git URL Import +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/structs/GermlineStructs.wdl" + # Get version of BWA task GetBwaVersion { command { @@ -23,39 +30,34 @@ task GetBwaVersion { sed 's/Version: //' } runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" - memory: "1 GB" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" + memory: "1 GiB" } output { - String version = read_string(stdout()) + String bwa_version = read_string(stdout()) } } # Read unmapped BAM, convert on-the-fly to FASTQ and stream to BWA MEM for alignment, then stream to MergeBamAlignment task SamToFastqAndBwaMemAndMba { - File input_bam - String bwa_commandline - String bwa_version - String output_bam_basename - File ref_fasta - File ref_fasta_index - File ref_dict - - # This is the .alt file from bwa-kit (https://github.com/lh3/bwa/tree/master/bwakit), - # listing the reference contigs that are "alternative". - File ref_alt - - File ref_amb - File ref_ann - File ref_bwt - File ref_pac - File ref_sa - Int compression_level - Int preemptible_tries - - Float unmapped_bam_size = size(input_bam, "GB") - Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") - Float bwa_ref_size = ref_size + size(ref_alt, "GB") + size(ref_amb, "GB") + size(ref_ann, "GB") + size(ref_bwt, "GB") + size(ref_pac, "GB") + size(ref_sa, "GB") + input { + File input_bam + String bwa_commandline + String bwa_version + String output_bam_basename + + # reference_fasta.ref_alt is the .alt file from bwa-kit + # (https://github.com/lh3/bwa/tree/master/bwakit), + # listing the reference contigs that are "alternative". + ReferenceFasta reference_fasta + + Int compression_level + Int preemptible_tries + } + + Float unmapped_bam_size = size(input_bam, "GiB") + Float ref_size = size(reference_fasta.ref_fasta, "GiB") + size(reference_fasta.ref_fasta_index, "GiB") + size(reference_fasta.ref_dict, "GiB") + Float bwa_ref_size = ref_size + size(reference_fasta.ref_alt, "GiB") + size(reference_fasta.ref_amb, "GiB") + size(reference_fasta.ref_ann, "GiB") + size(reference_fasta.ref_bwt, "GiB") + size(reference_fasta.ref_pac, "GiB") + size(reference_fasta.ref_sa, "GiB") # Sometimes the output is larger than the input, or a task can spill to disk. # In these cases we need to account for the input (1) and the output (1.5) or the input(1), the output(1), and spillage (.5). Float disk_multiplier = 2.5 @@ -66,17 +68,17 @@ task SamToFastqAndBwaMemAndMba { set -e # set the bash variable needed for the command-line - bash_ref_fasta=${ref_fasta} - # if ref_alt has data in it, - if [ -s ${ref_alt} ]; then - java -Xms5000m -jar /usr/gitc/picard.jar \ + bash_ref_fasta=~{reference_fasta.ref_fasta} + # if reference_fasta.ref_alt has data in it, + if [ -s ~{reference_fasta.ref_alt} ]; then + java -Xms1000m -Xmx1000m -jar /usr/gitc/picard.jar \ SamToFastq \ - INPUT=${input_bam} \ + INPUT=~{input_bam} \ FASTQ=/dev/stdout \ INTERLEAVE=true \ NON_PF=true | \ - /usr/gitc/${bwa_commandline} /dev/stdin - 2> >(tee ${output_bam_basename}.bwa.stderr.log >&2) | \ - java -Dsamjdk.compression_level=${compression_level} -Xms3000m -jar /usr/gitc/picard.jar \ + /usr/gitc/~{bwa_commandline} /dev/stdin - 2> >(tee ~{output_bam_basename}.bwa.stderr.log >&2) | \ + java -Dsamjdk.compression_level=~{compression_level} -Xms1000m -Xmx1000m -jar /usr/gitc/picard.jar \ MergeBamAlignment \ VALIDATION_STRINGENCY=SILENT \ EXPECTED_ORIENTATIONS=FR \ @@ -84,9 +86,9 @@ task SamToFastqAndBwaMemAndMba { ATTRIBUTES_TO_REMOVE=NM \ ATTRIBUTES_TO_REMOVE=MD \ ALIGNED_BAM=/dev/stdin \ - UNMAPPED_BAM=${input_bam} \ - OUTPUT=${output_bam_basename}.bam \ - REFERENCE_SEQUENCE=${ref_fasta} \ + UNMAPPED_BAM=~{input_bam} \ + OUTPUT=~{output_bam_basename}.bam \ + REFERENCE_SEQUENCE=~{reference_fasta.ref_fasta} \ PAIRED_RUN=true \ SORT_ORDER="unsorted" \ IS_BISULFITE_SEQUENCE=false \ @@ -97,42 +99,44 @@ task SamToFastqAndBwaMemAndMba { MAX_INSERTIONS_OR_DELETIONS=-1 \ PRIMARY_ALIGNMENT_STRATEGY=MostDistant \ PROGRAM_RECORD_ID="bwamem" \ - PROGRAM_GROUP_VERSION="${bwa_version}" \ - PROGRAM_GROUP_COMMAND_LINE="${bwa_commandline}" \ + PROGRAM_GROUP_VERSION="~{bwa_version}" \ + PROGRAM_GROUP_COMMAND_LINE="~{bwa_commandline}" \ PROGRAM_GROUP_NAME="bwamem" \ UNMAPPED_READ_STRATEGY=COPY_TO_TAG \ ALIGNER_PROPER_PAIR_FLAGS=true \ UNMAP_CONTAMINANT_READS=true \ ADD_PG_TAG_TO_READS=false - grep -m1 "read .* ALT contigs" ${output_bam_basename}.bwa.stderr.log | \ + grep -m1 "read .* ALT contigs" ~{output_bam_basename}.bwa.stderr.log | \ grep -v "read 0 ALT contigs" - # else ref_alt is empty or could not be found + # else reference_fasta.ref_alt is empty or could not be found else exit 1; fi >>> runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" preemptible: preemptible_tries - memory: "14 GB" + memory: "14 GiB" cpu: "16" disks: "local-disk " + disk_size + " HDD" } output { - File output_bam = "${output_bam_basename}.bam" - File bwa_stderr_log = "${output_bam_basename}.bwa.stderr.log" + File output_bam = "~{output_bam_basename}.bam" + File bwa_stderr_log = "~{output_bam_basename}.bwa.stderr.log" } } task SamSplitter { - File input_bam - Int n_reads - Int preemptible_tries - Int compression_level + input { + File input_bam + Int n_reads + Int preemptible_tries + Int compression_level + } - Float unmapped_bam_size = size(input_bam, "GB") + Float unmapped_bam_size = size(input_bam, "GiB") # Since the output bams are less compressed than the input bam we need a disk multiplier that's larger than 2. Float disk_multiplier = 2.5 Int disk_size = ceil(disk_multiplier * unmapped_bam_size + 20) @@ -141,21 +145,21 @@ task SamSplitter { set -e mkdir output_dir - total_reads=$(samtools view -c ${input_bam}) + total_reads=$(samtools view -c ~{input_bam}) - java -Dsamjdk.compression_level=${compression_level} -Xms3000m -jar /usr/gitc/picard.jar SplitSamByNumberOfReads \ - INPUT=${input_bam} \ + java -Dsamjdk.compression_level=~{compression_level} -Xms3000m -jar /usr/gitc/picard.jar SplitSamByNumberOfReads \ + INPUT=~{input_bam} \ OUTPUT=output_dir \ - SPLIT_TO_N_READS=${n_reads} \ + SPLIT_TO_N_READS=~{n_reads} \ TOTAL_READS_IN_INPUT=$total_reads } output { Array[File] split_bams = glob("output_dir/*.bam") } runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" preemptible: preemptible_tries - memory: "3.75 GB" + memory: "3.75 GiB" disks: "local-disk " + disk_size + " HDD" } } diff --git a/tasks/BamProcessing.wdl b/tasks/BamProcessing.wdl new file mode 100644 index 0000000..a006c2f --- /dev/null +++ b/tasks/BamProcessing.wdl @@ -0,0 +1,490 @@ +version 1.0 + +## Copyright Broad Institute, 2018 +## +## This WDL defines tasks used for BAM file processing of human whole-genome or exome sequencing data. +## +## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# Sort BAM file by coordinate order +task SortSam { + input { + File input_bam + String output_bam_basename + Int preemptible_tries + Int compression_level + } + # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs + # more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier + Float sort_sam_disk_multiplier = 3.25 + Int disk_size = ceil(sort_sam_disk_multiplier * size(input_bam, "GiB")) + 20 + + command { + java -Dsamjdk.compression_level=~{compression_level} -Xms4000m -jar /usr/gitc/picard.jar \ + SortSam \ + INPUT=~{input_bam} \ + OUTPUT=~{output_bam_basename}.bam \ + SORT_ORDER="coordinate" \ + CREATE_INDEX=true \ + CREATE_MD5_FILE=true \ + MAX_RECORDS_IN_RAM=300000 + + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" + disks: "local-disk " + disk_size + " HDD" + cpu: "1" + memory: "5000 MiB" + preemptible: preemptible_tries + } + output { + File output_bam = "~{output_bam_basename}.bam" + File output_bam_index = "~{output_bam_basename}.bai" + File output_bam_md5 = "~{output_bam_basename}.bam.md5" + } +} + +# Sort BAM file by coordinate order -- using Spark! +task SortSamSpark { + input { + File input_bam + String output_bam_basename + Int preemptible_tries + Int compression_level + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.0.10.1" + } + # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs + # more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier + Float sort_sam_disk_multiplier = 3.25 + Int disk_size = ceil(sort_sam_disk_multiplier * size(input_bam, "GiB")) + 20 + + command { + set -e + + gatk --java-options "-Dsamjdk.compression_level=~{compression_level} -Xms100g -Xmx100g" \ + SortSamSpark \ + -I ~{input_bam} \ + -O ~{output_bam_basename}.bam \ + -- --conf spark.local.dir=. --spark-master 'local[16]' --conf 'spark.kryo.referenceTracking=false' + + samtools index ~{output_bam_basename}.bam ~{output_bam_basename}.bai + } + runtime { + docker: gatk_docker + disks: "local-disk " + disk_size + " HDD" + bootDiskSizeGb: "15" + cpu: "16" + memory: "102 GiB" + preemptible: preemptible_tries + } + output { + File output_bam = "~{output_bam_basename}.bam" + File output_bam_index = "~{output_bam_basename}.bai" + } +} + +# Mark duplicate reads to avoid counting non-independent observations +task MarkDuplicates { + input { + Array[File] input_bams + String output_bam_basename + String metrics_filename + Float total_input_size + Int compression_level + Int preemptible_tries + + # The program default for READ_NAME_REGEX is appropriate in nearly every case. + # Sometimes we wish to supply "null" in order to turn off optical duplicate detection + # This can be desirable if you don't mind the estimated library size being wrong and optical duplicate detection is taking >7 days and failing + String? read_name_regex + Int memory_multiplier = 1 + } + + # The merged bam will be smaller than the sum of the parts so we need to account for the unmerged inputs and the merged output. + # Mark Duplicates takes in as input readgroup bams and outputs a slightly smaller aggregated bam. Giving .25 as wiggleroom + Float md_disk_multiplier = 3 + Int disk_size = ceil(md_disk_multiplier * total_input_size) + 20 + + Int memory_size = ceil(8 * memory_multiplier) + Int java_memory_size = (memory_size - 2) + + # Task is assuming query-sorted input so that the Secondary and Supplementary reads get marked correctly + # This works because the output of BWA is query-grouped and therefore, so is the output of MergeBamAlignment. + # While query-grouped isn't actually query-sorted, it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname" + + command { + java -Dsamjdk.compression_level=~{compression_level} -Xms~{java_memory_size}g -jar /usr/gitc/picard.jar \ + MarkDuplicates \ + INPUT=~{sep=' INPUT=' input_bams} \ + OUTPUT=~{output_bam_basename}.bam \ + METRICS_FILE=~{metrics_filename} \ + VALIDATION_STRINGENCY=SILENT \ + ~{"READ_NAME_REGEX=" + read_name_regex} \ + OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \ + ASSUME_SORT_ORDER="queryname" \ + CLEAR_DT="false" \ + ADD_PG_TAG_TO_READS=false + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" + preemptible: preemptible_tries + memory: "~{memory_size} GiB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File output_bam = "~{output_bam_basename}.bam" + File duplicate_metrics = "~{metrics_filename}" + } +} + +task MarkDuplicatesSpark { + input { + Array[File] input_bams + String output_bam_basename + String metrics_filename + Float total_input_size + Int compression_level + Int preemptible_tries + + String? read_name_regex + Int memory_multiplier = 3 + Int cpu_size = 6 + } + + # The merged bam will be smaller than the sum of the parts so we need to account for the unmerged inputs and the merged output. + # Mark Duplicates takes in as input readgroup bams and outputs a slightly smaller aggregated bam. Giving 2.5 as wiggleroom + Float md_disk_multiplier = 2.5 + Int disk_size = ceil(md_disk_multiplier * total_input_size) + 20 + + Int memory_size = ceil(16 * memory_multiplier) + Int java_memory_size = (memory_size - 6) + + String output_bam_location = "~{output_bam_basename}.bam" + + # Removed options ASSUME_SORT_ORDER, CLEAR_DT, and ADD_PG_TAG_TO_READS as it seems like they are a) not implemented + # in MarkDuplicatesSpark, and/or b) are set to "false" aka "don't do" anyhow. + # MarkDuplicatesSpark requires PAPIv2 + command <<< + set -e + export GATK_LOCAL_JAR=/root/gatk.jar + gatk --java-options "-Dsamjdk.compression_level=~{compression_level} -Xmx~{java_memory_size}g" \ + MarkDuplicatesSpark \ + --input ~{sep=' --input ' input_bams} \ + --output ~{output_bam_location} \ + --metrics-file ~{metrics_filename} \ + --read-validation-stringency SILENT \ + ~{"--read-name-regex " + read_name_regex} \ + --optical-duplicate-pixel-distance 2500 \ + --treat-unsorted-as-querygroup-ordered \ + --create-output-bam-index false \ + -- --conf spark.local.dir=/mnt/tmp --spark-master 'local[16]' --conf 'spark.kryo.referenceTracking=false' + >>> + + runtime { + docker: "jamesemery/gatknightly:gatkMasterSnapshot44ca2e9e84a" + disks: "/mnt/tmp " + ceil(2.1 * total_input_size) + " LOCAL, local-disk " + disk_size + " HDD" + bootDiskSizeGb: "50" + cpu: cpu_size + memory: "~{memory_size} GiB" + preemptible: preemptible_tries + } + + output { + File output_bam = output_bam_location + File duplicate_metrics = metrics_filename + } +} + +# Generate Base Quality Score Recalibration (BQSR) model +task BaseRecalibrator { + input { + File input_bam + String recalibration_report_filename + Array[String] sequence_group_interval + File dbsnp_vcf + File dbsnp_vcf_index + Array[File] known_indels_sites_vcfs + Array[File] known_indels_sites_indices + File ref_dict + File ref_fasta + File ref_fasta_index + Int bqsr_scatter + Int preemptible_tries + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.0.10.1" + } + + Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") + Float dbsnp_size = size(dbsnp_vcf, "GiB") + Int disk_size = ceil((size(input_bam, "GiB") / bqsr_scatter) + ref_size + dbsnp_size) + 20 + + parameter_meta { + input_bam: { + localization_optional: true + } + } + + command { + gatk --java-options "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+PrintFlagsFinal \ + -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -XX:+PrintGCDetails \ + -Xloggc:gc_log.log -Xms4000m" \ + BaseRecalibrator \ + -R ~{ref_fasta} \ + -I ~{input_bam} \ + --use-original-qualities \ + -O ~{recalibration_report_filename} \ + --known-sites ~{dbsnp_vcf} \ + --known-sites ~{sep=" -known-sites " known_indels_sites_vcfs} \ + -L ~{sep=" -L " sequence_group_interval} + } + runtime { + docker: gatk_docker + preemptible: preemptible_tries + memory: "6 GiB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File recalibration_report = "~{recalibration_report_filename}" + } +} + +# Apply Base Quality Score Recalibration (BQSR) model +task ApplyBQSR { + input { + File input_bam + String output_bam_basename + File recalibration_report + Array[String] sequence_group_interval + File ref_dict + File ref_fasta + File ref_fasta_index + Int compression_level + Int bqsr_scatter + Int preemptible_tries + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.0.10.1" + Int memory_multiplier = 1 + } + + Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") + Int disk_size = ceil((size(input_bam, "GiB") * 3 / bqsr_scatter) + ref_size) + 20 + + Int memory_size = ceil(3500 * memory_multiplier) + + parameter_meta { + input_bam: { + localization_optional: true + } + } + + command { + gatk --java-options "-XX:+PrintFlagsFinal -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps \ + -XX:+PrintGCDetails -Xloggc:gc_log.log \ + -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Dsamjdk.compression_level=~{compression_level} -Xms3000m" \ + ApplyBQSR \ + --create-output-bam-md5 \ + --add-output-sam-program-record \ + -R ~{ref_fasta} \ + -I ~{input_bam} \ + --use-original-qualities \ + -O ~{output_bam_basename}.bam \ + -bqsr ~{recalibration_report} \ + --static-quantized-quals 10 \ + --static-quantized-quals 20 \ + --static-quantized-quals 30 \ + -L ~{sep=" -L " sequence_group_interval} + } + runtime { + docker: gatk_docker + preemptible: preemptible_tries + memory: "~{memory_size} MiB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File recalibrated_bam = "~{output_bam_basename}.bam" + File recalibrated_bam_checksum = "~{output_bam_basename}.bam.md5" + } +} + +# Combine multiple recalibration tables from scattered BaseRecalibrator runs +task GatherBqsrReports { + input { + Array[File] input_bqsr_reports + String output_report_filename + Int preemptible_tries + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.0.10.1" + } + + command { + gatk --java-options "-Xms3000m" \ + GatherBQSRReports \ + -I ~{sep=' -I ' input_bqsr_reports} \ + -O ~{output_report_filename} + } + runtime { + docker: gatk_docker + preemptible: preemptible_tries + memory: "3500 MiB" + disks: "local-disk 20 HDD" + } + output { + File output_bqsr_report = "~{output_report_filename}" + } +} + +# Combine multiple *sorted* BAM files +task GatherSortedBamFiles { + input { + Array[File] input_bams + String output_bam_basename + Float total_input_size + Int compression_level + Int preemptible_tries + } + + # Multiply the input bam size by two to account for the input and output + Int disk_size = ceil(2 * total_input_size) + 20 + + command { + java -Dsamjdk.compression_level=~{compression_level} -Xms2000m -jar /usr/gitc/picard.jar \ + GatherBamFiles \ + INPUT=~{sep=' INPUT=' input_bams} \ + OUTPUT=~{output_bam_basename}.bam \ + CREATE_INDEX=true \ + CREATE_MD5_FILE=true + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" + preemptible: preemptible_tries + memory: "3 GiB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File output_bam = "~{output_bam_basename}.bam" + File output_bam_index = "~{output_bam_basename}.bai" + File output_bam_md5 = "~{output_bam_basename}.bam.md5" + } +} + +# Combine multiple *unsorted* BAM files +# Note that if/when WDL supports optional outputs, we should merge this task with the sorted version +task GatherUnsortedBamFiles { + input { + Array[File] input_bams + String output_bam_basename + Float total_input_size + Int compression_level + Int preemptible_tries + } + + # Multiply the input bam size by two to account for the input and output + Int disk_size = ceil(2 * total_input_size) + 20 + + command { + java -Dsamjdk.compression_level=~{compression_level} -Xms2000m -jar /usr/gitc/picard.jar \ + GatherBamFiles \ + INPUT=~{sep=' INPUT=' input_bams} \ + OUTPUT=~{output_bam_basename}.bam \ + CREATE_INDEX=false \ + CREATE_MD5_FILE=false + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" + preemptible: preemptible_tries + memory: "3 GiB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File output_bam = "~{output_bam_basename}.bam" + } +} + +# Notes on the contamination estimate: +# The contamination value is read from the FREEMIX field of the selfSM file output by verifyBamId +# +# In Zamboni production, this value is stored directly in METRICS.AGGREGATION_CONTAM +# +# Contamination is also stored in GVCF_CALLING and thereby passed to HAPLOTYPE_CALLER +# But first, it is divided by an underestimation factor thusly: +# float(FREEMIX) / ContaminationUnderestimationFactor +# where the denominator is hardcoded in Zamboni: +# val ContaminationUnderestimationFactor = 0.75f +# +# Here, I am handling this by returning both the original selfSM file for reporting, and the adjusted +# contamination estimate for use in variant calling +task CheckContamination { + input { + File input_bam + File input_bam_index + File contamination_sites_ud + File contamination_sites_bed + File contamination_sites_mu + File ref_fasta + File ref_fasta_index + String output_prefix + Int preemptible_tries + Float contamination_underestimation_factor + Boolean disable_sanity_check = false + } + + Int disk_size = ceil(size(input_bam, "GiB") + size(ref_fasta, "GiB")) + 30 + + command <<< + set -e + + # creates a ~{output_prefix}.selfSM file, a TSV file with 2 rows, 19 columns. + # First row are the keys (e.g., SEQ_SM, RG, FREEMIX), second row are the associated values + /usr/gitc/VerifyBamID \ + --Verbose \ + --NumPC 4 \ + --Output ~{output_prefix} \ + --BamFile ~{input_bam} \ + --Reference ~{ref_fasta} \ + --UDPath ~{contamination_sites_ud} \ + --MeanPath ~{contamination_sites_mu} \ + --BedPath ~{contamination_sites_bed} \ + ~{true="--DisableSanityCheck" false="" disable_sanity_check} \ + 1>/dev/null + + # used to read from the selfSM file and calculate contamination, which gets printed out + python3 <>> + runtime { + preemptible: preemptible_tries + memory: "4 GiB" + disks: "local-disk " + disk_size + " HDD" + docker: "us.gcr.io/broad-gotc-prod/verify-bam-id:c1cba76e979904eb69c31520a0d7f5be63c72253-1553018888" + cpu: "2" + } + output { + File selfSM = "~{output_prefix}.selfSM" + Float contamination = read_float(stdout()) + } +} diff --git a/tasks/BamToCram.wdl b/tasks/BamToCram.wdl new file mode 100644 index 0000000..4be2b66 --- /dev/null +++ b/tasks/BamToCram.wdl @@ -0,0 +1,72 @@ +version 1.0 + +# Local Import +#import "Utilities.wdl" as Utils +#import "Qc.wdl" as QC + +# Git URL Import +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Utilities.wdl" as Utils +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Qc.wdl" as QC + +workflow BamToCram { + + input { + File input_bam + File ref_fasta + File ref_fasta_index + File ref_dict + File duplication_metrics + File chimerism_metrics + String base_file_name + Int agg_preemptible_tries + } + + + # ValidateSamFile runs out of memory in mate validation on crazy edge case data, so we want to skip the mate validation + # in those cases. These values set the thresholds for what is considered outside the normal realm of "reasonable" data. + Float max_duplication_in_reasonable_sample = 0.30 + Float max_chimerism_in_reasonable_sample = 0.15 + + # Convert the final merged recalibrated BAM file to CRAM format + call Utils.ConvertToCram as ConvertToCram { + input: + input_bam = input_bam, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + output_basename = base_file_name, + preemptible_tries = agg_preemptible_tries + } + + # Check whether the data has massively high duplication or chimerism rates + call QC.CheckPreValidation as CheckPreValidation { + input: + duplication_metrics = duplication_metrics, + chimerism_metrics = chimerism_metrics, + max_duplication_in_reasonable_sample = max_duplication_in_reasonable_sample, + max_chimerism_in_reasonable_sample = max_chimerism_in_reasonable_sample, + preemptible_tries = agg_preemptible_tries + } + + # Validate the CRAM file + call QC.ValidateSamFile as ValidateCram { + input: + input_bam = ConvertToCram.output_cram, + input_bam_index = ConvertToCram.output_cram_index, + report_filename = base_file_name + ".cram.validation_report", + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ignore = ["MISSING_TAG_NM"], + max_output = 1000000000, + is_outlier_data = CheckPreValidation.is_outlier_data, + preemptible_tries = agg_preemptible_tries + } + + output { + File output_cram = ConvertToCram.output_cram + File output_cram_index = ConvertToCram.output_cram_index + File output_cram_md5 = ConvertToCram.output_cram_md5 + File validate_cram_file_report = ValidateCram.report + } +} + diff --git a/tasks/GermlineVariantDiscovery.wdl b/tasks/GermlineVariantDiscovery.wdl new file mode 100644 index 0000000..71779d0 --- /dev/null +++ b/tasks/GermlineVariantDiscovery.wdl @@ -0,0 +1,314 @@ +version 1.0 + +## Copyright Broad Institute, 2018 +## +## This WDL defines tasks used for germline variant discovery of human whole-genome or exome sequencing data. +## +## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +task HaplotypeCaller_GATK35_GVCF { + input { + File input_bam + File interval_list + String gvcf_basename + File ref_dict + File ref_fasta + File ref_fasta_index + Float? contamination + Int preemptible_tries + Int hc_scatter + } + + parameter_meta { + input_bam: { + localization_optional: true + } + } + + Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") + Int disk_size = ceil(((size(input_bam, "GB") + 30) / hc_scatter) + ref_size) + 20 + + # We use interval_padding 500 below to make sure that the HaplotypeCaller has context on both sides around + # the interval because the assembly uses them. + # + # Using PrintReads is a temporary solution until we update HaploypeCaller to use GATK4. Once that is done, + # HaplotypeCaller can stream the required intervals directly from the cloud. + command { + /usr/gitc/gatk4/gatk-launch --javaOptions "-Xms2g" \ + PrintReads \ + -I ~{input_bam} \ + --interval_padding 500 \ + -L ~{interval_list} \ + -O local.sharded.bam \ + && \ + java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms8000m \ + -jar /usr/gitc/GATK35.jar \ + -T HaplotypeCaller \ + -R ~{ref_fasta} \ + -o ~{gvcf_basename}.vcf.gz \ + -I local.sharded.bam \ + -L ~{interval_list} \ + -ERC GVCF \ + --max_alternate_alleles 3 \ + -variant_index_parameter 128000 \ + -variant_index_type LINEAR \ + -contamination ~{default=0 contamination} \ + --read_filter OverclippedRead + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" + preemptible: preemptible_tries + memory: "10 GiB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + } + output { + File output_gvcf = "~{gvcf_basename}.vcf.gz" + File output_gvcf_index = "~{gvcf_basename}.vcf.gz.tbi" + } +} + +task HaplotypeCaller_GATK4_VCF { + input { + File input_bam + File interval_list + String vcf_basename + File ref_dict + File ref_fasta + File ref_fasta_index + Float? contamination + Boolean make_gvcf + Boolean make_bamout + Int preemptible_tries + Int hc_scatter + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.0.10.1" + } + + String output_suffix = if make_gvcf then ".g.vcf.gz" else ".vcf.gz" + String output_file_name = vcf_basename + output_suffix + + Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") + Int disk_size = ceil(((size(input_bam, "GiB") + 30) / hc_scatter) + ref_size) + 20 + + String bamout_arg = if make_bamout then "-bamout ~{vcf_basename}.bamout.bam" else "" + + parameter_meta { + input_bam: { + localization_optional: true + } + } + + command <<< + set -e + gatk --java-options "-Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ + HaplotypeCaller \ + -R ~{ref_fasta} \ + -I ~{input_bam} \ + -L ~{interval_list} \ + -O ~{output_file_name} \ + -contamination ~{default=0 contamination} \ + -G StandardAnnotation -G StandardHCAnnotation ~{true="-G AS_StandardAnnotation" false="" make_gvcf} \ + -new-qual \ + -GQB 10 -GQB 20 -GQB 30 -GQB 40 -GQB 50 -GQB 60 -GQB 70 -GQB 80 -GQB 90 \ + ~{true="-ERC GVCF" false="" make_gvcf} \ + ~{bamout_arg} + + # Cromwell doesn't like optional task outputs, so we have to touch this file. + touch ~{vcf_basename}.bamout.bam + >>> + + runtime { + docker: gatk_docker + preemptible: preemptible_tries + memory: "6.5 GiB" + cpu: "2" + disks: "local-disk " + disk_size + " HDD" + } + + output { + File output_vcf = "~{output_file_name}" + File output_vcf_index = "~{output_file_name}.tbi" + File bamout = "~{vcf_basename}.bamout.bam" + } +} + +# Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs +task MergeVCFs { + input { + Array[File] input_vcfs + Array[File] input_vcfs_indexes + String output_vcf_name + Int preemptible_tries + } + + Int disk_size = ceil(size(input_vcfs, "GiB") * 2.5) + 10 + + # Using MergeVcfs instead of GatherVcfs so we can create indices + # See https://github.com/broadinstitute/picard/issues/789 for relevant GatherVcfs ticket + command { + java -Xms2000m -jar /usr/gitc/picard.jar \ + MergeVcfs \ + INPUT=~{sep=' INPUT=' input_vcfs} \ + OUTPUT=~{output_vcf_name} + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" + preemptible: preemptible_tries + memory: "3 GiB" + disks: "local-disk ~{disk_size} HDD" + } + output { + File output_vcf = "~{output_vcf_name}" + File output_vcf_index = "~{output_vcf_name}.tbi" + } +} + +task HardFilterVcf { + input { + File input_vcf + File input_vcf_index + String vcf_basename + File interval_list + Int preemptible_tries + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.0.10.1" + } + + Int disk_size = ceil(2 * size(input_vcf, "GiB")) + 20 + String output_vcf_name = vcf_basename + ".filtered.vcf.gz" + + command { + gatk --java-options "-Xms3000m" \ + VariantFiltration \ + -V ~{input_vcf} \ + -L ~{interval_list} \ + --filter-expression "QD < 2.0 || FS > 30.0 || SOR > 3.0 || MQ < 40.0 || MQRankSum < -3.0 || ReadPosRankSum < -3.0" \ + --filter-name "HardFiltered" \ + -O ~{output_vcf_name} + } + output { + File output_vcf = "~{output_vcf_name}" + File output_vcf_index = "~{output_vcf_name}.tbi" + } + runtime { + docker: gatk_docker + preemptible: preemptible_tries + memory: "3 GiB" + disks: "local-disk " + disk_size + " HDD" + } +} + +task CNNScoreVariants { + + input { + File? bamout + File? bamout_index + File input_vcf + File input_vcf_index + String vcf_basename + File ref_fasta + File ref_fasta_index + File ref_dict + Int preemptible_tries + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.0.0" + } + + Int disk_size = ceil(size(bamout, "GiB") + size(ref_fasta, "GiB") + (size(input_vcf, "GiB") * 2)) + + String base_vcf = basename(input_vcf) + Boolean is_compressed = basename(base_vcf, "gz") != base_vcf + String vcf_suffix = if is_compressed then ".vcf.gz" else ".vcf" + String vcf_index_suffix = if is_compressed then ".tbi" else ".idx" + String output_vcf = base_vcf + ".scored" + vcf_suffix + String output_vcf_index = output_vcf + vcf_index_suffix + + String bamout_param = if defined(bamout) then "-I ~{bamout}" else "" + String tensor_type = if defined(bamout) then "read-tensor" else "reference" + + command { + gatk --java-options -Xmx10g CNNScoreVariants \ + -V ~{input_vcf} \ + -R ~{ref_fasta} \ + -O ~{output_vcf} \ + ~{bamout_param} \ + -tensor-type ~{tensor_type} + } + + output { + File scored_vcf = "~{output_vcf}" + File scored_vcf_index = "~{output_vcf_index}" + } + + runtime { + docker: gatk_docker + preemptible: preemptible_tries + memory: "15 GiB" + cpu: "2" + disks: "local-disk " + disk_size + " HDD" + } +} + +task FilterVariantTranches { + + input { + File input_vcf + File input_vcf_index + String vcf_basename + Array[String] snp_tranches + Array[String] indel_tranches + File hapmap_resource_vcf + File hapmap_resource_vcf_index + File omni_resource_vcf + File omni_resource_vcf_index + File one_thousand_genomes_resource_vcf + File one_thousand_genomes_resource_vcf_index + File dbsnp_resource_vcf + File dbsnp_resource_vcf_index + String info_key + Int preemptible_tries + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.0.0" + } + + Int disk_size = ceil(size(hapmap_resource_vcf, "GiB") + + size(omni_resource_vcf, "GiB") + + size(one_thousand_genomes_resource_vcf, "GiB") + + size(dbsnp_resource_vcf, "GiB") + + (size(input_vcf, "GiB") * 2) + ) + 20 + + command { + + gatk --java-options -Xmx6g FilterVariantTranches \ + -V ~{input_vcf} \ + -O ~{vcf_basename}.filtered.vcf.gz \ + ~{sep=" " prefix("--snp-tranche ", snp_tranches)} \ + ~{sep=" " prefix("--indel-tranche ", indel_tranches)} \ + --resource ~{hapmap_resource_vcf} \ + --resource ~{omni_resource_vcf} \ + --resource ~{one_thousand_genomes_resource_vcf} \ + --resource ~{dbsnp_resource_vcf} \ + --info-key ~{info_key} \ + --create-output-variant-index true + } + + output { + File filtered_vcf = "~{vcf_basename}.filtered.vcf.gz" + File filtered_vcf_index = "~{vcf_basename}.filtered.vcf.gz.tbi" + } + + runtime { + memory: "7 GiB" + cpu: "2" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } +} diff --git a/tasks/Qc.wdl b/tasks/Qc.wdl new file mode 100644 index 0000000..e7c221b --- /dev/null +++ b/tasks/Qc.wdl @@ -0,0 +1,609 @@ +version 1.0 + +## Copyright Broad Institute, 2018 +## +## This WDL defines tasks used for QC of human whole-genome or exome sequencing data. +## +## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# Collect sequencing yield quality metrics +task CollectQualityYieldMetrics { + input { + File input_bam + String metrics_filename + Int preemptible_tries + } + + Int disk_size = ceil(size(input_bam, "GiB")) + 20 + + command { + java -Xms2000m -jar /usr/gitc/picard.jar \ + CollectQualityYieldMetrics \ + INPUT=~{input_bam} \ + OQ=true \ + OUTPUT=~{metrics_filename} + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" + disks: "local-disk " + disk_size + " HDD" + memory: "3 GiB" + preemptible: preemptible_tries + } + output { + File quality_yield_metrics = "~{metrics_filename}" + } +} + +# Collect base quality and insert size metrics +task CollectUnsortedReadgroupBamQualityMetrics { + input { + File input_bam + String output_bam_prefix + Int preemptible_tries + } + + Int disk_size = ceil(size(input_bam, "GiB")) + 20 + + command { + java -Xms5000m -jar /usr/gitc/picard.jar \ + CollectMultipleMetrics \ + INPUT=~{input_bam} \ + OUTPUT=~{output_bam_prefix} \ + ASSUME_SORTED=true \ + PROGRAM=null \ + PROGRAM=CollectBaseDistributionByCycle \ + PROGRAM=CollectInsertSizeMetrics \ + PROGRAM=MeanQualityByCycle \ + PROGRAM=QualityScoreDistribution \ + METRIC_ACCUMULATION_LEVEL=null \ + METRIC_ACCUMULATION_LEVEL=ALL_READS + + touch ~{output_bam_prefix}.insert_size_metrics + touch ~{output_bam_prefix}.insert_size_histogram.pdf + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" + memory: "7 GiB" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File base_distribution_by_cycle_pdf = "~{output_bam_prefix}.base_distribution_by_cycle.pdf" + File base_distribution_by_cycle_metrics = "~{output_bam_prefix}.base_distribution_by_cycle_metrics" + File insert_size_histogram_pdf = "~{output_bam_prefix}.insert_size_histogram.pdf" + File insert_size_metrics = "~{output_bam_prefix}.insert_size_metrics" + File quality_by_cycle_pdf = "~{output_bam_prefix}.quality_by_cycle.pdf" + File quality_by_cycle_metrics = "~{output_bam_prefix}.quality_by_cycle_metrics" + File quality_distribution_pdf = "~{output_bam_prefix}.quality_distribution.pdf" + File quality_distribution_metrics = "~{output_bam_prefix}.quality_distribution_metrics" + } +} + +# Collect alignment summary and GC bias quality metrics +task CollectReadgroupBamQualityMetrics { + input { + File input_bam + File input_bam_index + String output_bam_prefix + File ref_dict + File ref_fasta + File ref_fasta_index + Boolean collect_gc_bias_metrics = true + Int preemptible_tries + } + + Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") + Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + 20 + + command { + # These are optionally generated, but need to exist for Cromwell's sake + touch ~{output_bam_prefix}.gc_bias.detail_metrics \ + ~{output_bam_prefix}.gc_bias.pdf \ + ~{output_bam_prefix}.gc_bias.summary_metrics + + java -Xms5000m -jar /usr/gitc/picard.jar \ + CollectMultipleMetrics \ + INPUT=~{input_bam} \ + REFERENCE_SEQUENCE=~{ref_fasta} \ + OUTPUT=~{output_bam_prefix} \ + ASSUME_SORTED=true \ + PROGRAM=null \ + PROGRAM=CollectAlignmentSummaryMetrics \ + ~{true='PROGRAM="CollectGcBiasMetrics"' false="" collect_gc_bias_metrics} \ + METRIC_ACCUMULATION_LEVEL=null \ + METRIC_ACCUMULATION_LEVEL=READ_GROUP + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" + memory: "7 GiB" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File alignment_summary_metrics = "~{output_bam_prefix}.alignment_summary_metrics" + File gc_bias_detail_metrics = "~{output_bam_prefix}.gc_bias.detail_metrics" + File gc_bias_pdf = "~{output_bam_prefix}.gc_bias.pdf" + File gc_bias_summary_metrics = "~{output_bam_prefix}.gc_bias.summary_metrics" + } +} + +# Collect quality metrics from the aggregated bam +task CollectAggregationMetrics { + input { + File input_bam + File input_bam_index + String output_bam_prefix + File ref_dict + File ref_fasta + File ref_fasta_index + Boolean collect_gc_bias_metrics = true + Int preemptible_tries + } + + Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") + Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + 20 + + command { + # These are optionally generated, but need to exist for Cromwell's sake + touch ~{output_bam_prefix}.gc_bias.detail_metrics \ + ~{output_bam_prefix}.gc_bias.pdf \ + ~{output_bam_prefix}.gc_bias.summary_metrics \ + ~{output_bam_prefix}.insert_size_metrics \ + ~{output_bam_prefix}.insert_size_histogram.pdf + + java -Xms5000m -jar /usr/gitc/picard.jar \ + CollectMultipleMetrics \ + INPUT=~{input_bam} \ + REFERENCE_SEQUENCE=~{ref_fasta} \ + OUTPUT=~{output_bam_prefix} \ + ASSUME_SORTED=true \ + PROGRAM=null \ + PROGRAM=CollectAlignmentSummaryMetrics \ + PROGRAM=CollectInsertSizeMetrics \ + PROGRAM=CollectSequencingArtifactMetrics \ + PROGRAM=QualityScoreDistribution \ + ~{true='PROGRAM="CollectGcBiasMetrics"' false="" collect_gc_bias_metrics} \ + METRIC_ACCUMULATION_LEVEL=null \ + METRIC_ACCUMULATION_LEVEL=SAMPLE \ + METRIC_ACCUMULATION_LEVEL=LIBRARY + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" + memory: "7 GiB" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File alignment_summary_metrics = "~{output_bam_prefix}.alignment_summary_metrics" + File bait_bias_detail_metrics = "~{output_bam_prefix}.bait_bias_detail_metrics" + File bait_bias_summary_metrics = "~{output_bam_prefix}.bait_bias_summary_metrics" + File gc_bias_detail_metrics = "~{output_bam_prefix}.gc_bias.detail_metrics" + File gc_bias_pdf = "~{output_bam_prefix}.gc_bias.pdf" + File gc_bias_summary_metrics = "~{output_bam_prefix}.gc_bias.summary_metrics" + File insert_size_histogram_pdf = "~{output_bam_prefix}.insert_size_histogram.pdf" + File insert_size_metrics = "~{output_bam_prefix}.insert_size_metrics" + File pre_adapter_detail_metrics = "~{output_bam_prefix}.pre_adapter_detail_metrics" + File pre_adapter_summary_metrics = "~{output_bam_prefix}.pre_adapter_summary_metrics" + File quality_distribution_pdf = "~{output_bam_prefix}.quality_distribution.pdf" + File quality_distribution_metrics = "~{output_bam_prefix}.quality_distribution_metrics" + File error_summary_metrics = "~{output_bam_prefix}.error_summary_metrics" + } +} + +# Check that the fingerprints of separate readgroups all match +task CrossCheckFingerprints { + input { + Array[File] input_bams + Array[File] input_bam_indexes + File? haplotype_database_file + String metrics_filename + Float total_input_size + Int preemptible_tries + Float lod_threshold + String cross_check_by + } + + Int disk_size = ceil(total_input_size) + 20 + + command <<< + java -Dsamjdk.buffer_size=131072 \ + -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms2000m \ + -jar /usr/gitc/picard.jar \ + CrosscheckFingerprints \ + OUTPUT=~{metrics_filename} \ + HAPLOTYPE_MAP=~{haplotype_database_file} \ + EXPECT_ALL_GROUPS_TO_MATCH=true \ + INPUT=~{sep=' INPUT=' input_bams} \ + LOD_THRESHOLD=~{lod_threshold} \ + CROSSCHECK_BY=~{cross_check_by} + >>> + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" + preemptible: preemptible_tries + memory: "2 GiB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File cross_check_fingerprints_metrics = "~{metrics_filename}" + } +} + +# Check that the fingerprint of the sample BAM matches the sample array +task CheckFingerprint { + input { + File input_bam + File input_bam_index + String output_basename + File? haplotype_database_file + File? genotypes + File? genotypes_index + String sample + Int preemptible_tries + } + + Int disk_size = ceil(size(input_bam, "GiB")) + 20 + # Picard has different behavior depending on whether or not the OUTPUT parameter ends with a '.', so we are explicitly + # passing in where we want the two metrics files to go to avoid any potential confusion. + String summary_metrics_location = "~{output_basename}.fingerprinting_summary_metrics" + String detail_metrics_location = "~{output_basename}.fingerprinting_detail_metrics" + + command <<< + java -Dsamjdk.buffer_size=131072 \ + -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms2g \ + -jar /usr/gitc/picard.jar \ + CheckFingerprint \ + INPUT=~{input_bam} \ + SUMMARY_OUTPUT=~{summary_metrics_location} \ + DETAIL_OUTPUT=~{detail_metrics_location} \ + GENOTYPES=~{genotypes} \ + HAPLOTYPE_MAP=~{haplotype_database_file} \ + SAMPLE_ALIAS="~{sample}" \ + IGNORE_READ_GROUPS=true + + >>> + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" + preemptible: preemptible_tries + memory: "3 GiB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File summary_metrics = summary_metrics_location + File detail_metrics = detail_metrics_location + } +} + +task CheckPreValidation { + input { + File duplication_metrics + File chimerism_metrics + Float max_duplication_in_reasonable_sample + Float max_chimerism_in_reasonable_sample + Int preemptible_tries + } + + command <<< + set -o pipefail + set -e + + grep -A 1 PERCENT_DUPLICATION ~{duplication_metrics} > duplication.csv + grep -A 3 PCT_CHIMERAS ~{chimerism_metrics} | grep -v OF_PAIR > chimerism.csv + + python <>> + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" + preemptible: preemptible_tries + docker: "us.gcr.io/broad-gotc-prod/python:2.7" + memory: "2 GiB" + } + output { + Float duplication_rate = read_float("duplication_value.txt") + Float chimerism_rate = read_float("chimerism_value.txt") + Boolean is_outlier_data = duplication_rate > max_duplication_in_reasonable_sample || chimerism_rate > max_chimerism_in_reasonable_sample + } +} + +task ValidateSamFile { + input { + File input_bam + File? input_bam_index + String report_filename + File ref_dict + File ref_fasta + File ref_fasta_index + Int? max_output + Array[String]? ignore + Boolean? is_outlier_data + Int preemptible_tries + Int memory_multiplier = 1 + } + + Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") + Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + 20 + + Int memory_size = ceil(7 * memory_multiplier) + Int java_memory_size = (memory_size - 1) * 1000 + + command { + java -Xms~{java_memory_size}m -jar /usr/gitc/picard.jar \ + ValidateSamFile \ + INPUT=~{input_bam} \ + OUTPUT=~{report_filename} \ + REFERENCE_SEQUENCE=~{ref_fasta} \ + ~{"MAX_OUTPUT=" + max_output} \ + IGNORE=~{default="null" sep=" IGNORE=" ignore} \ + MODE=VERBOSE \ + ~{default='SKIP_MATE_VALIDATION=false' true='SKIP_MATE_VALIDATION=true' false='SKIP_MATE_VALIDATION=false' is_outlier_data} \ + IS_BISULFITE_SEQUENCED=false + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" + preemptible: preemptible_tries + memory: "~{memory_size} GiB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File report = "~{report_filename}" + } +} + +# Note these tasks will break if the read lengths in the bam are greater than 250. +task CollectWgsMetrics { + input { + File input_bam + File input_bam_index + String metrics_filename + File wgs_coverage_interval_list + File ref_fasta + File ref_fasta_index + Int read_length + Int preemptible_tries + } + + Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + 20 + + command { + java -Xms2000m -jar /usr/gitc/picard.jar \ + CollectWgsMetrics \ + INPUT=~{input_bam} \ + VALIDATION_STRINGENCY=SILENT \ + REFERENCE_SEQUENCE=~{ref_fasta} \ + INCLUDE_BQ_HISTOGRAM=true \ + INTERVALS=~{wgs_coverage_interval_list} \ + OUTPUT=~{metrics_filename} \ + USE_FAST_ALGORITHM=true \ + READ_LENGTH=~{read_length} + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" + preemptible: preemptible_tries + memory: "3 GiB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File metrics = "~{metrics_filename}" + } +} + +# Collect raw WGS metrics (commonly used QC thresholds) +task CollectRawWgsMetrics { + input { + File input_bam + File input_bam_index + String metrics_filename + File wgs_coverage_interval_list + File ref_fasta + File ref_fasta_index + Int read_length + Int preemptible_tries + Int memory_multiplier = 1 + } + + Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + 20 + + Int memory_size = ceil((if (disk_size < 110) then 5 else 7) * memory_multiplier) + String java_memory_size = (memory_size - 1) * 1000 + + command { + java -Xms~{java_memory_size}m -jar /usr/gitc/picard.jar \ + CollectRawWgsMetrics \ + INPUT=~{input_bam} \ + VALIDATION_STRINGENCY=SILENT \ + REFERENCE_SEQUENCE=~{ref_fasta} \ + INCLUDE_BQ_HISTOGRAM=true \ + INTERVALS=~{wgs_coverage_interval_list} \ + OUTPUT=~{metrics_filename} \ + USE_FAST_ALGORITHM=true \ + READ_LENGTH=~{read_length} + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" + preemptible: preemptible_tries + memory: "~{memory_size} GiB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File metrics = "~{metrics_filename}" + } +} + +task CollectHsMetrics { + input { + File input_bam + File input_bam_index + File ref_fasta + File ref_fasta_index + String metrics_filename + File target_interval_list + File bait_interval_list + Int preemptible_tries + Int memory_multiplier = 1 + } + + Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + 20 + # Try to fit the input bam into memory, within reason. + Int rounded_bam_size = ceil(size(input_bam, "GiB") + 0.5) + Int rounded_memory_size = ceil((if (rounded_bam_size > 10) then 10 else rounded_bam_size) * memory_multiplier) + Int memory_size = if rounded_memory_size < 7 then 7 else rounded_memory_size + Int java_memory_size = (memory_size - 1) * 1000 + + # There are probably more metrics we want to generate with this tool + command { + java -Xms~{java_memory_size}m -jar /usr/gitc/picard.jar \ + CollectHsMetrics \ + INPUT=~{input_bam} \ + REFERENCE_SEQUENCE=~{ref_fasta} \ + VALIDATION_STRINGENCY=SILENT \ + TARGET_INTERVALS=~{target_interval_list} \ + BAIT_INTERVALS=~{bait_interval_list} \ + METRIC_ACCUMULATION_LEVEL=null \ + METRIC_ACCUMULATION_LEVEL=SAMPLE \ + METRIC_ACCUMULATION_LEVEL=LIBRARY \ + OUTPUT=~{metrics_filename} + } + + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" + preemptible: preemptible_tries + memory: "~{memory_size} GiB" + disks: "local-disk " + disk_size + " HDD" + } + + output { + File metrics = metrics_filename + } +} + +# Generate a checksum per readgroup +task CalculateReadGroupChecksum { + input { + File input_bam + File input_bam_index + String read_group_md5_filename + Int preemptible_tries + } + + Int disk_size = ceil(size(input_bam, "GiB")) + 20 + + command { + java -Xms1000m -jar /usr/gitc/picard.jar \ + CalculateReadGroupChecksum \ + INPUT=~{input_bam} \ + OUTPUT=~{read_group_md5_filename} + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" + preemptible: preemptible_tries + memory: "2 GiB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File md5_file = "~{read_group_md5_filename}" + } +} + +# Validate a (g)VCF with -gvcf specific validation +task ValidateVCF { + input { + File input_vcf + File input_vcf_index + File ref_fasta + File ref_fasta_index + File ref_dict + File dbsnp_vcf + File dbsnp_vcf_index + File calling_interval_list + Int preemptible_tries + Boolean is_gvcf = true + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.0.10.1" + } + + Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") + Int disk_size = ceil(size(input_vcf, "GiB") + size(dbsnp_vcf, "GiB") + ref_size) + 20 + + command { + gatk --java-options -Xms6000m \ + ValidateVariants \ + -V ~{input_vcf} \ + -R ~{ref_fasta} \ + -L ~{calling_interval_list} \ + ~{true="-gvcf" false="" is_gvcf} \ + --validation-type-to-exclude ALLELES \ + --dbsnp ~{dbsnp_vcf} + } + runtime { + docker: gatk_docker + preemptible: preemptible_tries + memory: "7000 MiB" + disks: "local-disk " + disk_size + " HDD" + } +} + +# Collect variant calling metrics from GVCF output +task CollectVariantCallingMetrics { + input { + File input_vcf + File input_vcf_index + String metrics_basename + File dbsnp_vcf + File dbsnp_vcf_index + File ref_dict + File evaluation_interval_list + Boolean is_gvcf = true + Int preemptible_tries + } + + Int disk_size = ceil(size(input_vcf, "GiB") + size(dbsnp_vcf, "GiB")) + 20 + + command { + java -Xms2000m -jar /usr/gitc/picard.jar \ + CollectVariantCallingMetrics \ + INPUT=~{input_vcf} \ + OUTPUT=~{metrics_basename} \ + DBSNP=~{dbsnp_vcf} \ + SEQUENCE_DICTIONARY=~{ref_dict} \ + TARGET_INTERVALS=~{evaluation_interval_list} \ + ~{true="GVCF_INPUT=true" false="" is_gvcf} + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" + preemptible: preemptible_tries + memory: "3 GiB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File summary_metrics = "~{metrics_basename}.variant_calling_summary_metrics" + File detail_metrics = "~{metrics_basename}.variant_calling_detail_metrics" + } +} diff --git a/tasks_pipelines/split_large_readgroup.wdl b/tasks/SplitLargeReadGroup.wdl similarity index 65% rename from tasks_pipelines/split_large_readgroup.wdl rename to tasks/SplitLargeReadGroup.wdl index a3044d2..ca73468 100644 --- a/tasks_pipelines/split_large_readgroup.wdl +++ b/tasks/SplitLargeReadGroup.wdl @@ -1,3 +1,5 @@ +version 1.0 + ## Copyright Broad Institute, 2018 ## ## This WDL pipeline implements a split of large readgroups for human whole-genome and exome sequencing data. @@ -13,31 +15,28 @@ ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed ## licensing information pertaining to the included programs. -import "./tasks_pipelines/alignment.wdl" as Alignment -import "./tasks_pipelines/bam_processing.wdl" as Processing -import "./tasks_pipelines/utilities.wdl" as Utils +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Alignment.wdl" as Alignment +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/BamProcessing.wdl" as Processing +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Utilities.wdl" as Utils +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/structs/GermlineStructs.wdl" as Structs + +workflow SplitLargeReadGroup { + input { + File input_bam -workflow split_large_readgroup { - File input_bam + String bwa_commandline + String bwa_version + String output_bam_basename - String bwa_commandline - String bwa_version - String output_bam_basename - File ref_fasta - File ref_fasta_index - File ref_dict + # reference_fasta.ref_alt is the .alt file from bwa-kit + # (https://github.com/lh3/bwa/tree/master/bwakit), + # listing the reference contigs that are "alternative". + ReferenceFasta reference_fasta - # This is the .alt file from bwa-kit (https://github.com/lh3/bwa/tree/master/bwakit), - # listing the reference contigs that are "alternative". - File ref_alt - File ref_amb - File ref_ann - File ref_bwt - File ref_pac - File ref_sa - Int compression_level - Int preemptible_tries - Int reads_per_file = 48000000 + Int compression_level + Int preemptible_tries + Int reads_per_file = 48000000 + } call Alignment.SamSplitter as SamSplitter { input : @@ -48,7 +47,7 @@ workflow split_large_readgroup { } scatter(unmapped_bam in SamSplitter.split_bams) { - Float current_unmapped_bam_size = size(unmapped_bam, "GB") + Float current_unmapped_bam_size = size(unmapped_bam, "GiB") String current_name = basename(unmapped_bam, ".bam") call Alignment.SamToFastqAndBwaMemAndMba as SamToFastqAndBwaMemAndMba { @@ -56,21 +55,13 @@ workflow split_large_readgroup { input_bam = unmapped_bam, bwa_commandline = bwa_commandline, output_bam_basename = current_name, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - ref_dict = ref_dict, - ref_alt = ref_alt, - ref_bwt = ref_bwt, - ref_amb = ref_amb, - ref_ann = ref_ann, - ref_pac = ref_pac, - ref_sa = ref_sa, + reference_fasta = reference_fasta, bwa_version = bwa_version, compression_level = compression_level, preemptible_tries = preemptible_tries } - Float current_mapped_size = size(SamToFastqAndBwaMemAndMba.output_bam, "GB") + Float current_mapped_size = size(SamToFastqAndBwaMemAndMba.output_bam, "GiB") } call Utils.SumFloats as SumSplitAlignedSizes { diff --git a/tasks/UnmappedBamToAlignedBam.wdl b/tasks/UnmappedBamToAlignedBam.wdl new file mode 100644 index 0000000..944fc2d --- /dev/null +++ b/tasks/UnmappedBamToAlignedBam.wdl @@ -0,0 +1,268 @@ +version 1.0 + +## Copyright Broad Institute, 2018 +## +## This WDL pipeline implements data processing according to the GATK Best Practices (June 2016) +## for human whole-genome and exome sequencing data. +## +## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Alignment.wdl" as Alignment +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/SplitLargeReadGroup.wdl" as SplitRG +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Qc.wdl" as QC +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/BamProcessing.wdl" as Processing +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Utilities.wdl" as Utils +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/structs/GermlineStructs.wdl" as Structs + +# WORKFLOW DEFINITION +workflow UnmappedBamToAlignedBam { + input { + SampleAndUnmappedBams sample_and_unmapped_bams + GermlineSingleSampleReferences references + PapiSettings papi_settings + + String cross_check_fingerprints_by + File? haplotype_database_file + Float lod_threshold + String recalibrated_bam_basename + } + + Float cutoff_for_large_rg_in_gb = 20.0 + + String bwa_commandline = "bwa mem -K 100000000 -p -v 3 -t 16 -Y $bash_ref_fasta" + + Int compression_level = 2 + + # Get the version of BWA to include in the PG record in the header of the BAM produced + # by MergeBamAlignment. + call Alignment.GetBwaVersion + + # Get the size of the standard reference files as well as the additional reference files needed for BWA + + # Align flowcell-level unmapped input bams in parallel + scatter (unmapped_bam in sample_and_unmapped_bams.flowcell_unmapped_bams) { + + Float unmapped_bam_size = size(unmapped_bam, "GiB") + + String unmapped_bam_basename = basename(unmapped_bam, sample_and_unmapped_bams.unmapped_bam_suffix) + + # QC the unmapped BAM + call QC.CollectQualityYieldMetrics as CollectQualityYieldMetrics { + input: + input_bam = unmapped_bam, + metrics_filename = unmapped_bam_basename + ".unmapped.quality_yield_metrics", + preemptible_tries = papi_settings.preemptible_tries + } + + if (unmapped_bam_size > cutoff_for_large_rg_in_gb) { + # Split bam into multiple smaller bams, + # map reads to reference and recombine into one bam + call SplitRG.SplitLargeReadGroup as SplitRG { + input: + input_bam = unmapped_bam, + bwa_commandline = bwa_commandline, + bwa_version = GetBwaVersion.bwa_version, + output_bam_basename = unmapped_bam_basename + ".aligned.unsorted", + reference_fasta = references.reference_fasta, + compression_level = compression_level, + preemptible_tries = papi_settings.preemptible_tries + } + } + + if (unmapped_bam_size <= cutoff_for_large_rg_in_gb) { + # Map reads to reference + call Alignment.SamToFastqAndBwaMemAndMba as SamToFastqAndBwaMemAndMba { + input: + input_bam = unmapped_bam, + bwa_commandline = bwa_commandline, + output_bam_basename = unmapped_bam_basename + ".aligned.unsorted", + reference_fasta = references.reference_fasta, + bwa_version = GetBwaVersion.bwa_version, + compression_level = compression_level, + preemptible_tries = papi_settings.preemptible_tries + } + } + + File output_aligned_bam = select_first([SamToFastqAndBwaMemAndMba.output_bam, SplitRG.aligned_bam]) + + Float mapped_bam_size = size(output_aligned_bam, "GiB") + + # QC the aligned but unsorted readgroup BAM + # no reference as the input here is unsorted, providing a reference would cause an error + call QC.CollectUnsortedReadgroupBamQualityMetrics as CollectUnsortedReadgroupBamQualityMetrics { + input: + input_bam = output_aligned_bam, + output_bam_prefix = unmapped_bam_basename + ".readgroup", + preemptible_tries = papi_settings.preemptible_tries + } + } + + # Sum the read group bam sizes to approximate the aggregated bam size + call Utils.SumFloats as SumFloats { + input: + sizes = mapped_bam_size, + preemptible_tries = papi_settings.preemptible_tries + } + + # MarkDuplicates and SortSam currently take too long for preemptibles if the input data is too large + Float gb_size_cutoff_for_preemptibles = 110.0 + Boolean data_too_large_for_preemptibles = SumFloats.total_size > gb_size_cutoff_for_preemptibles + + # Aggregate aligned+merged flowcell BAM files and mark duplicates + # We take advantage of the tool's ability to take multiple BAM inputs and write out a single output + # to avoid having to spend time just merging BAM files. + call Processing.MarkDuplicates as MarkDuplicates { + input: + input_bams = output_aligned_bam, + output_bam_basename = sample_and_unmapped_bams.base_file_name + ".aligned.unsorted.duplicates_marked", + metrics_filename = sample_and_unmapped_bams.base_file_name + ".duplicate_metrics", + total_input_size = SumFloats.total_size, + compression_level = compression_level, + preemptible_tries = if data_too_large_for_preemptibles then 0 else papi_settings.agg_preemptible_tries + } + + # Sort aggregated+deduped BAM file and fix tags + call Processing.SortSam as SortSampleBam { + input: + input_bam = MarkDuplicates.output_bam, + output_bam_basename = sample_and_unmapped_bams.base_file_name + ".aligned.duplicate_marked.sorted", + compression_level = compression_level, + preemptible_tries = if data_too_large_for_preemptibles then 0 else papi_settings.agg_preemptible_tries + } + + Float agg_bam_size = size(SortSampleBam.output_bam, "GiB") + + if (defined(haplotype_database_file)) { + # Check identity of fingerprints across readgroups + call QC.CrossCheckFingerprints as CrossCheckFingerprints { + input: + input_bams = [ SortSampleBam.output_bam ], + input_bam_indexes = [SortSampleBam.output_bam_index], + haplotype_database_file = haplotype_database_file, + metrics_filename = sample_and_unmapped_bams.base_file_name + ".crosscheck", + total_input_size = agg_bam_size, + lod_threshold = lod_threshold, + cross_check_by = cross_check_fingerprints_by, + preemptible_tries = papi_settings.agg_preemptible_tries + } + } + + # Create list of sequences for scatter-gather parallelization + call Utils.CreateSequenceGroupingTSV as CreateSequenceGroupingTSV { + input: + ref_dict = references.reference_fasta.ref_dict, + preemptible_tries = papi_settings.preemptible_tries + } + + # Estimate level of cross-sample contamination + call Processing.CheckContamination as CheckContamination { + input: + input_bam = SortSampleBam.output_bam, + input_bam_index = SortSampleBam.output_bam_index, + contamination_sites_ud = references.contamination_sites_ud, + contamination_sites_bed = references.contamination_sites_bed, + contamination_sites_mu = references.contamination_sites_mu, + ref_fasta = references.reference_fasta.ref_fasta, + ref_fasta_index = references.reference_fasta.ref_fasta_index, + output_prefix = sample_and_unmapped_bams.base_file_name + ".preBqsr", + preemptible_tries = papi_settings.agg_preemptible_tries, + contamination_underestimation_factor = 0.75 + } + + # We need disk to localize the sharded input and output due to the scatter for BQSR. + # If we take the number we are scattering by and reduce by 3 we will have enough disk space + # to account for the fact that the data is not split evenly. + Int num_of_bqsr_scatters = length(CreateSequenceGroupingTSV.sequence_grouping) + Int potential_bqsr_divisor = num_of_bqsr_scatters - 10 + Int bqsr_divisor = if potential_bqsr_divisor > 1 then potential_bqsr_divisor else 1 + + # Perform Base Quality Score Recalibration (BQSR) on the sorted BAM in parallel + scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping) { + # Generate the recalibration model by interval + call Processing.BaseRecalibrator as BaseRecalibrator { + input: + input_bam = SortSampleBam.output_bam, + recalibration_report_filename = sample_and_unmapped_bams.base_file_name + ".recal_data.csv", + sequence_group_interval = subgroup, + dbsnp_vcf = references.dbsnp_vcf, + dbsnp_vcf_index = references.dbsnp_vcf_index, + known_indels_sites_vcfs = references.known_indels_sites_vcfs, + known_indels_sites_indices = references.known_indels_sites_indices, + ref_dict = references.reference_fasta.ref_dict, + ref_fasta = references.reference_fasta.ref_fasta, + ref_fasta_index = references.reference_fasta.ref_fasta_index, + bqsr_scatter = bqsr_divisor, + preemptible_tries = papi_settings.agg_preemptible_tries + } + } + + # Merge the recalibration reports resulting from by-interval recalibration + # The reports are always the same size + call Processing.GatherBqsrReports as GatherBqsrReports { + input: + input_bqsr_reports = BaseRecalibrator.recalibration_report, + output_report_filename = sample_and_unmapped_bams.base_file_name + ".recal_data.csv", + preemptible_tries = papi_settings.preemptible_tries + } + + scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping_with_unmapped) { + # Apply the recalibration model by interval + call Processing.ApplyBQSR as ApplyBQSR { + input: + input_bam = SortSampleBam.output_bam, + output_bam_basename = recalibrated_bam_basename, + recalibration_report = GatherBqsrReports.output_bqsr_report, + sequence_group_interval = subgroup, + ref_dict = references.reference_fasta.ref_dict, + ref_fasta = references.reference_fasta.ref_fasta, + ref_fasta_index = references.reference_fasta.ref_fasta_index, + bqsr_scatter = bqsr_divisor, + compression_level = compression_level, + preemptible_tries = papi_settings.agg_preemptible_tries + } + } + + # Merge the recalibrated BAM files resulting from by-interval recalibration + call Processing.GatherSortedBamFiles as GatherBamFiles { + input: + input_bams = ApplyBQSR.recalibrated_bam, + output_bam_basename = sample_and_unmapped_bams.base_file_name, + total_input_size = agg_bam_size, + compression_level = compression_level, + preemptible_tries = papi_settings.agg_preemptible_tries + } + + # Outputs that will be retained when execution is complete + output { + Array[File] quality_yield_metrics = CollectQualityYieldMetrics.quality_yield_metrics + + Array[File] unsorted_read_group_base_distribution_by_cycle_pdf = CollectUnsortedReadgroupBamQualityMetrics.base_distribution_by_cycle_pdf + Array[File] unsorted_read_group_base_distribution_by_cycle_metrics = CollectUnsortedReadgroupBamQualityMetrics.base_distribution_by_cycle_metrics + Array[File] unsorted_read_group_insert_size_histogram_pdf = CollectUnsortedReadgroupBamQualityMetrics.insert_size_histogram_pdf + Array[File] unsorted_read_group_insert_size_metrics = CollectUnsortedReadgroupBamQualityMetrics.insert_size_metrics + Array[File] unsorted_read_group_quality_by_cycle_pdf = CollectUnsortedReadgroupBamQualityMetrics.quality_by_cycle_pdf + Array[File] unsorted_read_group_quality_by_cycle_metrics = CollectUnsortedReadgroupBamQualityMetrics.quality_by_cycle_metrics + Array[File] unsorted_read_group_quality_distribution_pdf = CollectUnsortedReadgroupBamQualityMetrics.quality_distribution_pdf + Array[File] unsorted_read_group_quality_distribution_metrics = CollectUnsortedReadgroupBamQualityMetrics.quality_distribution_metrics + + File? cross_check_fingerprints_metrics = CrossCheckFingerprints.cross_check_fingerprints_metrics + + File selfSM = CheckContamination.selfSM + Float contamination = CheckContamination.contamination + + File duplicate_metrics = MarkDuplicates.duplicate_metrics + File output_bqsr_reports = GatherBqsrReports.output_bqsr_report + + File output_bam = GatherBamFiles.output_bam + File output_bam_index = GatherBamFiles.output_bam_index + } +} diff --git a/tasks_pipelines/utilities.wdl b/tasks/Utilities.wdl similarity index 72% rename from tasks_pipelines/utilities.wdl rename to tasks/Utilities.wdl index 7845821..a2075ec 100644 --- a/tasks_pipelines/utilities.wdl +++ b/tasks/Utilities.wdl @@ -1,3 +1,5 @@ +version 1.0 + ## Copyright Broad Institute, 2018 ## ## This WDL defines utility tasks used for processing of sequencing data. @@ -15,15 +17,16 @@ # Generate sets of intervals for scatter-gathering over chromosomes task CreateSequenceGroupingTSV { - File ref_dict - Int preemptible_tries - + input { + File ref_dict + Int preemptible_tries + } # Use python to create the Sequencing Groupings used for BQSR and PrintReads Scatter. # It outputs to stdout where it is parsed into a wdl Array[Array[String]] # e.g. [["1"], ["2"], ["3", "4"], ["5"], ["6", "7", "8"]] command <<< python <>> runtime { preemptible: preemptible_tries - docker: "python:2.7" - memory: "2 GB" + docker: "us.gcr.io/broad-gotc-prod/python:2.7" + memory: "2 GiB" } output { Array[Array[String]] sequence_grouping = read_tsv("sequence_grouping.txt") @@ -72,21 +75,23 @@ task CreateSequenceGroupingTSV { # Note that the number of sub interval lists may not be exactly equal to scatter_count. There may be slightly more or less. # Thus we have the block of python to count the number of generated sub interval lists. task ScatterIntervalList { - File interval_list - Int scatter_count - Int break_bands_at_multiples_of + input { + File interval_list + Int scatter_count + Int break_bands_at_multiples_of + } command <<< set -e mkdir out java -Xms1g -jar /usr/gitc/picard.jar \ IntervalListTools \ - SCATTER_COUNT=${scatter_count} \ + SCATTER_COUNT=~{scatter_count} \ SUBDIVISION_MODE=BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW \ UNIQUE=true \ SORT=true \ - BREAK_BANDS_AT_MULTIPLES_OF=${break_bands_at_multiples_of} \ - INPUT=${interval_list} \ + BREAK_BANDS_AT_MULTIPLES_OF=~{break_bands_at_multiples_of} \ + INPUT=~{interval_list} \ OUTPUT=out python3 < ${output_basename}.cram.md5 + samtools view -C -T ~{ref_fasta} ~{input_bam} | \ + tee ~{output_basename}.cram | \ + md5sum | awk '{print $1}' > ~{output_basename}.cram.md5 # Create REF_CACHE. Used when indexing a CRAM - seq_cache_populate.pl -root ./ref/cache ${ref_fasta} + seq_cache_populate.pl -root ./ref/cache ~{ref_fasta} export REF_PATH=: export REF_CACHE=./ref/cache/%2s/%2s/%s - samtools index ${output_basename}.cram + samtools index ~{output_basename}.cram >>> runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" preemptible: preemptible_tries - memory: "3 GB" + memory: "3 GiB" cpu: "1" disks: "local-disk " + disk_size + " HDD" } output { - File output_cram = "${output_basename}.cram" - File output_cram_index = "${output_basename}.cram.crai" - File output_cram_md5 = "${output_basename}.cram.md5" + File output_cram = "~{output_basename}.cram" + File output_cram_index = "~{output_basename}.cram.crai" + File output_cram_md5 = "~{output_basename}.cram.md5" } } # Convert CRAM file to BAM format task ConvertToBam { - File input_cram - File ref_fasta - File ref_fasta_index - String output_basename + input { + File input_cram + File ref_fasta + File ref_fasta_index + String output_basename + } command <<< set -e set -o pipefail - samtools view -b -o ${output_basename}.bam -T ${ref_fasta} ${input_cram} + samtools view -b -o ~{output_basename}.bam -T ~{ref_fasta} ~{input_cram} - samtools index ${output_basename}.bam + samtools index ~{output_basename}.bam >>> runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" preemptible: 3 - memory: "3 GB" + memory: "3 GiB" cpu: "1" disks: "local-disk 200 HDD" } output { - File output_bam = "${output_basename}.bam" - File output_bam_index = "${output_basename}.bam.bai" + File output_bam = "~{output_basename}.bam" + File output_bam_index = "~{output_basename}.bam.bai" } } # Calculates sum of a list of floats task SumFloats { - Array[Float] sizes - Int preemptible_tries + input { + Array[Float] sizes + Int preemptible_tries + } command <<< - python -c "print ${sep="+" sizes}" + python -c "print ~{sep="+" sizes}" >>> output { Float total_size = read_float(stdout()) } runtime { - docker: "python:2.7" + docker: "us.gcr.io/broad-gotc-prod/python:2.7" preemptible: preemptible_tries } } diff --git a/tasks/VariantCalling.wdl b/tasks/VariantCalling.wdl new file mode 100644 index 0000000..f60f648 --- /dev/null +++ b/tasks/VariantCalling.wdl @@ -0,0 +1,193 @@ +version 1.0 + +# Local Import +#import "../tasks/GermlineVariantDiscovery.wdl" as Calling +#import "../tasks/Qc.wdl" as QC +#import "../tasks/Utilities.wdl" as Utils +#import "../tasks/BamProcessing.wdl" as BamProcessing + +# Git URL Import +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/GermlineVariantDiscovery.wdl" as Calling +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Qc.wdl" as QC +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Utilities.wdl" as Utils +import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/BamProcessing.wdl" as BamProcessing + +workflow VariantCalling { + + input { + File calling_interval_list + File evaluation_interval_list + Int haplotype_scatter_count + Int break_bands_at_multiples_of + Float? contamination + File input_bam + File ref_fasta + File ref_fasta_index + File ref_dict + File dbsnp_vcf + File dbsnp_vcf_index + String base_file_name + String final_vcf_base_name + Int agg_preemptible_tries + Boolean make_gvcf = true + Boolean make_bamout = false + Boolean use_gatk3_haplotype_caller = false + } + + parameter_meta { + make_bamout: "For CNNScoreVariants to run with a 2D model, a bamout must be created by HaplotypeCaller. The bamout is a bam containing information on how HaplotypeCaller remapped reads while it was calling variants. See https://gatkforums.broadinstitute.org/gatk/discussion/5484/howto-generate-a-bamout-file-showing-how-haplotypecaller-has-remapped-sequence-reads for more details." + } + + # Break the calling interval_list into sub-intervals + # Perform variant calling on the sub-intervals, and then gather the results + call Utils.ScatterIntervalList as ScatterIntervalList { + input: + interval_list = calling_interval_list, + scatter_count = haplotype_scatter_count, + break_bands_at_multiples_of = break_bands_at_multiples_of + } + + # We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller. + # If we take the number we are scattering by and reduce by 20 we will have enough disk space + # to account for the fact that the data is quite uneven across the shards. + Int potential_hc_divisor = ScatterIntervalList.interval_count - 20 + Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1 + + # Call variants in parallel over WGS calling intervals + scatter (scattered_interval_list in ScatterIntervalList.out) { + + if (use_gatk3_haplotype_caller) { + call Calling.HaplotypeCaller_GATK35_GVCF as HaplotypeCallerGATK3 { + input: + input_bam = input_bam, + interval_list = scattered_interval_list, + gvcf_basename = base_file_name, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + contamination = contamination, + preemptible_tries = agg_preemptible_tries, + hc_scatter = hc_divisor + } + } + + if (!use_gatk3_haplotype_caller) { + + # Generate GVCF by interval + call Calling.HaplotypeCaller_GATK4_VCF as HaplotypeCallerGATK4 { + input: + contamination = contamination, + input_bam = input_bam, + interval_list = scattered_interval_list, + vcf_basename = base_file_name, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + hc_scatter = hc_divisor, + make_gvcf = make_gvcf, + make_bamout = make_bamout, + preemptible_tries = agg_preemptible_tries + } + + # If bamout files were created, we need to sort and gather them into one bamout + if (make_bamout) { + call BamProcessing.SortSam as SortBamout { + input: + input_bam = HaplotypeCallerGATK4.bamout, + output_bam_basename = final_vcf_base_name, + preemptible_tries = agg_preemptible_tries, + compression_level = 2 + } + } + } + + File vcfs_to_merge = select_first([HaplotypeCallerGATK3.output_gvcf, HaplotypeCallerGATK4.output_vcf]) + File vcf_indices_to_merge = select_first([HaplotypeCallerGATK3.output_gvcf_index, HaplotypeCallerGATK4.output_vcf_index]) + } + + # Combine by-interval (g)VCFs into a single sample (g)VCF file + String merge_suffix = if make_gvcf then ".g.vcf.gz" else ".vcf.gz" + call Calling.MergeVCFs as MergeVCFs { + input: + input_vcfs = vcfs_to_merge, + input_vcfs_indexes = vcf_indices_to_merge, + output_vcf_name = final_vcf_base_name + merge_suffix, + preemptible_tries = agg_preemptible_tries + } + + if (make_bamout) { + call MergeBamouts { + input: + bams = select_all(SortBamout.output_bam), + output_base_name = final_vcf_base_name + } + } + + # Validate the (g)VCF output of HaplotypeCaller + call QC.ValidateVCF as ValidateVCF { + input: + input_vcf = MergeVCFs.output_vcf, + input_vcf_index = MergeVCFs.output_vcf_index, + dbsnp_vcf = dbsnp_vcf, + dbsnp_vcf_index = dbsnp_vcf_index, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + calling_interval_list = calling_interval_list, + is_gvcf = make_gvcf, + preemptible_tries = agg_preemptible_tries + } + + # QC the (g)VCF + call QC.CollectVariantCallingMetrics as CollectVariantCallingMetrics { + input: + input_vcf = MergeVCFs.output_vcf, + input_vcf_index = MergeVCFs.output_vcf_index, + metrics_basename = final_vcf_base_name, + dbsnp_vcf = dbsnp_vcf, + dbsnp_vcf_index = dbsnp_vcf_index, + ref_dict = ref_dict, + evaluation_interval_list = evaluation_interval_list, + is_gvcf = make_gvcf, + preemptible_tries = agg_preemptible_tries + } + + output { + File vcf_summary_metrics = CollectVariantCallingMetrics.summary_metrics + File vcf_detail_metrics = CollectVariantCallingMetrics.detail_metrics + File output_vcf = MergeVCFs.output_vcf + File output_vcf_index = MergeVCFs.output_vcf_index + File? bamout = MergeBamouts.output_bam + File? bamout_index = MergeBamouts.output_bam_index + } +} + +# This task is here because merging bamout files using Picard produces an error. +task MergeBamouts { + + input { + Array[File] bams + String output_base_name + } + + Int disk_size = ceil(size(bams, "GiB") * 2) + 10 + + command { + samtools merge ~{output_base_name}.bam ~{sep=" " bams} + samtools index ~{output_base_name}.bam + mv ~{output_base_name}.bam.bai ~{output_base_name}.bai + } + + output { + File output_bam = "~{output_base_name}.bam" + File output_bam_index = "~{output_base_name}.bai" + } + + runtime { + docker: "biocontainers/samtools:1.3.1" + memory: "4 GiB" + disks: "local-disk ~{disk_size} HDD" + preemptible: 3 + cpu: 1 + } +} diff --git a/tasks_pipelines/bam_processing.wdl b/tasks_pipelines/bam_processing.wdl deleted file mode 100644 index b97c4d5..0000000 --- a/tasks_pipelines/bam_processing.wdl +++ /dev/null @@ -1,386 +0,0 @@ -## Copyright Broad Institute, 2018 -## -## This WDL defines tasks used for BAM file processing of human whole-genome or exome sequencing data. -## -## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. -## For program versions, see docker containers. -## -## LICENSING : -## This script is released under the WDL source code license (BSD-3) (see LICENSE in -## https://github.com/broadinstitute/wdl). Note however that the programs it calls may -## be subject to different licenses. Users are responsible for checking that they are -## authorized to run all programs before running this script. Please see the docker -## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed -## licensing information pertaining to the included programs. - -# Sort BAM file by coordinate order -task SortSam { - File input_bam - String output_bam_basename - Int preemptible_tries - Int compression_level - - # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs - # more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier - Float sort_sam_disk_multiplier = 3.25 - Int disk_size = ceil(sort_sam_disk_multiplier * size(input_bam, "GB")) + 20 - - command { - java -Dsamjdk.compression_level=${compression_level} -Xms4000m -jar /usr/gitc/picard.jar \ - SortSam \ - INPUT=${input_bam} \ - OUTPUT=${output_bam_basename}.bam \ - SORT_ORDER="coordinate" \ - CREATE_INDEX=true \ - CREATE_MD5_FILE=true \ - MAX_RECORDS_IN_RAM=300000 - - } - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" - disks: "local-disk " + disk_size + " HDD" - cpu: "1" - memory: "5000 MB" - preemptible: preemptible_tries - } - output { - File output_bam = "${output_bam_basename}.bam" - File output_bam_index = "${output_bam_basename}.bai" - File output_bam_md5 = "${output_bam_basename}.bam.md5" - } -} - -# Sort BAM file by coordinate order -- using Spark! -task SortSamSpark { - File input_bam - String output_bam_basename - Int preemptible_tries - Int compression_level - - # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs - # more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier - Float sort_sam_disk_multiplier = 3.25 - Int disk_size = ceil(sort_sam_disk_multiplier * size(input_bam, "GB")) + 20 - - command { - set -e - export GATK_LOCAL_JAR=/root/gatk.jar - - gatk --java-options "-Dsamjdk.compression_level=${compression_level} -Xms100g -Xmx100g" \ - SortSamSpark \ - -I ${input_bam} \ - -O ${output_bam_basename}.bam \ - -- --conf spark.local.dir=. --spark-master 'local[16]' --conf 'spark.kryo.referenceTracking=false' - - samtools index ${output_bam_basename}.bam ${output_bam_basename}.bai - } - runtime { - docker: "us.gcr.io/broad-gatk/gatk:4.0.2.1" - disks: "local-disk " + disk_size + " HDD" - bootDiskSizeGb: "15" - cpu: "16" - memory: "102 GB" - preemptible: preemptible_tries - } - output { - File output_bam = "${output_bam_basename}.bam" - File output_bam_index = "${output_bam_basename}.bai" - } -} - -# Mark duplicate reads to avoid counting non-independent observations -task MarkDuplicates { - Array[File] input_bams - String output_bam_basename - String metrics_filename - Float total_input_size - Int compression_level - Int preemptible_tries - - # The merged bam will be smaller than the sum of the parts so we need to account for the unmerged inputs and the merged output. - # Mark Duplicates takes in as input readgroup bams and outputs a slightly smaller aggregated bam. Giving .25 as wiggleroom - Float md_disk_multiplier = 2.25 - Int disk_size = ceil(md_disk_multiplier * total_input_size) + 20 - - # The program default for READ_NAME_REGEX is appropriate in nearly every case. - # Sometimes we wish to supply "null" in order to turn off optical duplicate detection - # This can be desirable if you don't mind the estimated library size being wrong and optical duplicate detection is taking >7 days and failing - String? read_name_regex - - # Task is assuming query-sorted input so that the Secondary and Supplementary reads get marked correctly - # This works because the output of BWA is query-grouped and therefore, so is the output of MergeBamAlignment. - # While query-grouped isn't actually query-sorted, it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname" - command { - java -Dsamjdk.compression_level=${compression_level} -Xms4000m -jar /usr/gitc/picard.jar \ - MarkDuplicates \ - INPUT=${sep=' INPUT=' input_bams} \ - OUTPUT=${output_bam_basename}.bam \ - METRICS_FILE=${metrics_filename} \ - VALIDATION_STRINGENCY=SILENT \ - ${"READ_NAME_REGEX=" + read_name_regex} \ - OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \ - ASSUME_SORT_ORDER="queryname" \ - CLEAR_DT="false" \ - ADD_PG_TAG_TO_READS=false - } - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" - preemptible: preemptible_tries - memory: "7 GB" - disks: "local-disk " + sub(disk_size, "\\..*", "") + " HDD" - } - output { - File output_bam = "${output_bam_basename}.bam" - File duplicate_metrics = "${metrics_filename}" - } -} - -# Generate Base Quality Score Recalibration (BQSR) model -task BaseRecalibrator { - String input_bam - String recalibration_report_filename - Array[String] sequence_group_interval - File dbSNP_vcf - File dbSNP_vcf_index - Array[File] known_indels_sites_VCFs - Array[File] known_indels_sites_indices - File ref_dict - File ref_fasta - File ref_fasta_index - Int bqsr_scatter - Int preemptible_tries - - Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") - Float dbsnp_size = size(dbSNP_vcf, "GB") - Int disk_size = ceil((size(input_bam, "GB") / bqsr_scatter) + ref_size + dbsnp_size) + 20 - - command { - /usr/gitc/gatk4/gatk-launch --javaOptions "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+PrintFlagsFinal \ - -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -XX:+PrintGCDetails \ - -Xloggc:gc_log.log -Xms4000m" \ - BaseRecalibrator \ - -R ${ref_fasta} \ - -I ${input_bam} \ - --useOriginalQualities \ - -O ${recalibration_report_filename} \ - -knownSites ${dbSNP_vcf} \ - -knownSites ${sep=" -knownSites " known_indels_sites_VCFs} \ - -L ${sep=" -L " sequence_group_interval} - } - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" - preemptible: preemptible_tries - memory: "6 GB" - disks: "local-disk " + disk_size + " HDD" - } - output { - File recalibration_report = "${recalibration_report_filename}" - } -} - -# Apply Base Quality Score Recalibration (BQSR) model -task ApplyBQSR { - String input_bam - String output_bam_basename - File recalibration_report - Array[String] sequence_group_interval - File ref_dict - File ref_fasta - File ref_fasta_index - Int compression_level - Int bqsr_scatter - Int preemptible_tries - - Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") - Int disk_size = ceil((size(input_bam, "GB") * 3 / bqsr_scatter) + ref_size) + 20 - - command { - /usr/gitc/gatk4/gatk-launch --javaOptions "-XX:+PrintFlagsFinal -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps \ - -XX:+PrintGCDetails -Xloggc:gc_log.log \ - -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Dsamjdk.compression_level=${compression_level} -Xms3000m" \ - ApplyBQSR \ - --createOutputBamMD5 \ - --addOutputSAMProgramRecord \ - -R ${ref_fasta} \ - -I ${input_bam} \ - --useOriginalQualities \ - -O ${output_bam_basename}.bam \ - -bqsr ${recalibration_report} \ - -SQQ 10 -SQQ 20 -SQQ 30 \ - -L ${sep=" -L " sequence_group_interval} - } - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" - preemptible: preemptible_tries - memory: "3500 MB" - disks: "local-disk " + disk_size + " HDD" - } - output { - File recalibrated_bam = "${output_bam_basename}.bam" - File recalibrated_bam_checksum = "${output_bam_basename}.bam.md5" - } -} - -# Combine multiple recalibration tables from scattered BaseRecalibrator runs -task GatherBqsrReports { - Array[File] input_bqsr_reports - String output_report_filename - Int preemptible_tries - - command { - /usr/gitc/gatk4/gatk-launch --javaOptions "-Xms3000m" \ - GatherBQSRReports \ - -I ${sep=' -I ' input_bqsr_reports} \ - -O ${output_report_filename} - } - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" - preemptible: preemptible_tries - memory: "3500 MB" - disks: "local-disk 20 HDD" - } - output { - File output_bqsr_report = "${output_report_filename}" - } -} - -# Combine multiple *sorted* BAM files -task GatherSortedBamFiles { - Array[File] input_bams - String output_bam_basename - Float total_input_size - Int compression_level - Int preemptible_tries - - # Multiply the input bam size by two to account for the input and output - Int disk_size = ceil(2 * total_input_size) + 20 - - command { - java -Dsamjdk.compression_level=${compression_level} -Xms2000m -jar /usr/gitc/picard.jar \ - GatherBamFiles \ - INPUT=${sep=' INPUT=' input_bams} \ - OUTPUT=${output_bam_basename}.bam \ - CREATE_INDEX=true \ - CREATE_MD5_FILE=true - } - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" - preemptible: preemptible_tries - memory: "3 GB" - disks: "local-disk " + disk_size + " HDD" - } - output { - File output_bam = "${output_bam_basename}.bam" - File output_bam_index = "${output_bam_basename}.bai" - File output_bam_md5 = "${output_bam_basename}.bam.md5" - } -} - -# Combine multiple *unsorted* BAM files -# Note that if/when WDL supports optional outputs, we should merge this task with the sorted version -task GatherUnsortedBamFiles { - Array[File] input_bams - String output_bam_basename - Float total_input_size - Int compression_level - Int preemptible_tries - - # Multiply the input bam size by two to account for the input and output - Int disk_size = ceil(2 * total_input_size) + 20 - - command { - java -Dsamjdk.compression_level=${compression_level} -Xms2000m -jar /usr/gitc/picard.jar \ - GatherBamFiles \ - INPUT=${sep=' INPUT=' input_bams} \ - OUTPUT=${output_bam_basename}.bam \ - CREATE_INDEX=false \ - CREATE_MD5_FILE=false - } - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" - preemptible: preemptible_tries - memory: "3 GB" - disks: "local-disk " + disk_size + " HDD" - } - output { - File output_bam = "${output_bam_basename}.bam" - } -} - -# Notes on the contamination estimate: -# The contamination value is read from the FREEMIX field of the selfSM file output by verifyBamId -# -# In Zamboni production, this value is stored directly in METRICS.AGGREGATION_CONTAM -# -# Contamination is also stored in GVCF_CALLING and thereby passed to HAPLOTYPE_CALLER -# But first, it is divided by an underestimation factor thusly: -# float(FREEMIX) / ContaminationUnderestimationFactor -# where the denominator is hardcoded in Zamboni: -# val ContaminationUnderestimationFactor = 0.75f -# -# Here, I am handling this by returning both the original selfSM file for reporting, and the adjusted -# contamination estimate for use in variant calling -task CheckContamination { - File input_bam - File input_bam_index - File contamination_sites_ud - File contamination_sites_bed - File contamination_sites_mu - File ref_fasta - File ref_fasta_index - String output_prefix - Int preemptible_tries - Float contamination_underestimation_factor - - Int disk_size = ceil(size(input_bam, "GB") + size(ref_fasta, "GB")) + 30 - - command <<< - set -e - - # creates a ${output_prefix}.selfSM file, a TSV file with 2 rows, 19 columns. - # First row are the keys (e.g., SEQ_SM, RG, FREEMIX), second row are the associated values - /usr/gitc/VerifyBamID \ - --Verbose \ - --NumPC 4 \ - --Output ${output_prefix} \ - --BamFile ${input_bam} \ - --Reference ${ref_fasta} \ - --UDPath ${contamination_sites_ud} \ - --MeanPath ${contamination_sites_mu} \ - --BedPath ${contamination_sites_bed} \ - 1>/dev/null - - # used to read from the selfSM file and calculate contamination, which gets printed out - python3 <>> - runtime { - preemptible: preemptible_tries - memory: "2 GB" - disks: "local-disk " + disk_size + " HDD" - docker: "us.gcr.io/broad-gotc-prod/verify-bam-id:c8a66425c312e5f8be46ab0c41f8d7a1942b6e16-1500298351" - } - output { - File selfSM = "${output_prefix}.selfSM" - Float contamination = read_float(stdout()) - } -} diff --git a/tasks_pipelines/germline_variant_discovery.wdl b/tasks_pipelines/germline_variant_discovery.wdl deleted file mode 100644 index 1a5ce3a..0000000 --- a/tasks_pipelines/germline_variant_discovery.wdl +++ /dev/null @@ -1,168 +0,0 @@ -## Copyright Broad Institute, 2018 -## -## This WDL defines tasks used for germline variant discovery of human whole-genome or exome sequencing data. -## -## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. -## For program versions, see docker containers. -## -## LICENSING : -## This script is released under the WDL source code license (BSD-3) (see LICENSE in -## https://github.com/broadinstitute/wdl). Note however that the programs it calls may -## be subject to different licenses. Users are responsible for checking that they are -## authorized to run all programs before running this script. Please see the docker -## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed -## licensing information pertaining to the included programs. - -# Call variants on a single sample with HaplotypeCaller to produce a GVCF -task HaplotypeCaller_GATK35_GVCF { - String input_bam - File interval_list - String gvcf_basename - File ref_dict - File ref_fasta - File ref_fasta_index - Float? contamination - Int preemptible_tries - Int hc_scatter - - Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") - Int disk_size = ceil(((size(input_bam, "GB") + 30) / hc_scatter) + ref_size) + 20 - - # We use interval_padding 500 below to make sure that the HaplotypeCaller has context on both sides around - # the interval because the assembly uses them. - # - # Using PrintReads is a temporary solution until we update HaploypeCaller to use GATK4. Once that is done, - # HaplotypeCaller can stream the required intervals directly from the cloud. - command { - /usr/gitc/gatk4/gatk-launch --javaOptions "-Xms2g" \ - PrintReads \ - -I ${input_bam} \ - --interval_padding 500 \ - -L ${interval_list} \ - -O local.sharded.bam \ - && \ - java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms8000m \ - -jar /usr/gitc/GATK35.jar \ - -T HaplotypeCaller \ - -R ${ref_fasta} \ - -o ${gvcf_basename}.vcf.gz \ - -I local.sharded.bam \ - -L ${interval_list} \ - -ERC GVCF \ - --max_alternate_alleles 3 \ - -variant_index_parameter 128000 \ - -variant_index_type LINEAR \ - -contamination ${default=0 contamination} \ - --read_filter OverclippedRead - } - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" - preemptible: preemptible_tries - memory: "10 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - } - output { - File output_gvcf = "${gvcf_basename}.vcf.gz" - File output_gvcf_index = "${gvcf_basename}.vcf.gz.tbi" - } -} - -task HaplotypeCaller_GATK4_VCF { - String input_bam - File interval_list - String vcf_basename - File ref_dict - File ref_fasta - File ref_fasta_index - Float contamination - Boolean make_gvcf - Int preemptible_tries - Int hc_scatter - - Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") - Int disk_size = ceil(((size(input_bam, "GB") + 30) / hc_scatter) + ref_size) + 20 - - command <<< - - set -e - export GATK_LOCAL_JAR=/root/gatk.jar - - gatk --java-options "-Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ - HaplotypeCaller \ - -R ${ref_fasta} \ - -I ${input_bam} \ - -L ${interval_list} \ - -O ${vcf_basename}.vcf.gz \ - -contamination ${default=0 contamination} ${true="-ERC GVCF" false="" make_gvcf} - >>> - runtime { - docker: "us.gcr.io/broad-gatk/gatk:4.0.2.1" - preemptible: preemptible_tries - memory: "6.5 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - } - output { - File output_vcf = "${vcf_basename}.vcf.gz" - File output_vcf_index = "${vcf_basename}.vcf.gz.tbi" - } -} - -# Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs -task MergeVCFs { - Array[File] input_vcfs - Array[File] input_vcfs_indexes - String output_vcf_name - Int preemptible_tries - - # Using MergeVcfs instead of GatherVcfs so we can create indices - # See https://github.com/broadinstitute/picard/issues/789 for relevant GatherVcfs ticket - command { - java -Xms2000m -jar /usr/gitc/picard.jar \ - MergeVcfs \ - INPUT=${sep=' INPUT=' input_vcfs} \ - OUTPUT=${output_vcf_name} - } - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" - preemptible: preemptible_tries - memory: "3 GB" - disks: "local-disk 30 HDD" - } - output { - File output_vcf = "${output_vcf_name}" - File output_vcf_index = "${output_vcf_name}.tbi" - } -} - -task HardFilterVcf { - File input_vcf - File input_vcf_index - String vcf_basename - File interval_list - Int preemptible_tries - - Int disk_size = ceil(2 * size(input_vcf, "GB")) + 20 - String output_vcf_name = vcf_basename + ".filtered.vcf.gz" - - command { - /usr/gitc/gatk4/gatk-launch --javaOptions "-Xms3000m" \ - VariantFiltration \ - -V ${input_vcf} \ - -L ${interval_list} \ - --filterExpression "QD < 2.0 || FS > 30.0 || SOR > 3.0 || MQ < 40.0 || MQRankSum < -3.0 || ReadPosRankSum < -3.0" \ - --filterName "HardFiltered" \ - -O ${output_vcf_name} - } - output { - File output_vcf = "${output_vcf_name}" - File output_vcf_index = "${output_vcf_name}.tbi" - } - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" - preemptible: preemptible_tries - memory: "3 GB" - disks: "local-disk " + disk_size + " HDD" - } -} diff --git a/tasks_pipelines/qc.wdl b/tasks_pipelines/qc.wdl deleted file mode 100644 index 62225c7..0000000 --- a/tasks_pipelines/qc.wdl +++ /dev/null @@ -1,502 +0,0 @@ -## Copyright Broad Institute, 2018 -## -## This WDL defines tasks used for QC of human whole-genome or exome sequencing data. -## -## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. -## For program versions, see docker containers. -## -## LICENSING : -## This script is released under the WDL source code license (BSD-3) (see LICENSE in -## https://github.com/broadinstitute/wdl). Note however that the programs it calls may -## be subject to different licenses. Users are responsible for checking that they are -## authorized to run all programs before running this script. Please see the docker -## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed -## licensing information pertaining to the included programs. - -# Collect sequencing yield quality metrics -task CollectQualityYieldMetrics { - File input_bam - String metrics_filename - Int preemptible_tries - - Int disk_size = ceil(size(input_bam, "GB")) + 20 - - command { - java -Xms2000m -jar /usr/gitc/picard.jar \ - CollectQualityYieldMetrics \ - INPUT=${input_bam} \ - OQ=true \ - OUTPUT=${metrics_filename} - } - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" - disks: "local-disk " + disk_size + " HDD" - memory: "3 GB" - preemptible: preemptible_tries - } - output { - File quality_yield_metrics = "${metrics_filename}" - } -} - -# Collect base quality and insert size metrics -task CollectUnsortedReadgroupBamQualityMetrics { - File input_bam - String output_bam_prefix - Int preemptible_tries - - Int disk_size = ceil(size(input_bam, "GB")) + 20 - - command { - java -Xms5000m -jar /usr/gitc/picard.jar \ - CollectMultipleMetrics \ - INPUT=${input_bam} \ - OUTPUT=${output_bam_prefix} \ - ASSUME_SORTED=true \ - PROGRAM="null" \ - PROGRAM="CollectBaseDistributionByCycle" \ - PROGRAM="CollectInsertSizeMetrics" \ - PROGRAM="MeanQualityByCycle" \ - PROGRAM="QualityScoreDistribution" \ - METRIC_ACCUMULATION_LEVEL="null" \ - METRIC_ACCUMULATION_LEVEL="ALL_READS" - - touch ${output_bam_prefix}.insert_size_metrics - touch ${output_bam_prefix}.insert_size_histogram.pdf - } - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" - memory: "7 GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File base_distribution_by_cycle_pdf = "${output_bam_prefix}.base_distribution_by_cycle.pdf" - File base_distribution_by_cycle_metrics = "${output_bam_prefix}.base_distribution_by_cycle_metrics" - File insert_size_histogram_pdf = "${output_bam_prefix}.insert_size_histogram.pdf" - File insert_size_metrics = "${output_bam_prefix}.insert_size_metrics" - File quality_by_cycle_pdf = "${output_bam_prefix}.quality_by_cycle.pdf" - File quality_by_cycle_metrics = "${output_bam_prefix}.quality_by_cycle_metrics" - File quality_distribution_pdf = "${output_bam_prefix}.quality_distribution.pdf" - File quality_distribution_metrics = "${output_bam_prefix}.quality_distribution_metrics" - } -} - -# Collect alignment summary and GC bias quality metrics -task CollectReadgroupBamQualityMetrics { - File input_bam - File input_bam_index - String output_bam_prefix - File ref_dict - File ref_fasta - File ref_fasta_index - Int preemptible_tries - - Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") - Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20 - - command { - java -Xms5000m -jar /usr/gitc/picard.jar \ - CollectMultipleMetrics \ - INPUT=${input_bam} \ - REFERENCE_SEQUENCE=${ref_fasta} \ - OUTPUT=${output_bam_prefix} \ - ASSUME_SORTED=true \ - PROGRAM="null" \ - PROGRAM="CollectAlignmentSummaryMetrics" \ - PROGRAM="CollectGcBiasMetrics" \ - METRIC_ACCUMULATION_LEVEL="null" \ - METRIC_ACCUMULATION_LEVEL="READ_GROUP" - } - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" - memory: "7 GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File alignment_summary_metrics = "${output_bam_prefix}.alignment_summary_metrics" - File gc_bias_detail_metrics = "${output_bam_prefix}.gc_bias.detail_metrics" - File gc_bias_pdf = "${output_bam_prefix}.gc_bias.pdf" - File gc_bias_summary_metrics = "${output_bam_prefix}.gc_bias.summary_metrics" - } -} - -# Collect quality metrics from the aggregated bam -task CollectAggregationMetrics { - File input_bam - File input_bam_index - String output_bam_prefix - File ref_dict - File ref_fasta - File ref_fasta_index - Int preemptible_tries - - Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") - Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20 - - command { - java -Xms5000m -jar /usr/gitc/picard.jar \ - CollectMultipleMetrics \ - INPUT=${input_bam} \ - REFERENCE_SEQUENCE=${ref_fasta} \ - OUTPUT=${output_bam_prefix} \ - ASSUME_SORTED=true \ - PROGRAM="null" \ - PROGRAM="CollectAlignmentSummaryMetrics" \ - PROGRAM="CollectInsertSizeMetrics" \ - PROGRAM="CollectSequencingArtifactMetrics" \ - PROGRAM="CollectGcBiasMetrics" \ - PROGRAM="QualityScoreDistribution" \ - METRIC_ACCUMULATION_LEVEL="null" \ - METRIC_ACCUMULATION_LEVEL="SAMPLE" \ - METRIC_ACCUMULATION_LEVEL="LIBRARY" - - touch ${output_bam_prefix}.insert_size_metrics - touch ${output_bam_prefix}.insert_size_histogram.pdf - } - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" - memory: "7 GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File alignment_summary_metrics = "${output_bam_prefix}.alignment_summary_metrics" - File bait_bias_detail_metrics = "${output_bam_prefix}.bait_bias_detail_metrics" - File bait_bias_summary_metrics = "${output_bam_prefix}.bait_bias_summary_metrics" - File gc_bias_detail_metrics = "${output_bam_prefix}.gc_bias.detail_metrics" - File gc_bias_pdf = "${output_bam_prefix}.gc_bias.pdf" - File gc_bias_summary_metrics = "${output_bam_prefix}.gc_bias.summary_metrics" - File insert_size_histogram_pdf = "${output_bam_prefix}.insert_size_histogram.pdf" - File insert_size_metrics = "${output_bam_prefix}.insert_size_metrics" - File pre_adapter_detail_metrics = "${output_bam_prefix}.pre_adapter_detail_metrics" - File pre_adapter_summary_metrics = "${output_bam_prefix}.pre_adapter_summary_metrics" - File quality_distribution_pdf = "${output_bam_prefix}.quality_distribution.pdf" - File quality_distribution_metrics = "${output_bam_prefix}.quality_distribution_metrics" - } -} - -# Check that the fingerprints of separate readgroups all match -task CrossCheckFingerprints { - Array[File] input_bams - Array[File] input_bam_indexes - File? haplotype_database_file - String metrics_filename - Float total_input_size - Int preemptible_tries - - Int disk_size = ceil(total_input_size) + 20 - - command <<< - java -Dsamjdk.buffer_size=131072 \ - -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms2000m \ - -jar /usr/gitc/picard.jar \ - CrosscheckReadGroupFingerprints \ - OUTPUT=${metrics_filename} \ - HAPLOTYPE_MAP=${haplotype_database_file} \ - EXPECT_ALL_READ_GROUPS_TO_MATCH=true \ - INPUT=${sep=' INPUT=' input_bams} \ - LOD_THRESHOLD=-20.0 - >>> - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" - preemptible: preemptible_tries - memory: "2 GB" - disks: "local-disk " + disk_size + " HDD" - } - output { - File cross_check_fingerprints_metrics = "${metrics_filename}" - } -} - -# Check that the fingerprint of the sample BAM matches the sample array -task CheckFingerprint { - File input_bam - File input_bam_index - String output_basename - File? haplotype_database_file - File? genotypes - File? genotypes_index - String sample - Int preemptible_tries - - Int disk_size = ceil(size(input_bam, "GB")) + 20 - - command <<< - java -Dsamjdk.buffer_size=131072 \ - -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms1024m \ - -jar /usr/gitc/picard.jar \ - CheckFingerprint \ - INPUT=${input_bam} \ - OUTPUT=${output_basename} \ - GENOTYPES=${genotypes} \ - HAPLOTYPE_MAP=${haplotype_database_file} \ - SAMPLE_ALIAS="${sample}" \ - IGNORE_READ_GROUPS=true - - >>> - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" - preemptible: preemptible_tries - memory: "1 GB" - disks: "local-disk " + disk_size + " HDD" - } - output { - File summary_metrics = "${output_basename}.fingerprinting_summary_metrics" - File detail_metrics = "${output_basename}.fingerprinting_detail_metrics" - } -} - -task CheckPreValidation { - File duplication_metrics - File chimerism_metrics - Float max_duplication_in_reasonable_sample - Float max_chimerism_in_reasonable_sample - Int preemptible_tries - - command <<< - set -o pipefail - set -e - - grep -A 1 PERCENT_DUPLICATION ${duplication_metrics} > duplication.csv - grep -A 3 PCT_CHIMERAS ${chimerism_metrics} | grep -v OF_PAIR > chimerism.csv - - python <>> - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" - preemptible: preemptible_tries - docker: "python:2.7" - memory: "2 GB" - } - output { - Float duplication_rate = read_float("duplication_value.txt") - Float chimerism_rate = read_float("chimerism_value.txt") - Boolean is_outlier_data = duplication_rate > max_duplication_in_reasonable_sample || chimerism_rate > max_chimerism_in_reasonable_sample - } -} - -task ValidateSamFile { - File input_bam - File? input_bam_index - String report_filename - File ref_dict - File ref_fasta - File ref_fasta_index - Int? max_output - Array[String]? ignore - Boolean? is_outlier_data - Int preemptible_tries - - Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") - Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20 - - command { - java -Xms6000m -jar /usr/gitc/picard.jar \ - ValidateSamFile \ - INPUT=${input_bam} \ - OUTPUT=${report_filename} \ - REFERENCE_SEQUENCE=${ref_fasta} \ - ${"MAX_OUTPUT=" + max_output} \ - IGNORE=${default="null" sep=" IGNORE=" ignore} \ - MODE=VERBOSE \ - ${default='SKIP_MATE_VALIDATION=false' true='SKIP_MATE_VALIDATION=true' false='SKIP_MATE_VALIDATION=false' is_outlier_data} \ - IS_BISULFITE_SEQUENCED=false - } - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" - preemptible: preemptible_tries - memory: "7 GB" - disks: "local-disk " + disk_size + " HDD" - } - output { - File report = "${report_filename}" - } -} - -# Note these tasks will break if the read lengths in the bam are greater than 250. -task CollectWgsMetrics { - File input_bam - File input_bam_index - String metrics_filename - File wgs_coverage_interval_list - File ref_fasta - File ref_fasta_index - Int read_length - Int preemptible_tries - - Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") - Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20 - - command { - java -Xms2000m -jar /usr/gitc/picard.jar \ - CollectWgsMetrics \ - INPUT=${input_bam} \ - VALIDATION_STRINGENCY=SILENT \ - REFERENCE_SEQUENCE=${ref_fasta} \ - INCLUDE_BQ_HISTOGRAM=true \ - INTERVALS=${wgs_coverage_interval_list} \ - OUTPUT=${metrics_filename} \ - USE_FAST_ALGORITHM=true \ - READ_LENGTH=${read_length} - } - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" - preemptible: preemptible_tries - memory: "3 GB" - disks: "local-disk " + disk_size + " HDD" - } - output { - File metrics = "${metrics_filename}" - } -} - -# Collect raw WGS metrics (commonly used QC thresholds) -task CollectRawWgsMetrics { - File input_bam - File input_bam_index - String metrics_filename - File wgs_coverage_interval_list - File ref_fasta - File ref_fasta_index - Int read_length - Int preemptible_tries - - Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") - Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20 - - command { - java -Xms2000m -jar /usr/gitc/picard.jar \ - CollectRawWgsMetrics \ - INPUT=${input_bam} \ - VALIDATION_STRINGENCY=SILENT \ - REFERENCE_SEQUENCE=${ref_fasta} \ - INCLUDE_BQ_HISTOGRAM=true \ - INTERVALS=${wgs_coverage_interval_list} \ - OUTPUT=${metrics_filename} \ - USE_FAST_ALGORITHM=true \ - READ_LENGTH=${read_length} - } - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" - preemptible: preemptible_tries - memory: "3 GB" - disks: "local-disk " + disk_size + " HDD" - } - output { - File metrics = "${metrics_filename}" - } -} - -# Generate a checksum per readgroup -task CalculateReadGroupChecksum { - File input_bam - File input_bam_index - String read_group_md5_filename - Int preemptible_tries - - Int disk_size = ceil(size(input_bam, "GB")) + 20 - - command { - java -Xms1000m -jar /usr/gitc/picard.jar \ - CalculateReadGroupChecksum \ - INPUT=${input_bam} \ - OUTPUT=${read_group_md5_filename} - } - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" - preemptible: preemptible_tries - memory: "2 GB" - disks: "local-disk " + disk_size + " HDD" - } - output { - File md5_file = "${read_group_md5_filename}" - } -} - -# Validate a GVCF with -gvcf specific validation -task ValidateGVCF { - File input_vcf - File input_vcf_index - File ref_fasta - File ref_fasta_index - File ref_dict - File dbSNP_vcf - File dbSNP_vcf_index - File wgs_calling_interval_list - Int preemptible_tries - - Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") - Int disk_size = ceil(size(input_vcf, "GB") + size(dbSNP_vcf, "GB") + ref_size) + 20 - - command { - /usr/gitc/gatk4/gatk-launch --javaOptions "-Xms3000m" \ - ValidateVariants \ - -V ${input_vcf} \ - -R ${ref_fasta} \ - -L ${wgs_calling_interval_list} \ - -gvcf \ - --validationTypeToExclude ALLELES \ - --dbsnp ${dbSNP_vcf} - } - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" - preemptible: preemptible_tries - memory: "3500 MB" - disks: "local-disk " + disk_size + " HDD" - } -} - -# Collect variant calling metrics from GVCF output -task CollectGvcfCallingMetrics { - File input_vcf - File input_vcf_index - String metrics_basename - File dbSNP_vcf - File dbSNP_vcf_index - File ref_dict - File wgs_evaluation_interval_list - Int preemptible_tries - - Int disk_size = ceil(size(input_vcf, "GB") + size(dbSNP_vcf, "GB")) + 20 - - command { - java -Xms2000m -jar /usr/gitc/picard.jar \ - CollectVariantCallingMetrics \ - INPUT=${input_vcf} \ - OUTPUT=${metrics_basename} \ - DBSNP=${dbSNP_vcf} \ - SEQUENCE_DICTIONARY=${ref_dict} \ - TARGET_INTERVALS=${wgs_evaluation_interval_list} \ - GVCF_INPUT=true - } - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135" - preemptible: preemptible_tries - memory: "3 GB" - disks: "local-disk " + disk_size + " HDD" - } - output { - File summary_metrics = "${metrics_basename}.variant_calling_summary_metrics" - File detail_metrics = "${metrics_basename}.variant_calling_detail_metrics" - } -} diff --git a/tasks_pipelines/unmapped_bam_to_aligned_bam.wdl b/tasks_pipelines/unmapped_bam_to_aligned_bam.wdl deleted file mode 100644 index 6bb25cd..0000000 --- a/tasks_pipelines/unmapped_bam_to_aligned_bam.wdl +++ /dev/null @@ -1,400 +0,0 @@ -## Copyright Broad Institute, 2018 -## -## This WDL pipeline implements data processing according to the GATK Best Practices (June 2016) -## for human whole-genome and exome sequencing data. -## -## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. -## For program versions, see docker containers. -## -## LICENSING : -## This script is released under the WDL source code license (BSD-3) (see LICENSE in -## https://github.com/broadinstitute/wdl). Note however that the programs it calls may -## be subject to different licenses. Users are responsible for checking that they are -## authorized to run all programs before running this script. Please see the docker -## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed -## licensing information pertaining to the included programs. - -import "./tasks_pipelines/alignment.wdl" as Alignment -import "./tasks_pipelines/split_large_readgroup.wdl" as SplitRG -import "./tasks_pipelines/qc.wdl" as QC -import "./tasks_pipelines/bam_processing.wdl" as Processing -import "./tasks_pipelines/utilities.wdl" as Utils - -# WORKFLOW DEFINITION -workflow to_bam_workflow { - - File contamination_sites_ud - File contamination_sites_bed - File contamination_sites_mu - File? fingerprint_genotypes_file - File? fingerprint_genotypes_index - File? haplotype_database_file - File wgs_coverage_interval_list - - String sample_name - String base_file_name - Array[File] flowcell_unmapped_bams - String unmapped_bam_suffix - - Int read_length = 250 - - File ref_fasta - File ref_fasta_index - File ref_dict - File ref_alt - File ref_bwt - File ref_sa - File ref_amb - File ref_ann - File ref_pac - - File dbSNP_vcf - File dbSNP_vcf_index - Array[File] known_indels_sites_VCFs - Array[File] known_indels_sites_indices - - Int preemptible_tries - Int agg_preemptible_tries - - Float cutoff_for_large_rg_in_gb = 20.0 - - String bwa_commandline = "bwa mem -K 100000000 -p -v 3 -t 16 -Y $bash_ref_fasta" - - String recalibrated_bam_basename = base_file_name + ".aligned.duplicates_marked.recalibrated" - - Int compression_level = 2 - - # Get the version of BWA to include in the PG record in the header of the BAM produced - # by MergeBamAlignment. - call Alignment.GetBwaVersion - - # Get the size of the standard reference files as well as the additional reference files needed for BWA - - # Align flowcell-level unmapped input bams in parallel - scatter (unmapped_bam in flowcell_unmapped_bams) { - - Float unmapped_bam_size = size(unmapped_bam, "GB") - - String unmapped_bam_basename = basename(unmapped_bam, unmapped_bam_suffix) - - # QC the unmapped BAM - call QC.CollectQualityYieldMetrics as CollectQualityYieldMetrics { - input: - input_bam = unmapped_bam, - metrics_filename = unmapped_bam_basename + ".unmapped.quality_yield_metrics", - preemptible_tries = preemptible_tries - } - - if (unmapped_bam_size > cutoff_for_large_rg_in_gb) { - # Split bam into multiple smaller bams, - # map reads to reference and recombine into one bam - call SplitRG.split_large_readgroup as SplitRG { - input: - input_bam = unmapped_bam, - bwa_commandline = bwa_commandline, - bwa_version = GetBwaVersion.version, - output_bam_basename = unmapped_bam_basename + ".aligned.unsorted", - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - ref_dict = ref_dict, - ref_alt = ref_alt, - ref_amb = ref_amb, - ref_ann = ref_ann, - ref_bwt = ref_bwt, - ref_pac = ref_pac, - ref_sa = ref_sa, - compression_level = compression_level, - preemptible_tries = preemptible_tries - } - } - - if (unmapped_bam_size <= cutoff_for_large_rg_in_gb) { - # Map reads to reference - call Alignment.SamToFastqAndBwaMemAndMba as SamToFastqAndBwaMemAndMba { - input: - input_bam = unmapped_bam, - bwa_commandline = bwa_commandline, - output_bam_basename = unmapped_bam_basename + ".aligned.unsorted", - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - ref_dict = ref_dict, - ref_alt = ref_alt, - ref_bwt = ref_bwt, - ref_amb = ref_amb, - ref_ann = ref_ann, - ref_pac = ref_pac, - ref_sa = ref_sa, - bwa_version = GetBwaVersion.version, - compression_level = compression_level, - preemptible_tries = preemptible_tries - } - } - - File output_aligned_bam = select_first([SamToFastqAndBwaMemAndMba.output_bam, SplitRG.aligned_bam]) - - Float mapped_bam_size = size(output_aligned_bam, "GB") - - # QC the aligned but unsorted readgroup BAM - # no reference as the input here is unsorted, providing a reference would cause an error - call QC.CollectUnsortedReadgroupBamQualityMetrics as CollectUnsortedReadgroupBamQualityMetrics { - input: - input_bam = output_aligned_bam, - output_bam_prefix = unmapped_bam_basename + ".readgroup", - preemptible_tries = preemptible_tries - } - } - - # Sum the read group bam sizes to approximate the aggregated bam size - call Utils.SumFloats as SumFloats { - input: - sizes = mapped_bam_size, - preemptible_tries = preemptible_tries - } - - # Aggregate aligned+merged flowcell BAM files and mark duplicates - # We take advantage of the tool's ability to take multiple BAM inputs and write out a single output - # to avoid having to spend time just merging BAM files. - call Processing.MarkDuplicates as MarkDuplicates { - input: - input_bams = output_aligned_bam, - output_bam_basename = base_file_name + ".aligned.unsorted.duplicates_marked", - metrics_filename = base_file_name + ".duplicate_metrics", - total_input_size = SumFloats.total_size, - compression_level = compression_level, - preemptible_tries = agg_preemptible_tries - } - - # Sort aggregated+deduped BAM file and fix tags - call Processing.SortSam as SortSampleBam { - input: - input_bam = MarkDuplicates.output_bam, - output_bam_basename = base_file_name + ".aligned.duplicate_marked.sorted", - compression_level = compression_level, - preemptible_tries = agg_preemptible_tries - } - - Float agg_bam_size = size(SortSampleBam.output_bam, "GB") - - if (defined(haplotype_database_file)) { - # Check identity of fingerprints across readgroups - call QC.CrossCheckFingerprints as CrossCheckFingerprints { - input: - input_bams = SortSampleBam.output_bam, - input_bam_indexes = SortSampleBam.output_bam_index, - haplotype_database_file = haplotype_database_file, - metrics_filename = base_file_name + ".crosscheck", - total_input_size = agg_bam_size, - preemptible_tries = agg_preemptible_tries - } - } - - # Create list of sequences for scatter-gather parallelization - call Utils.CreateSequenceGroupingTSV as CreateSequenceGroupingTSV { - input: - ref_dict = ref_dict, - preemptible_tries = preemptible_tries - } - - # Estimate level of cross-sample contamination - call Processing.CheckContamination as CheckContamination { - input: - input_bam = SortSampleBam.output_bam, - input_bam_index = SortSampleBam.output_bam_index, - contamination_sites_ud = contamination_sites_ud, - contamination_sites_bed = contamination_sites_bed, - contamination_sites_mu = contamination_sites_mu, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - output_prefix = base_file_name + ".preBqsr", - preemptible_tries = agg_preemptible_tries, - contamination_underestimation_factor = 0.75 - } - - # We need disk to localize the sharded input and output due to the scatter for BQSR. - # If we take the number we are scattering by and reduce by 3 we will have enough disk space - # to account for the fact that the data is not split evenly. - Int num_of_bqsr_scatters = length(CreateSequenceGroupingTSV.sequence_grouping) - Int potential_bqsr_divisor = num_of_bqsr_scatters - 10 - Int bqsr_divisor = if potential_bqsr_divisor > 1 then potential_bqsr_divisor else 1 - - # Perform Base Quality Score Recalibration (BQSR) on the sorted BAM in parallel - scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping) { - # Generate the recalibration model by interval - call Processing.BaseRecalibrator as BaseRecalibrator { - input: - input_bam = SortSampleBam.output_bam, - recalibration_report_filename = base_file_name + ".recal_data.csv", - sequence_group_interval = subgroup, - dbSNP_vcf = dbSNP_vcf, - dbSNP_vcf_index = dbSNP_vcf_index, - known_indels_sites_VCFs = known_indels_sites_VCFs, - known_indels_sites_indices = known_indels_sites_indices, - ref_dict = ref_dict, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - bqsr_scatter = bqsr_divisor, - preemptible_tries = agg_preemptible_tries - } - } - - # Merge the recalibration reports resulting from by-interval recalibration - # The reports are always the same size - call Processing.GatherBqsrReports as GatherBqsrReports { - input: - input_bqsr_reports = BaseRecalibrator.recalibration_report, - output_report_filename = base_file_name + ".recal_data.csv", - preemptible_tries = preemptible_tries - } - - scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping_with_unmapped) { - # Apply the recalibration model by interval - call Processing.ApplyBQSR as ApplyBQSR { - input: - input_bam = SortSampleBam.output_bam, - output_bam_basename = recalibrated_bam_basename, - recalibration_report = GatherBqsrReports.output_bqsr_report, - sequence_group_interval = subgroup, - ref_dict = ref_dict, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - bqsr_scatter = bqsr_divisor, - compression_level = compression_level, - preemptible_tries = agg_preemptible_tries - } - } - - # Merge the recalibrated BAM files resulting from by-interval recalibration - call Processing.GatherSortedBamFiles as GatherBamFiles { - input: - input_bams = ApplyBQSR.recalibrated_bam, - output_bam_basename = base_file_name, - total_input_size = agg_bam_size, - compression_level = compression_level, - preemptible_tries = agg_preemptible_tries - } - - # QC the final BAM (consolidated after scattered BQSR) - call QC.CollectReadgroupBamQualityMetrics as CollectReadgroupBamQualityMetrics { - input: - input_bam = GatherBamFiles.output_bam, - input_bam_index = GatherBamFiles.output_bam_index, - output_bam_prefix = base_file_name + ".readgroup", - ref_dict = ref_dict, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - preemptible_tries = agg_preemptible_tries - } - - # QC the final BAM some more (no such thing as too much QC) - call QC.CollectAggregationMetrics as CollectAggregationMetrics { - input: - input_bam = GatherBamFiles.output_bam, - input_bam_index = GatherBamFiles.output_bam_index, - output_bam_prefix = base_file_name, - ref_dict = ref_dict, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - preemptible_tries = agg_preemptible_tries - } - - if (defined(haplotype_database_file) && defined(fingerprint_genotypes_file)) { - # Check the sample BAM fingerprint against the sample array - call QC.CheckFingerprint as CheckFingerprint { - input: - input_bam = GatherBamFiles.output_bam, - input_bam_index = GatherBamFiles.output_bam_index, - haplotype_database_file = haplotype_database_file, - genotypes = fingerprint_genotypes_file, - genotypes_index = fingerprint_genotypes_index, - output_basename = base_file_name, - sample = sample_name, - preemptible_tries = agg_preemptible_tries - } - } - - # QC the sample WGS metrics (stringent thresholds) - call QC.CollectWgsMetrics as CollectWgsMetrics { - input: - input_bam = GatherBamFiles.output_bam, - input_bam_index = GatherBamFiles.output_bam_index, - metrics_filename = base_file_name + ".wgs_metrics", - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - wgs_coverage_interval_list = wgs_coverage_interval_list, - read_length = read_length, - preemptible_tries = agg_preemptible_tries - } - - # QC the sample raw WGS metrics (common thresholds) - call QC.CollectRawWgsMetrics as CollectRawWgsMetrics { - input: - input_bam = GatherBamFiles.output_bam, - input_bam_index = GatherBamFiles.output_bam_index, - metrics_filename = base_file_name + ".raw_wgs_metrics", - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - wgs_coverage_interval_list = wgs_coverage_interval_list, - read_length = read_length, - preemptible_tries = agg_preemptible_tries - } - - # Generate a checksum per readgroup in the final BAM - call QC.CalculateReadGroupChecksum as CalculateReadGroupChecksum { - input: - input_bam = GatherBamFiles.output_bam, - input_bam_index = GatherBamFiles.output_bam_index, - read_group_md5_filename = recalibrated_bam_basename + ".bam.read_group_md5", - preemptible_tries = agg_preemptible_tries - } - - # Outputs that will be retained when execution is complete - output { - Array[File] quality_yield_metrics = CollectQualityYieldMetrics.quality_yield_metrics - - Array[File] unsorted_read_group_base_distribution_by_cycle_pdf = CollectUnsortedReadgroupBamQualityMetrics.base_distribution_by_cycle_pdf - Array[File] unsorted_read_group_base_distribution_by_cycle_metrics = CollectUnsortedReadgroupBamQualityMetrics.base_distribution_by_cycle_metrics - Array[File] unsorted_read_group_insert_size_histogram_pdf = CollectUnsortedReadgroupBamQualityMetrics.insert_size_histogram_pdf - Array[File] unsorted_read_group_insert_size_metrics = CollectUnsortedReadgroupBamQualityMetrics.insert_size_metrics - Array[File] unsorted_read_group_quality_by_cycle_pdf = CollectUnsortedReadgroupBamQualityMetrics.quality_by_cycle_pdf - Array[File] unsorted_read_group_quality_by_cycle_metrics = CollectUnsortedReadgroupBamQualityMetrics.quality_by_cycle_metrics - Array[File] unsorted_read_group_quality_distribution_pdf = CollectUnsortedReadgroupBamQualityMetrics.quality_distribution_pdf - Array[File] unsorted_read_group_quality_distribution_metrics = CollectUnsortedReadgroupBamQualityMetrics.quality_distribution_metrics - - File read_group_alignment_summary_metrics = CollectReadgroupBamQualityMetrics.alignment_summary_metrics - File read_group_gc_bias_detail_metrics = CollectReadgroupBamQualityMetrics.gc_bias_detail_metrics - File read_group_gc_bias_pdf = CollectReadgroupBamQualityMetrics.gc_bias_pdf - File read_group_gc_bias_summary_metrics = CollectReadgroupBamQualityMetrics.gc_bias_summary_metrics - - File? cross_check_fingerprints_metrics = CrossCheckFingerprints.cross_check_fingerprints_metrics - - File selfSM = CheckContamination.selfSM - Float contamination = CheckContamination.contamination - - File calculate_read_group_checksum_md5 = CalculateReadGroupChecksum.md5_file - - File agg_alignment_summary_metrics = CollectAggregationMetrics.alignment_summary_metrics - File agg_bait_bias_detail_metrics = CollectAggregationMetrics.bait_bias_detail_metrics - File agg_bait_bias_summary_metrics = CollectAggregationMetrics.bait_bias_summary_metrics - File agg_gc_bias_summary_metrics = CollectAggregationMetrics.gc_bias_summary_metrics - File agg_gc_bias_detail_metrics = CollectAggregationMetrics.gc_bias_detail_metrics - File agg_gc_bias_pdf = CollectAggregationMetrics.gc_bias_pdf - File agg_insert_size_histogram_pdf = CollectAggregationMetrics.insert_size_histogram_pdf - File agg_insert_size_metrics = CollectAggregationMetrics.insert_size_metrics - File agg_pre_adapter_detail_metrics = CollectAggregationMetrics.pre_adapter_detail_metrics - File agg_pre_adapter_summary_metrics = CollectAggregationMetrics.pre_adapter_summary_metrics - File agg_quality_distribution_pdf = CollectAggregationMetrics.quality_distribution_pdf - File agg_quality_distribution_metrics = CollectAggregationMetrics.quality_distribution_metrics - - File? fingerprint_summary_metrics = CheckFingerprint.summary_metrics - File? fingerprint_detail_metrics = CheckFingerprint.detail_metrics - - File duplicate_metrics = MarkDuplicates.duplicate_metrics - File output_bqsr_reports = GatherBqsrReports.output_bqsr_report - - File raw_wgs_metrics = CollectRawWgsMetrics.metrics - File wgs_metrics = CollectWgsMetrics.metrics - - File output_bam = GatherBamFiles.output_bam - File? output_bam_index = GatherBamFiles.output_bam_index - } -}