Merge branch 'dev'

gatk-workflows · Aug 2, 2019 · 498c549 · 498c549
2 parents 2f353ae + dcd8214
commit 498c549
Show file tree

Hide file tree

Showing 21 changed files with 2,567 additions and 2,447 deletions.
diff --git a/README.md b/README.md
@@ -1,14 +1,11 @@
 # five-dollar-genome-analysis-pipeline
 Workflows used for germline short variant discovery in WGS data
+
 ### germline_single_sample_workflow :
 This WDL pipeline implements data pre-processing and initial variant calling (GVCF
 generation) according to the GATK Best Practices (June 2016) for germline SNP and
 Indel discovery in human whole-genome sequencing data.
 
-Note: For those users interested in running this wdl on FireCloud (FC), the FC
-version has been provided as fc_germline_single_sample_workflow.wdl. Please visit the 
-FC featured methods and workspaces for more GATK Best Practices pipelines.
-
 #### Requirements/expectations
 - Human whole-genome paired-end sequencing data in unmapped BAM (uBAM) format
 - One or more read groups, one per uBAM file, all belonging to a single sample (SM)
@@ -18,13 +15,35 @@ FC featured methods and workspaces for more GATK Best Practices pipelines.
 - - reads are provided in query-sorted order
 - - all reads must have an RG tag
 - Reference genome must be Hg38 with ALT contigs
+
 #### Outputs 
 - Cram, cram index, and cram md5 
 - GVCF and its gvcf index 
 - BQSR Report
 - Several Summary Metrics 
 
 ### Software version requirements :
-Cromwell version support 
-- Successfully tested on v30.2
-- Does not work on versions < v23 due to output syntax
+- GATK 4.0.10.1
+- Picard 2.16.0-SNAPSHOT
+- Samtools 1.3.1
+- Python 2.7
+- Cromwell version support 
+  - Successfully tested on v37
+  - Does not work on versions < v23 due to output syntax
+
+### Important Note :
+- The provided JSON is meant to be a ready to use example JSON template of the workflow. It is the user’s responsibility to correctly set the reference and resource input variables using the [GATK Tool and Tutorial Documentations](https://software.broadinstitute.org/gatk/documentation/).
+- Relevant reference and resources bundles can be accessed in [Resource Bundle](https://software.broadinstitute.org/gatk/download/bundle).
+- Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
+- For help running workflows on the Google Cloud Platform or locally please
+view the following tutorial [(How to) Execute Workflows from the gatk-workflows Git Organization](https://software.broadinstitute.org/gatk/documentation/article?id=12521).
+- The following material is provided by the GATK Team. Please post any questions or concerns to one of our forum sites : [GATK](https://gatkforums.broadinstitute.org/gatk/categories/ask-the-team/) , [FireCloud](https://gatkforums.broadinstitute.org/firecloud/categories/ask-the-firecloud-team) or [Terra](https://broadinstitute.zendesk.com/hc/en-us/community/topics/360000500432-General-Discussion) , [WDL/Cromwell](https://gatkforums.broadinstitute.org/wdl/categories/ask-the-wdl-team).
+- Please visit the [User Guide](https://software.broadinstitute.org/gatk/documentation/) site for further documentation on our workflows and tools.
+
+### LICENSING :
+Copyright Broad Institute, 2019 | BSD-3
+This script is released under the WDL open source code license (BSD-3) (full license text at https://github.com/openwdl/wdl/blob/master/LICENSE). Note however that the programs it calls may be subject to different licenses. Users are responsible for checking that they are authorized to run all programs before running this script.
+- [GATK](https://software.broadinstitute.org/gatk/download/licensing.php)
+- [BWA](http://bio-bwa.sourceforge.net/bwa.shtml#13)
+- [Picard](https://broadinstitute.github.io/picard/)
+- [Samtools](http://www.htslib.org/terms/)
diff --git a/WholeGenomeGermlineSingleSample.hg38.inputs.json b/WholeGenomeGermlineSingleSample.hg38.inputs.json
@@ -0,0 +1,53 @@
+{
+  "WholeGenomeGermlineSingleSample.sample_and_unmapped_bams": {
+    "sample_name": "NA12878 PLUMBING",
+    "base_file_name": "NA12878_PLUMBING",
+    "flowcell_unmapped_bams": [
+    "gs://broad-public-datasets/NA12878_downsampled_for_testing/unmapped/H06HDADXX130110.1.ATCACGAT.20k_reads.bam",
+    "gs://broad-public-datasets/NA12878_downsampled_for_testing/unmapped/H06HDADXX130110.2.ATCACGAT.20k_reads.bam",
+    "gs://broad-public-datasets/NA12878_downsampled_for_testing/unmapped/H06JUADXX130110.1.ATCACGAT.20k_reads.bam"
+    ],
+    "final_gvcf_base_name": "NA12878_PLUMBING",
+    "unmapped_bam_suffix": ".bam"
+  },
+
+  "WholeGenomeGermlineSingleSample.references": {
+    "fingerprint_genotypes_file": "gs://dsde-data-na12878-public/NA12878.hg38.reference.fingerprint.vcf",
+    "fingerprint_genotypes_index": "gs://dsde-data-na12878-public/NA12878.hg38.reference.fingerprint.vcf.idx",
+    "contamination_sites_ud": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.contam.UD",
+    "contamination_sites_bed": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.contam.bed",
+    "contamination_sites_mu": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.contam.mu",
+    "calling_interval_list": "gs://broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list",
+    "haplotype_scatter_count": 10,
+    "break_bands_at_multiples_of": 100000,
+    "reference_fasta" : {
+        "ref_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict",
+        "ref_fasta": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta",
+        "ref_fasta_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai",
+        "ref_alt": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.alt",
+        "ref_sa": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.sa",
+        "ref_amb": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.amb",
+        "ref_bwt": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.bwt",
+        "ref_ann": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.ann",
+        "ref_pac": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64.pac"
+    },
+    "known_indels_sites_vcfs": [
+      "gs://broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
+      "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz"
+    ],
+    "known_indels_sites_indices": [
+      "gs://broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi",
+      "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi"
+    ],
+    "dbsnp_vcf": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf",
+    "dbsnp_vcf_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx",
+    "evaluation_interval_list": "gs://broad-references/hg38/v0/wgs_evaluation_regions.hg38.interval_list"
+  },
+
+  "WholeGenomeGermlineSingleSample.wgs_coverage_interval_list": "gs://broad-references/hg38/v0/wgs_coverage_regions.hg38.interval_list",
+
+  "WholeGenomeGermlineSingleSample.papi_settings": {
+    "preemptible_tries": 3,
+    "agg_preemptible_tries": 3
+  }
+}
diff --git a/WholeGenomeGermlineSingleSample.wdl b/WholeGenomeGermlineSingleSample.wdl
@@ -0,0 +1,219 @@
+version 1.0
+
+## Copyright Broad Institute, 2018
+##
+## This WDL pipeline implements data pre-processing and initial variant calling (GVCF
+## generation) according to the GATK Best Practices (June 2016) for germline SNP and
+## Indel discovery in human whole-genome data.
+##
+## Requirements/expectations :
+## - Human whole-genome pair-end sequencing data in unmapped BAM (uBAM) format
+## - One or more read groups, one per uBAM file, all belonging to a single sample (SM)
+## - Input uBAM files must additionally comply with the following requirements:
+## - - filenames all have the same suffix (we use ".unmapped.bam")
+## - - files must pass validation by ValidateSamFile
+## - - reads are provided in query-sorted order
+## - - all reads must have an RG tag
+## - GVCF output names must end in ".g.vcf.gz"
+## - Reference genome must be Hg38 with ALT contigs
+##
+## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
+## For program versions, see docker containers.
+##
+## LICENSING :
+## This script is released under the WDL source code license (BSD-3) (see LICENSE in
+## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
+## be subject to different licenses. Users are responsible for checking that they are
+## authorized to run all programs before running this script. Please see the docker
+## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
+## licensing information pertaining to the included programs.
+
+# Local import
+#import "../../../../pipelines/dna_seq/UnmappedBamToAlignedBam.wdl" as ToBam
+#import "../../../../tasks/AggregatedBamQC.wdl" as AggregatedQC
+#import "../../../../tasks/GermlineVariantDiscovery.wdl" as Calling
+#import "../../../../tasks/Qc.wdl" as QC
+#import "../../../../tasks/Utilities.wdl" as Utils
+#import "../../../../tasks/BamToCram.wdl" as ToCram
+#import "../../../../tasks/VariantCalling.wdl" as ToGvcf
+#import "../../../../structs/dna_seq/germline/GermlineStructs.wdl"
+
+# Git URL import
+import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/UnmappedBamToAlignedBam.wdl" as ToBam
+import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/AggregatedBamQC.wdl" as AggregatedQC
+import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/GermlineVariantDiscovery.wdl" as Calling
+import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Qc.wdl" as QC
+import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/Utilities.wdl" as Utils
+import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/BamToCram.wdl" as ToCram
+import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/tasks/VariantCalling.wdl" as ToGvcf
+import "https://raw.githubusercontent.com/gatk-workflows/five-dollar-genome-analysis-pipeline/1.2.0/structs/GermlineStructs.wdl"
+
+# WORKFLOW DEFINITION
+workflow WholeGenomeGermlineSingleSample {
+  input {
+    SampleAndUnmappedBams sample_and_unmapped_bams
+    GermlineSingleSampleReferences references
+    PapiSettings papi_settings
+    File wgs_coverage_interval_list
+
+    File? haplotype_database_file
+    Boolean provide_bam_output = false
+    Boolean use_gatk3_haplotype_caller = true
+  }
+
+  # Not overridable:
+  Int read_length = 250
+  Float lod_threshold = -20.0
+  String cross_check_fingerprints_by = "READGROUP"
+  String recalibrated_bam_basename = sample_and_unmapped_bams.base_file_name + ".aligned.duplicates_marked.recalibrated"
+
+  call ToBam.UnmappedBamToAlignedBam {
+    input:
+      sample_and_unmapped_bams    = sample_and_unmapped_bams,
+      references                  = references,
+      papi_settings               = papi_settings,
+
+      cross_check_fingerprints_by = cross_check_fingerprints_by,
+      haplotype_database_file     = haplotype_database_file,
+      lod_threshold               = lod_threshold,
+      recalibrated_bam_basename   = recalibrated_bam_basename
+  }
+
+  call AggregatedQC.AggregatedBamQC {
+    input:
+      base_recalibrated_bam = UnmappedBamToAlignedBam.output_bam,
+      base_recalibrated_bam_index = UnmappedBamToAlignedBam.output_bam_index,
+      base_name = sample_and_unmapped_bams.base_file_name,
+      sample_name = sample_and_unmapped_bams.sample_name,
+      recalibrated_bam_base_name = recalibrated_bam_basename,
+      haplotype_database_file = haplotype_database_file,
+      references = references,
+      papi_settings = papi_settings
+  }
+
+  call ToCram.BamToCram as BamToCram {
+    input:
+      input_bam = UnmappedBamToAlignedBam.output_bam,
+      ref_fasta = references.reference_fasta.ref_fasta,
+      ref_fasta_index = references.reference_fasta.ref_fasta_index,
+      ref_dict = references.reference_fasta.ref_dict,
+      duplication_metrics = UnmappedBamToAlignedBam.duplicate_metrics,
+      chimerism_metrics = AggregatedBamQC.agg_alignment_summary_metrics,
+      base_file_name = sample_and_unmapped_bams.base_file_name,
+      agg_preemptible_tries = papi_settings.agg_preemptible_tries
+  }
+
+  # QC the sample WGS metrics (stringent thresholds)
+  call QC.CollectWgsMetrics as CollectWgsMetrics {
+    input:
+      input_bam = UnmappedBamToAlignedBam.output_bam,
+      input_bam_index = UnmappedBamToAlignedBam.output_bam_index,
+      metrics_filename = sample_and_unmapped_bams.base_file_name + ".wgs_metrics",
+      ref_fasta = references.reference_fasta.ref_fasta,
+      ref_fasta_index = references.reference_fasta.ref_fasta_index,
+      wgs_coverage_interval_list = wgs_coverage_interval_list,
+      read_length = read_length,
+      preemptible_tries = papi_settings.agg_preemptible_tries
+  }
+
+  # QC the sample raw WGS metrics (common thresholds)
+  call QC.CollectRawWgsMetrics as CollectRawWgsMetrics {
+    input:
+      input_bam = UnmappedBamToAlignedBam.output_bam,
+      input_bam_index = UnmappedBamToAlignedBam.output_bam_index,
+      metrics_filename = sample_and_unmapped_bams.base_file_name + ".raw_wgs_metrics",
+      ref_fasta = references.reference_fasta.ref_fasta,
+      ref_fasta_index = references.reference_fasta.ref_fasta_index,
+      wgs_coverage_interval_list = wgs_coverage_interval_list,
+      read_length = read_length,
+      preemptible_tries = papi_settings.agg_preemptible_tries
+  }
+
+  call ToGvcf.VariantCalling as BamToGvcf {
+    input:
+      calling_interval_list = references.calling_interval_list,
+      evaluation_interval_list = references.evaluation_interval_list,
+      haplotype_scatter_count = references.haplotype_scatter_count,
+      break_bands_at_multiples_of = references.break_bands_at_multiples_of,
+      contamination = UnmappedBamToAlignedBam.contamination,
+      input_bam = UnmappedBamToAlignedBam.output_bam,
+      ref_fasta = references.reference_fasta.ref_fasta,
+      ref_fasta_index = references.reference_fasta.ref_fasta_index,
+      ref_dict = references.reference_fasta.ref_dict,
+      dbsnp_vcf = references.dbsnp_vcf,
+      dbsnp_vcf_index = references.dbsnp_vcf_index,
+      base_file_name = sample_and_unmapped_bams.base_file_name,
+      final_vcf_base_name = sample_and_unmapped_bams.final_gvcf_base_name,
+      agg_preemptible_tries = papi_settings.agg_preemptible_tries,
+      use_gatk3_haplotype_caller = use_gatk3_haplotype_caller
+  }
+
+  if (provide_bam_output) {
+    File provided_output_bam = UnmappedBamToAlignedBam.output_bam
+    File provided_output_bam_index = UnmappedBamToAlignedBam.output_bam_index
+  }
+
+  # Outputs that will be retained when execution is complete
+  output {
+    Array[File] quality_yield_metrics = UnmappedBamToAlignedBam.quality_yield_metrics
+
+    Array[File] unsorted_read_group_base_distribution_by_cycle_pdf = UnmappedBamToAlignedBam.unsorted_read_group_base_distribution_by_cycle_pdf
+    Array[File] unsorted_read_group_base_distribution_by_cycle_metrics = UnmappedBamToAlignedBam.unsorted_read_group_base_distribution_by_cycle_metrics
+    Array[File] unsorted_read_group_insert_size_histogram_pdf = UnmappedBamToAlignedBam.unsorted_read_group_insert_size_histogram_pdf
+    Array[File] unsorted_read_group_insert_size_metrics = UnmappedBamToAlignedBam.unsorted_read_group_insert_size_metrics
+    Array[File] unsorted_read_group_quality_by_cycle_pdf = UnmappedBamToAlignedBam.unsorted_read_group_quality_by_cycle_pdf
+    Array[File] unsorted_read_group_quality_by_cycle_metrics = UnmappedBamToAlignedBam.unsorted_read_group_quality_by_cycle_metrics
+    Array[File] unsorted_read_group_quality_distribution_pdf = UnmappedBamToAlignedBam.unsorted_read_group_quality_distribution_pdf
+    Array[File] unsorted_read_group_quality_distribution_metrics = UnmappedBamToAlignedBam.unsorted_read_group_quality_distribution_metrics
+
+    File read_group_alignment_summary_metrics = AggregatedBamQC.read_group_alignment_summary_metrics
+    File read_group_gc_bias_detail_metrics = AggregatedBamQC.read_group_gc_bias_detail_metrics
+    File read_group_gc_bias_pdf = AggregatedBamQC.read_group_gc_bias_pdf
+    File read_group_gc_bias_summary_metrics = AggregatedBamQC.read_group_gc_bias_summary_metrics
+
+    File? cross_check_fingerprints_metrics = UnmappedBamToAlignedBam.cross_check_fingerprints_metrics
+
+    File selfSM = UnmappedBamToAlignedBam.selfSM
+    Float contamination = UnmappedBamToAlignedBam.contamination
+
+    File calculate_read_group_checksum_md5 = AggregatedBamQC.calculate_read_group_checksum_md5
+
+    File agg_alignment_summary_metrics = AggregatedBamQC.agg_alignment_summary_metrics
+    File agg_bait_bias_detail_metrics = AggregatedBamQC.agg_bait_bias_detail_metrics
+    File agg_bait_bias_summary_metrics = AggregatedBamQC.agg_bait_bias_summary_metrics
+    File agg_gc_bias_detail_metrics = AggregatedBamQC.agg_gc_bias_detail_metrics
+    File agg_gc_bias_pdf = AggregatedBamQC.agg_gc_bias_pdf
+    File agg_gc_bias_summary_metrics = AggregatedBamQC.agg_gc_bias_summary_metrics
+    File agg_insert_size_histogram_pdf = AggregatedBamQC.agg_insert_size_histogram_pdf
+    File agg_insert_size_metrics = AggregatedBamQC.agg_insert_size_metrics
+    File agg_pre_adapter_detail_metrics = AggregatedBamQC.agg_pre_adapter_detail_metrics
+    File agg_pre_adapter_summary_metrics = AggregatedBamQC.agg_pre_adapter_summary_metrics
+    File agg_quality_distribution_pdf = AggregatedBamQC.agg_quality_distribution_pdf
+    File agg_quality_distribution_metrics = AggregatedBamQC.agg_quality_distribution_metrics
+    File agg_error_summary_metrics = AggregatedBamQC.agg_error_summary_metrics
+
+    File? fingerprint_summary_metrics = AggregatedBamQC.fingerprint_summary_metrics
+    File? fingerprint_detail_metrics = AggregatedBamQC.fingerprint_detail_metrics
+
+    File wgs_metrics = CollectWgsMetrics.metrics
+    File raw_wgs_metrics = CollectRawWgsMetrics.metrics
+
+    File duplicate_metrics = UnmappedBamToAlignedBam.duplicate_metrics
+    File output_bqsr_reports = UnmappedBamToAlignedBam.output_bqsr_reports
+
+    File gvcf_summary_metrics = BamToGvcf.vcf_summary_metrics
+    File gvcf_detail_metrics = BamToGvcf.vcf_detail_metrics
+
+    File? output_bam = provided_output_bam
+    File? output_bam_index = provided_output_bam_index
+
+    File output_cram = BamToCram.output_cram
+    File output_cram_index = BamToCram.output_cram_index
+    File output_cram_md5 = BamToCram.output_cram_md5
+
+    File validate_cram_file_report = BamToCram.validate_cram_file_report
+
+    File output_vcf = BamToGvcf.output_vcf
+    File output_vcf_index = BamToGvcf.output_vcf_index
+  }
+}