-
Notifications
You must be signed in to change notification settings - Fork 2
/
cnv_common_tasks.wdl
291 lines (241 loc) · 8.56 KB
/
cnv_common_tasks.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
#tag gatk4.0.1.2 release
task PreprocessIntervals {
File? intervals
File ref_fasta
File ref_fasta_fai
File ref_fasta_dict
Int? padding
Int? bin_length
File? gatk4_jar_override
# Runtime parameters
String gatk_docker
Int? mem_gb
Int? disk_space_gb
Boolean use_ssd = false
Int? cpu
Int? preemptible_attempts
Int machine_mem_mb = select_first([mem_gb, 2]) * 1000
Int command_mem_mb = machine_mem_mb - 500
# Determine output filename
String filename = select_first([intervals, "wgs"])
String base_filename = basename(filename, ".interval_list")
command <<<
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
gatk --java-options "-Xmx${command_mem_mb}m" PreprocessIntervals \
${"-L " + intervals} \
--sequence-dictionary ${ref_fasta_dict} \
--reference ${ref_fasta} \
--padding ${default="250" padding} \
--bin-length ${default="1000" bin_length} \
--interval-merging-rule OVERLAPPING_ONLY \
--output ${base_filename}.preprocessed.interval_list
>>>
runtime {
docker: "${gatk_docker}"
memory: machine_mem_mb + " MB"
disks: "local-disk " + select_first([disk_space_gb, 40]) + if use_ssd then " SSD" else " HDD"
cpu: select_first([cpu, 1])
preemptible: select_first([preemptible_attempts, 5])
}
output {
File preprocessed_intervals = "${base_filename}.preprocessed.interval_list"
}
}
task AnnotateIntervals {
File intervals
File ref_fasta
File ref_fasta_fai
File ref_fasta_dict
File? gatk4_jar_override
# Runtime parameters
String gatk_docker
Int? mem_gb
Int? disk_space_gb
Boolean use_ssd = false
Int? cpu
Int? preemptible_attempts
Int machine_mem_mb = select_first([mem_gb, 2]) * 1000
Int command_mem_mb = machine_mem_mb - 500
command <<<
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
gatk --java-options "-Xmx${command_mem_mb}m" AnnotateIntervals \
-L ${intervals} \
--reference ${ref_fasta} \
--interval-merging-rule OVERLAPPING_ONLY \
--output annotated_intervals.tsv
>>>
runtime {
docker: "${gatk_docker}"
memory: machine_mem_mb + " MB"
disks: "local-disk " + select_first([disk_space_gb, ceil(size(ref_fasta, "GB")) + 50]) + if use_ssd then " SSD" else " HDD"
cpu: select_first([cpu, 1])
preemptible: select_first([preemptible_attempts, 5])
}
output {
File annotated_intervals = "annotated_intervals.tsv"
}
}
task CollectCounts {
File intervals
File bam
File bam_idx
File ref_fasta
File ref_fasta_fai
File ref_fasta_dict
String? format
File? gatk4_jar_override
# Runtime parameters
String gatk_docker
Int? mem_gb
Int? disk_space_gb
Boolean use_ssd = false
Int? cpu
Int? preemptible_attempts
Int machine_mem_mb = select_first([mem_gb, 7]) * 1000
Int command_mem_mb = machine_mem_mb - 1000
# Sample name is derived from the bam filename
String base_filename = basename(bam, ".bam")
String counts_filename = if !defined(format) then "${base_filename}.counts.hdf5" else "${base_filename}.counts.tsv"
command <<<
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
gatk --java-options "-Xmx${command_mem_mb}m" CollectFragmentCounts \
-L ${intervals} \
--input ${bam} \
--reference ${ref_fasta} \
--format ${default="HDF5" format} \
--interval-merging-rule OVERLAPPING_ONLY \
--output ${counts_filename}
>>>
runtime {
docker: "${gatk_docker}"
memory: machine_mem_mb + " MB"
disks: "local-disk " + select_first([disk_space_gb, ceil(size(bam, "GB")) + 50]) + if use_ssd then " SSD" else " HDD"
cpu: select_first([cpu, 1])
preemptible: select_first([preemptible_attempts, 5])
}
output {
String entity_id = base_filename
File counts = counts_filename
}
}
task CollectAllelicCounts {
File common_sites
File bam
File bam_idx
File ref_fasta
File ref_fasta_fai
File ref_fasta_dict
Int? minimum_base_quality
File? gatk4_jar_override
# Runtime parameters
String gatk_docker
Int? mem_gb
Int? disk_space_gb
Boolean use_ssd = false
Int? cpu
Int? preemptible_attempts
Int machine_mem_mb = select_first([mem_gb, 13]) * 1000
Int command_mem_mb = machine_mem_mb - 1000
# Sample name is derived from the bam filename
String base_filename = basename(bam, ".bam")
String allelic_counts_filename = "${base_filename}.allelicCounts.tsv"
command <<<
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
gatk --java-options "-Xmx${command_mem_mb}m" CollectAllelicCounts \
-L ${common_sites} \
--input ${bam} \
--reference ${ref_fasta} \
--minimum-base-quality ${default="20" minimum_base_quality} \
--output ${allelic_counts_filename}
>>>
runtime {
docker: "${gatk_docker}"
memory: machine_mem_mb + " MB"
disks: "local-disk " + select_first([disk_space_gb, ceil(size(bam, "GB")) + 50]) + if use_ssd then " SSD" else " HDD"
cpu: select_first([cpu, 1])
preemptible: select_first([preemptible_attempts, 5])
}
output {
String entity_id = base_filename
File allelic_counts = allelic_counts_filename
}
}
task ScatterIntervals {
File interval_list
Int num_intervals_per_scatter
# Runtime parameters
String gatk_docker
Int? mem_gb
Int? disk_space_gb
Boolean use_ssd = false
Int? cpu
Int? preemptible_attempts
Int machine_mem_mb = select_first([mem_gb, 2]) * 1000
String base_filename = basename(interval_list, ".interval_list")
command <<<
set -e
grep @ ${interval_list} > header.txt
grep -v @ ${interval_list} > all_intervals.txt
split -l ${num_intervals_per_scatter} --numeric-suffixes all_intervals.txt ${base_filename}.scattered.
for i in ${base_filename}.scattered.*; do cat header.txt $i > $i.interval_list; done
>>>
runtime {
docker: "${gatk_docker}"
memory: machine_mem_mb + " MB"
disks: "local-disk " + select_first([disk_space_gb, 40]) + if use_ssd then " SSD" else " HDD"
cpu: select_first([cpu, 1])
preemptible: select_first([preemptible_attempts, 5])
}
output {
Array[File] scattered_interval_lists = glob("${base_filename}.scattered.*.interval_list")
}
}
task PostprocessGermlineCNVCalls {
String entity_id
Array[File] chunk_path_tars
String sample_index
File? gatk4_jar_override
# Runtime parameters
String gatk_docker
Int? mem_gb
Int? disk_space_gb
Boolean use_ssd = false
Int? cpu
Int? preemptible_attempts
Int machine_mem_mb = select_first([mem_gb, 7]) * 1000
Int command_mem_mb = machine_mem_mb - 1000
String sample_directory = "SAMPLE_${sample_index}" #this is a hardcoded convention in gcnvkernel
String vcf_filename = "${entity_id}.vcf.gz"
String dollar = "$" #WDL workaround for using array[@], see https://github.com/broadinstitute/cromwell/issues/1819
command <<<
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
#untar chunk_path_tars to CHUNK_0, CHUNK_1, etc. directories and build chunk_paths_command_line="--chunk_path CHUNK_0 ..."
chunk_path_array=(${sep=" " chunk_path_tars})
chunk_paths_command_line=""
for index in ${dollar}{!chunk_path_array[@]}; do
chunk_path_tar=${dollar}{chunk_path_array[$index]}
mkdir CHUNK_$index
tar xzf $chunk_path_tar -C CHUNK_$index
chunk_paths_command_line="$chunk_paths_command_line --chunk-path CHUNK_$index"
done
gatk --java-options "-Xmx${command_mem_mb}m" PostprocessGermlineCNVCalls \
$chunk_paths_command_line \
--sample-directory ${sample_directory} \
--output ${vcf_filename}
>>>
runtime {
docker: "${gatk_docker}"
memory: machine_mem_mb + " MB"
disks: "local-disk " + select_first([disk_space_gb, 40]) + if use_ssd then " SSD" else " HDD"
cpu: select_first([cpu, 1])
preemptible: select_first([preemptible_attempts, 5])
}
output {
File vcf = vcf_filename
}
}