-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.nf
148 lines (108 loc) · 3.59 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env nextflow
/*
* @authors
* Ferriol Calvet <ferriol.calvet@crg.eu>
* Emilio Righi <emilio.righi@crg.eu>
*/
/*
* Input parameters: genome, protein evidences, parameter file,
* additional values for the generation of the parameter file.
* Params are stored in the params.config file
*
* The configuration is in nextflow.config file
*/
nextflow.enable.dsl=2
process PARSE_FASTA_HEADER {
tag "${id}"
input:
tuple val(id), val(taxid), val(scientific_name),path(fasta)
output:
tuple val(id), val(taxid),val(scientific_name), path(out_fasta)
script:
out_fasta = "parsed_${fasta.BaseName}.fa"
"""
#!/usr/bin/env python3
with open("${fasta}", 'r') as infile, open("${out_fasta}", 'w') as outfile:
for line in infile:
if line.startswith('>'):
header = line.strip()
if 'ENA' in header:
parts = header.split('|')
if len(parts) > 2:
new_header = f'>{parts[-1].split()[0]}\\n'
outfile.write(new_header)
else:
outfile.write(header + '\\n')
else:
outfile.write(header + '\\n')
else:
outfile.write(line)
"""
}
process UNZIP_FASTA {
tag "${id}"
input:
tuple val(id), val(taxid), val(scientific_name), path(genome)
output:
tuple val(id), val(taxid), val(scientific_name), path(unzipped_genome)
script:
unzipped_genome = "${id}_unzipped.fa"
"""
gunzip -c ${genome} > ${unzipped_genome};
"""
}
process RUN_SELENOPROFILES {
container "maxtico/container_selenoprofiles:latest"
publishDir "${params.outdir}", mode: 'copy'
tag "${id}"
input:
tuple val(id),val(taxid),val(scientific_name),path(genome)
output:
path("${taxid}/*", type:'dir')
script:
genome_name = genome.BaseName
"""
selenoprofiles -o ./${taxid} -t ${genome} -s "${scientific_name}" -p ${params.selenoprofile}
"""
}
process DOWNLOAD_NCBI_ANNOTATION {
container "datasets"
input:
tuple val(id), val(taxid), val(scientific_name),path(fasta)
output:
tuple val(id),val(gff_path)
script:
"""
./datasets download genome accession ${id} --include gff3 --filename ${id}.zip
"""
}
workflow {
if (params.tsv) { tsv_input = file(params.tsv) } else {
exit 1, 'TSV samplesheet not specified!'
}
tsv = channel.fromPath(params.tsv).splitCsv( sep: '\t', header:true )
.collectFile(storeDir:params.assemblies_dir){ row -> ["${row[params.column_id_value]}-${row[params.column_taxid_value]}-${row[params.column_scientific_name]}-.fa.gz", file(row[params.column_path_value])]}
.map { it ->
def elements = it.baseName.tokenize('-')
tuple(elements[0],elements[1],elements[2], it)
}
.branch {
validFasta: it[3].countFasta() > 0
invalidFasta: true
}
invalid_fasta = tsv.invalidFasta
if(invalid_fasta.count()){
invalid_fasta.count().view { it -> "A total of ${it} invalid assemblies have been found"}
invalid_fasta.collectFile(storeDir:params.outdir){ item ->
[ "INVALID_ASSEMBLIES.txt", "${item[0]}\t${item[1]}\t${item[2]}" + '\n' ]
}
.subscribe {
println "Invalid assemblies have been saved in: $it"
}
}
assemblies = tsv.validFasta.map{it-> tuple(it[0], it[1], it[2], it[3])}
results = UNZIP_FASTA(assemblies) | PARSE_FASTA_HEADER | RUN_SELENOPROFILES
}
workflow.onComplete {
println ( "\nDone!\n" )
}