-
Notifications
You must be signed in to change notification settings - Fork 3
/
getSeqENA.py
executable file
·138 lines (102 loc) · 6.77 KB
/
getSeqENA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
getSeqENA.py - Get fastq files from ENA/SRA using Accession IDs
<https://github.com/B-UMMI/getSeqENA/>
Copyright (C) 2017 Miguel Machado <mpmachado@medicina.ulisboa.pt>
Last modified: December 06, 2017
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import argparse
import os
import download
import sys
import utils
import time
version = '1.3'
def requiredPrograms(args):
programs_version_dictionary = {}
programs_version_dictionary['wget'] = ['--version', '>=', '1.12']
programs_version_dictionary['gzip'] = ['--version', '>=', '1.6']
if args.asperaKey is not None:
programs_version_dictionary['ascp'] = ['--version', '>=', '3.6.1']
if args.downloadCramBam:
programs_version_dictionary['samtools'] = ['--version', '==', '1.3.1']
if args.SRA or args.SRAopt:
programs_version_dictionary['prefetch'] = ['--version', '>=', '2.8.2']
programs_version_dictionary['fastq-dump'] = ['--version', '>=', '2.8.2']
programs_version_dictionary['gawk'] = ['--version', '>=', '3.0.4']
missingPrograms = utils.checkPrograms(programs_version_dictionary)
if len(missingPrograms) > 0:
sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms))
def runGetSeqENA(args):
start_time = time.time()
listENA_IDs = utils.getListIDs(os.path.abspath(args.listENAids.name))
outdir = os.path.abspath(args.outdir)
utils.check_create_directory(outdir)
asperaKey = args.asperaKey
if asperaKey is not None:
asperaKey = os.path.abspath(asperaKey.name)
# Start logger
logfile = utils.start_logger(outdir)
# Get general information
utils.general_information(logfile, version)
# Check programms
requiredPrograms(args)
runs_successfully = 0
with open(os.path.join(outdir, 'getSeqENA.report.txt'), 'wt') as writer:
header_sequencing = ['run_accession', 'instrument_platform', 'instrument_model', 'library_layout', 'library_source', 'extra_run_accession', 'nominal_length', 'read_count', 'base_count', 'date_download']
writer.write('#sample' + '\t' + '\t'.join(header_sequencing) + '\n')
for ena_id in listENA_IDs:
if args.maximumSamples is None:
maximumSamples = runs_successfully + 1
else:
maximumSamples = args.maximumSamples
if runs_successfully < maximumSamples:
print '\n' + 'Download ENA_ID ' + ena_id
ena_id_folder = os.path.join(outdir, ena_id)
utils.check_create_directory(ena_id_folder)
sequencingInformation = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None, 'library_layout': None, 'library_source': None, 'extra_run_accession': None, 'nominal_length': None, 'read_count': None, 'base_count': None, 'date_download': None}
time_taken, run_successfully, fastq_files, sequencingInformation = download.run_download(ena_id, args.downloadLibrariesType, asperaKey, ena_id_folder, args.downloadCramBam, args.threads, args.downloadInstrumentPlatform, args.SRA, args.SRAopt)
if run_successfully:
runs_successfully += 1
else:
utils.removeDirectory(ena_id_folder)
print ena_id + ' was not downloaded'
writer.write(ena_id + '\t' + '\t'.join([str(sequencingInformation[i]) for i in header_sequencing]) + '\n')
else:
break
time_taken = utils.runTime(start_time)
del time_taken
if runs_successfully == 0:
sys.exit('No ENA_IDs were successfully downloaded!')
def main():
parser = argparse.ArgumentParser(prog='getSeqENA.py', description="Get fastq files from ENA/SRA using Accession IDs", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version))
parser_required = parser.add_argument_group('Required options')
parser_required.add_argument('-l', '--listENAids', type=argparse.FileType('r'), metavar='/path/to/list/ENA_IDs.txt', help='Path to list containing the ENA_IDs to be downloaded', required=True)
parser_optional = parser.add_argument_group('Facultative options')
parser_optional.add_argument('-o', '--outdir', type=str, metavar='/output/directory/', help='Path for output directory', required=False, default='.')
parser_optional.add_argument('-j', '--threads', type=int, metavar='N', help='Number of threads for cram/bam to fastq conversion', required=False, default=1)
parser_optional.add_argument('-a', '--asperaKey', type=argparse.FileType('r'), metavar='/path/to/asperaweb_id_dsa.openssh', help='Tells getSeqENA.py to download fastq files from ENA using Aspera Connect. With this option, the path to Private-key file asperaweb_id_dsa.openssh is provided (normaly found in ~/.aspera/connect/etc/asperaweb_id_dsa.openssh).', required=False)
parser_optional.add_argument('--downloadLibrariesType', type=str, metavar='PAIRED', help='Tells getSeqENA.py to download files with specific library layout', choices=['PAIRED', 'SINGLE', 'BOTH'], required=False, default='BOTH')
parser_optional.add_argument('--downloadCramBam', action='store_true', help='Tells getSeqENA.py to also download cram/bam files and convert them to fastq files')
parser_optional.add_argument('--downloadInstrumentPlatform', type=str, metavar='ILLUMINA', help='Tells getSeqENA.py to download files with specific library layout', choices=['ILLUMINA', 'ALL'], required=False, default='ILLUMINA')
parser_optional.add_argument('--maximumSamples', type=int, metavar='N', help='Tells getSeqENA.py to only download files for N samples', required=False)
parser_optional_SRA = parser.add_mutually_exclusive_group()
parser_optional_SRA.add_argument('--SRA', action='store_true', help='Tells getSeqENA.py to download reads in fastq format only from NCBI SRA database (not recommended)')
parser_optional_SRA.add_argument('--SRAopt', action='store_true', help='Tells getSeqENA.py to download reads from NCBI SRA if the download from ENA fails')
parser.set_defaults(func=runGetSeqENA)
args = parser.parse_args()
args.func(args)
if __name__ == "__main__":
main()