-
Notifications
You must be signed in to change notification settings - Fork 1
/
contig-puller.py
executable file
·180 lines (165 loc) · 6.38 KB
/
contig-puller.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/env python3
# Script by JK
# Extracts targeted contigs from a multi-FASTA file eg. draft genome assembly
import subprocess
import os
import sys
import shutil
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Seq import UnknownSeq
import csv
# Usage
import argparse
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description='Extracts contigs containing a target sequence (eg. gene) from several multi-FASTA assemblies \n and aligns the contigs at the target sequence',
usage='\n %(prog)s --db DBFILE --out OUTFILE --anno ANNOFILE [OPTIONS] FASTA-1 FASTA-2 ... FASTA-N')
parser.add_argument('--db', metavar='DBFILE', required=True, help='target sequence (FASTA) to search for (REQUIRED to create BLAST database)')
parser.add_argument('--id', metavar='ID', default='100', help='percentage identity cutoff (default=100)')
parser.add_argument('--out', metavar='OUTFILE', default='contigs.gbk', help='output file (default=contigs.gbk)')
parser.add_argument('--anno', metavar='ANNOFILE', help='reference proteins.faa file to annotate from (optional | requires Prokka to be installed)')
parser.add_argument('--cpus', metavar='CPUS', default='8', help='number of cpus to use (default=8)')
#parser.add_argument('--log', metavar='LOGFILE', default='tempdb/temp.log', help='Log file')
parser.add_argument('assembly', metavar='FASTA', nargs='+', help='FASTA assemblies to search')
parser.add_argument('--version', action='version', version='v2.0')
args = parser.parse_args()
# Functions
def msg(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
def err(*args, **kwargs):
msg(*args, **kwargs)
if os.path.isdir('tempdb'):
shutil.rmtree('tempdb')
sys.exit(1);
def check_file(f):
if os.path.isfile(f) == False:
err('ERROR: Cannot find "{}". Check CFML output files exist in this directory.'.format(f))
def progexit():
shutil.rmtree('./tempdb')
banner()
msg('Done. Output written to {}.'.format(outfile))
banner()
sys.exit(0)
def banner():
msg('--------------------------')
return()
# Check output file
outfile = args.out
tmpout = 'tempdb/tmpout'
if (os.path.isfile(outfile)):
msg('ERROR: {} already exists. Please specify another output file.'.format(outfile))
sys.exit(1)
# Create local BLAST database
print('Creating BLAST database ...')
if os.path.isdir('tempdb'):
msg('ERROR: "tempdb" already exists. Please remove/rename this directory first.')
sys.exit(1)
subprocess.call(['mkdir', 'tempdb'])
subprocess.call(['makeblastdb', '-in', args.db, '-parse_seqids', '-dbtype', 'nucl', '-out', 'tempdb/db'])
banner()
# Run BLAST and find lengths of contigs and orientation/position of target gene
seqlist = []
seqplus = []
seqminus = []
contigs = []
genestart = []
geneend = []
for f in args.assembly:
check_file(f)
seqlist.append(f)
blastn_for = NcbiblastnCommandline(query=f, db='tempdb/db', word_size=32, strand='plus', dust='no', perc_identity=args.id, evalue=1E-99, outfmt=6, out='tempdb/bplus.txt')
blastn_for()
blastn_rev = NcbiblastnCommandline(query=f, db='tempdb/db', word_size=32, strand='minus', dust='no', perc_identity=args.id, evalue=1E-99, outfmt=6, out='tempdb/bminus.txt')
blastn_rev()
if os.stat('tempdb/bplus.txt').st_size > 0:
msg('Searching {} ...'.format(f))
bplus = list(csv.reader(open('tempdb/bplus.txt', 'r'), delimiter='\t'))
bplus.insert(0, f)
seqplus.append(bplus)
cont = str(bplus[1][0])
contigs.append(cont)
start = int(bplus[1][6])
end = int(bplus[1][7])
for s in SeqIO.parse(f, 'fasta'):
if s.id == cont:
msg("Found '{}' (+) ...".format(cont))
genestart.append(start - 1)
geneend.append(len(s) - end)
if os.stat('tempdb/bminus.txt').st_size > 0:
msg('Searching {} ...'.format(f))
bminus = list(csv.reader(open('tempdb/bminus.txt', 'r'), delimiter='\t'))
bminus.insert(0, f)
seqminus.append(bminus)
cont = str(bminus[1][0])
contigs.append(cont)
start = int(bminus[1][6])
end = int(bminus[1][7])
for s in SeqIO.parse(f, 'fasta'):
if s.id == cont:
msg("Found '{}' (-) ...".format(cont))
genestart.append(len(s) - end)
geneend.append(start - 1)
if not genestart or not geneend:
err('ERROR: No BLAST hits found. Try adjusting --id')
else:
maxstart = max(genestart)
maxend = max(geneend)
banner()
# Write target contigs to FASTA file
msg('Writing forward sequence matches ... ')
for row in seqplus:
seqname = row[0]
cont = row[1][0]
begin = int(row[1][6])
finish = int(row[1][7])
for s in SeqIO.parse(seqname, 'fasta'):
if s.id == cont:
b1len = maxstart - (begin - 1) # Determine start buffer
b2len = maxend - (len(s) - finish) # Determine end buffer
buff1 = UnknownSeq(b1len, character = 'N')
buff2 = UnknownSeq(b2len, character = 'N')
newseq = str(buff1 + s.seq + buff2) # Buffer sandwich
record = SeqRecord(Seq(newseq), id=seqname, description='')
with open(tmpout, 'a') as f:
SeqIO.write(record, f, 'fasta') # Write new sequence to file
msg('Writing reverse sequence matches (reverse complement) ...')
for row in seqminus:
seqname = row[0]
cont = row[1][0]
begin = int(row[1][6])
finish = int(row[1][7])
for s in SeqIO.parse(seqname, 'fasta'):
if s.id == cont:
b1len = maxstart - (len(s) - finish) # Determine start buffer
b2len = maxend - (begin - 1) # Determine end buffer
buff1 = UnknownSeq(b1len, character = 'N')
buff2 = UnknownSeq(b2len, character = 'N')
rc = s.reverse_complement()
newseq = str(buff1 + rc.seq + buff2) # Reverse complement
record = SeqRecord(Seq(newseq), id=seqname, description='')
with open(tmpout, 'a') as f:
SeqIO.write(record, f, 'fasta') # Write new sequence to file
# Optional annotation
if args.anno == None:
shutil.move(tmpout, outfile)
progexit()
else:
for seq in SeqIO.parse(tmpout, 'fasta'):
seq.id = os.path.splitext(os.path.basename(seq.id))[0]
strname = str(seq.id)
splitfile = 'tempdb/strname'
banner()
msg('Annotating {} ...'.format(strname))
banner()
SeqIO.write(seq, splitfile, 'fasta')
subprocess.call(['prokka', '--outdir', 'tempdb/split', '--force', '--prefix', strname, '--locustag', strname, '--compliant', '--proteins', args.anno, '--cpus', args.cpus, splitfile])
import glob
read_files = glob.glob('tempdb/split/*.gbk')
with open(outfile, 'w') as write_file:
for f in read_files:
with open(f, 'r') as infile:
write_file.write(infile.read())
progexit()