-
Notifications
You must be signed in to change notification settings - Fork 0
/
disambiguate.py
executable file
·351 lines (322 loc) · 16.7 KB
/
disambiguate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
#!/usr/bin/env python
"""
This is the main function to call for disambiguating between BAM files
from two species that have alignments from the same source of fastq files.
It is part of the explant RNA/DNA-Seq workflow where an informatics
approach is used to distinguish between e.g. human and mouse or rat RNA/DNA reads.
For reads that have aligned to both organisms, the functionality is based on
comparing quality scores from either Tophat, Hisat2, STAR or BWA. Read
name is used to collect all alignments for both mates (_1 and _2) and
compared between the alignments from the two species.
For Tophat (default, can be changed using option -a) and Hisat2, the sum of the flags XO,
NM and NH is evaluated and the lowest sum wins the paired end reads. For equal
scores, the reads are assigned as ambiguous.
The alternative algorithm (STAR, bwa) disambiguates (for aligned reads) by tags
AS (alignment score, higher better), followed by NM (edit distance, lower
better).
Code by Miika Ahdesmaki July-August 2013, based on original Perl implementation
for Tophat by Zhongwu Lai.
"""
from __future__ import print_function
import sys, re, pysam
from array import array
from os import path, makedirs
from argparse import ArgumentParser, RawTextHelpFormatter
# "natural comparison" for strings
def nat_cmp(a, b):
convert = lambda text: int(text) if text.isdigit() else text # lambda function to convert text to int if number present
alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] # split string to piecewise strings and string numbers
#return cmp(alphanum_key(a), alphanum_key(b)) # use internal cmp to compare piecewise strings and numbers
return (alphanum_key(a) > alphanum_key(b))-(alphanum_key(a) < alphanum_key(b))
# read reads into a list object for as long as the read qname is constant (sorted file). Return the first read with new qname or None
def read_next_reads(fileobject, listobject):
qnamediff = False
while not qnamediff:
try:
myRead=fileobject.next()
except StopIteration:
#print("5")
return None # return None as the name of the new reads (i.e. no more new reads)
if nat_cmp(myRead.qname, listobject[0].qname)==0:
listobject.append(myRead)
else:
qnamediff = True
return myRead # this is the first read with a new qname
# disambiguate between two lists of reads
def disambiguate(humanlist, mouselist, disambalgo):
if disambalgo in ['tophat','hisat2']:
dv = 2**13 # a high quality score to replace missing quality scores (no real quality score should be this high)
sa = array('i',(dv for i in range(0,4))) # score array, with [human_1_QS, human_2_QS, mouse_1_QS, mouse_2_QS]
for read in humanlist:
if 0x4&read.flag: # flag 0x4 means unaligned
continue
QScore = read.opt('XO') + read.opt('NM') + read.opt('NH')
# directionality (_1 or _2)
d12 = 0 if 0x40&read.flag else 1
if sa[d12]>QScore:
sa[d12]=QScore # update to lowest (i.e. 'best') quality score
for read in mouselist:
if 0x4&read.flag: # flag 0x4 means unaligned
continue
QScore = read.opt('XO') + read.opt('NM') + read.opt('NH')
# directionality (_1 or _2)
d12 = 2 if 0x40&read.flag else 3
if sa[d12]>QScore:
sa[d12]=QScore # update to lowest (i.e. 'best') quality score
if min(sa[0:2])==min(sa[2:4]) and max(sa[0:2])==max(sa[2:4]): # ambiguous
return 0
elif min(sa[0:2]) < min(sa[2:4]) or min(sa[0:2]) == min(sa[2:4]) and max(sa[0:2]) < max(sa[2:4]):
# assign to human
return 1
else:
# assign to mouse
return -1
elif disambalgo.lower() in ('bwa', 'star', 'bowtie2'):
dv = -2^13 # default value, low
bwatags = ['AS', 'NM']# ,'XS'] # in order of importance (compared sequentially, not as a sum as for tophat)
bwatagsigns = [1, -1]#,1] # for AS and XS higher is better. for NM lower is better, thus multiply by -1
AS = list()
for x in range(0, len(bwatagsigns)):
AS.append(array('i',(dv for i in range(0,4)))) # alignment score array, with [human_1_Score, human_2_Score, mouse_1_Score, mouse_2_Score]
#
for read in humanlist:
if 0x4&read.flag: # flag 0x4 means unaligned
continue
# directionality (_1 or _2)
d12 = 0 if 0x40&read.flag else 1
for x in range(0, len(bwatagsigns)):
try:
QScore = bwatagsigns[x]*read.opt(bwatags[x])
except KeyError:
if bwatags[x] == 'NM':
bwatags[x] = 'nM' # oddity of STAR
elif bwatags[x] == 'AS':
continue # this can happen for e.g. hg38 ALT-alignments (missing AS)
QScore = bwatagsigns[x]*read.opt(bwatags[x])
if AS[x][d12]<QScore:
AS[x][d12]=QScore # update to highest (i.e. 'best') quality score
#
for read in mouselist:
if 0x4&read.flag: # flag 0x4 means unaligned
continue
# directionality (_1 or _2)
d12 = 2 if 0x40&read.flag else 3
for x in range(0, len(bwatagsigns)):
try:
QScore = bwatagsigns[x]*read.opt(bwatags[x])
except KeyError:
if bwatags[x] == 'NM':
bwatags[x] = 'nM' # oddity of STAR
elif bwatags[x] == 'AS':
continue # this can happen for e.g. hg38 ALT-alignments (missing AS)
QScore = bwatagsigns[x]*read.opt(bwatags[x])
if AS[x][d12]<QScore:
AS[x][d12]=QScore # update to highest (i.e. 'best') quality score
#
for x in range(0, len(bwatagsigns)):
if max(AS[x][0:2]) > max(AS[x][2:4]) or max(AS[x][0:2]) == max(AS[x][2:4]) and min(AS[x][0:2]) > min(AS[x][2:4]):
# assign to human
return 1
elif max(AS[x][0:2]) < max(AS[x][2:4]) or max(AS[x][0:2]) == max(AS[x][2:4]) and min(AS[x][0:2]) < min(AS[x][2:4]):
# assign to mouse
return -1
return 0 # ambiguous
else:
print("Not implemented yet")
sys.exit(2)
def main():
description = """
disambiguate.py disambiguates between two organisms that have alignments
from the same source of fastq files. An example where this might be
useful is as part of an explant RNA/DNA-Seq workflow where an informatics
approach is used to distinguish between human and mouse RNA/DNA reads.
For reads that have aligned to both organisms, the functionality is based on
comparing quality scores from either Tophat of BWA. Read
name is used to collect all alignments for both mates (_1 and _2) and
compared between human and mouse alignments.
For Tophat (default, can be changed using option -a) and Hisat2, the sum of the tags XO,
NM and NH is evaluated and the lowest sum wins the paired end reads. For equal
scores (both mates, both species), the reads are assigned as ambiguous.
The alternative algorithm (STAR, bwa) disambiguates (for aligned reads) by tags
AS (alignment score, higher better), followed by NM (edit distance, lower
better).
The output directory will contain four files:\n
...disambiguatedSpeciesA.bam: Reads that could be assigned to species A
...disambiguatedSpeciesB.bam: Reads that could be assigned to species B
...ambiguousSpeciesA.bam: Reads aligned to species A that also aligned \n\tto B but could not be uniquely assigned to either
...ambiguousSpeciesB.bam: Reads aligned to species B that also aligned \n\tto A but could not be uniquely assigned to either
..._summary.txt: A summary of unique read names assigned to species A, B \n\tand ambiguous.
Examples:
disambiguate.py test/human.bam test/mouse.bam
disambiguate.py -s mysample1 test/human.bam test/mouse.bam
"""
parser = ArgumentParser(description=description, formatter_class=RawTextHelpFormatter)
parser.add_argument('A', help='Input BAM file for species A.')
parser.add_argument('B', help='Input BAM file for species B.')
parser.add_argument('-o', '--output-dir', default="disambres",
help='Output directory.')
parser.add_argument('-i', '--intermediate-dir', default="intermfiles",
help='Location to store intermediate files')
parser.add_argument('-d', '--no-sort', action='store_true', default=False,
help='Disable BAM file sorting. Use this option if the '
'files have already been name sorted.')
parser.add_argument('-s', '--prefix', default='',
help='A prefix (e.g. sample name) to use for the output '
'BAM files. If not provided, the input BAM file prefix '
'will be used. Do not include .bam in the prefix.')
parser.add_argument('-a', '--aligner', default='tophat',
choices=('tophat', 'hisat2', 'bwa', 'star', 'bowtie2'),
help='The aligner used to generate these reads. Some '
'aligners set different tags.')
args = parser.parse_args()
#code
numhum = nummou = numamb = 0
#starttime = time.clock()
# parse inputs
humanfilename = args.A
mousefilename = args.B
samplenameprefix = args.prefix
outputdir = args.output_dir
intermdir = args.intermediate_dir
disablesort = args.no_sort
disambalgo = args.aligner
supportedalgorithms = set(['tophat', 'hisat2', 'bwa', 'star', 'bowtie2'])
# check existence of input BAM files
if not (file_exists(humanfilename) and file_exists(mousefilename)):
sys.stderr.write("\nERROR in disambiguate.py: Two existing input BAM files "
"must be specified as positional arguments\n")
sys.exit(2)
if len(samplenameprefix) < 1:
humanprefix = path.basename(humanfilename.replace(".bam",""))
mouseprefix = path.basename(mousefilename.replace(".bam",""))
else:
if samplenameprefix.endswith(".bam"):
samplenameprefix = samplenameprefix[0:samplenameprefix.rfind(".bam")] # the above if is not stricly necessary for this to work
humanprefix = samplenameprefix
mouseprefix = samplenameprefix
samplenameprefix = None # clear variable
if disambalgo.lower() not in supportedalgorithms:
print(disambalgo+" is not a supported disambiguation scheme at the moment.")
sys.exit(2)
if disablesort:
humanfilenamesorted = humanfilename # assumed to be sorted externally...
mousefilenamesorted = mousefilename # assumed to be sorted externally...
else:
if not path.isdir(intermdir):
makedirs(intermdir)
humanfilenamesorted = path.join(intermdir,humanprefix+".speciesA.namesorted.bam")
mousefilenamesorted = path.join(intermdir,mouseprefix+".speciesB.namesorted.bam")
if not path.isfile(humanfilenamesorted):
pysam.sort("-n","-m","2000000000","-o",humanfilenamesorted,humanfilename)
if not path.isfile(mousefilenamesorted):
pysam.sort("-n","-m","2000000000","-o",mousefilenamesorted,mousefilename)
# read in human reads and form a dictionary
myHumanFile = pysam.Samfile(humanfilenamesorted, "rb" )
myMouseFile = pysam.Samfile(mousefilenamesorted, "rb" )
if not path.isdir(outputdir):
makedirs(outputdir)
myHumanUniqueFile = pysam.Samfile(path.join(outputdir, humanprefix+".disambiguatedSpeciesA.bam"), "wb", template=myHumanFile)
myHumanAmbiguousFile = pysam.Samfile(path.join(outputdir, humanprefix+".ambiguousSpeciesA.bam"), "wb", template=myHumanFile)
myMouseUniqueFile = pysam.Samfile(path.join(outputdir, mouseprefix+".disambiguatedSpeciesB.bam"), "wb", template=myMouseFile)
myMouseAmbiguousFile = pysam.Samfile(path.join(outputdir, mouseprefix+".ambiguousSpeciesB.bam"), "wb", template=myMouseFile)
summaryFile = open(path.join(outputdir,humanprefix+'_summary.txt'),'w')
#initialise
# try:
# nexthumread=myHumanFile.next()
# nextmouread=myMouseFile.next()
# except StopIteration:
# print("No reads in one or either of the input files")
# sys.exit(2)
EOFmouse = EOFhuman = False
prevHumID = '-+=RANDOMSTRING=+-'
prevMouID = '-+=RANDOMSTRING=+-'
while not EOFmouse&EOFhuman:
while not (nat_cmp(nexthumread.qname,nextmouread.qname) == 0):
# check order between current human and mouse qname (find a point where they're identical, i.e. in sync)
while nat_cmp(nexthumread.qname,nextmouread.qname) > 0 and not EOFmouse: # mouse is "behind" human, output to mouse disambiguous
myMouseUniqueFile.write(nextmouread)
if not nextmouread.qname == prevMouID:
nummou+=1 # increment mouse counter for unique only
prevMouID = nextmouread.qname
try:
nextmouread=myMouseFile.next()
except StopIteration:
EOFmouse=True
while nat_cmp(nexthumread.qname,nextmouread.qname) < 0 and not EOFhuman: # human is "behind" mouse, output to human disambiguous
myHumanUniqueFile.write(nexthumread)
if not nexthumread.qname == prevHumID:
numhum+=1 # increment human counter for unique only
prevHumID = nexthumread.qname
try:
nexthumread=myHumanFile.next()
except StopIteration:
EOFhuman=True
if EOFhuman or EOFmouse:
break
# at this point the read qnames are identical and/or we've reached EOF
humlist = list()
moulist = list()
if nat_cmp(nexthumread.qname,nextmouread.qname) == 0:
humlist.append(nexthumread)
nexthumread = read_next_reads(myHumanFile, humlist) # read more reads with same qname (the function modifies humlist directly)
if not nexthumread:
EOFhuman = True
moulist.append(nextmouread)
nextmouread = read_next_reads(myMouseFile, moulist) # read more reads with same qname (the function modifies moulist directly)
if not nextmouread:
EOFmouse = True
# perform comparison to check mouse, human or ambiguous
if len(moulist) > 0 and len(humlist) > 0:
myAmbiguousness = disambiguate(humlist, moulist, disambalgo)
if myAmbiguousness < 0: # mouse
nummou+=1 # increment mouse counter
for myRead in moulist:
myMouseUniqueFile.write(myRead)
elif myAmbiguousness > 0: # human
numhum+=1 # increment human counter
for myRead in humlist:
myHumanUniqueFile.write(myRead)
else: # ambiguous
numamb+=1 # increment ambiguous counter
for myRead in moulist:
myMouseAmbiguousFile.write(myRead)
for myRead in humlist:
myHumanAmbiguousFile.write(myRead)
if EOFhuman:
#flush the rest of the mouse reads
while not EOFmouse:
myMouseUniqueFile.write(nextmouread)
if not nextmouread.qname == prevMouID:
nummou+=1 # increment mouse counter for unique only
prevMouID = nextmouread.qname
try:
nextmouread=myMouseFile.next()
except StopIteration:
#print("3")
EOFmouse=True
if EOFmouse:
#flush the rest of the human reads
while not EOFhuman:
myHumanUniqueFile.write(nexthumread)
if not nexthumread.qname == prevHumID:
numhum+=1 # increment human counter for unique only
prevHumID = nexthumread.qname
try:
nexthumread=myHumanFile.next()
except StopIteration:
EOFhuman=True
summaryFile.write("sample\tunique species A pairs\tunique species B pairs\tambiguous pairs\n")
summaryFile.write(humanprefix+"\t"+str(numhum)+"\t"+str(nummou)+"\t"+str(numamb)+"\n")
summaryFile.close()
myHumanFile.close()
myMouseFile.close()
myHumanUniqueFile.close()
myHumanAmbiguousFile.close()
myMouseUniqueFile.close()
myMouseAmbiguousFile.close()
def file_exists(fname):
"""Check if a file exists and is non-empty.
"""
return path.exists(fname) and path.getsize(fname) > 0
if __name__ == "__main__":
main()