-
Notifications
You must be signed in to change notification settings - Fork 0
/
add_realtrons_pogigwasc_intronless.py
executable file
·286 lines (256 loc) · 14.7 KB
/
add_realtrons_pogigwasc_intronless.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
#!/usr/bin/env python3
import re
from collections import defaultdict
import argparse
import json
from pogigwasc_utils_shared import *
def update_feature_coords_realtrons(features, introns):
# Adjust coordinates of features with introns added
# Record features that contain introns
# features object is changed in-place
intron_parents = defaultdict( # seqid
lambda: defaultdict( # feature id
list))
for seqid in introns:
try:
# sort features ascending by start coordinate
features_list = [ (f, features[seqid][f]) for f in features[seqid] ]
features_list = sorted(features_list, key=lambda f: f[1]['start'])
feature_ids_sorted = [f[0] for f in features_list]
# sort introns ascending by start coordinate
introns_list = [ (i, introns[seqid][i]) for i in introns[seqid]]
introns_list = sorted(introns_list, key=lambda i: i[1]['start'])
intron_ids_sorted = [i[0] for i in introns_list]
# Adjust coordinates
for intron_id in intron_ids_sorted:
intron_len = introns[seqid][intron_id]['end'] - introns[seqid][intron_id]['start']
for feature_id in feature_ids_sorted:
# sanity check
orig_feature_length = features[seqid][feature_id]['end'] - features[seqid][feature_id]['start']
# sequentially iterate through features and update coordinates
# assume that no two genes share the same start coordinate
if features[seqid][feature_id]['start'] > introns[seqid][intron_id]['start']:
# feature to right of intron: push whole feature forward
features[seqid][feature_id]['start'] += intron_len
features[seqid][feature_id]['end'] += intron_len
# in edge case where feature start == intron start, the
# resulting feature will have correct length, but the
# first feature segment will have zero length
# however we want to record the corresponding gene as parent
# to the intron
else:
if features[seqid][feature_id]['end'] >= introns[seqid][intron_id]['start']:
# feature start to left of intron, feature start to right of intron
# i.e. intron contained in feature, need to adjust:
# keep start coordinate, push end coordinate forward
features[seqid][feature_id]['end'] += intron_len
intron_parents[seqid][feature_id].append(intron_id)
# sanity check
if features[seqid][feature_id]['end'] - features[seqid][feature_id]['start'] != orig_feature_length + intron_len:
print("feature length does not match after adding introns")
else:
# entire feature to left of intron
pass
except KeyError:
print(f"seqid {seqid} in intron gff not found in feature gff")
pass
return(intron_parents)
def check_strand_conflict_orphan_introns(features, introns, intron_parents):
# Report adjusted coordinates
# TODO: what about features that encompass both conflicting and non-conflicting introns?
orphan_introns = defaultdict(list)
conflict_features = defaultdict( # seqid
lambda: defaultdict( # feature id
list))
for seqid in features:
# sort introns ascending by start coordinate
introns_list = [ (i, introns[seqid][i]) for i in introns[seqid] ]
introns_list = sorted(introns_list, key=lambda i: i[1]['start'])
intron_ids_sorted = [i[0] for i in introns_list]
# Check for mismatched strands in intron vs. CDS
for feature_id in intron_parents[seqid]:
parent_strand = features[seqid][feature_id]['strand']
for intron_id in intron_parents[seqid][feature_id]:
if introns[seqid][intron_id]['strand'] != parent_strand:
# feature is a 'conflicted' CDS
conflict_features[seqid][feature_id].append(intron_id)
# Check for intron orphans
contained_introns = []
for i in intron_parents[seqid].values():
contained_introns.extend(i)
orphan_introns[seqid].extend([i for i in intron_ids_sorted if i not in set(contained_introns)])
return(orphan_introns, conflict_features)
def create_parent_genes(features):
# Associate stop_codons with adjacent attached CDSs in parent gene objects
# add attributes field to the feature table
# assumes that:
# - every CDS is adjacent to a stop codon,
# - strand of CDS and adjacent stop codon always the same
# - that a gene is composed of no more than one CDS+stop codon
geneid_counter = defaultdict(int)
genes = defaultdict( # seqid
lambda: defaultdict( # geneid
dict)) # dict of features keyed by ID
for seqid in features:
# sort features by start coord
features_list = [ (f, features[seqid][f]) for f in features[seqid] ]
features_list = sorted(features_list, key=lambda f: f[1]['start'])
feature_ids_sorted = [f[0] for f in features_list]
last_feature = None # record previous feature
for feature_id in feature_ids_sorted:
if last_feature:
# adjacent features
if int(features[seqid][last_feature]['end']) == int(features[seqid][feature_id]['start']):
# either: CDS followed by stop codon and strands both positive
# or: stop codon followed by CDS, and strands both negative
if (features[seqid][last_feature]['type'] == 'CDS' \
and features[seqid][feature_id]['type'] == 'stop_codon' \
and features[seqid][last_feature]['strand'] == '+' \
and features[seqid][feature_id]['strand'] == '+'):
# extend CDS to encompass stop codon limits
features[seqid][last_feature]['end'] = features[seqid][feature_id]['end']
# update gene counter
geneid_counter[seqid] += 1
geneid = seqid + '.g' + str(geneid_counter[seqid])
# add feature ID and Parent gene id to attributes field
genes[seqid][geneid] = { feature_id : 1,
last_feature: 1}
elif (features[seqid][last_feature]['type'] == 'stop_codon' \
and features[seqid][feature_id]['type'] == 'CDS' \
and features[seqid][last_feature]['strand'] == '-' \
and features[seqid][feature_id]['strand'] == '-'):
# extend CDS to encompass stop codon limits
features[seqid][feature_id]['start'] = features[seqid][last_feature]['start']
# update gene counter
geneid_counter[seqid] += 1
geneid = seqid + '.g' + str(geneid_counter[seqid])
# add feature ID and Parent gene id to attributes field
genes[seqid][geneid] = { feature_id : 1,
last_feature: 1}
else:
print("adjacent features but apparently unrelated")
# Update previous feature
last_feature = feature_id
else:
# first entry
last_feature = feature_id
return(genes)
def segment_cds_with_introns(genes, features, introns, intron_parents, orphan_introns, conflict_features):
# Changes `features` and `introns` in place
for seqid in genes:
# Split CDS features into segments and correct phase
# pogigwasc in intronless mode produces CDS features with phase 0 (check!)
# so the first segment will always have phase 0
# Report features in GFF format
for geneid in genes[seqid]:
for feature_id in genes[seqid][geneid]:
# Features that contain introns
if feature_id in intron_parents[seqid]:
# strandedness of CDS and intron features conflict
if feature_id in conflict_features[seqid]:
# Note that strand is conflicting
features[seqid][feature_id]['Note'] = 'strand_conflict'
# report overlapping introns as 'intron_orphan' features
for conflict_intron in conflict_features[seqid][feature_id]:
introns[seqid][conflict_intron]['Note'] = 'orphan'
else:
# check if CDS feature (or start_codon or stop_codon) and split if necessary
if features[seqid][feature_id]['type'] in ['CDS', 'stop_codon']:
# get coordinates of the split CDS feature segments,
# interrupted by introns
seg_coords = [features[seqid][feature_id]['start']]
for intron_id in intron_parents[seqid][feature_id]:
# the introns should be present in coordinate sort order
seg_coords.append(introns[seqid][intron_id]['start'])
seg_coords.append(introns[seqid][intron_id]['end'])
# add gene as parent to this intron
introns[seqid][intron_id]['Parent'] = geneid
seg_coords.append(features[seqid][feature_id]['end'])
# start and end coords as a list for each segment
starts = seg_coords[::2]
ends = seg_coords[1::2]
segs = []
# forward strand
if features[seqid][feature_id]['strand'] == '+':
segs = list(zip(starts,ends))
# reverse strand: work backwards
elif features[seqid][feature_id]['strand'] == '-':
segs = reversed(list(zip(starts,ends)))
# replace feature 'start', 'end', and 'phase' fields with lists
features[seqid][feature_id]['start'] = starts
features[seqid][feature_id]['end'] = ends
features[seqid][feature_id]['phase'] = [0] * len(starts)
else:
print(f"Non CDS feature {feature_id} overlaps with empirical intron")
# this should never happen because update_feature_coords_realtrons()
# is called with pogigwasc feature table which has only CDS and stop_codons
else:
# intronless feature, report as-is, change phase from '.' to 0 if CDS
if features[seqid][feature_id]['type'] == 'CDS':
features[seqid][feature_id]['phase'] = 0
# add orphan introns
for orphan_id in orphan_introns[seqid]:
introns[seqid][orphan_id]['Note'] = 'orphan'
if __name__ == '__main__':
# parse arguments
parser = argparse.ArgumentParser()
parser.add_argument("--introns", action='store', type=str,
help="Realtrons GFF3 file")
parser.add_argument("--features", action='store', type=str,
help="""
Pogigwasc intronless-mode gene predictions, GFF3 file.
Assumes no overlapping features, only `CDS` and `stop_codon` features,
and stop codons not contained in adjacent CDS features.
If the prediction was performed on contigs split from scaffolds, the
coordinates should have been corrected with recombine_contigs.py
""")
parser.add_argument("--output", action='store', type=str,
help="Prefix for output files. GFF file will be suffixed `.gff3`")
parser.add_argument("--dump", action='store_true',
help="""
Dump intermediate files to JSON for troubleshooting. Dump files will be
suffixed `.dump.*.json`
""")
args = parser.parse_args()
# import data
features = read_pogigwasc_gff(args.features)
introns = read_realtrons_gff(args.introns)
# create parent gene features
genes = create_parent_genes(features)
# update coordinates of intron-minus gene predictions after introns re-inserted
intron_parents = update_feature_coords_realtrons(features, introns)
orphan_introns, conflict_features = check_strand_conflict_orphan_introns(features, introns, intron_parents)
# segment features that are interrupted by introns
segment_cds_with_introns(genes, features, introns, intron_parents, orphan_introns, conflict_features)
# write data in GFF format
out = []
for seqid in genes:
for geneid in genes[seqid]:
gene_start, gene_end, gene_strand = find_gene_limits(genes, seqid, geneid, features)
gene_start, gene_end = py2gff_coords(gene_start, gene_end)
out.append([seqid, 'predicted', 'gene',
gene_start, gene_end, '.', gene_strand, '.',
f'ID={geneid}'])
for feature_id in genes[seqid][geneid]:
out.extend(dict2gff(features[seqid][feature_id], feature_id, geneid))
for seqid in introns:
for intron_id in introns[seqid]:
out.extend(dict2gff(introns[seqid][intron_id], intron_id))
# sort by coordinates
# negative sort by end coordinates so that enclosing features are listed before their components
out = sorted(out, key=lambda x: (x[0], int(x[3]), -int(x[4])))
with open(args.output + ".gff3", "w") as fh:
for line in out:
fh.write("\t".join([str(i) for i in line]))
fh.write("\n")
if args.dump:
with open(args.output + '.dump.genes.json', "w") as fh:
fh.write(json.dumps(genes, indent=2))
with open(args.output + '.dump.introns.json', "w") as fh:
fh.write(json.dumps(introns, indent=2))
with open(args.output + '.dump.intron_parents.json', "w") as fh:
fh.write(json.dumps(intron_parents, indent=2))
with open(args.output + '.dump.orphan_introns.json', "w") as fh:
fh.write(json.dumps(orphan_introns, indent=2))
with open(args.output + '.dump.conflict_features.json', "w") as fh:
fh.write(json.dumps(conflict_features, indent=2))