-
Notifications
You must be signed in to change notification settings - Fork 0
/
add_attributes_pogigwasc.py
executable file
·98 lines (90 loc) · 4.33 KB
/
add_attributes_pogigwasc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python
import re
import json
import argparse
from collections import defaultdict
from pogigwasc_utils_shared import *
def create_parent_genes(features):
# Create parent genes object and change features object in place
geneid_counter = defaultdict(int)
genes = defaultdict( #seqid
lambda: defaultdict( # geneid
dict
))
for seqid in features:
features_list = [(f, features[seqid][f]) for f in features[seqid] if
features[seqid][f]['type'] in ['CDS','stop_codon','intron']]
features_list = sorted(features_list, key=lambda f: f[1]['start'])
feature_ids_sorted = [f[0] for f in features_list]
last_cluster = []
for i in range(1, len(feature_ids_sorted)):
curr_feature = feature_ids_sorted[i]
last_feature = feature_ids_sorted[i-1]
if features[seqid][last_feature]['end'] == features[seqid][curr_feature]['start'] \
and features[seqid][last_feature]['strand'] == features[seqid][curr_feature]['strand']:
feature_strand = features[seqid][curr_feature]['strand']
# Adjacent features belong to same gene
# Check for CDS adjacent to stop codon, adjust CDS coords
# to encompass stop codon
if features_list[i][1]['type'] == 'CDS' \
and features_list[i-1][1]['type'] == 'stop_codon' \
and feature_strand == '-' :
# stop codon followed by CDS
features_list[i][1]['start'] = features_list[i-1][1]['start']
elif features_list[i][1]['type'] == 'stop_codon' \
and features_list[i-1][1]['type'] == 'CDS' \
and feature_strand == '+' :
# CDS followed by stop codon
features_list[i-1][1]['end'] = features_list[i][1]['end']
last_cluster.append(last_feature)
else:
# close current cluster and start a new one
last_cluster.append(last_feature)
geneid_counter[seqid] += 1
geneid = seqid + ".g" + str(geneid_counter[seqid])
genes[seqid][geneid] = {f:1 for f in last_cluster}
last_cluster = []
# close the last cluster
last_cluster.append(curr_feature)
geneid_counter[seqid] += 1
geneid = seqid + ".g" + str(geneid_counter[seqid])
genes[seqid][geneid] = {f:1 for f in last_cluster}
return(genes)
if __name__ == "__main__":
# command line args
parser = argparse.ArgumentParser(
description="""
Create parent gene objects and attributes for Pogigwasc GFF output.
Add attributes (ID, Parent), creates gene objects for adjacent
CDS/intron/stop_codon features, adjust CDS coordinates to include
adjacent stop codons, and add phase field to GFF for CDS features.
""")
parser.add_argument("-i", "--input", action='store', help="""
Input GFF file from Pogigwasc
""")
parser.add_argument("-o", "--output", action='store', help="""
Output GFF file
""")
args = parser.parse_args()
# main script
features = read_pogigwasc_gff(args.input)
genes = create_parent_genes(features)
out = []
for seqid in genes:
for geneid in genes[seqid]:
gene_start, gene_end, gene_strand = find_gene_limits(genes, seqid, geneid, features)
gene_start, gene_end = py2gff_coords(gene_start, gene_end)
out.append([seqid, 'predicted','gene',gene_start, gene_end, '.', gene_strand, '.', f'ID={geneid}'])
for feature_id in genes[seqid][geneid]:
new_feature_id = feature_id
# Renumber CDSs, because multiple CDS segments belonging to the
# same gene should all have the same CDS ID although we earlier
# used distinct identifiers to have a unique handle
if features[seqid][feature_id]['type'] == 'CDS':
new_feature_id = seqid + '.c' + geneid.split(".")[-1][1:]
out.extend(dict2gff(features[seqid][feature_id], new_feature_id, geneid))
# write new file
with open(args.output, 'w') as fh:
for line in out:
fh.write("\t".join([str(i) for i in line]))
fh.write("\n")