-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgff3_compare.py
155 lines (136 loc) · 6.26 KB
/
gff3_compare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# gff3_compare.py
# compare two different gff3 gene annotation files on the same reference
from argparse import ArgumentParser
import csv
def parse_attrs(attrs_str, info_field_types):
""" Parse in a string of attrs from a gff3 file. """
attrs = {}
attrs_list = attrs_str.split(';')
for attr_pair in attrs_list:
field_name = attr_pair.split('=')[0]
field_val = attr_pair.split('=')[1]
attrs[field_name] = field_val
if field_name not in info_field_types:
info_field_types.append(field_name)
return attrs
def read_gff3_file(gff3_file_name):
""" Read in a gff3 file. """
all_feature_types = ['region', 'ncRNA_gene', 'lnc_RNA', 'exon', 'miRNA', 'pseudogene', 'pseudogenic_transcript', 'gene', 'mRNA', 'three_prime_UTR', 'CDS', 'five_prime_UTR', 'snRNA', 'ncRNA', 'scRNA', 'snoRNA', 'unconfirmed_transcript', 'V_gene_segment', 'C_gene_segment', 'J_gene_segment', 'rRNA', 'D_gene_segment', 'start_codon', 'stop_codon', 'intron', 'transcript']
gene_types = ['ncRNA_gene', 'pseudogene', 'gene']
# ebi tx types
# tx_types = ['lnc_RNA', 'miRNA', 'pseudogenic_transcript', 'mRNA', 'snRNA', 'ncRNA', 'scRNA', 'snoRNA', 'unconfirmed_transcript', 'V_gene_segment', 'C_gene_segment', 'J_gene_segment', 'rRNA', 'D_gene_segment']
tx_types = ['transcript', 'lnc_RNA', 'miRNA', 'pseudogenic_transcript', 'mRNA', 'snRNA', 'ncRNA', 'scRNA', 'snoRNA', 'unconfirmed_transcript', 'V_gene_segment', 'C_gene_segment', 'J_gene_segment', 'rRNA', 'D_gene_segment']
# Other field types: region, exon, three_prime_UTR, five_prime_UTR, CDS
all_info_field_types = []
regions = []
gene_ids = []
parent_gene_ids = []
tx_ids = []
parent_tx_ids = []
gene_locs = {}
tx_locs = {}
with open(gff3_file_name) as in_gff3:
r = csv.reader(in_gff3, delimiter='\t')
for row in r:
if not row[0].startswith('#'):
feature_type = row[2]
if feature_type not in all_feature_types:
all_feature_types.append(feature_type)
print("Warning: feature type: ", feature_type, "is not present in the code.")
if feature_type == "region":
regions.append((row[0], int(row[3]), int(row[4])))
elif feature_type in gene_types:
attrs_str = row[8]
gene_attrs = parse_attrs(attrs_str, all_info_field_types)
# print(gene_attrs)
try:
gene_ids.append(gene_attrs['gene_id'])
except:
print("Error, no key 'gene_id'. Gene attrs: ", gene_attrs)
try:
parent_gene_id = gene_attrs['parent_gene']
except:
parent_gene_id = gene_attrs['source_gene']
# print("Error, no key 'parent_gene'. Gene attrs: ", gene_attrs)
parent_gene_ids.append(parent_gene_id)
gene_locs[parent_gene_id] = (row[0], int(row[3]), int(row[4]))
elif feature_type in tx_types:
attrs_str = row[8]
tx_attrs = parse_attrs(attrs_str, all_info_field_types)
# print(tx_attrs)
try:
tx_ids.append(tx_attrs['transcript_id'])
except:
print("Error, no key 'transcript_id'. Tx attrs: ", tx_attrs)
try:
parent_tx_id = tx_attrs['parent_transcript']
except:
parent_tx_id = tx_attrs['source_transcript']
# print("Error, no key 'parent_transcript'. Tx attrs: ", tx_attrs)
parent_tx_ids.append(parent_tx_id)
tx_locs[parent_tx_id] = (row[0], int(row[3]), int(row[4]))
else:
# print(row)
# break
continue
# print("regions: ", regions)
# print("all info field types: ", all_info_field_types)
print("\t# genes: ", len(gene_ids))
print("\t# transcripts: ", len(tx_ids))
# print(all_feature_types)
return gene_locs, tx_locs
def compare_feature_locs(ref_locs, query_locs, ref_out_file_name, query_out_file_name, diff_locs_file_name):
""" Compare the locations for two sets of features """
num_overlapping_ids = 0
num_ref_ids_only = 0
num_query_ids_only = 0
num_different_chroms = 0
num_same_chrom_diff_loc = 0
num_same_loc = 0
with open(ref_out_file_name, 'w') as ref_out, open(diff_locs_file_name, 'w') as diffs_out:
for feature_id, loc in ref_locs.items():
ref_chrom, ref_start, ref_stop = loc[0], loc[1], loc[2]
if feature_id in query_locs:
num_overlapping_ids += 1
query_loc = query_locs[feature_id]
query_chrom, query_start, query_stop = query_loc[0], query_loc[1], query_loc[2]
if query_chrom == ref_chrom:
if (query_start >= ref_start and query_start <= ref_stop) or (ref_start >= query_start and ref_start <= query_stop):
# Is this sufficient to check for overlapping regions?
num_same_loc += 1
else:
num_same_chrom_diff_loc += 1
print(feature_id, ref_chrom, ref_start, ref_stop, query_chrom, query_start, query_stop, file=diffs_out, sep="\t")
else:
num_different_chroms += 1
print(feature_id, ref_chrom, ref_start, ref_stop, query_chrom, query_start, query_stop, file=diffs_out, sep="\t")
else:
num_ref_ids_only += 1
print(feature_id, ref_chrom, ref_start, ref_stop, file=ref_out, sep="\t")
with open(query_out_file_name, 'w') as query_out:
for feature_id, loc in query_locs.items():
query_chrom, query_start, query_stop = loc[0], loc[1], loc[2]
if feature_id not in ref_locs:
num_query_ids_only += 1
print(feature_id, query_chrom, query_start, query_stop, file=query_out, sep="\t")
print("\tNumber of features in both files: ", num_overlapping_ids)
print("\t\tSame locations: ", num_same_loc)
print("\t\tSame chroms but different locations: ", num_same_chrom_diff_loc)
print("\t\tDifferent chroms: ", num_different_chroms)
print("\tNumber of features in reference only: ", num_ref_ids_only)
print("\tNumber of features in query only: ", num_query_ids_only)
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('ref_gff3_file')
parser.add_argument('query_gff3_file')
opts = parser.parse_args()
print("> Reading in reference file...")
ref_gene_locs, ref_tx_locs = read_gff3_file(opts.ref_gff3_file)
print("> Reading in query file...")
query_gene_locs, query_tx_locs = read_gff3_file(opts.query_gff3_file)
print("> Comparing genes for both files...")
compare_feature_locs(ref_gene_locs, query_gene_locs, "genes_only_ref.txt", "genes_only_query.txt", "gene_loc_diffs.txt")
print("> Comparing transcripts for both files...")
compare_feature_locs(ref_tx_locs, query_tx_locs, "txs_only_ref.txt", "txs_only_query.txt", "tx_loc_diffs.txt")
# Sample command:
# python3 gff3_compare.py GCA_018472595.1_genes.gff3 HG00438.1.chm13.gff3