-
Notifications
You must be signed in to change notification settings - Fork 0
/
gene_report.py
157 lines (140 loc) · 7.02 KB
/
gene_report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
__author__ = 'dexter'
import ndex_access
import term2gene_mapper
import json
class GeneReport():
def __init__(self, name):
self.gene_network_pairs = {}
self.name = name
self.fields = [
"Gene Symbol",
"Entrez Gene Id",
"Network Id",
"Network Name",
"Network Sets"
]
def add_gene_network_pair(self, gene_id, gene_symbol, network_id, network_name, e_set_name):
key = str(gene_id) + "_" + network_id
pair = self.gene_network_pairs.get(key)
if not pair:
pair = {
"Gene Symbol": gene_symbol,
"Entrez Gene Id": gene_id,
"Network Id": network_id,
"Network Name": network_name,
"e_sets": [e_set_name]
}
self.gene_network_pairs[key] = pair
else:
e_sets = pair["e_sets"]
if e_set_name not in e_sets:
e_sets.append(e_set_name)
pair["e_sets"] = e_sets
def get_rows(self):
s = ", "
for pair in self.gene_network_pairs.values():
pair["Network Sets"] = s.join(pair.get("e_sets"))
return self.gene_network_pairs.values()
def get_network_summary(self):
networks = {}
rows = self.get_rows()
for row in rows:
network_id = row.get("Network Id")
network = networks.get(network_id)
new_symbol = row.get('Gene Symbol')
new_id = row.get('Entrez Gene Id')
if not network:
network = {'Network Id': row.get('Network Id'),
'Gene Symbols': [new_symbol],
'Entez Gene Ids': [new_id],
'Network Name' : row.get('Network Name'),
}
networks[network_id] = network
network['Gene Symbols'].append(new_symbol)
network['Entez Gene Ids'].append(new_id)
return networks
class report_generator():
def __init__(self):
# a map to hold the term to gene mapping to speed the query.
self.term_mapper = term2gene_mapper.Term2gene_mapper()
# for each e_set in the configuration:
# for each network specified in the e_set:
# get the network from the specified NDEx
# analyze the identifiers in each network and normalize to genes using the mygeneinfo service
# add each gene-network pair to the report
# output the report to a file in the gene_reports directory when all networks have been processed
# (the name of the file is <configuration name>_gene_report.txt
def create_gene_report(self, config):
report = GeneReport(config.name)
term_to_gene_map = {}
for e_set_config in config.e_set_configs:
self.process_e_set_for_report(e_set_config, report, term_to_gene_map)
return report
def process_e_set_for_report(self, e_set_config, report, term_to_gene_map):
ndex = ndex_access.NdexAccess(e_set_config.ndex, username=e_set_config.username, password=e_set_config.password)
# Find the networks
networks = ndex.find_networks(e_set_config)
print "Found " + str(len(networks)) + " networks for gene report"
for network in networks:
self.process_network_for_report(network, e_set_config, report, ndex, term_to_gene_map)
def process_network_for_report(self, network, e_set_config, report, ndex, term_to_gene_map):
network_id = network.get("externalId")
network_name = network.get("name")
print " - " + network_name + " : " + network_id
# genes = ndex.get_genes(network_id, term_to_gene_map)
node_table = ndex.get_nodes_from_cx(network_id)
self.term_mapper.add_network_nodes(node_table)
all_found = []
all_not_found = []
for node_id, node in node_table.items():
found = False
if "represents" in node:
represents_id = node["represents"]
gene = self.term_mapper.get_gene_from_identifier(represents_id)
if gene != None:
report.add_gene_network_pair(gene.id, gene.symbol, network_id, network_name, e_set_config.name)
found = True
all_found.append({"node_id": node_id, "symbol": gene.symbol, "gene_id":gene.id, "input":represents_id, "type":"represents"})
continue
# otherwise check aliases
if "alias" in node:
alias_ids = node.get('alias')
for alias_id in alias_ids:
gene = self.term_mapper.get_gene_from_identifier(alias_id)
if gene != None:
report.add_gene_network_pair(gene.id, gene.symbol, network_id, network_name, e_set_config.name)
found = True
all_found.append({"node_id": node_id, "symbol": gene.symbol, "gene_id":gene.id, "input":alias_id, "type":"alias"})
break
if found:
continue
# then, try using name
if "name" in node:
names = node.get("name")
for node_name in names:
if len(node_name) < 40:
gene = self.term_mapper.get_gene_from_identifier(node_name)
if gene != None:
report.add_gene_network_pair(gene.id, gene.symbol, network_id, network_name, e_set_config.name)
found = True
all_found.append({"node_id": node_id, "symbol": gene.symbol, "gene_id":gene.id, "input":node_name, "type":"name"})
break
if found:
continue
if "functionTerm" in node:
genes = self.genes_from_function_term(node["functionTerm"], network_id, network_name, e_set_config.name, report, all_found, node_id)
if not found:
all_not_found.append({"node_id":node_id, "names": list(node.get("name"))})
print "Found genes for " + str(len(all_found)) + " nodes"
print "Did not find genes for " + str(len(all_not_found)) + " nodes"
print json.dumps(all_not_found, indent=4)
def genes_from_function_term(self, function_term, network_id, network_name, e_set_name, report, all_found, node_id):
# if it is a function term, process all genes mentioned
for parameter in function_term['args']:
if type(parameter) == 'str':
gene = self.term_mapper.get_gene_from_identifier(parameter)
if gene != None:
report.add_gene_network_pair(gene.id, gene.symbol, network_id, network_name, e_set_name)
all_found.append({"node_id": node_id, "symbol": gene.symbol, "gene_id":gene.id, "input":parameter, "type":"function_term"})
else:
self.genes_from_function_term(parameter, network_id, network_name, e_set_name, report, all_found, node_id)