-
Notifications
You must be signed in to change notification settings - Fork 4
/
13_reorganize_mutations.py
executable file
·210 lines (169 loc) · 7.24 KB
/
13_reorganize_mutations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#! /usr/bin/python3
#
# This source code is part of icgc, an ICGC processing pipeline.
#
# Icgc is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Icgc is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see<http://www.gnu.org/licenses/>.
#
# Contact: ivana.mihalek@gmail.com
#
import time
from config import Config
from icgc_utils.common_queries import *
from icgc_utils.processes import *
# 'aa_mutation', 'consequence_type', and 'pathogenic_estimate' will be filled separately
mutation_columns = ['icgc_mutation_id', 'start_position', 'end_position', 'assembly',
'mutation_type', 'mutated_from_allele', 'mutated_to_allele', 'reference_genome_allele']
################################################################
# stop_retained: A sequence variant where at least one base in the terminator codon is changed, but the terminator remains
consequence_vocab = ['stop_lost', 'synonymous', 'inframe_deletion', 'inframe_insertion', 'stop_gained',
'5_prime_UTR_premature_start_codon_gain',
'start_lost', 'start_gained', 'frameshift', 'disruptive_inframe_deletion', 'stop_retained',
'exon_loss', 'disruptive_inframe_insertion', 'missense']
# location_vocab[1:4] is gene-relative
# location_vocab[1:4] is transcript-relative
location_vocab = ['intergenic_region', 'intragenic', 'upstream', 'downstream',
'5_prime_UTR', 'exon', 'coding_sequence', 'initiator_codon',
'splice_acceptor', 'splice_region', 'splice_donor',
'intron', '3_prime_UTR', ]
# this is set literal
pathogenic = {'stop_lost', 'inframe_deletion', 'inframe_insertion', 'stop_gained',
'5_prime_UTR_premature_start_codon_gain',
'start_lost', 'start_gained', 'frameshift', 'disruptive_inframe_deletion',
'exon_loss', 'disruptive_inframe_insertion', 'missense',
'splice_acceptor', 'splice_region', 'splice_donor',
'inframe' # there is no way we can know at this level whether an inframe change is nondsisruptive
# more likely it is than not
}
#########################################
def insert (cursor, table, columns, values):
nonempty_values = []
corresponding_columns = []
for i in range(len(values)):
if not values[i] or values[i] == "": continue
nonempty_values.append(values[i])
corresponding_columns.append(columns[i])
qry = "insert into %s (%s) " %(table, ",".join(corresponding_columns))
qry += "values (%s) " % ",".join(nonempty_values)
search_db(cursor, qry)
#########################################
# profile decorator is for the use with kernprof (a line profiler):
# ./icgc_utils/kernprof.py -l 13_reorganize_variants.py
# followed by
# python -m line_profiler 13_reorganize_variants.py.lprof
# see here https://github.com/rkern/line_profiler#line-profiler
# the reason I am using local kernprof.py is that I don't know where pip
# installed its version (if anywhere)
# @profile
def reorganize_mutations(cursor, chromosome, somatic_temp_table, columns):
mutation_table = "mutations_chrom_{}".format(chromosome)
new_somatic_table = somatic_temp_table.replace("_temp","")
mutations = get_mutations(cursor, new_somatic_table, chromosome=chromosome)
totmut = len(mutations)
print("\t\t\t total mutations on chrom {} in {}: {}".format(chromosome, somatic_temp_table, totmut))
ct = 0
time0 = time.time()
for mutation in mutations:
ct += 1
if ct%10000 == 0:
print("\t\t\t chrom %s %10s %6d %d%% %ds" % (chromosome, somatic_temp_table, ct, float(ct) / totmut * 100, time.time() - time0))
time0 = time.time()
skip = False
conseqs = set([])
aa_mutations = set([])
mutation_values = None
qry = "select * from %s where icgc_mutation_id='%s' " % (somatic_temp_table, mutation)
qry += "and gene_affected is not null and gene_affected !='' "
ret = search_db (cursor, qry)
if not ret: continue
for fields in ret:
named_field = dict(list(zip(columns,fields)))
if not mutation_values: # we need to set this only once
mutation_values = [quotify(named_field[name]) for name in mutation_columns]
if entry_exists(cursor, "icgc", mutation_table, "icgc_mutation_id", quotify(mutation)):
skip = True
continue
# aa_mutation
aa = named_field['aa_mutation']
if aa and aa!="":
transcript = named_field['transcript_affected']
if not transcript: transcript="unk"
aa_mutations.add("{}:{}".format(transcript,aa))
# consequences
csq = named_field['consequence_type']
if csq in consequence_vocab:
conseqs.add(csq)
elif csq in location_vocab:
if "splice" in csq.lower():
conseqs.add(csq)
elif csq == "":
pass
else:
print("unrecognized consequence field:", csq)
exit()
if skip: continue
if not mutation_values:
print("mutation values not assigned for %s (!?)" % mutation)
exit()
if not chromosome:
print("chromosome not assigned for %s (!?)" % mutation)
exit()
mutation_values.append(quotify(";".join(list(aa_mutations))))
mutation_values.append(quotify(";".join(list(conseqs))))
if len(conseqs&pathogenic)>0:
mutation_values.append("1")
else:
mutation_values.append("0")
# now we are ready to store
insert(cursor, mutation_table, mutation_columns + ['aa_mutation','consequence', 'pathogenicity_estimate'], mutation_values)
#########################################
def reorganize(chromosomes, other_args):
db = connect_to_mysql(Config.mysql_conf_file)
cursor = db.cursor()
switch_to_db(cursor,"icgc")
somatic_temp_tables = other_args[0]
columns = other_args[1]
for chrom in chromosomes:
time0 = time.time()
print("====================")
print("reorganizing mutations on chromosome ", chrom, os.getpid())
for somatic_temp_table in somatic_temp_tables:
reorganize_mutations(cursor, chrom, somatic_temp_table, columns)
time1 = time.time()
print(("\t\t chromosome %s done in %.3f mins" % (chrom, float(time1-time0)/60)), os.getpid())
cursor.close()
db.close()
return
#########################################
#########################################
def main():
print("disabled")
exit()
db = connect_to_mysql(Config.mysql_conf_file)
cursor = db.cursor()
#########################
# which temp somatic tables do we have
qry = "select table_name from information_schema.tables "
qry += "where table_schema='icgc' and table_name like '%simple_somatic_temp'"
tables = [field[0] for field in search_db(cursor,qry)]
# the tables should all have the same columns
qry = "select column_name from information_schema.columns where table_name='%s'"%tables[0]
columns = [field[0] for field in search_db(cursor,qry)]
cursor.close()
db.close()
chromosomes = [str(i) for i in range(1,13)] + ["Y"] + [str(i) for i in range(22,12,-1)] + ["X"]
number_of_chunks = 12 # myISAM does not deadlock
parallelize(number_of_chunks, reorganize, chromosomes, [tables,columns], round_robin=True)
#########################################
if __name__ == '__main__':
main()