-
Notifications
You must be signed in to change notification settings - Fork 4
/
05_find_max_field_length.py
executable file
·87 lines (72 loc) · 2.65 KB
/
05_find_max_field_length.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#! /usr/bin/python3
#
# This source code is part of icgc, an ICGC processing pipeline.
#
# Icgc is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Icgc is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see<http://www.gnu.org/licenses/>.
#
# Contact: ivana.mihalek@gmail.com
#
# Note: this is really slow, but assumed to be run only once - during the installation
# for the ICGC v27 the search for the longest entry comes up with the following
# alleles 200
# genotypes 401
# gene_names 15
# aa_mutation 59
# cds_mutation 12
#the longest genotype I see in this set is 401
#I assume that the allele legnth cutoff is 200
# I'll make the allele columns varchar210 and genotype 430
import os
from config import Config
#########################################
def get_simple_somatic_tsv_files(data_home_local):
tsv_files = []
for root, dirs, files in os.walk(data_home_local):
for file in files:
if file.endswith(".tsv") and 'simple_somatic' in file:
tsv_files.append(os.path.join(root, file))
return tsv_files
#########################################
def main():
tsv_files = get_simple_somatic_tsv_files(Config.data_home_local)
field_groups = {
'alleles': ['reference_genome_allele', 'mutated_from_allele', 'mutated_to_allele'],
'genotypes': ['control_genotype', 'tumour_genotype', 'control_genotype'],
'gene_names': ['gene_affected', 'transcript_affected'],
'aa_mutation': ['aa_mutation'],
'cds_mutation': ['cds_mutation']
}
max_allele_length = {}
for field_group in field_groups.keys():
max_allele_length[field_group] = 0
for tf in tsv_files:
print(tf)
infile = open(tf,'r')
headers = None
for line in infile:
if not headers:
headers = line.rstrip('\n').split('\t')
else:
fields = line.rstrip('\n').split('\t')
field_named = dict(list(zip(headers, fields)))
for field_group, field_names in field_groups.items():
for name in field_names:
if len(field_named[name])>max_allele_length[field_group]:
max_allele_length[field_group] = len(field_named[name])
infile.close()
for field_group in field_groups.keys():
print(field_group, max_allele_length[field_group])
#########################################
if __name__ == '__main__':
main()