-
Notifications
You must be signed in to change notification settings - Fork 4
/
choco_summary.py
executable file
·128 lines (107 loc) · 5.52 KB
/
choco_summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python
import sys
import collections
import utils
import pyphlan as ppa
try:
import argparse as ap
import bz2
except ImportError:
sys.stderr.write( "argparse not found" )
sys.exit(-1)
def read_params( args ):
p = ap.ArgumentParser(description='Profile ChocoPhlAn genes\n')
p.add_argument( '--cscores', required = True, default=None, type=str )
p.add_argument( '--fwmarkers', required = True, default=None, type=str )
p.add_argument( '--maps', required = True, default=None, type=str )
p.add_argument( '--taxonomy', required = True, default=None, type=str )
p.add_argument('--g2t', metavar="Mapping file from genes to taxa", default=None, type=str )
p.add_argument('--t2g', metavar="Mapping file from taxa to genes", default=None, type=str )
p.add_argument('--g2c', metavar="Mapping file from genomes to contigs", required = True, default=None, type=str )
p.add_argument('out', nargs='?', default=None, type=str,
help= "the output summary file\n"
"[stdout if not present]")
return vars( p.parse_args() )
if __name__ == "__main__":
args = read_params( sys.argv )
cscores = collections.defaultdict( set )
fwmarkers = {}
maps = collections.defaultdict( set )
tree = ppa.PpaTree( args['taxonomy'] )
clades2terms = ppa.clades2terms( tree.tree )
clades2taxa = dict([(clade.full_name,set([-int(taxon.name[4:]) for taxon in taxa])) for clade,taxa in clades2terms.items()])
for l in open( args['cscores'] ):
gene_seed, clade, n, n_tot, coreness = l.strip().split('\t')
gene_seed, clade, n, n_tot, coreness = int(gene_seed), clade, int(n), int(n_tot), float(coreness)
cscores[gene_seed].add( (clade, n, n_tot, coreness) )
for i,l in enumerate(open( args['fwmarkers'] )):
taxa_id, gene_seed, clade, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness, ext_taxa = l.strip().split('\t')
taxa_id, gene_seed, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness = int(taxa_id), int(gene_seed), int(n), int(n_tot), float(coreness), int(n_ext_seeds), int(n_ext_taxa), float(uniqueness)
fwmarkers[gene_seed] = (taxa_id, clade, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness)
for l in open( args['maps'] ):
line = l.strip().split('\t')
if len(line) < 2:
continue
fr,to = line
fr,to = int(fr.split("_")[0]), int(to) # can be improved!!!!
maps[fr].add( to )
g2t,g2c,c2g = {},{},{}
if args['g2t']:
with open( args['g2t'] ) as inp:
g2t = dict(([int(a) for a in l.strip().split('\t')] for l in inp))
elif args['t2g']:
with open( args['t2g'] ) as inp:
for ll in (l.strip().split('\t') for l in inp):
for g in ll[1:]:
g2t[int(g)] = int(ll[0])
genomes = set([g2t[g] for g in cscores])
with open( args['g2c'] ) as inp:
for l in inp:
line = list(l.strip().split('\t'))
#if int(line[0]) not in genomes:
# continue
vals = [int(a) for a in line if utils.is_number(a)]
if len(vals) > 1:
g2c[vals[0]] = vals[1:]
for g,c in g2c.items():
for cc in c:
c2g[cc] = g
with utils.openw( args['out'] ) as out:
for gene_seed,cscores_t in cscores.items():
taxa = g2t[gene_seed]
for clade, n, n_tot, coreness in cscores_t:
out.write( "\t".join(["CSCORE",str(gene_seed),str(taxa),clade,str(n), str(n_tot), str(coreness)]) +"\n" )
# anche sotto ???
if gene_seed in fwmarkers:
taxa_id, clade, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness = fwmarkers[gene_seed]
if uniqueness < 0.01:
out.write( "\t".join(["FWMARKER",str(gene_seed),str(taxa),clade,str(n), str(n_tot), str(coreness),
str(n_ext_seeds), str(n_ext_taxa), str(uniqueness)]) +"\n" )
if gene_seed in maps:
ext_tax = set([(c2g[s] if s in c2g else 0) for s in maps[gene_seed]])
ext_tax_ok = ext_tax & clades2taxa[clade]
ext_tax_ko = ext_tax - clades2taxa[clade]
ext_tax_okl = ":".join([str(s) for s in ext_tax_ok])
ext_tax_kol = ":".join([str(s) for s in ext_tax_ko])
out.write( "\t".join(["MARKER",str(gene_seed),str(taxa),clade,str(n), str(n_tot), str(coreness),
str(n_ext_seeds), str(n_ext_taxa), str(uniqueness),
str(len(ext_tax)), str(len(ext_tax_ok)), str(len(ext_tax_ko)),
ext_tax_okl, ext_tax_kol]) +"\n" )
"""
gid2cores = collections.defaultdict( set )
with utils.openr(args['cg']) as inp:
for line in (l.split('\t') for l in inp):
if int(line[0]) > 0:
gid,clade,ncore,ngenomes,pv = line[:5]
else:
gid,clade,ncore,ngenomes,pv = line[1:6]
gid2cores[gid].add( (clade,ncore,ngenomes,pv) )
clades2cores = collections.defaultdict( set )
for k,v in gid2cores.items():
if len(v) > 1:
continue
clades2cores[list(v)[0][0]].add( k )
with utils.openw(args['cgs']) as out:
for k,v in sorted(clades2cores.items(),key=lambda x:-len(x[1])):
out.write( "\t".join([k,str(len(v))]) +"\n" )
"""