-
Notifications
You must be signed in to change notification settings - Fork 0
/
spliced_triplexating.py
executable file
·118 lines (106 loc) · 3.57 KB
/
spliced_triplexating.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/python3
from collections import defaultdict
import subprocess
ncRNA = {'Mt_rRNA',
'Mt_tRNA',
'miRNA',
'misc_RNA',
'rRNA',
'scRNA',
'snRNA',
'snoRNA',
'ribozyme',
'sRNA',
'scaRNA'}
lncRNA = {'lincRNA'}
pr_coding = {'protein_coding',
'IG_X',
'TR_X'}
def load(name):
with open(name, 'r') as f:
data = f.readlines()
result = []
rev = {}
for i in range(0, len(data), 2):
rev[data[i][1:].strip()] = len(result)
result.append((data[i][1:].strip(), data[i + 1].strip()))
return result, rev
rna, rna_rev = load('spliced_rnas.fa')
dna, dna_rev = load('.work/dnas.fa')
res_ncRNA = open('ncRNAs.stats', 'w')
res_lncRNA = open('lncRNAs.stats', 'w')
res_prot_c = open('prot_c.stats', 'w')
res_others = open('others.stats', 'w')
res_none = open('none.stats', 'w')
graph = defaultdict(list)
values = defaultdict(float)
g_num = defaultdict(int)
total = 0
with open('.work/GSM2396700_mESCgraph.bed', 'r') as f:
for line in f.readlines():
r, d, value, g = line.strip().split(' ')
r = int(r) - 1
d = int(d) - 1
graph[r].append(d)
values[(r,d)] = float(value)
g_num[(r,d)] = int(g)
total += 1
count = 0
for r in graph:
if rna[r][0] != "":
tmp = rna[r][0].strip().split('|');
rna_name = tmp[1]
rna_type = tmp[2]
if rna_type in ncRNA:
result = res_ncRNA
elif rna_type in lncRNA:
result = res_lncRNA
elif rna_type in pr_coding:
result = res_prot_c
else:
result = res_others
with open('.work3/rna.temp', 'w') as f:
print(">%s" % rna[r][0], file = f)
print(rna[r][1], file = f)
with open('.work3/dna.temp', 'w') as f:
for d in graph[r]:
print(">%s" % dna[d][0], file = f)
print("%s" % dna[d][1], file = f)
count += 1
subprocess.run(["./triplexator-1.3.2-Linux/bin/triplexator", "-rm", "2", "-p", "4", "-l", "12", "-o", ".work3/res.temp", "-ss", ".work3/rna.temp", "-ds", ".work3/dna.temp"])
with open(".work3/res.temp.summary") as f:
data = f.readlines()
triplexes = 0
norm_res1 = 1
norm_res2 = 1
norm_res3 = 1
grid_norm_res1 = 1
grid_norm_res2 = 1
grid_norm_res3 = 1
for line in data[1:]: # skip headers
tokens = line.split()
# chr1:158318000-158318999 chrX:159198334-159217024 22 4.02e-08 0 0 0 0 22 4.02e-08
d = dna_rev[tokens[0]]
r = rna_rev[tokens[1]]
gr_num = g_num[(r,d)]
if gr_num == 1:
grid_norm_res1 *= 1 - values[(r, d)] * float(tokens[3])
norm_res1 *= 1 - float(tokens[3])
elif gr_num == 2:
grid_norm_res2 *= 1 - values[(r, d)] * float(tokens[3])
norm_res2 *= 1 - float(tokens[3])
if gr_num == 3:
grid_norm_res3 *= 1 - values[(r, d)] * float(tokens[3])
norm_res3 *= 1 - float(tokens[3])
triplexes += int(tokens[2])
print("%d %s %.10e %.10e | %.10e %.10e | %.10e %.10e" % (r + 1, rna_name,
1 - grid_norm_res3, 1 - norm_res3,
1 - grid_norm_res2, 1 - norm_res2,
1 - grid_norm_res1, 1 - norm_res1), file = result)
print("%d/%d" % (count, total))
else:
print(r, file = res_none)
res_ncRNA.close()
res_lncRNA.close()
res_prot_c.close()
res_others.close()