-
Notifications
You must be signed in to change notification settings - Fork 0
/
rMATS_preptable.py
159 lines (125 loc) · 6.33 KB
/
rMATS_preptable.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#Usage: python rMATS_preptable.py <rMATSoutputtable>
import pandas as pd
import os
import sys
from Bio import SeqIO
import gzip
def preptable(table):
#Want to do a little bit of prep on a rMATS output table. Add in some columns, split others.
#Read in table
df = pd.read_table(table)
#Split replicate psi values so they have their own columns
#This splits the column delimited replicate values so that each one has its own column
#df = df.join(df['IncLevel1'].str.split(',', expand = True).rename(columns = {0: 'S1.Rep1', 1: 'S1.Rep2', 2: 'S1.Rep3'}))
#df = df.join(df['IncLevel2'].str.split(',', expand = True).rename(columns = {0: 'S2.Rep1', 1: 'S2.Rep2', 2: 'S2.Rep3'}))
#IDs are not stable from run to run so we have to make a new one
eventIDs = []
for index, row in df.iterrows():
chrm = row['chr']
strand = row['strand']
exonstart = str(row['exonStart_0base'])
exonend = str(row['exonEnd'])
upstreamexonstart = str(row['upstreamES'])
upstreamexonend = str(row['upstreamEE'])
downstreamexonstart = str(row['downstreamES'])
downstreamexonend = str(row['downstreamEE'])
eventID = chrm + ':' + strand + ':' + upstreamexonstart + '-' + upstreamexonend + ':' + exonstart + '-' + exonend + ':' + downstreamexonstart + '-' + downstreamexonend
eventIDs.append(eventID)
df['eventID'] = eventIDs
#Add in the total number of reads (inc isoform + exc isoform) for each replicate
#Then add a column that says whether or not each replicate passes a read filter threshold
s1readcounts = []
s2readcounts = []
readcountpass = []
for index, row in df.iterrows():
s1Inc = row['IJC_SAMPLE_1'].split(',')
s1Sk = row['SJC_SAMPLE_1'].split(',')
s2Inc = row['IJC_SAMPLE_2'].split(',')
s2Sk = row['SJC_SAMPLE_2'].split(',')
#Turn them into integers
s1Inc = [int(x) for x in s1Inc]
s1Sk = [int(x) for x in s1Sk]
s2Inc = [int(x) for x in s2Inc]
s2Sk = [int(x) for x in s2Sk]
s1reads = [] #[rep1readcount, rep2readcount, rep3readcount] readcount = inclusion reads + exclusion reads
s2reads = [] #[rep1readcount, rep2readcount, rep3readcount]
for i in range(len(s1Inc)):
s1reads.append(s1Inc[i] + s1Sk[i])
for i in range(len(s2Inc)):
s2reads.append(s2Inc[i] + s2Sk[i])
#We need at least 20 reads in every replicate of both samples for this quantification to be believable
if all(x >= 20 for x in s1reads) and all(x >= 20 for x in s2reads):
readcountpass.append('pass')
else:
readcountpass.append('fail')
#Turn them into strings
s1reads = [str(x) for x in s1reads]
s2reads = [str(x) for x in s2reads]
s1readcounts.append((',').join(s1reads))
s2readcounts.append((',').join(s2reads))
df['s1reads'] = s1readcounts
df['s2reads'] = s2readcounts
df['readcountpass'] = readcountpass
outfile = os.path.splitext(table)[0] + '.prepped.txt'
df.to_csv(outfile, sep = '\t', index = False)
#Given a rMATS output table, get relevant sequences surrounding skipped exons
def getSequences(table, genomefasta):
#Index genome
print 'Indexing genome...'
seq_dict = SeqIO.to_dict(SeqIO.parse(gzip.open(genomefasta), 'fasta'))
print 'Done indexing!'
seqs = {}
for eventclass in ['moreincluded', 'lessincluded', 'control']:
seqs[eventclass] = {} #{exonid : {upstream : seq, downstream : seq, exon: seq}}
#Go through table
df = pd.read_table(table)
for index, row in df.iterrows():
#if row['FDR'] < 0.05 and row['readcountpass'] == 'pass' and row['IncLevelDifference'] >= 0.05:
if row['FDR'] < 0.05 and row['IncLevelDifference'] > 0:
eventclass = 'moreincluded'
#elif row['FDR'] < 0.05 and row['readcountpass'] == 'pass' and row['IncLevelDifference'] <= -0.05:
elif row['FDR'] < 0.05 and row['IncLevelDifference'] < 0:
eventclass = 'lessincluded'
#elif row['FDR'] > 0.05 and row['readcountpass'] == 'pass' and abs(row['IncLevelDifference']) <= 0.05:
elif row['FDR'] > 0.05:
eventclass = 'control'
else:
continue
ID = row['ID']
geneid = row['GeneID']
exonid = geneid + '_' + str(ID)
chrm = row['chr']
strand = row['strand']
exonstart = row['exonStart_0base']
exonend = row['exonEnd']
if strand == '+':
exonseq = seq_dict[chrm].seq[exonstart : exonend]
upstreamseq = seq_dict[chrm].seq[exonstart - 100 : exonstart]
downstreamseq = seq_dict[chrm].seq[exonend : exonend + 100]
elif strand == '-':
exonseq = seq_dict[chrm].seq[exonstart : exonend].reverse_complement()
downstreamseq = seq_dict[chrm].seq[exonstart - 100 : exonstart].reverse_complement()
upstreamseq = seq_dict[chrm].seq[exonend: exonend + 100].reverse_complement()
#Add seq to dictionary
seqs[eventclass][exonid] = {}
seqs[eventclass][exonid]['upstream'] = str(upstreamseq).upper()
seqs[eventclass][exonid]['downstream'] = str(downstreamseq).upper()
seqs[eventclass][exonid]['exon'] = str(exonseq).upper()
print 'Found sequences for {0} more included exons, {1} less included exons, and {2} control exons.'.format(len(seqs['moreincluded']), len(seqs['lessincluded']), len(seqs['control']))
with open('Control_upstream.fasta', 'w') as upstreamfh, open('Control_exon.fasta', 'w') as exonfh, open('Control_downstream.fasta', 'w') as downstreamfh:
for exonid in seqs['control']:
upstreamfh.write('>' + exonid + '\n' + seqs['control'][exonid]['upstream'] + '\n')
exonfh.write('>' + exonid + '\n' + seqs['control'][exonid]['exon'] + '\n')
downstreamfh.write('>' + exonid + '\n' + seqs['control'][exonid]['downstream'] + '\n')
with open('MoreIncluded_upstream.fasta', 'w') as upstreamfh, open('MoreIncluded_exon.fasta', 'w') as exonfh, open('MoreIncluded_downstream.fasta', 'w') as downstreamfh:
for exonid in seqs['moreincluded']:
upstreamfh.write('>' + exonid + '\n' + seqs['moreincluded'][exonid]['upstream'] + '\n')
exonfh.write('>' + exonid + '\n' + seqs['moreincluded'][exonid]['exon'] + '\n')
downstreamfh.write('>' + exonid + '\n' + seqs['moreincluded'][exonid]['downstream'] + '\n')
with open('LessIncluded_upstream.fasta', 'w') as upstreamfh, open('LessIncluded_exon.fasta', 'w') as exonfh, open('LessIncluded_downstream.fasta', 'w') as downstreamfh:
for exonid in seqs['lessincluded']:
upstreamfh.write('>' + exonid + '\n' + seqs['lessincluded'][exonid]['upstream'] + '\n')
exonfh.write('>' + exonid + '\n' + seqs['lessincluded'][exonid]['exon'] + '\n')
downstreamfh.write('>' + exonid + '\n' + seqs['lessincluded'][exonid]['downstream'] + '\n')
preptable(sys.argv[1])
#getSequences(sys.argv[1], sys.argv[2])