-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathgcContPlot.py
91 lines (77 loc) · 2.47 KB
/
gcContPlot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from annotateOffs import *
from collections import defaultdict
import operator
import matplotlib
matplotlib.use('Agg')
matplotlib.rcParams['pdf.fonttype'] = 42
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.backends.backend_pdf as pltBack
def plot(fname):
outfname = "gcCont" + '.pdf'
pdf = pltBack.PdfPages(outfname)
fig = plt.figure(figsize=(5,5),
dpi=300, facecolor='w')
data = np.genfromtxt(fname, names=True, dtype=np.dtype([('name', 'S20'), ('gcContent', 'f8'), ('offtargetCount', 'f8')]))
fig = plt.figure()
studyX = defaultdict(list)
studyY = defaultdict(list)
for row in data:
study = row["guideName"].split("_")[0]
gcCont = row["gcContent"]
otCount = row["offtargetCount"]
studyX[study].append( gcCont )
studyY[study].append( otCount )
colors = ["green", "blue", "black", "yellow", "red", "grey", "violet", "lightblue"]
markers = ["o", "s", "+", ">", "<", "^", "o", "+"]
studyNames = []
figs = []
i = 0
for study, xVals in studyX.iteritems():
yVals = studyY[study]
studyFig = plt.scatter(xVals, yVals, \
alpha=.5, \
marker=markers[i], \
s=60, \
color=colors[i])
figs.append(studyFig)
studyNames.append(study)
i+=1
plt.legend(figs,
studyNames,
scatterpoints=1,
loc='upper left',
ncol=3,
fontsize=10)
plt.xlabel("GC content")
plt.ylabel("Number of off-targets")
fig.savefig(pdf, format = 'pdf')
fig.savefig("gcCont.png")
pdf.close()
print "Wrote %s" % outfname
def main():
# parse offtargets.tsv
offsByName = defaultdict(list)
targetSeqs = dict()
for row in iterTsvRows("offtargets.tsv"):
guideName = row.name.split("/")[0] # remove cell type
offsByName[guideName].append(row)
if row.type=="on-target":
targetSeqs[guideName] = row.seq
rows = []
for name, offs in offsByName.iteritems():
row = [name, str(round(gcCont(targetSeqs[name]))), str(len(offs))]
rows.append(row)
rows.sort(key=operator.itemgetter(1))
# write to tsv file
fname = "gcCont.tsv"
ofh = open(fname, "w")
headers = ["guideName", "gcContent", "offtargetCount"]
ofh.write("\t".join(headers))
ofh.write("\n")
for row in rows:
ofh.write("\t".join(row))
ofh.write("\n")
ofh.close()
plot(fname)
main()