-
Notifications
You must be signed in to change notification settings - Fork 2
/
condensed.py
98 lines (89 loc) · 3.64 KB
/
condensed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib.patches import Rectangle
from extract_arrays import extract_arrays, get_refseq, create_random_arrays
def plot_rects(ax, x, i, h47r_p, wt_p, shift):
"""
Plot linked rectangles (i.e., copies).
"""
ax.add_patch(
Rectangle(
(x + shift, i),
h47r_p,
0.5,
facecolor='darkblue',
edgecolor='k',
lw=1))
ax.add_patch(
Rectangle(
(x + h47r_p + shift, i),
wt_p,
0.5,
facecolor='w',
edgecolor='k',
lw=1))
def run(args):
# Figure formatting.
sns.set_style("white")
f, ax = plt.subplots()
sns.despine(ax=ax, top=True, right=True, left=True, bottom=True)
ax.set_xlim([1, (args.cn * 1.5)])
ax.set_ylim([0.5, (args.cn * 1.2)])
ax.set(yticklabels=[], xticklabels=[])
refseq = get_refseq(args.ref)
data = extract_arrays(args.bam, refseq, copy_filter='hard')
arrays = data.arrays
af = float(sum([sum(a) for a in arrays])) / sum([len(a) for a in arrays])
# Create a population with randomly distributed H47R alleles if specified.
if args.rand:
arrays = create_random_arrays(arrays, max_cn=args.cn, af=af)
for cn in range(1, args.cn + 1):
matrix = [array for array in arrays if len(array) == cn]
if args.mixed_only:
if cn == 1: continue
matrix = [array for array in matrix if len(set(array)) == 2]
matrix = sorted(matrix)
m = np.array(matrix)
# Total number of arrays with the specified copy number.
total_arrays = float(m.shape[0])
# Among all arrays of the given copy number, how many H47R alleles
# are observed in the first copy, second copy, etc.?
h47r_copies = np.sum(m, axis=0)
# Convert these numbers to proportions at each copy.
h47r_proportions = [x / total_arrays for x in h47r_copies]
copy_count = 1
for h47r_p in h47r_proportions:
shift = copy_count * 0.2
wt_p = 1 - h47r_p
plot_rects(ax, copy_count, cn, h47r_p, wt_p, shift)
# Overly specific parameters to plot lines in between rectangles
# (i.e., copies) and summary text for each copy number.
if copy_count < cn:
plt.plot([copy_count + h47r_p + wt_p + shift,
copy_count + h47r_p + wt_p + shift + 0.2],
[cn + 0.25, cn + 0.25], '-', c='k', lw=0.75)
if copy_count == cn:
plt.text(copy_count + h47r_p + wt_p + shift + 0.2,
cn + 0.15, str(int(total_arrays)), style='italic')
copy_count += 1
if args.png:
plt.savefig(args.o + '.png', dpi=300)
else:
plt.savefig(args.o + '.eps')
def main(argv):
import argparse
p = argparse.ArgumentParser()
p.add_argument("--bam", required = True, help='Path to sorted BAM file.')
p.add_argument("--ref", required = True, help='Path to FASTA reference genome.')
p.add_argument("-mixed_only", help="Only plot mixed arrays", action="store_true")
p.add_argument("-rand", help='Plot random distribution of alleles', action='store_true')
p.add_argument("-cn", type=int, help='Plot K3L arrays with up to this many copies of the specified region. (default = 5)', default=5)
p.add_argument("-png", help='Output as PNG.', action='store_true')
p.add_argument("-o", help='Name of output plot.', default='condensed')
run(p.parse_args(argv))
if __name__ == "__main__":
import sys
main(sys.argv[1:])