-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdwgsim.py
97 lines (87 loc) · 4.56 KB
/
dwgsim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
import subprocess
import random
import multiprocessing
from config import *
from bokeh.plotting import figure, show
def create_illumina_reads_dwgsim(sequenced_genome_path, reads_folder, num_reads, name, read_length, error_factor=1):
print("\tdwgsim...")
dwgsim_instances = []
file_names1 = ""
file_names2 = ""
reads1 = reads_folder + name + ".bwa.read1.fastq.gz"
reads2 = reads_folder + name + ".bwa.read2.fastq.gz"
# we actually need to seed dwgsim otherwise each instance will create the same data (it uses the current time)
seed = random.randrange(0, 2**6)
num_instances = 1#32
for idx in range(num_instances):
command = [dwgsim_str, "-1", str(read_length), "-2", str(read_length), "-N",
str(int(num_reads/num_instances)), "-r", str(0.0010*error_factor), "-o", "1", "-z",
str(seed), sequenced_genome_path,
reads_folder + "part_" + str(idx) + name]
dwgsim_instances.append(subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE))
seed += 42 # increment the seed so that we have some different number every time
file_names1 += reads_folder + "part_" + str(idx) + name + ".bwa.read1.fastq.gz "
file_names2 += reads_folder + "part_" + str(idx) + name + ".bwa.read2.fastq.gz "
for proc in dwgsim_instances:
proc.wait()
print("\tcat...")
os.system("zcat " + file_names1 + " > " + reads1)
os.system("zcat " + file_names2 + " > " + reads2)
def gen_survivor_error_profile(out_file_name, in_file_name=survivor_error_profile, p_mod=lambda x: x):
with open(os.path.expanduser(in_file_name)) as in_file:
lines = in_file.readlines()[1:]
pos_list = [int(line.split("\t")[0]) for line in lines]
p_stop = [float(line.split("\t")[1]) for line in lines]
p_match = [float(line.split("\t")[2]) for line in lines]
p_missmatch = [float(line.split("\t")[3]) for line in lines]
p_ins = [float(line.split("\t")[4]) for line in lines]
p_del = [float(line.split("\t")[5]) for line in lines]
with open(out_file_name, "w") as out_file:
out_file.write("Pos\tP(stop)\tP(match)\tp(mismatch)\tP(ins)]\tP(del)\n")
for pos in pos_list:
p_stop_val = p_stop[pos]
p_missmatch_val = p_mod(p_missmatch[pos])
p_match_val = 1 - p_mod(1 - p_match[pos])
p_ins_val = p_mod(p_ins[pos])
p_del_val = p_mod(p_del[pos])
out_file.write(str(pos) + "\t" + str(p_stop_val) + "\t" + str(p_match_val) + "\t" + str(p_missmatch_val) +
"\t" + str(p_ins_val) + "\t" + str(p_del_val) + "\n")
def gen_survivor_error_profile_fac(out_file_name, fac=1, survivor_error_profile=survivor_error_profile):
gen_survivor_error_profile(out_file_name + "_" + str(fac) + ".txt",
survivor_error_profile,
p_mod=lambda x: min(1,x*fac))
def create_reads_survivor(sequenced_genome_path, reads_folder, num_reads, name, error_profile):
print("\tsurvivor...")
reads1 = reads_folder + name + ".fasta"
command = survivor_str + sequenced_genome_path + " " + error_profile + " " + str(num_reads) + " " \
+ reads1 + " >/dev/null 2>&1"
p = multiprocessing.Process(target=os.system, name="survivor command", args=(command,))
p.daemon = True # kill p when this process dies...
p.start()
p.join(30)
if p.is_alive():
print("WARNING: survivor is running for 30 secs. It seems to be stuck; killing it now.")
p.terminate()
p.join()
def plot_error_profile(in_file_name):
plot = figure(title=in_file_name)
with open(os.path.expanduser(in_file_name)) as in_file:
x = []
ys = {"stop":[], "match": [], "ins": [], "del": []}
for line in in_file.readlines()[1:]:
pos, stop, match, _, ins, del_ = [float(x) for x in line.split("\t")]
x.append(pos)
ys["stop"].append(stop)
ys["match"].append(match)
ys["ins"].append(ins)
ys["del"].append(del_)
plot.line(x=x, y=ys["stop"], line_color="black", legend_label="stop")
plot.line(x=x, y=ys["match"], line_color="red", legend_label="match")
plot.line(x=x, y=ys["ins"], line_color="yellow", legend_label="ins")
plot.line(x=x, y=ys["del"], line_color="green", legend_label="del")
show(plot)
if __name__ == "__main__":
#plot_error_profile(survivor_error_profile)
gen_survivor_error_profile_fac("test_error_profile", 1 )
plot_error_profile("test_error_profile_1.txt")