-
Notifications
You must be signed in to change notification settings - Fork 8
/
simprily.py
221 lines (171 loc) · 10.1 KB
/
simprily.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#!/usr/local/bin/python
from collections import OrderedDict
from sys import argv
import numpy as np
import os
from alleles_generator.bit_structure import set_seq_bits, set_discovery_bits, set_panel_bits
from alleles_generator.macs_file import AllelesMacsFile
from alleles_generator.seqInfo import create_sequences
from ascertainment.asc_tools import set_asc_bits, make_ped_file, make_map_file, get_SNP_sites
from ascertainment.pseudo_array import pseudo_array_bits
from main_tools.housekeeping import process_args, debugPrint, profile
from main_tools.write_files import create_sim_directories, write_sim_results_file
from processInput import process_input_files
from simulation.run_sim import run_macs
from simulation.sim_tools import get_sim_positions, get_sim_positions_old
from summary_statistics import stat_tools
from summary_statistics.germline_tools import run_germline, process_germline_file
verbos = 0
def main(args):
chr_number = 1
# Use dictionary keys instead of index keys for args
args = process_args(args)
job = str(args['job']) # must be a number
print('JOB {}'.format(job))
prof_option = args['profile']
sim_option = args['sim option']
path = args['path']
[sim_data_dir, germline_out_dir, sim_results_dir] = create_sim_directories(path)
processedData = process_input_files(args['param file'], args['model file'], args)
using_pseudo_array = True
if not processedData.get('discovery') and not processedData.get('sample') and not processedData.get('daf'):
using_pseudo_array = False
debugPrint(3, "Finished processing input\nprocessedData: ", processedData)
### Create a list of Sequence class instances. These will contain the bulk of all sequence-based data
sequences = create_sequences(processedData)
names = [seq.name for seq in sequences]
n_d = sum([1 for seq in sequences if seq.type == 'discovery'])
debugPrint(1,'name\ttotal\tpanel\tgenotyped')
for seq in sequences:
debugPrint(1,'{}\t{}\t{}\t{}'.format(seq.name, seq.tot, seq.panel, seq.genotyped))
total = sum([seq.tot for seq in sequences])
debugPrint(1, 'total samples: {}'.format(sum([seq.genotyped for seq in sequences if seq.type=='discovery'] + [seq.tot for seq in sequences if seq.type=='sample'])))
### Define simulation size
length = processedData['length']
debugPrint(1, 'Perform simulation and get sequences')
pedmap = args['pedmap']
germline = args['germline']
##########################################################################
################## Perform simulation and get sequences ##################
##########################################################################
### Flag to check if the simulation works
SNPs_exceed_available_sites = True
while SNPs_exceed_available_sites:
# add genetic map to macs_args list
macs_args = []
macs_args = processedData['macs_args']
if sim_option == 'macs':
### Run macs and make bitarray
profile(prof_option, path, job, "start_run_macs")
[sequences,position] = run_macs(macs_args, sequences)
profile(prof_option, path, job, "end_run_macs")
nbss = len(sequences[0].bits) / (sequences[0].tot)
if using_pseudo_array:
## get position of the simulated sites and scale it to the "real" position in the SNP chip
sim_positions = get_sim_positions(position, nbss, length)
elif sim_option == 'macs_file':
### Using a static sim output rather than generating from seed
seq_alleles = AllelesMacsFile('tests/test_data/sites1000000.txt')
set_seq_bits(sequences, seq_alleles)
nbss = len(sequences[0].bits) / (sequences[0].tot)
if using_pseudo_array:
## get position of the simulated sites and scale it to the "real" position in the SNP chip
sim_positions = get_sim_positions_old(seq_alleles, nbss, length)
profile(prof_option, path, job, "start_set_discovery_bits")
set_discovery_bits(sequences)
profile(prof_option, path, job, "end_set_discovery_bits")
debugPrint(1, 'Number of sites in simulation: {}'.format(nbss))
assert nbss > 10, "Number of sites is less than 10: {}".format(nbss)
##########################################################################
### Create pseudo array according to ascertainment scheme and template ###
##########################################################################
if using_pseudo_array:
SNPs = get_SNP_sites(args['SNP file'])
debugPrint(1, 'Number of SNPs in Array: {}'.format(len(SNPs)))
profile(prof_option, path, job, "start_set_panel_bits")
asc_panel_bits = set_panel_bits(nbss, sequences)
profile(prof_option, path, job, "end_set_panel_bits")
debugPrint(1,'Number of chromosomes in asc_panel: {}'.format(asc_panel_bits.length()/nbss))
### Get pseudo array sites
debugPrint(2,'Making pseudo array')
profile(prof_option, path, job, "start_pseudo_array_bits")
[pos_asc, nbss_asc, avail_site_indices, avail_sites] = pseudo_array_bits(asc_panel_bits, processedData['daf'], sim_positions, SNPs)
profile(prof_option, path, job, "end_pseudo_array_bits")
nb_avail_sites = len(avail_sites)
SNPs_exceed_available_sites = ( len(SNPs) >= nb_avail_sites )
else:
SNPs = []
SNPs_exceed_available_sites = False
if using_pseudo_array:
profile(prof_option, path, job, "start_set_asc_bits")
set_asc_bits(sequences, nbss_asc, pos_asc, avail_site_indices)
profile(prof_option, path, job, "end_set_asc_bits")
debugPrint(1, 'Calculating summary statistics')
##########################################################################
###################### Calculate summary statistics ######################
##########################################################################
res, head = [], []
### Calculate summary stats from genomes
if nbss > 0: # Simulations must contain at least one segregating site
profile(prof_option, path, job, "start_store_segregating_site_stats")
stat_tools.store_segregating_site_stats(sequences, res, head)
profile(prof_option, path, job, "end_store_segregating_site_stats")
profile(prof_option, path, job, "start_store_pairwise_FSTs")
stat_tools.store_pairwise_FSTs(sequences, n_d, res, head)
profile(prof_option, path, job, "end_store_pairwise_FSTs")
### Calculate summary stats from the ascertained SNPs
if using_pseudo_array:
if nbss_asc > 0:
profile(prof_option, path, job, "start_store_array_segregating_site_stats")
stat_tools.store_array_segregating_site_stats(sequences, res, head)
profile(prof_option, path, job, "end_store_array_segregating_site_stats")
profile(prof_option, path, job, "start_store_array_FSTs")
stat_tools.store_array_FSTs(sequences, res, head)
profile(prof_option, path, job, "end_store_array_FSTs")
debugPrint(2,'Making ped and map files')
ped_file_name = '{0}/macs_asc_{1}_chr{2}.ped'.format(sim_data_dir, job, str(chr_number))
map_file_name = '{0}/macs_asc_{1}_chr{2}.map'.format(sim_data_dir, job, str(chr_number))
out_file_name = '{0}/macs_asc_{1}_chr{2}'.format(germline_out_dir, job, str(chr_number))
if os.path.isfile(out_file_name + '.match'): # Maybe remove if statement
os.remove(ped_file_name)
os.remove(map_file_name)
if using_pseudo_array and pedmap or germline:
profile(prof_option, path, job, "start_make_ped_file")
make_ped_file(ped_file_name, sequences)
profile(prof_option, path, job, "end_make_ped_file")
profile(prof_option, path, job, "start_make_map_file")
make_map_file(map_file_name, pos_asc, chr_number, avail_sites)
profile(prof_option, path, job, "end_make_map_file")
### Use Germline to find IBD on pseduo array ped and map files
do_i_run_germline = int(args['germline'])
debugPrint(1,'run germline? {}'.format("True" if do_i_run_germline else "False"))
if (do_i_run_germline == True):
########################### <CHANGE THIS LATER> ###########################
### Germline seems to be outputting in the wrong unit - so I am putting the min at 3000000 so that it is 3Mb, but should be the default.
profile(prof_option, path, job, "start_run_germline")
# germline = run_germline(ped_file_name, map_file_name, out_file_name, min_m = 3000000)
profile(prof_option, path, job, "end_run_germline")
germline = run_germline(ped_file_name, map_file_name, out_file_name, min_m=300)
########################### </CHANGE THIS LATER> ##########################
### Get IBD stats from Germline output
if os.path.isfile(out_file_name + '.match'):
print('Reading Germline IBD output')
profile(prof_option, path, job, "start_process_germline_file")
[IBD_pairs, IBD_dict] = process_germline_file(out_file_name, names)
profile(prof_option, path, job, "end_process_germline_file")
print('Calculating summary stats')
stats = OrderedDict([('num', len), ('mean', np.mean), ('med', np.median), ('var', np.var)])
profile(prof_option, path, job, "start_store_IBD_stats")
stat_tools.store_IBD_stats(stats, IBD_pairs, IBD_dict, res, head)
stat_tools.store_IBD_stats(stats, IBD_pairs, IBD_dict, res, head, min_val=30)
profile(prof_option, path, job, "end_store_IBD_stats")
debugPrint(1,'finished calculating ss')
write_sim_results_file(sim_results_dir, job, processedData['param_dict'], res, head)
print('')
print('#########################')
print('### PROGRAM COMPLETED ###')
print('#########################')
print('')
profile(prof_option, path, job, "COMPLETE")
if __name__ == '__main__':
main(argv)