forked from salbrec/SIMPA
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSIMPA.py
251 lines (194 loc) · 9.24 KB
/
SIMPA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import pandas as pd
import numpy as np
from sys import argv, path, stdout
import os
import pickle
import argparse
from sklearn.ensemble import RandomForestClassifier
# initializing MPI
from mpi4py import MPI
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
import utils.pp as pp
import utils.utils as utils
simpa_dir = './'
if argv[0].find('/') >= 0:
simpa_dir = argv[0][: - argv[0][::-1].find('/')]
parser = argparse.ArgumentParser(description='SIMPA - Single-cell chIp-seq iMPutAtion')
parser.add_argument('--bed', '-b', type=str, required=True, help='Path to bed file with sparse single-cell input')
parser.add_argument('--targets', '-t', type=str, required=True, help='''Target(s) defining the specific reference experiments
(ususally the one used in the scChIP). When multiple targets are provided, separate by "+"''')
parser.add_argument('--outdir', '-o', type=str, default='./', help='Output directory. Default: "./"')
parser.add_argument('--genome', '-g', type=str, default='hg38', choices=['hg38'], help='Genome assembly')
parser.add_argument('--binsize', '-bs', type=str, default='5kb', choices=['5kb','50kb'], help='Size of the bins (genomic regions)')
parser.add_argument('--estimators', '-e', type=int, default=100, help='Number of trees in Random Forest')
parser.add_argument('--simulate', action='store_true', help='Impute only 100 bins for testing the software')
# parse and pre-process command line arguments
args = parser.parse_args()
bin_size_int = int(args.binsize.replace('bp', '').replace('kb', '000'))
ENCODE_dir = '%sdata/ENCODE/%s/%s/'%(simpa_dir, args.genome, args.binsize)
target_set = set(args.targets.split('+'))
# initialize variables used by all ranks
sc_bins = None
imp_prob_all = None
dimensions = None
training_features = None
uniq_cand_labels = None
candidates = None
freq_map = None
ref_n_bins = None
bin_bed_map = None
# used to extract uniq bin columns
bin_index_map = None
key_index_map = None
if rank == 0:
# initialize variables needed to read in the sparse single-cell input
# that is converted into a set (or list) of bins
allowed_chroms = utils.get_allowed_chrom_str()
chrom_sizes = utils.get_chrom_sizes('%sdata/chromosome_sizes/%s/sizes.tsv'%(simpa_dir, args.genome))
peaks = pp.get_peaks(args.bed, allowed_chroms, enrich_index=-1)
sc_bins, sc_bin_value, max_bin_ID, bin_bed_map = pp.bin_it(peaks, allowed_chroms,
chrom_sizes, bin_size_int)
sc_bins = sorted(list(sc_bins))
print('\nGiven the sparse input there are %d genomic regions converted into %d bins of size %s'%(
sum([len(p) for p in peaks.values()]), len(sc_bins), args.binsize))
# get reference experiments for given target(s)
metadata = pd.read_csv('%sdata/metadata_ENCODE.tsv'%(simpa_dir), sep='\t')
metadata = metadata.loc[[True if target in target_set else False for target in metadata['target']]]
print('Number of available bulk reference experiments: %d (for %s)'%(metadata.shape[0],args.targets))
# read in the reference experiments, already converted into bins
# collect also all bins that are observed in at least one reference experiment
all_ref_bins = set()
ref_bins_map = {}
for accession in metadata['accession']:
ref_experiment_bins = pickle.load(open(ENCODE_dir+accession, 'rb'))
all_ref_bins = all_ref_bins | ref_experiment_bins
ref_bins_map[accession] = ref_experiment_bins
# calculate the frequencies: how often is a particular bin present across all the
# reference experiments
print('Number of bins with a signal in at least one bulk:', len(all_ref_bins))
freq_map = {}
for bid in all_ref_bins:
freq = sum([True if bid in bin_set else False for bin_set in ref_bins_map.values()])
freq = float(freq) / float(metadata.shape[0])
freq_map[bid] = freq
# number of bins within a reference set
ref_n_bins = [len(ref_bins_map[acc]) for acc in metadata['accession']]
# define candidate bins that are potentially imputed
candidates = list(all_ref_bins - set(sc_bins))
candidates.sort()
if args.simulate:
candidates = candidates[:100]
candidates = np.array(candidates)
print('Number of candidate bins: %d%s'%(len(candidates),
'' if not args.simulate else ' (simulation mode is on)'))
# depending on the bins in the sparse input, create the training features matrix
training_features = np.zeros((metadata.shape[0], len(sc_bins)), dtype=np.int)
for index, accession in enumerate(metadata['accession']):
training_features[index] = np.array([1 if bid in ref_bins_map[accession] else 0 for bid in sc_bins])
training_features = np.array(training_features)
print('Shape of matrix for training features:', training_features.shape)
# searching for bins with exactly the same values, for bins having the same
# column values, only one model is trained and applied for all these bins
bin_index_map = {}
key_index_map = {}
uniq_cand_labels = []
index = 0
for bid in candidates:
class_labels = np.array([1 if bid in ref_bins_map[acc] else 0 for acc in metadata['accession']])
key = ''.join(map(str, class_labels))
if key not in key_index_map:
uniq_cand_labels.append(class_labels)
key_index_map[key] = index
index += 1
bin_index_map[bid] = key_index_map[key]
uniq_cand_labels = np.array(uniq_cand_labels)
print('\n##### Pre-Processing is done ... #####\n')
print('Training %d models for %d candidate bins'%(uniq_cand_labels.shape[0], len(candidates)))
print('Random Forest is used with %d trees'%(args.estimators))
print('The task is shared by %d processors'%(size))
# rank 0 shares the dimensions for other ranks used to initiate
# the matrices containing the training features and candidates
dimensions = (metadata.shape[0], len(sc_bins), uniq_cand_labels.shape[0])
stdout.flush()
dimensions = comm.bcast(dimensions, root=0)
n_ref_exps, n_sc_bins, dim1_uniq_cand_labels = dimensions
if rank != 0:
training_features = np.empty((n_ref_exps,n_sc_bins), dtype=np.int)
uniq_cand_labels = np.empty((dim1_uniq_cand_labels, n_ref_exps), dtype=np.int)
# rank 0 pre-processed the data, created the training features, and collected the
# candidates. This data is now shared with other ranks by broadcast
comm.Bcast(training_features, root=0)
comm.Bcast(uniq_cand_labels, root=0)
# calculate the range that defines the tasks of the rank
chunk = int(dim1_uniq_cand_labels / size) + 1
f = rank * chunk
t = (rank+1) * chunk
if t > dim1_uniq_cand_labels:
t = dim1_uniq_cand_labels
# imp_prob_rank collects the results of each single rank
imp_prob_rank = np.zeros(chunk, dtype=np.float) - 1.0
# initialize classification algorithm with Random Forest
clf = RandomForestClassifier(n_estimators=args.estimators, random_state=42)
# for each candidate bin a classification model is trained, the more
# cores are used, the less models are trained by one rank
artificial_inst = np.array([training_features.shape[1] * [1]])
for send_index, reference_index in enumerate(range(f,t)):
class_vector = uniq_cand_labels[reference_index]
prob = None
# additionally check whether the bins labels are only 0 or 1
sum_y = sum(class_vector)
if sum_y == 0 or sum_y == training_features.shape[0]:
prob = 0.0 if sum_y == 0 else 1.0
else:
# train classification model and get the probability
clf.fit(training_features, class_vector)
prob = clf.predict_proba(artificial_inst)[0][1]
imp_prob_rank[send_index] = prob
# MPI-gather of the results from each rank, collected in rank 0
if rank == 0:
imp_prob_all = np.empty((size,chunk), dtype=np.float)
comm.Gather(imp_prob_rank, imp_prob_all, root=0)
if rank == 0:
# extract imputed probabilities and collect reference frequencies
index_prob = {}
index = 0
for r in range(size):
for prob in imp_prob_all[r]:
if index >= len(candidates):
continue
index_prob[index] = prob
index += 1
bin_freq_prob = []
for bid in candidates:
bid_index = bin_index_map[bid]
prob = index_prob[bid_index]
bin_freq_prob.append((bid, freq_map.get(bid,-1.0), prob))
bin_freq_prob = sorted(bin_freq_prob, key=lambda x: x[-1], reverse=True)
# add the single-cell bins to the results
sc_bins_freq_def = [(bid, freq_map.get(bid, 0.0), -1.0) for bid in sc_bins]
bin_freq_prob = sc_bins_freq_def + bin_freq_prob
# create the SIMPA output table and the imputed bins bed file
impute_n = int(np.mean(ref_n_bins))
imputed_bed = ''
extended_bed = 'BinID\tchromosome\tstart\tend\tfrequency\timputed_probability\n'
for n_bins_imp, (binID, frequency, imp_probability) in enumerate(bin_freq_prob):
if n_bins_imp <= impute_n:
imputed_bed += '%s\t%d\t%d\n'%(bin_bed_map[binID])
extended_bed += str(binID) + '\t'
extended_bed += '%s\t%d\t%d\t'%(bin_bed_map[binID])
extended_bed += '%.5f\t%f\n'%(frequency, imp_probability)
# preparing output files
out_prefix = args.outdir
out_prefix += '/' if out_prefix[-1] != '/' else ''
if not os.path.exists(out_prefix):
os.makedirs(out_prefix)
input_file_name = args.bed.split('/')[-1].replace('.bed','')
open('%s%s_imputed.bed'%(out_prefix, input_file_name), 'w').write(imputed_bed)
open('%s%s_freq_prob.simpa'%(out_prefix, input_file_name), 'w').write(extended_bed)
print('\n##### Writing output to "%s" #####\n'%(out_prefix))
print('Reference bulk experiments have in average %d bins'%(impute_n))
last_comment = ' (not really, because of "simulate")' if args.simulate else ''
print('%d bins were imputed%s'%(impute_n - len(sc_bins), last_comment))
print('Done!\n')