-
Notifications
You must be signed in to change notification settings - Fork 2
/
Classification.py
304 lines (234 loc) · 10.4 KB
/
Classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
#Author: Louis Gomez
#Health and AI Lab
'''
SCRIPT DESCRIPTION:
This file is used to perform classification using the leave one patient out framework. We also
output the AUROC (with confidence intervals), the AUPRC and accuracy
Files saved are the results and feature file in pickle format for use in makinf plots and other evaluations
performed in the paper like model calibration and signal relevance
Inputs: data - inout file is the feature extraction data
output - location pickle files are stored
'''
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import sys
import pickle
import xgboost as xgb
from scipy import stats
import itertools
from joblib import Parallel, delayed
from sklearn.model_selection import ParameterSampler
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import classification_report,average_precision_score
from DeLong import delong_roc_variance
def create_hyperparameters(n):
max_depth = np.linspace(2,7,(7-2+1),dtype=int)
min_child_weight = np.linspace(2,6,(6-2+1),dtype=int)
gamma = np.linspace(1,4,(4-1+1),dtype=int)
learning_rate = [0.1,0.01]
params_grid = {"max_depth":max_depth,"min_child_weight":min_child_weight,
"gamma":gamma,"learning_rate":learning_rate}
param_list = list(ParameterSampler(params_grid, n_iter=n,random_state=10))
return param_list
def delong_ci(ground_truth,predictions,alpha = 0.95):
#https://github.com/RaulSanchezVazquez/roc_curve_with_confidence_intervals/blob/master/auc_delong_xu.py
y_true = np.array(ground_truth)
y_score = np.array(predictions)
auc,auc_var = delong_roc_variance(y_true,y_score)
auc_std = np.sqrt(auc_var)
# Confidence Interval
lower_upper_q = np.abs(np.array([0, 1]) - (1 - alpha) / 2)
lower_upper_ci = stats.norm.ppf(
lower_upper_q,
loc=auc,
scale=auc_std)
lower_upper_ci[lower_upper_ci > 1] = 1
print('ROC AUC: %s, Conf.' % auc)
print('Confidence Interval: %s (95%% confidence)' % str(lower_upper_ci))
def drop_nan_first(X_train,X_test):
'''
This function drops the nan values in the test (and subsequently train data) and then
performs feature selection
'''
#Drop features that are nans
X_test = X_test.dropna(axis = 1, how = "all").copy() #If a column are all nan drop
#if all features are dropped, we of-course cannot make a prediction
if X_test.empty:
print("All features dropped")
return None,None
test_features_present = X_test.columns.to_list()
X_train = X_train[test_features_present].copy()
return X_train,X_test
def new_labels(event_labels,classification_task):
"""Takes in a list of event labels and reconfigures it"""
if classification_task == "A":
for num in range(0,len(event_labels)):
if event_labels[num] in [0,1,2]:
pass
elif event_labels[num] == 3:
event_labels[num] = 2
elif event_labels[num] in [4,5]:
event_labels[num] = 3
else:
pass
elif classification_task == "B":
for num in range(0,len(event_labels)):
if event_labels[num] in [0,1]:
event_labels[num] = 0
elif event_labels[num] in [2,3,4,5]:
event_labels[num] = 1
else:
pass
elif classification_task == "C":
for num in range(0,len(event_labels)):
if event_labels[num] in [0,1,2,3]:
event_labels[num] = 0
elif event_labels[num] in [4,5]:
event_labels[num] = 1
else:
pass
return event_labels
def setup():
X = pd.read_csv(data)
if classification_task == "A":
task = "(1) vs (2,3)"
elif classification_task == "B":
task = "(0,1) vs (2,3,4,5)"
elif classification_task == "C":
task = "(0,1,2,3) vs (4,5)"
print("Classification Task is: ", task)
#label_map = {}
old_labels = X.Event.to_numpy()
X["Event"] = new_labels(old_labels,classification_task) #assign the new binned labels
if classification_task == "A":
X = X[(X["Event"] == 1) | (X["Event"] == 2)].copy()
else:
X = X[(X["Event"] == 0) | (X["Event"] == 1)].copy()
label = X.Event.to_numpy()
mode = stats.mode(label)[0]
#assign the minority class as the positive class label
label = [0 if n == mode else 1 for n in label]
X["Event"] = label
global uid
uid = X["UID"].to_list()
Y = X["Event"].to_numpy() #labels
X.drop(columns = ["Event","UID"], inplace = True)
X = X.reset_index(drop = True)
return X,Y,uid
def CV_lopo(config,X_train,y_train):
featurenames = list(X_train.columns)
keep = featurenames
temp_num = list(y_train)
sum_neg_samples = temp_num.count(0)
sum_pos_samples = temp_num.count(1)
scale = sum_neg_samples / sum_pos_samples
config["scale_pos_weight"] = scale
model = xgb.XGBClassifier(objective='binary:logistic',max_depth=config["max_depth"],
learning_rate=config["learning_rate"],gamma=config["gamma"],
min_child_weight=config["min_child_weight"],scale_pos_weight=scale,n_estimators=50,
random_state = 10)
sfs = SequentialFeatureSelector(model,cv=2,n_features_to_select=0.2,scoring='roc_auc',n_jobs=1)
sfs.fit(X_train,y_train)
ind = sfs.get_support()
keep = []
for n in range(0,len(ind)):
if ind[n] == True:
keep.append(featurenames[n])
X_train = sfs.transform(X_train)
dtrain = xgb.DMatrix(X_train,label = y_train,feature_names = keep)
cv_results = xgb.cv(config,dtrain,num_boost_round=100,seed = 10,nfold = 5, stratified = True,shuffle = True,
early_stopping_rounds = 10,metrics = {"auc"},verbose_eval = False)
mean_auc = cv_results['test-auc-mean'].max()
boost_rounds = cv_results['test-auc-mean'].argmax()
return keep,mean_auc,boost_rounds,config
def LOPO(X,Y,uid):
cons_prediction_prob,cons_predictions,cons_labels = [],[],[]
pid_new = []
cons_output,feat_imp_store = {},{}
pid,Atime = [],[]
for n in uid:
temp = n.split("_")
pid.append(int(temp[0]))
Atime.append(temp[1])
assert len(pid) == len(Atime)
u_pid = list(set(pid))
X.insert(0,"PID",pid)
X.insert(1,"Label",Y)
for pid in u_pid:
print(pid)
min_auc = 0.5
selected_info = ()
data = X.copy(deep = True)
label = Y.copy()
train = data[data["PID"] != pid].copy()
test = data[data["PID"] == pid].copy()
y_train = train["Label"].to_numpy()
y_test = test["Label"].to_numpy()
X_train = train.drop(columns = ["PID","Label"])
X_test = test.drop(columns = ["PID","Label"])
X_train,X_test = drop_nan_first(X_train,X_test)
#tune parameters and select parameters
configs = create_hyperparameters(n=240)
results = Parallel(n_jobs = 170)(delayed(CV_lopo)(config,X_train,y_train) for config in configs)
for result in results:
keep,mean_auc,boost_rounds,config = result
if mean_auc > min_auc:
min_auc = mean_auc
selected_info = (keep,boost_rounds,config)
#test
features,rounds,config = selected_info
X_train = X_train[features].copy()
X_test = X_test[features].copy()
dtrain = xgb.DMatrix(X_train,label = y_train,feature_names = features)
dtest = xgb.DMatrix(X_test,label = y_test,feature_names = features)
config["objective"] = "binary:logistic"
config["nthread"] = 1
model = xgb.train(config,dtrain,num_boost_round=rounds,verbose_eval=True)
pred = model.predict(dtest,validate_features = True)
prediction = [1 if x > 0.5 else 0 for x in pred]
cons_prediction_prob.append(pred)
cons_predictions.append(prediction)
cons_labels.append(y_test)
#for each pid, store the collection of valid uids
pid_new = []
for id in uid:
temp = id.split("_")[0]
if pid == temp:
pid_new.append(id)
'''Feature Importance'''
#this returns a dictionary with the feature names as keys and value as scores
feature_dict = model.get_score(importance_type = "gain")
feature_dict = dict(itertools.islice(feature_dict.items(), 5)) #select 5 top performing features
feat_imp_store.update({pid:(feature_dict,features)})
cons_output.update({pid: (prediction,y_test,pred,pid_new)})
return cons_predictions,cons_labels,cons_prediction_prob,cons_output,feat_imp_store
if __name__ == "__main__":
if len(sys.argv) != 6:
print("Error incomplete input: features_condition_window.pickle, window_size, output-folder, sah/ich expr task cv type")
sys.exit(0)
data = sys.argv[1]
output = sys.argv[2]
dataset_type = sys.argv[3]
classification_task = str(sys.argv[4]) #A = UWS/MCS-
experiment = str(sys.argv[5])
print("Classification Task: ",classification_task)
print("Experiment: ",experiment)
X,Y,uid = setup()
cons_predictions,cons_labels,cons_prediction_prob,cons_output,feature_imp_store = LOPO(X,Y,uid)
cons_predictions = [i for j in cons_predictions for i in j]
cons_labels = [i for j in cons_labels for i in j]
pos_prob = [i for j in cons_prediction_prob for i in j]
print(classification_report(cons_labels, cons_predictions, target_names = ["0","1"]))
delong_ci(cons_labels,pos_prob)
#get AUPRC scores here
print("AUPRC: ",round(average_precision_score(cons_labels,pos_prob),2))
#For printing assessments to file
file = output+dataset_type+ "_"+ classification_task +"_"+ experiment + ".pickle"
with open(file,"wb") as f:
pickle.dump(cons_output,f,pickle.HIGHEST_PROTOCOL)
file = "Experiments/Features/" + dataset_type + "_"+classification_task +"_" + experiment + "_features.pickle"
with open(file,"wb") as f:
pickle.dump(feature_imp_store,f,pickle.HIGHEST_PROTOCOL)
print("work done")