-
Notifications
You must be signed in to change notification settings - Fork 0
/
nested_cv_pipeline.py
114 lines (92 loc) · 4.85 KB
/
nested_cv_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from sklearn.metrics import matthews_corrcoef, balanced_accuracy_score, fbeta_score, f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
import numpy as np
import pandas as pd
from model_optimization import optimize_one_model
NUM_TRIALS = 10
OUTER_FOLDS = 5
scoring = {'F1_weighted': make_scorer(f1_score, average='weighted'),
'F1_micro': make_scorer(f1_score, average='micro'),
'F1_macro': make_scorer(f1_score, average='macro'),
'F2_weighted': make_scorer(fbeta_score, beta=2, zero_division=0, average='weighted'),
'F2_micro': make_scorer(fbeta_score, beta=2, zero_division=0, average='micro'),
'F2_macro': make_scorer(fbeta_score, beta=2, zero_division=0, average='macro'),
'MCC': make_scorer(matthews_corrcoef),
'Balanced_Accuracy': 'balanced_accuracy',
}
def get_outer_scores(y_true, y_pred, results_df, idx):
"""
Fill the dataframe which will contain all the scores for each experiment of a specific classifier
:param y_true: True labels
:param y_pred: Predicted labels by the best estimator of each inner cv
:param results_df: The dataframe to fill
:param idx: The index the results should be put
:return: The dataframe filled at the indicated index
"""
results_df['MCC'].loc[idx] = matthews_corrcoef(y_true, y_pred)
results_df['F1_weighted'].loc[idx] = f1_score(y_true, y_pred, average='weighted')
results_df['F1_micro'].loc[idx] = f1_score(y_true, y_pred, average='micro')
results_df['F1_macro'].loc[idx] = f1_score(y_true, y_pred, average='macro')
results_df['F2_weighted'].loc[idx] = fbeta_score(y_true, y_pred, beta=2, zero_division=0, average='weighted')
results_df['F2_micro'].loc[idx] = fbeta_score(y_true, y_pred, beta=2, zero_division=0, average='micro')
results_df['F2_macro'].loc[idx] = f1_score(y_true, y_pred, average='macro')
results_df['Balanced_Accuracy'].loc[idx] = balanced_accuracy_score(y_true, y_pred)
return results_df.copy()
def run_ncv(data_x, labels, clf_name):
"""
Run an NCV experiment for a specific classifier whose name is required as input
:param data_x: Dataframe of samples and features
:param labels: numpy array of labels
:param clf_name: a string indicating the name of the classifier
:return: tuple
Result scores of all experiments in a dataframe (experiment_number x score) and a list with the best parameters
for each experiment just for inspection (DO NOT USE)
"""
trial_results_df = pd.DataFrame(columns=list(scoring.keys()), index=[i for i in range(NUM_TRIALS+OUTER_FOLDS)])
best_params_list = []
svd = TruncatedSVD(n_components=2, random_state=42).fit(data_x)
for num_trial in range(NUM_TRIALS):
outer_cv = StratifiedKFold(n_splits=OUTER_FOLDS, shuffle=True, random_state=num_trial)
count = 0
for train_ix, test_ix in outer_cv.split(data_x, labels):
x_train, x_test = data_x.loc[train_ix], data_x.loc[test_ix]
y_train, y_test = labels[train_ix], labels[test_ix]
x_train = svd.transform(x_train)
x_test = svd.transform(x_test)
seed = num_trial
best_estimator, best_params = optimize_one_model(x_train, y_train, clf_name, seed)
best_params_list.append(best_params)
y_pred = best_estimator.predict(x_test)
trial_results_df = get_outer_scores(y_test, y_pred, trial_results_df.copy(), num_trial+count)
count += 1
return trial_results_df, best_params_list
def run_all_classifiers(data, targets):
"""
START HERE
Begin the nested cv experiment here. This function iterates over all classifier names and starts a ncv experiment for
each one.
:param data: A dataframe containing the features for all available samples (aka PCs)
:param targets: The labels after turning them into numbers of range 0 to num_classes-1
:return: Tuple
A dictionary with keys the names of the classifiers and values the dataframes containing the scores for each,
A dictionary containing the best params for each experiment and each classifier (Only used for inspection)
"""
results_dict = {}
params_dict = {}
classifiers = {0: 'svm',
1: 'lr',
2: 'gnb',
3: 'rf',
4: 'xgb'
}
for clf in classifiers.values():
print(f'Starting NCV experiment for classifier {clf}')
# res_df = nested_cv(clf, hyperparam_grids[key], x, y)
res_df, best_params = run_ncv(data, targets, clf)
results_dict[clf] = res_df
params_dict[clf] = best_params
# break
return results_dict, params_dict