forked from pylablanche/gcForest
-
Notifications
You must be signed in to change notification settings - Fork 0
/
gcForest.py
123 lines (96 loc) · 5.16 KB
/
gcForest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import itertools
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
class gcForest(object):
def __init__(self,
cascade_test_size=0.2, n_cascadeRF=2, n_cascadeRFtree=101, cascade_layer=np.inf,
min_samples_cascade=0.05, tolerance=0.0, n_jobs=-1):
setattr(self, 'n_layer', 0)
setattr(self, '_n_samples', 0)
setattr(self, 'n_cascadeRF', int(n_cascadeRF))
setattr(self, 'cascade_test_size', cascade_test_size)
setattr(self, 'n_cascadeRFtree', int(n_cascadeRFtree))
setattr(self, 'cascade_layer', cascade_layer)
setattr(self, 'min_samples_cascade', min_samples_cascade)
setattr(self, 'tolerance', tolerance)
setattr(self, 'n_jobs', n_jobs)
def cascade_forest(self, X, y=None):
if y is not None:
setattr(self, 'n_layer', 0)
test_size = getattr(self, 'cascade_test_size')
max_layers = getattr(self, 'cascade_layer')
tol = getattr(self, 'tolerance')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
self.n_layer += 1
prf_crf_pred_ref = self._cascade_layer(X_train, y_train)
accuracy_ref = self._cascade_evaluation(X_test, y_test)
feat_arr = self._create_feat_arr(X_train, prf_crf_pred_ref)
self.n_layer += 1
prf_crf_pred_layer = self._cascade_layer(feat_arr, y_train)
accuracy_layer = self._cascade_evaluation(X_test, y_test)
while accuracy_layer > (accuracy_ref + tol):
accuracy_ref = accuracy_layer
prf_crf_pred_ref = prf_crf_pred_layer
self.n_layer += 1
feat_arr = self._create_feat_arr(X_train, prf_crf_pred_ref)
prf_crf_pred_layer = self._cascade_layer(feat_arr, y_train)
accuracy_layer = self._cascade_evaluation(X_test, y_test)
gc_array = pd.DataFrame(feat_arr)
gc_array.to_csv('../gc_array.csv', header = True, index = False)
if accuracy_layer < accuracy_ref :
n_cascadeRF = getattr(self, 'n_cascadeRF')
for irf in range(n_cascadeRF):
delattr(self, '_casprf{}_{}'.format(self.n_layer, irf))
delattr(self, '_cascrf{}_{}'.format(self.n_layer, irf))
self.n_layer -= 1
elif y is None:
at_layer = 1
prf_crf_pred_ref = self._cascade_layer(X, layer=at_layer)
while at_layer < getattr(self, 'n_layer'):
at_layer += 1
feat_arr = self._create_feat_arr(X, prf_crf_pred_ref)
prf_crf_pred_ref = self._cascade_layer(feat_arr, layer=at_layer)
return prf_crf_pred_ref
def _cascade_layer(self, X, y=None, layer=0):
n_tree = getattr(self, 'n_cascadeRFtree')
n_cascadeRF = getattr(self, 'n_cascadeRF')
min_samples = getattr(self, 'min_samples_cascade')
n_jobs = getattr(self, 'n_jobs')
prf = RandomForestClassifier(n_estimators=n_tree, max_features='sqrt',
min_samples_split=min_samples, oob_score=True, n_jobs=n_jobs, class_weight = 'balanced', max_depth = None)
crf = RandomForestClassifier(n_estimators=n_tree, max_features=1,
min_samples_split=min_samples, oob_score=True, n_jobs=n_jobs, class_weight = 'balanced', max_depth = None)
prf_crf_pred = []
if y is not None:
print('Adding/Training Layer, n_layer={}'.format(self.n_layer))
for irf in range(n_cascadeRF):
prf.fit(X, y)
crf.fit(X, y)
setattr(self, '_casprf{}_{}'.format(self.n_layer, irf), prf)
setattr(self, '_cascrf{}_{}'.format(self.n_layer, irf), crf)
prf_crf_pred.append(prf.oob_decision_function_)
prf_crf_pred.append(crf.oob_decision_function_)
elif y is None:
for irf in range(n_cascadeRF):
prf = getattr(self, '_casprf{}_{}'.format(layer, irf))
crf = getattr(self, '_cascrf{}_{}'.format(layer, irf))
prf_crf_pred.append(prf.predict_proba(X))
prf_crf_pred.append(crf.predict_proba(X))
return prf_crf_pred
def _cascade_evaluation(self, X_test, y_test):
casc_pred_prob = np.mean(self.cascade_forest(X_test), axis=0)
casc_pred = np.argmax(casc_pred_prob, axis=1)
casc_eva = roc_auc_score(y_true=y_test, y_score=casc_pred_prob[:,1])
casc_ks = KS(y_test, casc_pred_prob[:,1])
print('Layer validation AUC = {}'.format(casc_eva))
print('Layer validation KS = {}'.format(casc_ks))
return casc_ks
def _create_feat_arr(self, X, prf_crf_pred):
swap_pred = np.swapaxes(prf_crf_pred, 0, 1)
add_feat = swap_pred.reshape([np.shape(X)[0], -1])
feat_arr = np.concatenate([add_feat, X], axis=1)
return feat_arr
def KS(y_true, y_pred):
fpr, tpr, thresholds = roc_curve(y_true, y_pred)
return max(tpr - fpr)