-
Notifications
You must be signed in to change notification settings - Fork 5
/
module_lightgbm.py
131 lines (98 loc) · 6.04 KB
/
module_lightgbm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""
The purpose of this module is to:
* generate and evaluate predictions for each shopper to buy a certain product in week89 with a lightGBM
Prerequisite:
* A least a X_train, X_test, y_train, y_test as np.arrays have to be generated, e.g. via module_train_test_splitting
"""
#load libraries
import time
import lightgbm as lgbm
import matplotlib.pyplot as plt
import numpy as np
from numpy import savetxt
import pickle
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
def predict_lightgbm(X_train, X_test, y_train, y_test, X_eval = None, y_eval = None, eval_set = False, output_probabilities = True, n_estimators = 300, early_stopping_rounds = 50, num_leaves = 1000, reg_alpha = 0, reg_lambda = 0.5, subsample = 0.5, learning_rate = 0.01, verbose = 200):
"""
input:
X_train: np.ndarray containing the training parameters without the target variable (can be generated by the module: module_train_test_splitting)
X_test: np.ndarray containing the testing parameters without the target variable (can be generated by the module: module_train_test_splitting)
y_train: np.ndarray containing only the target variable (can be generated by the module: module_train_test_splitting)
y_test: np.ndarray containing only the target variable (can be generated by the module: module_train_test_splitting)
X_eval: default = None (optional), analogue to discription of X_train
y_eval: default = None (optional), analogue to discription of y_train
eval_set: default: False, whether you like to use an evalution set or not
output_probabilities: default = True, determines whether the output are probabilities or binary
n_estimators: default = 300, number of boosting iterations
early_stopping_rounds: default = 50, stops training if one metric does not improve for the given early_stopping_rounds
num_leaves: default = 1000, max. number of leaves per tree
reg_alpha: default = 0, L1 regularisation, L1 >= 0, reduces overfitting
reg_lambda: default = 0.5, L2 regularisation, L2 >= 0, reduces overfitting
subsample: default = 0.5, randomly selects part of the data without resampling, 0 < subsample <= 1, reduces overfitting and speeds up training
learning_rate: default = 0.01, shrinkage rate, learning_rate > 0
-> lightgbm parameters (source: https://lightgbm.readthedocs.io/en/latest/Parameters.html):
output:
model
plot that shows evaluation results over time (metrics: auc, logloss)
for binary results: confusion matrix
for probabilities results: confusion matrix, auc, binary logloss
save: lightGBM model, outputted predictions (either in binary format or in form of probabilities dependung on set parameters)
"""
assert type(X_train) == np.ndarray
assert type(X_test) == np.ndarray
assert type(y_train) == np.ndarray
assert type(y_test) == np.ndarray
if eval_set:
start = time.time()
model = lgbm.LGBMClassifier(objective = 'binary', num_leaves = num_leaves, reg_alpha = reg_alpha, reg_lambda = reg_lambda, subsample = subsample, learning_rate = learning_rate, n_estimators = n_estimators, metric = ['auc', 'logloss'], random_state = 42)
model.fit(X_train, y_train, verbose = verbose, eval_set = [(X_train, y_train), (X_eval, y_eval), (X_test, y_test)], early_stopping_rounds = early_stopping_rounds, eval_metric = ['auc', 'logloss'])
print(model.best_score_)
end = time.time()
print('\nThe computation took %.2f minutes.'%((end - start)/60))
#show train, eval and test set over time
model.evals_result_['evaluation'] = model.evals_result_.pop('valid_1')
model.evals_result_['testing'] = model.evals_result_.pop('valid_2')
lgbm.plot_metric(model.evals_result_, metric = 'auc')
lgbm.plot_metric(model.evals_result_, metric = 'binary_logloss')
plt.show()
#save the model
filename = 'lightgbm_model_final.pkl'
with open(filename, 'wb') as file:
pickle.dump(model, file)
else:
start = time.time()
model = lgbm.LGBMClassifier(objective = 'binary', num_leaves = num_leaves, reg_alpha = reg_alpha, reg_lambda = reg_lambda, subsample = subsample, learning_rate = learning_rate, n_estimators = n_estimators, metric = ['auc', 'logloss'], random_state = 42)
model.fit(X_train, y_train, verbose = verbose, eval_set = [(X_train, y_train), (X_test, y_test)], early_stopping_rounds = early_stopping_rounds, eval_metric = ['auc', 'logloss'])
print(model.best_score_)
end = time.time()
print('\nThe computation took %.2f minutes.'%((end - start)/60))
#show train and test set over time
model.evals_result_['testing'] = model.evals_result_.pop('valid_1')
lgbm.plot_metric(model.evals_result_, metric = 'auc')
lgbm.plot_metric(model.evals_result_, metric = 'binary_logloss')
plt.show()
#save the model
filename = 'lightgbm_model_final.pkl'
with open(filename, 'wb') as file:
pickle.dump(model, file)
if output_probabilities:
y_pred = model.predict_proba(X_test)
#confusion matrix
y_pred_binary = np.where(np.array([col[1] for col in y_pred]) >= 0.5, 1, 0)
confusion_mat = confusion_matrix(y_test, y_pred_binary)
y_pred = y_pred[:,1]
#auc score
auc = roc_auc_score(y_test, y_pred)
#binary logloss
binary_log_loss = log_loss(y_test, y_pred)
print('Confusion Matrix: \n', confusion_mat)
print('The AUC score is: ', auc)
print('The Binary_Log_Loss is: ', binary_log_loss)
else:
y_pred = model.predict(X_test)
#confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)
print('Confusion Matrix: ', confusion_mat)
return model