module_lightgbm.py

"""
The purpose of this module is to:
    * generate and evaluate predictions for each shopper to buy a certain product in week89 with a lightGBM

Prerequisite: 
    * A least a X_train, X_test, y_train, y_test as np.arrays have to be generated, e.g. via module_train_test_splitting
"""

#load libraries
import time
import lightgbm as lgbm
import matplotlib.pyplot as plt
import numpy as np
from numpy import savetxt
import pickle
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss

def predict_lightgbm(X_train, X_test, y_train, y_test, X_eval = None, y_eval = None, eval_set = False, output_probabilities = True, n_estimators = 300, early_stopping_rounds = 50, num_leaves = 1000, reg_alpha = 0, reg_lambda = 0.5, subsample = 0.5, learning_rate = 0.01, verbose = 200):
    
    """
    input: 
        X_train: np.ndarray containing the training parameters without the target variable (can be generated by the module: module_train_test_splitting)
        X_test: np.ndarray containing the testing parameters without the target variable (can be generated by the module: module_train_test_splitting)
        y_train: np.ndarray containing only the target variable (can be generated by the module: module_train_test_splitting)
        y_test: np.ndarray containing only the target variable (can be generated by the module: module_train_test_splitting)
        X_eval: default = None (optional), analogue to discription of X_train
        y_eval: default = None (optional), analogue to discription of y_train
        eval_set: default: False, whether you like to use an evalution set or not
        output_probabilities: default = True, determines whether the output are probabilities or binary
        n_estimators: default = 300, number of boosting iterations
        early_stopping_rounds: default = 50, stops training if one metric does not improve for the given early_stopping_rounds
        num_leaves: default = 1000, max. number of leaves per tree
        reg_alpha: default = 0, L1 regularisation, L1 >= 0, reduces overfitting
        reg_lambda: default = 0.5, L2 regularisation, L2 >= 0, reduces overfitting
        subsample: default = 0.5, randomly selects part of the data without resampling, 0 < subsample <= 1, reduces overfitting and speeds up training
        learning_rate: default = 0.01, shrinkage rate, learning_rate > 0
        -> lightgbm parameters (source: https://lightgbm.readthedocs.io/en/latest/Parameters.html):
    
    output: 
        model
        plot that shows evaluation results over time (metrics: auc, logloss)
        for binary results: confusion matrix
        for probabilities results: confusion matrix, auc, binary logloss
    
    save: lightGBM model, outputted predictions (either in binary format or in form of probabilities dependung on set parameters)
    
    """
    assert type(X_train) == np.ndarray
    assert type(X_test) == np.ndarray
    assert type(y_train) == np.ndarray
    assert type(y_test) == np.ndarray
    
    if eval_set:
        
        start = time.time()
    
        model = lgbm.LGBMClassifier(objective = 'binary', num_leaves = num_leaves, reg_alpha = reg_alpha, reg_lambda = reg_lambda, subsample = subsample, learning_rate = learning_rate, n_estimators = n_estimators, metric = ['auc', 'logloss'], random_state = 42) 
        model.fit(X_train, y_train, verbose = verbose, eval_set = [(X_train, y_train), (X_eval, y_eval), (X_test, y_test)], early_stopping_rounds = early_stopping_rounds, eval_metric = ['auc', 'logloss'])

        print(model.best_score_)
    
        end = time.time()
        print('\nThe computation took %.2f minutes.'%((end - start)/60))
    
        #show train, eval and test set over time
        model.evals_result_['evaluation'] = model.evals_result_.pop('valid_1')
        model.evals_result_['testing'] = model.evals_result_.pop('valid_2')
        lgbm.plot_metric(model.evals_result_, metric = 'auc')
        lgbm.plot_metric(model.evals_result_, metric = 'binary_logloss')
        plt.show()
    
        #save the model
        filename = 'lightgbm_model_final.pkl'
        with open(filename, 'wb') as file:
            pickle.dump(model, file)
    
    else:
        
        start = time.time()
    
        model = lgbm.LGBMClassifier(objective = 'binary', num_leaves = num_leaves, reg_alpha = reg_alpha, reg_lambda = reg_lambda, subsample = subsample, learning_rate = learning_rate, n_estimators = n_estimators, metric = ['auc', 'logloss'], random_state = 42) 
        model.fit(X_train, y_train, verbose = verbose, eval_set = [(X_train, y_train), (X_test, y_test)], early_stopping_rounds = early_stopping_rounds, eval_metric = ['auc', 'logloss'])

        print(model.best_score_)
    
        end = time.time()
        print('\nThe computation took %.2f minutes.'%((end - start)/60))
    
        #show train and test set over time
        model.evals_result_['testing'] = model.evals_result_.pop('valid_1')
        lgbm.plot_metric(model.evals_result_, metric = 'auc')
        lgbm.plot_metric(model.evals_result_, metric = 'binary_logloss')
        plt.show()
        
        #save the model
        filename = 'lightgbm_model_final.pkl'
        with open(filename, 'wb') as file:
            pickle.dump(model, file)
        
    if output_probabilities:
        
        y_pred = model.predict_proba(X_test)
        
        #confusion matrix
        y_pred_binary = np.where(np.array([col[1] for col in y_pred]) >= 0.5, 1, 0)
        confusion_mat = confusion_matrix(y_test, y_pred_binary)
        
        y_pred = y_pred[:,1]
        
        #auc score
        auc = roc_auc_score(y_test, y_pred)
        #binary logloss
        binary_log_loss = log_loss(y_test, y_pred)
        
        print('Confusion Matrix: \n', confusion_mat)
        print('The AUC score is: ', auc)
        print('The Binary_Log_Loss is: ', binary_log_loss)
        
    else:
        
        y_pred = model.predict(X_test) 
        
        #confusion matrix
        confusion_mat = confusion_matrix(y_test, y_pred)
        
        print('Confusion Matrix: ', confusion_mat)
     
    
    return model