forked from TensorDuck/k01-overfitting
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLGBM - Hyper Param Search
90 lines (72 loc) · 3.1 KB
/
LGBM - Hyper Param Search
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np
import pandas as pd
import lightgbm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer#
# Prepare the data
#
num_leaves_valid_values=[2,4,6,8]
max_depth_valid_values=[2,3,4,5]
min_data_in_leaf_valid_values=[3,5,8,13,21]
for min_data_in_leaf_value in min_data_in_leaf_valid_values:
for max_depth_value in max_depth_valid_values:
for num_leaves_value in num_leaves_valid_values:
train = pd.read_csv('train.csv')
# get the labels
y = train.target.values
train.drop(['id', 'target'], inplace=True, axis=1)
x = train.values
#
# Create training and validation sets
#
x, x_test, y, y_test = train_test_split(x, y, test_size=.2, random_state=42, stratify=y)
#
# Create the LightGBM data containers
#
categorical_features = [c for c, col in enumerate(train.columns) if 'cat' in col]
train_data = lightgbm.Dataset(x, label=y)
#if you have categorical variables, use this:
#train_data = lightgbm.Dataset(x, label=y, categorical_feature=categorical_features)
test_data = lightgbm.Dataset(x_test, label=y_test)
#
# Train the model
#
myoutput={}
parameters = {
'application': 'binary',
'objective': 'binary',
'metric': 'auc',
'is_unbalance': 'true',
'boosting': 'gbdt',
'feature_fraction': 0.5,
'bagging_fraction': 0.5,
'bagging_freq': 20,
'learning_rate': 0.05,
'verbose': -1,
'verbosity':-1,
'num_leaves': num_leaves_value,
'max_depth':max_depth_value,
'min_data_in_leaf':min_data_in_leaf_value
}
model = lightgbm.train(parameters,
train_data,
verbose_eval=False,
evals_result=myoutput,
valid_sets=test_data,
num_boost_round=5000,
early_stopping_rounds=100)
#
# Create a submission
#
submission = pd.read_csv('test.csv')
ids = submission['id'].values
submission.drop('id', inplace=True, axis=1)
x = submission.values
y = model.predict(x)
binY = [round(i) for i in y]
output = pd.DataFrame({'id': ids, 'target': binY})
output.to_csv("submission.csv", index=False)
print(min_data_in_leaf_value,max_depth_value,num_leaves_value,max(myoutput['valid_0']['auc']))