-
Notifications
You must be signed in to change notification settings - Fork 0
/
predict_model_xgboost.py
98 lines (76 loc) · 3.08 KB
/
predict_model_xgboost.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pandas as pd
import numpy as np
import xgboost as xgb
from os.path import join
from sklearn.cross_validation import train_test_split
from sklearn import manifold
data_path = r'/data/'
def preprocess(df):
f = [c for c in df.columns if not c in['fault_severity', 'tsne0', 'tsne1']]
df[f] = np.sqrt(df[f]+3.0/8.0)
return df
#t-Distributed Stochastic Neighbor Embedding for dimentionality reduction
def add_tsne_features(dtrain, dtest):
tmp = dtrain.append(dtest)
tmp = tmp.reindex_axis(dtrain.columns, axis=1) # reorder columns again
features = tmp.columns.drop(['id', 'fault_severity'])[:39]
features = list(features) + ['severity_type']
tmp=tmp[features].groupby('location').mean()
tsne = manifold.TSNE(n_components=2, verbose=2, n_iter=500, perplexity=30)
X_2d = tsne.fit_transform(tmp.as_matrix())
tmp = pd.DataFrame(tmp.reset_index(), columns=('location', 'tsne0', 'tsne1'))
tmp[['tsne0', 'tsne1']] = X_2d
dtrain = dtrain.merge(tmp, on='location', sort=False, how='left')
dtest = dtest.merge(tmp, on='location', sort=False, how='left')
return dtrain, dtest
test = pd.read_csv(join(data_path, 'test_merged.csv'))
train = pd.read_csv(join(data_path, 'train_merged.csv'))
train, test = add_tsne_features(train, test)
features = train.columns.drop(['id', 'fault_severity', 'location'])
mode = 'fulltrain' # validate|fulltrain
n_classes = 3
prepr = True
params = {'objective': 'multi:softprob',
'eval_metric': ['merror', 'mlogloss'],
'num_class': n_classes,
'max_depth': 5,
'eta': 0.05,
'sub_sample': 0.7,
'colsample_bytree': 0.4,
'min_child_weight': 0,
'seed': 1,
'silent': 1}
if prepr:
train = preprocess(train)
X = train[features].values
y = train['fault_severity'].values
if mode == 'validate':
print("Validating...")
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.1,
random_state=15,
stratify=y)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
watchlist = [(dtrain,'train'), (dtest,'eval')]
num_round = 10000
bst = xgb.train(params, dtrain, num_round, watchlist,
early_stopping_rounds=1000)
elif mode == 'fulltrain':
print("Full training...")
dtrain = xgb.DMatrix(X, label=y)
watchlist = [(dtrain,'train')]
num_round = 1126
bst = xgb.train(params, dtrain, num_round, watchlist)
print("Predicting")
if prepr:
test = preprocess(test)
X_test = test[features].values
ypred = bst.predict(xgb.DMatrix(X_test)).reshape(-1, n_classes)
result = pd.read_csv(join(data_path, 'sample_result.csv'))
cols = ['predict_0', 'predict_1', 'predict_2']
result[cols] = ypred
result.to_csv(join(data_path, 'results',
'result.csv'), index=False)
else:
print("Usupported mode")