-
Notifications
You must be signed in to change notification settings - Fork 334
/
2.py
32 lines (24 loc) · 1.06 KB
/
2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
d_train = pd.read_csv("train-0.1m.csv")
d_test = pd.read_csv("test.csv")
d_train_test = d_train.append(d_test)
vars_categ = ["Month","DayofMonth","DayOfWeek","UniqueCarrier", "Origin", "Dest"]
vars_num = ["DepTime","Distance"]
def get_dummies(d, col):
dd = pd.get_dummies(d.ix[:, col])
dd.columns = [col + "_%s" % c for c in dd.columns]
return(dd)
%time X_train_test_categ = pd.concat([get_dummies(d_train_test, col) for col in vars_categ], axis = 1)
X_train_test = pd.concat([X_train_test_categ, d_train_test.ix[:,vars_num]], axis = 1)
y_train_test = np.where(d_train_test["dep_delayed_15min"]=="Y", 1, 0)
X_train = X_train_test[0:d_train.shape[0]]
y_train = y_train_test[0:d_train.shape[0]]
X_test = X_train_test[d_train.shape[0]:]
y_test = y_train_test[d_train.shape[0]:]
md = RandomForestClassifier(n_estimators = 500, n_jobs = -1)
%time md.fit(X_train, y_train)
%time phat = md.predict_proba(X_test)[:,1]
metrics.roc_auc_score(y_test, phat)