-
Notifications
You must be signed in to change notification settings - Fork 0
/
greenwell-boehmke.R
323 lines (255 loc) · 8.66 KB
/
greenwell-boehmke.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
# Setup ------------------------------------------------------------------------
# Simulate training data
set.seed(101) # for reproducibility
trn <- as.data.frame(mlbench::mlbench.friedman1(500)) # ?mlbench.friedman1
names(trn) <- gsub("^x\\.", replacement = "X", x = names(trn))
# Inspect data
tibble::as_tibble(trn)
# Load required packages
library(dplyr) # for easy data wrangling
library(ggplot2) # for nicer graphics
library(rpart) # for fitting CART-like decision trees
library(randomForest) # for fitting random forests
library(xgboost) # for fitting GBMs
library(vip) # for variable importance plots
# Model-specific VI ------------------------------------------------------------
# Fit a single regression tree
tree <- rpart(y ~ ., data = trn)
# Fit a random forest
set.seed(101)
rfo <- randomForest(y ~ ., data = trn, importance = TRUE)
# Fit a GBM
set.seed(102)
bst <- xgboost(
data = data.matrix(subset(trn, select = -y)),
label = trn$y,
objective = "reg:linear",
nrounds = 100,
max_depth = 5,
eta = 0.3,
verbose = 0 # suppress printing
)
# VI plot for single regression tree
vi_tree <- tree$variable.importance %>%
data.frame("Importance" = .) %>%
tibble::rownames_to_column("Feature")
# VI plot for RF
vi_rfo <- rfo$importance %>%
data.frame("Importance" = .) %>%
tibble::rownames_to_column("Feature")
# VI plot for GMB
vi_bst <- bst %>%
xgb.importance(model = .) %>%
as.data.frame() %>%
select(Feature, Importance = Gain)
# Plot results
library(ggplot2)
p1 <- vip(tree) + ggtitle("Single tree")
p2 <- vip(rfo) + ggtitle("Random forest")
p3 <- vip(bst) + ggtitle("Gradient boosting")
#
pdf("figures/vi-plots.pdf", width = 9, height = 3)
grid.arrange(p1, p2, p3, nrow = 1)
dev.off()
library(vip)
# Extract (tibble of) VI scores
vi(tree) # CART-like decision tree
vi(rfo) # RF
vi(bst) # GBM
library(ggplot2) # for theme_light() function
pdf("figures/dot-plot.pdf", width = 7, height = 4.326)
vip(bst, num_features = 5, geom = "point", horizontal = FALSE,
aesthetics = list(color = "red", shape = 17, size = 4)) +
theme_light()
dev.off()
# Fit a LM
linmod <- lm(y ~ .^2, data = trn)
backward <- step(linmod, direction = "backward", trace = 0)
# Extract VI scores
vi(backward)
#> # A tibble: 21 x 3
#> Variable Importance Sign
#> <chr> <dbl> <chr>
#> 1 X4 14.2 POS
#> 2 X2 7.31 POS
#> 3 X1 5.63 POS
#> 4 X5 5.21 POS
#> 5 X3:X5 2.46 POS
#> 6 X1:X10 2.41 NEG
#> 7 X2:X6 2.41 NEG
#> 8 X1:X5 2.37 NEG
#> 9 X10 2.21 POS
#> 10 X3:X4 2.01 NEG
#> # … with 11 more rows
# Plot VI scores
pdf("figures/vip-step.pdf", width = 7, height = 4.326)
vip(backward, num_features = length(coef(backward)))
dev.off()
# Load required packages
library(earth)
# Fit a MARS model
mars <- earth(y ~ ., data = trn, degree = 2, pmethod = "exhaustive")
# Extract VI scores
vi(mars)
#> # A tibble: 10 x 2
#> Variable Importance
#> <chr> <dbl>
#> 1 X4 100
#> 2 X1 83.2
#> 3 X2 83.2
#> 4 X5 59.3
#> 5 X3 43.5
#> 6 X6 0
#> 7 X7 0
#> 8 X8 0
#> 9 X9 0
#> 10 X10 0
# Plot VI scores
pdf("figures/vip-earth.pdf", width = 7, height = 4.326)
vip(mars)
dev.off()
# Load required packages
library(nnet)
# Fit a neural network
set.seed(0803)
nn <- nnet(y ~ ., data = trn, size = 7, decay = 0.1, linout = TRUE, maxit = 500)
# VIPs
p1 <- vip(nn, type = "garson")
p2 <- vip(nn, type = "olden")
# Figure X
pdf("figures/vip-model-nn.pdf", width = 7, height = 3.5)
grid.arrange(p1, p2, nrow = 1)
dev.off()
# PDP method -------------------------------------------------------------------
# Load required packages
library(pdp)
# Fit a PPR model (nterms was chosen using the caret package with 5 repeats of
# 5-fold cross-validation)
pp <- ppr(y ~ ., data = trn, nterms = 11)
# PDPs for all 10 features
features <- paste0("X", 1:10)
pdps <- lapply(features, FUN = function(feature) {
pd <- partial(pp, pred.var = feature)
autoplot(pd) +
ylim(range(trn$y)) +
theme_light()
})
pdf("figures/pdp-ppr.pdf", width = 15, height = 5)
grid.arrange(grobs = pdps, ncol = 5)
dev.off()
# Plot VI scores
p1 <- vip(pp, method = "pdp") + ggtitle("PPR")
p2 <- vip(nn, method = "pdp") + ggtitle("NN")
# Figure X
pdf("figures/vip-ppr-nn.pdf", width = 7, height = 3.5)
grid.arrange(p1, p2, nrow = 1)
dev.off()
# ICE curve method -------------------------------------------------------------
# ICE curves for all 10 features
ice_curves <- lapply(features, FUN = function(feature) {
ice <- partial(pp, pred.var = feature, ice = TRUE)
autoplot(ice, alpha = 0.1) +
ylim(range(trn$y)) +
theme_light()
})
pdf("figures/ice-ppr.pdf", width = 15, height = 5)
grid.arrange(grobs = ice_curves, ncol = 5)
dev.off()
# Plot VI scores
p1 <- vip(pp, method = "ice") + ggtitle("PPR")
p2 <- vip(nn, method = "ice") + ggtitle("NN")
# Figure X
pdf("figures/vip-ice-ppr-nn.pdf", width = 7, height = 3.5)
grid.arrange(p1, p2, ncol = 2)
dev.off()
vis <- vi(pp, method = "pdp")
vis
# Figure X
pdf("figures/pdp-from-attr.pdf", width = 10, height = 5)
par(mfrow = c(2, 5))
for (name in paste0("X", 1:10)) {
plot(attr(vis, which = "pdp")[[name]], type = "l", ylim = c(9, 19), las = 1)
}
dev.off()
# Permutation method -----------------------------------------------------------
# Plot VI scores
set.seed(2021) # for reproducibility
p1 <- vip(pp, method = "permute", target = "y", metric = "rsquared",
pred_wrapper = predict) + ggtitle("PPR")
p2 <- vip(nn, method = "permute", target = "y", metric = "rsquared",
pred_wrapper = predict) + ggtitle("NN")
# Figure X
pdf("figures/vip-permute-ppr-nn.pdf", width = 7, height = 3.5)
grid.arrange(p1, p2, ncol = 2)
dev.off()
# Use 10 Monte Carlo reps
set.seed(403) # for reproducibility
vi(pp, method = "permute", target = "y", metric = "rsquared",
pred_wrapper = predict, nsim = 10)
# Custom loss function: mean absolute error
mae <- function(actual, predicted) {
mean(abs(actual - predicted))
}
# Figure X
set.seed(2321) # for reproducibility
pdf("figures/vip-permute-nn-mae.pdf", width = 7, height = 4.326)
vip(nn, method = "permute", target = "y", metric = mae,
smaller_is_better = TRUE, pred_wrapper = nnet:::predict.nnet) +
ggtitle("Custom loss function: MAE")
dev.off()
# Figure X
set.seed(2327) # for reproducibility
pdf("figures/vip-permute-nn-sample.pdf", width = 7, height = 4.326)
vip(nn, method = "permute",
train = trn[sample(nrow(trn), size = 400), ], # sample 400 observations
target = "y", metric = "rmse") +
ggtitle("Using a random subset of training data")
dev.off()
# Figure X
set.seed(8264) # for reproducibility
pdf("figures/vip-permute-nn-all.pdf", width = 7, height = 4.326)
vip(nn, method = "permute", target = "y", metric = "rmse", nsim = 10,
all_permutations = TRUE, fill = "red", alpha = 0.5) +
ggtitle("Plotting all permutation scores")
dev.off()
# Use sparklines to characterize feature effects -------------------------------
# First, compute a tibble of variable importance scores using any method
var_imp <- vi(rfo, method = "permute", metric = "rmse", target = "y")
# Next, convert to an html-based data table with sparklines
add_sparklines(var_imp, fit = rfo)
# Predict the sale price for a home --------------------------------------------
# Load the Ames housing data
ames <- AmesHousing::make_ames()
X <- subset(ames, select = -Sale_Price)
y <- ames$Sale_Price
# Load required packages
library(SuperLearner)
# List of base learners
learners <- c("SL.xgboost", "SL.ranger", "SL.earth", "SL.glmnet", "SL.ksvm")
# Stack models
set.seed(840)
ctrl <- SuperLearner.CV.control(V = 5L, shuffle = TRUE)
sl <- SuperLearner(Y = y, X = X, SL.library = learners, verbose = TRUE,
cvControl = ctrl)
sl
# Prediction wrapper functions
imp_fun <- function(object, newdata) {
predict(object, newdata = newdata)$pred
}
par_fun <- function(object, newdata) {
mean(predict(object, newdata = newdata)$pred)
}
# Setup parallel backend
library(doParallel) # load the parallel backend
cl <- makeCluster(5) # use 5 workers
registerDoParallel(cl) # register the parallel backend
# Permutation-based feature importance
set.seed(278)
var_imp <- vi(sl, method = "permute", train = X, target = y, metric = "rmse",
pred_wrapper = imp_fun, nsim = 5, parallel = TRUE)
# Add sparline representation of feature effects
add_sparklines(var_imp[1L:10L, ], fit = sl, pred.fun = par_fun, train = X,
digits = 2, verbose = TRUE, trim.outliers = TRUE,
grid.resolution = 20, parallel = TRUE)
# Shut down cluster
stopCluster(cl)