-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCARTmtcars.R
131 lines (105 loc) · 4.41 KB
/
CARTmtcars.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
library(rpart)
library(parallel)
head(mtcars)
plot(mtcars)
# we are trying to predict mpg
# functionalize CART algorithm
getBestTree <- function(data) {
# get maximal tree
myMaxTree <- rpart(data[, 1] ~ ., data[, -1], control = rpart.control(minsplit = 2, cp = 0))
# plot(myMaxTree)
# text(myMaxTree)
# print(myMaxTree)
# print(summary(myMaxTree))
# get CP table of myMaxTree
myCPtable <- myMaxTree$cptable
# myPlotCP <- plotcp(myMaxTree)
#get error threshold to be able to selet the best tree
# we may have several trees with min xerr, take min xstd and then, min nsplit (must be unique in the end)
myCPtable_sorted <- myCPtable[order(myCPtable[, 4], myCPtable[, 5], myCPtable[, 2]),]
myThreshold <- myCPtable_sorted[1, 4] + myCPtable_sorted[1, 5]
# print("threshold = ")
# print(myThreshold)
# get best CP to be able to get the best tree
myEligibleCP <- myCPtable[myCPtable[, 4] <= myThreshold,]
# if several trees are elegible then sort them out
if (is.vector(myEligibleCP) == FALSE) {
myBestCP <- myEligibleCP[myEligibleCP[, 2] == min(myEligibleCP[, 2]), 1] # must be unique
}
# otherwise we are done
else {
myBestCP <- myEligibleCP[1] # must be unique
}
# print("best CP =")
# print(myBestCP)
# prune our maximal tree
myBestTree <- prune(myMaxTree, cp = myBestCP)
# plot(myBestTree)
# text(myBestTree)
# print(myBestTree)
# print(summary(myBestTree))
getBestTree <- myBestTree
}
# OPTIONAL
# Let us determine the CART tree associated to mtcars
# myBestTree <- getBestTree(mtcars)
# unstable : different at each execution
# Let us do bagging and CART to plot error as a funciton of K (number of bootstrap sample)
# split our data without replacement into a leanrning sample and and a testing sample
getSplit <- function(data, ts) {
# ts = testing sample size
n <- nrow(data)
testIndex <- sample(1:n, ts)
mySplit <- list(learnData = data[-testIndex,], testData = data[testIndex,])
}
# create a bootstrap sample with replacement of size bs
getBootstrap <- function(data, bs) { # bs = bagsize
len <- nrow (data)
myRows <- sample(1:len, len, replace = TRUE)
getBootstrap <- data[myRows,]
}
# from splited data, get error of prediction with a bagging of K bootstrap samples of size bs computed on a test sample of size ts
getPoint <- function(learnData, testData, K, ts, bs) { # ts = test size, bs = bootstrap sample size
# get K predictions of K boostrap samples of size bs from data (first column of test is real Y values)
myKpredict <- replicate(K, predict(getBestTree(getBootstrap(learnData, bs)), newdata = testData[-1]))
# get prediction out of K predictions
myBagPredict <- rowMeans(myKpredict)
# get norm 2 error between predicition and real value
getPoint <- norm(myBagPredict - testData[1], type = "2") / length(myBagPredict)
}
# prepare split for our test with bagging going from 2 to Kmax
prepareCompute <- function(Kmax) {
mySplit <- getSplit(mtcars, 10)
myK <- sample(2:Kmax, Kmax - 1)
myPrep <- list(learnData = mySplit$learnData, testData = mySplit$testData, myK = myK)
return (myPrep)
}
# process our test and plot error vs K without parrallelization
simpleCompute <- function(Kmax) {
myPrep <- prepareCompute(Kmax)
myErr <- lapply(myPrep$myK, function(x) { return(getPoint(myPrep$learnData, myPrep$testData, x, 10, nrow(mtcars))) })
return(as.vector(myErr))
}
# process our test and plot error vs K with 11 threads
parCompute <- function(Kmax) {
myParPrep <- prepareCompute(Kmax)
myEnvir <- environment()
# create a cluster
numCores <- detectCores()
cl <- makeCluster(numCores)
# export useful objects in the cluster
clusterExport(cl, list("getBestTree","getSplit", "getBootstrap", "getPoint", "prepareCompute"))
clusterExport(cl, list("Kmax", "myParPrep"), envir = myEnvir)
# parallelize sapply
myErr <- parSapply(cl, myParPrep$myK, FUN = function(x) { return(getPoint(myParPrep$learnData, myParPrep$testData, x, 10, nrow(mtcars))) })
# close the cluster
stopCluster(cl)
plot(myParPrep$myK, myErr)
return(as.vector(myErr))
}
myKmaxList <- c(50, 100, 200)
myResults <- lapply(myKmaxList, FUN = function(x) { list(system.time(simpleCompute(x)), system.time(parCompute(x))) })
names(myResults) <- myKmaxList
myResults
system.time(parCompute(500))
system.time(simpleCompute(500))