-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDecision Tree-ROC.R
89 lines (64 loc) · 2.4 KB
/
Decision Tree-ROC.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Fitting Classification Tree Models
# We are going to use the Carseats dataset from the ISLR library
library(ISLR) # data library for statistical usage
library(tree) #to fit the decision trees
library(ROCR)
library(pROC)
attach(Carseats) #dataset to use
head(Carseats) #shows a data sample
Carseats #show the original data itself
str(Carseats)
range(Sales) #shows the range of values in the column
#Create a categorical variables bases on Slaes
High=ifelse(Sales>=8,"Yes", "No") #OR
#High=ifelse(Sales>=8,1,0)
High
length(High)
dim(Carseats)
#Carseats$Sales=as.factor(Carseats$Sales) 336 levels
Carseats= data.frame(Carseats,High)
names(Carseats)
str(Carseats)
Carseats=Carseats[,-1] #drop the column you do not need which is Sales
names(Carseats)
dim(Carseats)
#Split data into testing and training using
set.seed(2) #it is important to have the same number in seed to get the same results for splitting
train=sample(1:nrow(Carseats),nrow(Carseats)/2) #we are sampling from the dataset and selesct the random half of it for training
summary(train)
test=-train #the remaining part from the train sample
training_data=Carseats[train,] #rows, columns
testing_data=Carseats[test,]
Testing_High=High[test]
#fit the tree model using training data
tree_model=tree(High~.,training_data)
plot(tree_model)
text(tree_model,pretty = 0) #we added pretty to see categorical data branches
summary(tree_model)
#check how the model is performing using the test data
tree_pred=predict(tree_model,testing_data,type="class")
mean(tree_pred!=Testing_High) #misclasification error
tree_pred
##Prune the tree
#Cross validation to check where to stop pruning
set.seed(3)
cv_tree=cv.tree(tree_model,FUN = prune.misclass)
names(cv_tree) #includes tree size, error rate ...
plot(cv_tree$size,cv_tree$dev, type="b") #x axis variable, y axis variable
#min error rate is tree size of 9
#prune the tree size to 9
pruned_model=prune.misclass(tree_model,best=9)
plot(pruned_model)
text(pruned_model,pretty = 0)
summary(pruned_model)
#check how it is doing
tree_pred=predict(pruned_model,testing_data,type="class")
mean(tree_pred!=Testing_High)
#Predictions
t<-table(predictions=tree_pred,actual=Testing_High)
t
##accuracy metric
sum(diag(t))/sum(t)
##Plotting Roc curve and calculating AUC metric
pred_model<-predict(pruned_model,testing_data,type="tree")
pred_model