-
Notifications
You must be signed in to change notification settings - Fork 0
/
forest.R
203 lines (161 loc) · 6.31 KB
/
forest.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
# ************************************************
#
# [Based on Lab 4's code]
#
# simpleDT() :
#
# Create C5 Decision Tree on the raw dataset
# A decision tree may not need the dataset to be pre-processed
#
# INPUT :
# Data Frame - train - original train dataset
# Data Frame - test - original test dataset
# list - config - list of configurations
# boolean - plot - TRUE = plot charts
#
# OUTPUT :
# : Data Frame - measures - performance metrics
#
# ************************************************
simpleDT<-function(train,test,config,plot=TRUE){
train<-NConvertClass(train,config)
test<-NConvertClass(test,config)
positionClassOutput<-which(names(train)==config$OUTPUT_FIELD)
tree<-C50::C5.0(x=train[-positionClassOutput],
y=factor(train[,positionClassOutput]),
rules=TRUE,
trials=1)
measures<-testModel(myModel = tree,
testDataset = test,
title="Original Dataset. DT C5.0",
config=config)
return(measures)
}
# ************************************************
#
# [Based on Lab 4's code]
#
# fullDT() :
#
# Create C5 Decision Tree on pre-processed dataset
#
# INPUT :
# Data Frame - train - train dataset
# Data Frame - test - test dataset
# list - config - list of configurations
# int - boost - number of trees to boost
# boolean - plot - TRUE = plot charts
#
# OUTPUT :
# : Data Frame - measures - performance metrics
#
# ************************************************
fullDT<-function(train,test,config,boost=1,plot=TRUE){
positionClassOutput<-which(names(train)==config$OUTPUT_FIELD)
# train data: dataframe with the input fields
train_inputs<-train[-positionClassOutput]
# train data: vector with the expedcted output
train_expected<-train[,positionClassOutput]
# ************************************************
# Create a standard Decision Tree using the C5.0 algorithm
# Uses library C50
# Outputs the tree in the format of rules
myTitle<-"Preprocessed Dataset. DT C5.0"
if (boost>1)
myTitle<-paste(myTitle,"BOOSTED=",boost)
print(myTitle)
tree<-C50::C5.0(x=train_inputs,
factor(train_expected),
rules=TRUE,
trials=boost)
# Use the created decision tree with the test dataset
# to determine best classification threshold & calculate metrics
measures<-testModel(myModel = tree,
testDataset = test,
config = config,
title=myTitle,
plot=plot)
if (plot==TRUE){
print(summary(tree))
# Get importance of the input fields
importance<-C50::C5imp(tree, metric = "usage")
names(importance)<-"Strength"
importance<-importance[order(importance$Strength,decreasing=TRUE),,drop=FALSE]
print(formattable::formattable(importance))
# Plot the importance fields
barplot(t(importance),las=2,
border = 0, cex.names =0.7,
main=myTitle)
# Function to output the tree as rules to a file
dftreerules<-NDT5RuleOutput(tree)
print(formattable::formattable(dftreerules))
# ************************************************
# Creates the same C5.0 decision tree & output as a tree structure, plot it
# The "partykit" library requires the variables (wrongly) to be global
print("Plot decision tree to file called diagram.pdf")
Global_train_inputs<<-train_inputs
Global_train_expected<<-train_expected
# :: is used to specify a function within the named package to avoid confusion
tree<-C50::C5.0(x=Global_train_inputs,
factor(Global_train_expected),
trials=boost)
# ::: is used to directly access a member of a package that is internal
graphtree<-C50:::as.party.C5.0(tree)
# The plot is large - so print to a big PDF file
pdf(config$TREE_FILENAME, width=100, height=50, paper="special", onefile=F)
# The number is the node level of the tree to print
plot(graphtree[config$NODE_DEPTH])
#This closes the PDF file
dev.off()
}
return(measures)
}
# ************************************************
#
# [Based on Lab 4's code]
#
# randomForest() :
#
# Create Random Forest on pre-processed dataset
#
# INPUT :
# : Data Frame - train - train dataset
# Data Frame - test - test dataset
# list - config - list of configurations
# boolean - plot - TRUE = output charts/results
#
# OUTPUT :
# : Data Frame - measures - performance metrics
#
# ************************************************
randomForest<-function(train,test,config,plot=TRUE){
myTitle<-(paste("Random Forest =",config$TREE_NUMBER,"trees"))
print(myTitle)
positionClassOutput<-which(names(train)==config$OUTPUT_FIELD)
# train data: dataframe with the input fields
train_inputs<-train[-positionClassOutput]
# train data: vector with the expected output
train_expected<-train[,positionClassOutput]
rf<-randomForest::randomForest(train_inputs,
factor(train_expected),
ntree=config$TREE_NUMBER ,
importance=TRUE,
mtry=sqrt(ncol(train_inputs)))
# Use the created decision tree with the test dataset
measures<-testModel(myModel = rf,
testDataset = test,
config=config,
title=myTitle,
plot=plot)
if (plot==TRUE){
# Get importance of the input fields
importance<-randomForest::importance(rf,scale=TRUE,type=1)
importance<-importance[order(importance,decreasing=TRUE),,drop=FALSE]
colnames(importance)<-"Strength"
barplot(t(importance),las=2, border = 0,
cex.names =0.7,
main=myTitle)
print(formattable::formattable(data.frame(importance)))
}
return(measures)
}