-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathRandomForest_clean.R
69 lines (48 loc) · 2.29 KB
/
RandomForest_clean.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# Libraries
library(readr)
library(tidyverse)
library(randomForestSRC)
# Load datasets
questions = read_csv('questions.csv')
clients = read_csv('clients.csv')
# Merge datasets on 'ClientUno'
colnames(questions)[which(colnames(questions)=="AskedByClientUno")]<-"ClientUno"
question_cli.df<-merge(questions, y = clients[ , c("ClientUno", "EthnicIdentity",
"Age", "Gender", "MaritalStatus",
"NumberInHousehold", "AnnualIncome",
"AllowedIncome" )],
by = "ClientUno", all.x=TRUE)
# Coerce columns to Numeric or Factor
question_cli.df$Age<-as.numeric(question_cli.df$Age)
question_cli.df$NumberInHousehold<-as.numeric(question_cli.df$NumberInHousehold)
question_cli.df$AnnualIncome<-as.numeric(question_cli.df$AnnualIncome)
question_cli.df$AllowedIncome<-as.numeric(question_cli.df$AllowedIncome)
question_cli.df$Category<-as.factor(question_cli.df$Category)
question_cli.df$EthnicIdentity<-as.factor(question_cli.df$EthnicIdentity)
question_cli.df$MaritalStatus<-as.factor(question_cli.df$MaritalStatus)
question_cli.df$Gender<-as.factor(question_cli.df$Gender)
# Filter data from Missouri
question_cli_mo.df <- question_cli.df %>% filter(StateAbbr=="MO")
# Create Factor 'FnC': Question in "Family and Children" category or not
question_cli_mo.df$FnC = question_cli_mo.df$Category=='Family and Children'
question_cli_mo.df$FnC = replace(question_cli_mo.df$FnC,
question_cli_mo.df$FnC==TRUE, 1)
question_cli_mo.df$FnC = replace(question_cli_mo.df$FnC,
question_cli_mo.df$FnC==FALSE, 0)
question_cli_mo.df$FnC = as.factor(question_cli_mo.df$FnC)
# Train/Test Split
set.seed(123)
n = nrow(question_cli_mo.df)
train = sample(1:n, n*0.7)
test = -(train)
mo.train = question_cli_mo.df[train,]
mo.test = question_cli_mo.df[test,]
# Random Forest Model (with runtime)
start.tm<-Sys.time()
rf.mo1<-rfsrc(FnC~EthnicIdentity+Age+MaritalStatus+Gender+
NumberInHousehold+AnnualIncome+AllowedIncome,
data=mo.train)
end.tm<-Sys.time()
end.tm-start.tm
# Predictions on test set
predict(rf.mo1, mo.test)