-
Notifications
You must be signed in to change notification settings - Fork 0
/
randomForest_SMOTE.R
142 lines (110 loc) · 4.4 KB
/
randomForest_SMOTE.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
library(tidyverse)
library(dplyr)
library(tidymodels)
library(ggplot2)
library(imbalance)
library(randomForest)
# Take caution, this script took over 5 hours to run.
df <- read.csv("feature_engineering/train_data_model.csv",row.names = "X")
df
glimpse(df)
summary(df)
# Increases font size for all ggplot2 plots
theme_set(theme_gray(base_size=18))
# Increases font size for confusion matrix plots
update_geom_defaults("text", list(size=6))
options(repr.plot.width=6, repr.plot.height=5)
# Split data
set.seed(20231124)
data_split <- initial_split(df, prop = 0.7)
train_data <- training(data_split)
test_data <- testing(data_split)
# Print the shape of original (imbalanced) training dataset
train_y_categ <- train_data %>%
select(fraud_flag) %>%
table
message(
paste0(
"Original dataset shape ",
paste(names(train_y_categ), train_y_categ, sep = ": ", collapse = ", ")
)
)
# Resample the training dataset using SMOTE
smote_train_df <- train_data %>%
mutate(fraud_flag = factor(fraud_flag)) %>%
oversample(ratio = 0.99, method = "SMOTE", classAttr = "fraud_flag") %>%
mutate(fraud_flag = as.integer(fraud_flag)) %>%
mutate(fraud_flag = as.factor(fraud_flag)) %>%
mutate(fraud_flag = forcats::fct_recode(fraud_flag, "0" = "1", "1" = "2"))
# Print the shape of resampled (balanced) training dataset
smote_train_y_categ <- smote_train_df %>%
select(fraud_flag) %>% table
message(
paste0(
"Resampled dataset shape ",
paste(names(smote_train_y_categ), smote_train_y_categ, sep = ": ", collapse = ", ")
)
)
smote_train_df$fraud_flag <- as.factor(smote_train_df$fraud_flag)
smote_train_y_categ
# Define the random forest model with a progress bar and time estimate
start_time <- Sys.time()
rf_model <- rand_forest(trees = 100, mode = "classification") %>%
set_engine("randomForest", importance = TRUE)
# Display progress and estimated time remaining
for (i in 1:100) {
if (i %% 10 == 0) {
current_time <- Sys.time()
elapsed_time <- difftime(current_time, start_time, units = "secs")
trees_per_sec <- i / as.numeric(elapsed_time)
remaining_time <- (100 - i) / trees_per_sec
cat(sprintf("\r%d trees fitted, Estimated time remaining: %.2f seconds", i, remaining_time))
flush.console()
}
# Fit additional trees to the existing model
rf_model <- randomForest(fraud_flag ~ ., data = smote_train_df, ntree = i, keep.forest = TRUE,
init.forest = rf_model$forest)
}
cat("\n") # Add a newline after the progress bar
rf_model
# Make predictions on the test set
rf_preds <- predict(rf_model, newdata = test_data)
# Combine predictions with the original test_data
result_df <- cbind(test_data, Predicted_Fraud_Flag = rf_preds)
result_df
write.csv(result_df, "feature_engineering/randomforestresults.csv", row.names = FALSE)
# Make predictions on the test set
rf_preds <- predict(rf_model, newdata = test_data)
# Combine predictions with the original test_data
result_df <- cbind(test_data, Predicted_Fraud_Flag = rf_preds)
# Create a confusion matrix
conf_matrix <- table(result_df$fraud_flag, result_df$Predicted_Fraud_Flag)
# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)
# Convert to a data frame for easier writing to CSV
conf_matrix_df <- as.data.frame.matrix(conf_matrix)
# Calculate metrics
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
precision <- conf_matrix[2, 2] / sum(conf_matrix[, 2])
recall <- conf_matrix[2, 2] / sum(conf_matrix[2, ])
f1_score <- 2 * (precision * recall) / (precision + recall)
# Print metrics
cat("\nAccuracy:", accuracy, "\n")
cat("Precision:", precision, "\n")
cat("Recall:", recall, "\n")
cat("F1 Score:", f1_score, "\n")
# Convert to a data frame for easier writing to CSV
metrics_df <- data.frame(Accuracy = accuracy, Precision = precision, Recall = recall, F1_Score = f1_score)
# Assuming result_df has columns fraud_flag and Predicted_Fraud_Flag
rmse <- sqrt(mean((as.numeric(result_df$fraud_flag) - as.numeric(result_df$Predicted_Fraud_Flag))^2))
# Print RMSE
cat("Root Mean Squared Error (RMSE):", rmse, "\n")
# Convert to a data frame for easier writing to CSV
rmse_df <- data.frame(RMSE = rmse)
# Write RMSE to CSV
write.csv(rmse_df, "feature_engineering/rmse.csv", row.names = FALSE)
# Write metrics to CSV
write.csv(metrics_df, "feature_engineering/model_metrics.csv", row.names = FALSE)
# Write confusion matrix to CSV
write.csv(conf_matrix_df, "feature_engineering/confusion_matrix.csv", row.names = TRUE)