-
Notifications
You must be signed in to change notification settings - Fork 0
/
logistic_model_SMOTE.R
64 lines (52 loc) · 1.63 KB
/
logistic_model_SMOTE.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
library(tidyverse)
library(dplyr)
library(tidymodels)
library(ggplot2)
library(imbalance)
df <- read.csv("feature_engineering/train_data.csv",row.names = "X")
df
glimpse(df)
summary(df)
# Increases font size for all ggplot2 plots
theme_set(theme_gray(base_size=18))
# Increases font size for confusion matrix plots
update_geom_defaults("text", list(size=6))
options(repr.plot.width=6, repr.plot.height=5)
#df$fraud_flag <- as.factor(df$fraud_flag)
#Split data
set.seed(20231124)
data_split <- initial_split(df, prop = 0.7)
train_data <- training(data_split)
test_data <- testing(data_split)
# Print the shape of original (imbalanced) training dataset
train_y_categ <- train_data %>%
select(fraud_flag) %>%
table
message(
paste0(
"Original dataset shape ",
paste(names(train_y_categ), train_y_categ, sep = ": ", collapse = ", ")
)
)
# Resample the training dataset using SMOTE
smote_train_df <- train_data %>%
mutate(fraud_flag = factor(fraud_flag)) %>%
oversample(ratio = 0.99, method = "SMOTE", classAttr = "fraud_flag") %>%
mutate(fraud_flag = as.integer(fraud_flag))
# Print the shape of resampled (balanced) training dataset
smote_train_y_categ <- smote_train_df %>%
select(fraud_flag) %>% table
message(
paste0(
"Resampled dataset shape ",
paste(names(smote_train_y_categ), smote_train_y_categ, sep = ": ", collapse = ", ")
)
)
smote_train_df$fraud_flag <- as.factor(smote_train_df$fraud_flag)
smote_train_y_categ
smote_train_df
glimpse(smote_train_df)
# Fit logistic regression model
logistic_model <- glm(fraud_flag ~ ., data = smote_train_df, family = binomial)
# Print a summary of the model
summary(logistic_model)