model_prep.Rmd

Loading libraries

```{r}

library(dplyr)
library(caTools)
library(rpart)
library(rpart.plot)  
            
library(ipred)       
library(caret)   

library(pROC)

library(Metrics)

library(randomForest)

library(ggplot2)
library(plotly)

```

Laoding data from rds file

```{r}

used_cars_data <- readRDS("./data/my_data.rds")

View(used_cars_data)

```

Studying data

```{r}

vehicle_conditions <- used_cars_data %>%
  count(condition)

vehicle_status <- used_cars_data %>%
  count(title_status)

vehicle_cylinders <- used_cars_data %>%
  count(cylinders)

vehicle_paints <- used_cars_data %>%
  count(paint_color)

```


Split data into training data and testing data

```{r}

set.seed(101) 

sample <- sample.split(used_cars_data$price, SplitRatio = .80)
train_data <- subset(used_cars_data, sample == TRUE)
test_data  <- subset(used_cars_data, sample == FALSE)

```

Create random forest model

```{r}

rf_model <- randomForest(price ~ ., data = train_data, mtry = 3, importance = TRUE)

```

Load random forest model from rds

```{r}

rf_model <- readRDS("./model/rf_model.rds")

```


Importance and variables usage

```{r}

# varImpPlot(rf_model)
# colnames(rf_model)
# varUsed(rf_model)

imp <- as.data.frame(importance(rf_model))
imp$varnames <- rownames(imp) # row names to column
rownames(imp) <- NULL  
colnames(imp) <- c("MeanDecreaseAccuracy", "MeanDecreaseGini", "varnames")

# MeanDecreaseAccuracy plot

mda_plot <- imp %>%
  ggplot(aes(x = reorder(varnames, MeanDecreaseAccuracy), y = MeanDecreaseAccuracy, colour = varnames)) +
  geom_point(size = 3) +
  geom_segment(alpha = 0.7,  size = 1, 
               aes(x = varnames, xend = varnames, y = 0, yend = MeanDecreaseAccuracy)) +
  coord_flip() +
  labs(x = "", y = "MeanDecreaseAccuracy") +
  theme_bw() +
  ggtitle("MeanDecreaseAccuracy") +
  theme(legend.position = "none")

# MeanDecreaseGini plot

mdg_plot <- imp %>%
  ggplot(aes(x = reorder(varnames, MeanDecreaseGini), y = MeanDecreaseGini, colour = varnames)) +
  geom_point(size = 3) +
  geom_segment(alpha = 0.7,  size = 1, 
               aes(x = varnames, xend = varnames, y = 0, yend = MeanDecreaseGini)) +
  coord_flip() +
  labs(x = "", y = "MeanDecreaseGini") +
  theme_bw() +
  ggtitle("MeanDecreaseGini") +
  theme(legend.position = "none")

# Display Plots

ggplotly(mda_plot)
ggplotly(mdg_plot)

# Both plots in one plot

# subplot(mda_plot, mdg_plot, nrows = 1, margin = 0.04, widths = c(0.5, 0.5))
  

```

Plot random forest model

```{r}

# Optimal value of NTREE
plot(rf_model)

```

Evaluer l'efficacité du modèle

```{r}

pred <- predict(rf_model, test_data)

head(round(pred), digits = 0)

head(test_data$price)

```

Number of nodes in each tree


```{r}

hist(treesize(rf_model))

getTree(rf_model, 1, labelVar = TRUE)

```

Save random forest model as RDS file

```{r}

saveRDS(rf_model, "./model/rf_model.rds")

```