Quiz 3 Review code #35

chendaniely · 2022-06-24T01:44:55Z

Inference

library(tidyverse)
library(tidymodels)


# inference example -----

mtcars

bootstrap_samples <- rep_sample_n(mtcars, size = 10, reps = 1000, replace = TRUE)

bootstrap_sample_estimates <- bootstrap_samples %>%
  group_by(replicate) %>% # technically the data is already grouped, but i'm putting this here to be extra explicit
  summarize(avg_mpg = mean(mpg, na.rm = TRUE))

# bootstrap distribution

ggplot(bootstrap_sample_estimates, aes(x = avg_mpg)) + geom_histogram()

bootstrap_sample_estimates %>%
    pull(avg_mpg) %>%
    mean()

Linear regression

library(GGally)

ggpairs(mtcars)

# linear regression example -----

## very similar line of thinking with the classification models
## https://github.com/UBC-DSCI/dsci-100-student/issues/33

mt_cars <- mtcars %>%
  mutate(am = factor(am)) # why do you need this?

car_split <- initial_split(mt_cars , prop = 0.75, strata = am)
car_train <- training(car_split)
car_test <- testing(car_split)

lm_spec <- linear_reg()  %>% 
  set_engine("lm") %>% 
  set_mode("regression")

car_recipe <- recipe(mpg ~ am + hp + wt, data = car_train)
# do we __have to__ center scale?

lm_fit <- workflow() %>% 
  add_recipe(car_recipe) %>% 
  add_model(lm_spec) %>% 
  fit(data = car_train)
lm_fit

## evaluate the linear model on test data

lm_test_results <- lm_fit %>% 
  predict(car_test) %>% 
  bind_cols(car_test) %>% 
  metrics(truth = mpg, estimate = .pred)

lm_test_results

## where are all the places the code can go wrong?
## training on the wrong dataset, testing the wrong dataset

Clustering

# clustering

mtcars_subset <- mtcars %>%
  select(mpg, disp, hp, wt)

car_km <- mtcars_subset%>%
  mutate(across(everything(), scale))

car_clusters <- kmeans(car_km, centers = 3)

clustered_cars <- augment(car_clusters, car_km)

ggplot(clustered_cars, aes(x = mpg, y = hp, colour = .cluster)) +
  geom_point(alpha = 0.5, size = 2) +
  theme(text = element_text(size = 20))


## elbow

cars_elbow_stats <- tibble(k = 1:10) %>%
  rowwise() %>%
  mutate(car_clusts = list(kmeans(car_km, centers=k, nstart=10))) %>%
  mutate(glanced=list(glance(car_clusts))) %>%
  select(-car_clusts) %>% 
  unnest(glanced)

car_elbow_plot <- ggplot(cars_elbow_stats, aes(x = k, y = tot.withinss)) +
  geom_point() +
  geom_line() +
  xlab("K") +
  ylab("Total within-cluster sum of squares") +
  theme(text = element_text(size = 20))
car_elbow_plot

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Quiz 3 Review code #35

Quiz 3 Review code #35

chendaniely commented Jun 24, 2022 •

edited

Loading

Quiz 3 Review code #35

Quiz 3 Review code #35

Comments

chendaniely commented Jun 24, 2022 • edited Loading

Inference

Linear regression

Clustering

chendaniely commented Jun 24, 2022 •

edited

Loading