Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Quiz 3 Review code #35

Open
chendaniely opened this issue Jun 24, 2022 · 0 comments
Open

Quiz 3 Review code #35

chendaniely opened this issue Jun 24, 2022 · 0 comments

Comments

@chendaniely
Copy link
Contributor

chendaniely commented Jun 24, 2022

Inference

library(tidyverse)
library(tidymodels)


# inference example -----

mtcars

bootstrap_samples <- rep_sample_n(mtcars, size = 10, reps = 1000, replace = TRUE)

bootstrap_sample_estimates <- bootstrap_samples %>%
  group_by(replicate) %>% # technically the data is already grouped, but i'm putting this here to be extra explicit
  summarize(avg_mpg = mean(mpg, na.rm = TRUE))

# bootstrap distribution

ggplot(bootstrap_sample_estimates, aes(x = avg_mpg)) + geom_histogram()

bootstrap_sample_estimates %>%
    pull(avg_mpg) %>%
    mean()

Linear regression

library(GGally)

ggpairs(mtcars)

# linear regression example -----

## very similar line of thinking with the classification models
## https://github.com/UBC-DSCI/dsci-100-student/issues/33

mt_cars <- mtcars %>%
  mutate(am = factor(am)) # why do you need this?

car_split <- initial_split(mt_cars , prop = 0.75, strata = am)
car_train <- training(car_split)
car_test <- testing(car_split)

lm_spec <- linear_reg()  %>% 
  set_engine("lm") %>% 
  set_mode("regression")

car_recipe <- recipe(mpg ~ am + hp + wt, data = car_train)
# do we __have to__ center scale?

lm_fit <- workflow() %>% 
  add_recipe(car_recipe) %>% 
  add_model(lm_spec) %>% 
  fit(data = car_train)
lm_fit

## evaluate the linear model on test data

lm_test_results <- lm_fit %>% 
  predict(car_test) %>% 
  bind_cols(car_test) %>% 
  metrics(truth = mpg, estimate = .pred)

lm_test_results

## where are all the places the code can go wrong?
## training on the wrong dataset, testing the wrong dataset

Clustering

# clustering

mtcars_subset <- mtcars %>%
  select(mpg, disp, hp, wt)

car_km <- mtcars_subset%>%
  mutate(across(everything(), scale))

car_clusters <- kmeans(car_km, centers = 3)

clustered_cars <- augment(car_clusters, car_km)

ggplot(clustered_cars, aes(x = mpg, y = hp, colour = .cluster)) +
  geom_point(alpha = 0.5, size = 2) +
  theme(text = element_text(size = 20))


## elbow

cars_elbow_stats <- tibble(k = 1:10) %>%
  rowwise() %>%
  mutate(car_clusts = list(kmeans(car_km, centers=k, nstart=10))) %>%
  mutate(glanced=list(glance(car_clusts))) %>%
  select(-car_clusts) %>% 
  unnest(glanced)

car_elbow_plot <- ggplot(cars_elbow_stats, aes(x = k, y = tot.withinss)) +
  geom_point() +
  geom_line() +
  xlab("K") +
  ylab("Total within-cluster sum of squares") +
  theme(text = element_text(size = 20))
car_elbow_plot
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant