From c0b99877b55f792751aa9ce9e2f124ba9684a2c0 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Tue, 19 Sep 2023 17:08:55 -0700 Subject: [PATCH] cv for many models --- .../execute-results/html.json | 20 ++ schedule/slides/00-cv-for-many-models.html | 314 ------------------ ...y-models.Rmd => 00-cv-for-many-models.qmd} | 69 ++-- 3 files changed, 47 insertions(+), 356 deletions(-) create mode 100644 _freeze/schedule/slides/00-cv-for-many-models/execute-results/html.json delete mode 100644 schedule/slides/00-cv-for-many-models.html rename schedule/slides/{00-cv-for-many-models.Rmd => 00-cv-for-many-models.qmd} (51%) diff --git a/_freeze/schedule/slides/00-cv-for-many-models/execute-results/html.json b/_freeze/schedule/slides/00-cv-for-many-models/execute-results/html.json new file mode 100644 index 0000000..eb0f1bc --- /dev/null +++ b/_freeze/schedule/slides/00-cv-for-many-models/execute-results/html.json @@ -0,0 +1,20 @@ +{ + "hash": "ce81d559001112d6fd73786eb2f4d192", + "result": { + "markdown": "---\nlecture: \"00 CV for many models\"\nformat: revealjs\nmetadata-files: \n - _metadata.yml\n---\n---\n---\n\n## {{< meta lecture >}} {.large background-image=\"gfx/smooths.svg\" background-opacity=\"0.3\"}\n\n[Stat 406]{.secondary}\n\n[{{< meta author >}}]{.secondary}\n\nLast modified -- 19 September 2023\n\n\n\n$$\n\\DeclareMathOperator*{\\argmin}{argmin}\n\\DeclareMathOperator*{\\argmax}{argmax}\n\\DeclareMathOperator*{\\minimize}{minimize}\n\\DeclareMathOperator*{\\maximize}{maximize}\n\\DeclareMathOperator*{\\find}{find}\n\\DeclareMathOperator{\\st}{subject\\,\\,to}\n\\newcommand{\\E}{E}\n\\newcommand{\\Expect}[1]{\\E\\left[ #1 \\right]}\n\\newcommand{\\Var}[1]{\\mathrm{Var}\\left[ #1 \\right]}\n\\newcommand{\\Cov}[2]{\\mathrm{Cov}\\left[#1,\\ #2\\right]}\n\\newcommand{\\given}{\\ \\vert\\ }\n\\newcommand{\\X}{\\mathbf{X}}\n\\newcommand{\\x}{\\mathbf{x}}\n\\newcommand{\\y}{\\mathbf{y}}\n\\newcommand{\\P}{\\mathcal{P}}\n\\newcommand{\\R}{\\mathbb{R}}\n\\newcommand{\\norm}[1]{\\left\\lVert #1 \\right\\rVert}\n\\newcommand{\\snorm}[1]{\\lVert #1 \\rVert}\n\\newcommand{\\tr}[1]{\\mbox{tr}(#1)}\n\\newcommand{\\brt}{\\widehat{\\beta}^R_{s}}\n\\newcommand{\\brl}{\\widehat{\\beta}^R_{\\lambda}}\n\\newcommand{\\bls}{\\widehat{\\beta}_{ols}}\n\\newcommand{\\blt}{\\widehat{\\beta}^L_{s}}\n\\newcommand{\\bll}{\\widehat{\\beta}^L_{\\lambda}}\n$$\n\n\n\n\n\n## Some data and 4 models\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndata(\"mobility\", package = \"Stat406\")\n```\n:::\n\n\n**Model 1:** Lasso on all predictors, use CV min\n\n**Model 2:** Ridge on all predictors, use CV min\n\n**Model 3:** OLS on all predictors (no tuning parameters)\n\n**Model 4:** (1) Lasso on all predictors, then (2) OLS on those chosen at CV min\n\n\n> How do I decide between these 4 models?\n\n\n## CV functions\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nkfold_cv <- function(data, estimator, predictor, error_fun, kfolds = 5) {\n fold_labels <- sample(rep(seq_len(kfolds), length.out = nrow(data)))\n errors <- double(kfolds)\n for (fold in seq_len(kfolds)) {\n test_rows <- fold_labels == fold\n train <- data[!test_rows, ]\n test <- data[test_rows, ]\n current_model <- estimator(train)\n test$.preds <- predictor(current_model, test)\n errors[fold] <- error_fun(test)\n }\n mean(errors)\n}\n\nloo_cv <- function(dat) {\n mdl <- lm(Mobility ~ ., data = dat)\n mean( abs(residuals(mdl)) / abs(1 - hatvalues(mdl)) ) # MAE version\n}\n```\n:::\n\n\n\n## Experiment setup\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# prepare our data\n# note that mob has only continuous predictors, otherwise could be trouble\nmob <- mobility[complete.cases(mobility), ] |> select(-ID, -State, -Name)\n# avoid doing this same operation a bunch\nxmat <- function(dat) dat |> select(!Mobility) |> as.matrix()\n\n# set up our model functions\nlibrary(glmnet)\nmod1 <- function(dat, ...) cv.glmnet(xmat(dat), dat$Mobility, type.measure = \"mae\", ...)\nmod2 <- function(dat, ...) cv.glmnet(xmat(dat), dat$Mobility, alpha = 0, type.measure = \"mae\", ...)\nmod3 <- function(dat, ...) glmnet(xmat(dat), dat$Mobility, lambda = 0, ...) # just does lm()\nmod4 <- function(dat, ...) cv.glmnet(xmat(dat), dat$Mobility, relax = TRUE, gamma = 1, type.measure = \"mae\", ...)\n\n# this will still \"work\" on mod3, because there's only 1 s\npredictor <- function(mod, dat) drop(predict(mod, newx = xmat(dat), s = \"lambda.min\"))\n\n# chose mean absolute error just 'cause\nerror_fun <- function(testdata) mean(abs(testdata$Mobility - testdata$.preds))\n```\n:::\n\n\n\n## Run the experiment\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nall_model_funs <- lst(mod1, mod2, mod3, mod4)\nall_fits <- map(all_model_funs, .f = exec, dat = mob)\n\n# unfortunately, does different splits for each method, so we use 10, \n# it would be better to use the _SAME_ splits\nten_fold_cv <- map_dbl(all_model_funs, ~ kfold_cv(mob, .x, predictor, error_fun, 10)) \n\nin_sample_cv <- c(\n mod1 = min(all_fits[[1]]$cvm),\n mod2 = min(all_fits[[2]]$cvm),\n mod3 = loo_cv(mob),\n mod4 = min(all_fits[[4]]$cvm)\n)\n\ntib <- bind_rows(in_sample_cv, ten_fold_cv)\ntib$method = c(\"in_sample\", \"out_of_sample\")\ntib\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 2 × 5\n mod1 mod2 mod3 mod4 method \n \n1 0.0159 0.0161 0.0164 0.0156 in_sample \n2 0.0158 0.0161 0.0165 0.0161 out_of_sample\n```\n:::\n:::\n", + "supporting": [ + "00-cv-for-many-models_files" + ], + "filters": [ + "rmarkdown/pagebreak.lua" + ], + "includes": { + "include-after-body": [ + "\n\n\n" + ] + }, + "engineDependencies": {}, + "preserve": {}, + "postProcess": true + } +} \ No newline at end of file diff --git a/schedule/slides/00-cv-for-many-models.html b/schedule/slides/00-cv-for-many-models.html deleted file mode 100644 index b08314f..0000000 --- a/schedule/slides/00-cv-for-many-models.html +++ /dev/null @@ -1,314 +0,0 @@ - - - - 00 CV for many models - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/schedule/slides/00-cv-for-many-models.Rmd b/schedule/slides/00-cv-for-many-models.qmd similarity index 51% rename from schedule/slides/00-cv-for-many-models.Rmd rename to schedule/slides/00-cv-for-many-models.qmd index f5ab349..fc844c2 100644 --- a/schedule/slides/00-cv-for-many-models.Rmd +++ b/schedule/slides/00-cv-for-many-models.qmd @@ -1,26 +1,17 @@ --- -title: "00 CV for many models" -author: - - "STAT 406" - - "Daniel J. McDonald" -date: 'Last modified - `r Sys.Date()`' +lecture: "00 CV for many models" +format: revealjs +metadata-files: + - _metadata.yml --- - -```{r setup, include=FALSE} -source("rmd_config.R") -library(magrittr) -``` - -```{r css-extras, file="css-extras.R", echo=FALSE} -``` - +{{< include _titleslide.qmd >}} ## Some data and 4 models ```{r} -data(mobility, package = "Stat406") +data("mobility", package = "Stat406") ``` **Model 1:** Lasso on all predictors, use CV min @@ -31,15 +22,15 @@ data(mobility, package = "Stat406") **Model 4:** (1) Lasso on all predictors, then (2) OLS on those chosen at CV min -.emphasis[ -How do I decide between these 4 models? -] --- +> How do I decide between these 4 models? + + +## CV functions ```{r stuff-i-need} kfold_cv <- function(data, estimator, predictor, error_fun, kfolds = 5) { - fold_labels <- sample(rep(seq(kfolds), length.out = nrow(data))) + fold_labels <- sample(rep(seq_len(kfolds), length.out = nrow(data))) errors <- double(kfolds) for (fold in seq_len(kfolds)) { test_rows <- fold_labels == fold @@ -51,50 +42,44 @@ kfold_cv <- function(data, estimator, predictor, error_fun, kfolds = 5) { } mean(errors) } + +loo_cv <- function(dat) { + mdl <- lm(Mobility ~ ., data = dat) + mean( abs(residuals(mdl)) / abs(1 - hatvalues(mdl)) ) # MAE version +} ``` ---- ## Experiment setup ```{r} # prepare our data # note that mob has only continuous predictors, otherwise could be trouble -mob <- mobility[complete.cases(mobility), ] %>% select(-ID, -State, -Name) +mob <- mobility[complete.cases(mobility), ] |> select(-ID, -State, -Name) # avoid doing this same operation a bunch -xmat <- function(dataset) as.matrix(select(dataset, !Mobility)) +xmat <- function(dat) dat |> select(!Mobility) |> as.matrix() # set up our model functions library(glmnet) -mod1 <- function(dataset, ...) cv.glmnet(xmat(dataset), dataset$Mobility, type.measure = "mae", ...) -mod2 <- function(dataset, ...) cv.glmnet(xmat(dataset), dataset$Mobility, alpha = 0, type.measure = "mae", ...) -mod3 <- function(dataset, ...) glmnet(xmat(dataset), dataset$Mobility, lambda = 0, ...) # just does lm() -mod4 <- function(dataset, ...) cv.glmnet(xmat(dataset), dataset$Mobility, relax = TRUE, gamma = 1, - type.measure = "mae", ...) +mod1 <- function(dat, ...) cv.glmnet(xmat(dat), dat$Mobility, type.measure = "mae", ...) +mod2 <- function(dat, ...) cv.glmnet(xmat(dat), dat$Mobility, alpha = 0, type.measure = "mae", ...) +mod3 <- function(dat, ...) glmnet(xmat(dat), dat$Mobility, lambda = 0, ...) # just does lm() +mod4 <- function(dat, ...) cv.glmnet(xmat(dat), dat$Mobility, relax = TRUE, gamma = 1, type.measure = "mae", ...) # this will still "work" on mod3, because there's only 1 s -predictor <- function(modle, dataset) drop(predict(modle, newx = xmat(dataset), s = "lambda.min")) +predictor <- function(mod, dat) drop(predict(mod, newx = xmat(dat), s = "lambda.min")) # chose mean absolute error just 'cause error_fun <- function(testdata) mean(abs(testdata$Mobility - testdata$.preds)) - -# not necessarily useful for choosing in this context, but good for illustration -loo_cv <- function(dataset) { - mdl <- lm(Mobility ~ ., data = dataset) - mean( abs(residuals(mdl)) / abs(1 - hatvalues(mdl)) ) # MAE version -} ``` ---- ## Run the experiment -* I'm using `purrr` functions to do this without loops, 'cause it's prettier ```{r} -library(purrr) -all_model_funs <- list(mod1 = mod1, mod2 = mod2, mod3 = mod3, mod4 = mod4) -all_fits <- map(all_model_funs, ~ do.call(.x, list(dataset = mob))) +all_model_funs <- lst(mod1, mod2, mod3, mod4) +all_fits <- map(all_model_funs, .f = exec, dat = mob) # unfortunately, does different splits for each method, so we use 10, # it would be better to use the _SAME_ splits @@ -109,5 +94,5 @@ in_sample_cv <- c( tib <- bind_rows(in_sample_cv, ten_fold_cv) tib$method = c("in_sample", "out_of_sample") -kableExtra::kable(tib, booktabs = TRUE) -``` \ No newline at end of file +tib +```