add week 8 R materials

UBC-DSCI · Mar 4, 2024 · 2364cd4 · 2364cd4
1 parent 5a9bf82
commit 2364cd4
Show file tree

Hide file tree

Showing 7 changed files with 10,731 additions and 0 deletions.
diff --git a/materials/R/worksheet_regression1/cleanup.R b/materials/R/worksheet_regression1/cleanup.R
@@ -0,0 +1,2 @@
+# clean up data files that students output 
+
diff --git a/materials/R/worksheet_regression1/data/marathon.csv b/materials/R/worksheet_regression1/data/marathon.csv
diff --git a/materials/R/worksheet_regression1/img/k-nn.jpeg b/materials/R/worksheet_regression1/img/k-nn.jpeg
diff --git a/materials/R/worksheet_regression1/img/k-nn_RMSE.jpeg b/materials/R/worksheet_regression1/img/k-nn_RMSE.jpeg
diff --git a/materials/R/worksheet_regression1/tests.R b/materials/R/worksheet_regression1/tests.R
@@ -0,0 +1,377 @@
+library(testthat)
+library(digest)
+library(rlang)
+
+# Round double to precise integer
+#
+# `int_round` works to create an integer corresponding to a number that is 
+# tested up to a particular decimal point of precision. This is useful when 
+# there is a need to compare a numeric value using hashes.
+#
+# @param x Double vector of length one.
+# @param digits Double vector of length one to specify decimal point of precision. Negative numbers can be used to specifying significant digits > 0.1.
+#
+# @return Integer vector of length one corresponding to a particular decimal point of precision.
+#
+# @examples
+# # to get an integer up to two decimals of precision from 234.56789
+# int_round(234.56789, 2)
+#
+# to get an integer rounded to the hundred digit from 234.56789
+# int_round(234.56789, -2)
+int_round <- function(x, digits){
+    x = x * 10^digits
+    xint = as.integer(x)
+    xint1 = xint + 1L
+    if (abs(xint - x) < abs(xint1 - x)){
+        return(xint)
+    }
+    else {
+        return(xint1)
+    }
+}
+
+test_0.0 <- function(){
+    test_that('Solution is incorrect', {
+        expect_equal(digest(answer0.0), '3a5505c06543876fe45598b5e5e5195d')
+    })
+    print("Success!")
+}
+
+test_0.1 <- function(){
+    test_that('Solution is incorrect', {
+        expect_equal(digest(answer0.1), '475bf9280aab63a82af60791302736f6')
+    })
+    print("Success!")
+}
+
+test_0.2 <- function(){
+    test_that('Solution is incorrect', {
+        expect_equal(digest(int_round(answer0.2, 2)), '6953b334169bd7ec7da1c1eda5aaf6a5')
+    })
+    print("Success!")
+}
+
+test_0.3 <- function(){
+    test_that('Solution is incorrect', {
+        expect_equal(digest(answer0.3), '75f1160e72554f4270c809f041c7a776')
+    })
+    print("Success!")
+}
+
+test_1.0 <- function(){
+    test_that('Did not create an object named marathon', {
+        expect_true(exists("marathon")) 
+    })
+    test_that('marathon should be a tibble.', {
+        expect_true('tbl' %in% class(marathon))
+    })
+    test_that('marathon does not contain the correct number of rows and/or columns.', {
+        expect_equal(dim(marathon), c(929, 13))
+    })
+    test_that('The marathon tibble is missing columns.', {
+        expect_true("time_hrs" %in% colnames(marathon))
+        expect_true("max" %in% colnames(marathon))
+    })
+    print("Success!")
+}
+
+test_2.0 <- function(){
+    properties <- c(answer2$mapping, answer2$layers[[1]]$mapping)
+    labels <- answer2$labels
+    test_that('Did not create a plot named answer2', {
+        expect_true(exists("answer2")) 
+    })
+    test_that('marathon_50 does not contain the correct number of rows and/or columns.', {
+        expect_equal(dim(marathon_50), c(50, 13))
+    })
+    test_that('answer2 should use information from marathon_50', {
+        expect_equal(answer2$data, marathon_50) 
+    })
+    test_that('max should be on the x-axis.', {
+        expect_true("max" %in% c(rlang::get_expr(properties$x),
+                                 rlang::get_expr(properties$x)))
+    })
+    test_that('time_hrs should be on the y-axis.', {
+        expect_true("time_hrs" %in% c(rlang::get_expr(properties$y), 
+                                      rlang::get_expr(properties$y)))
+    })
+    test_that('answer2 should be a scatter plot.', {
+        expect_equal(digest(class(rlang::get_expr(answer2$layers[[1]]$geom))[1]), 
+                 '911e5b9debfb523f25ad2ccc01a4b2dd')
+    })
+    test_that('Labels on the axes should be descriptive and human readable.', {
+        expect_false((labels$y) == 'time_hrs')
+        expect_false((labels$x) == 'max')
+    })
+print("Success!")  
+}
+
+test_3.0 <- function(){
+    test_that('Did not create an object called answer3', {
+        expect_true(exists('answer3'))
+    })
+    test_that('answer3 is incorrect', {
+        expect_equal(digest(int_round(answer3, 1)), 'a266aa4a0aa711355be22e0f3b8d91af')
+})
+    print("Success!")
+}
+
+test_4.0 <- function(){
+     test_that('Did not create an object called answer4', {
+        expect_true(exists('answer4'))
+    })
+    test_that('answer4 is incorrect', {
+        expect_equal(digest(int_round(answer4, 1)), '285d156b1b700fbb489df058fdb9e2ee')
+    })
+    print("Success!")
+}
+
+test_5.0 <- function(){
+     test_that('Did not create an object called answer5', {
+        expect_true(exists('answer5'))
+    })
+    test_that('Solution is incorrect', {
+        expect_equal(digest(answer5), '475bf9280aab63a82af60791302736f6')
+    })
+    print("Success!")
+}
+
+test_6.0 <- function(){
+    test_that('Did not create an object named marathon_split', {
+        expect_true(exists("marathon_split")) 
+    })
+    test_that('marathon_split should be rsplit (not a tibble)', {
+        expect_true('rsplit' %in% class(marathon_split))
+    })
+    test_that('Did not create an object named marathon_training', {
+        expect_true(exists('marathon_training'))
+    })
+    test_that('marathon_training does not contain 0.75 of the data.', {
+        expect_equal(dim(marathon_training), c(696,13))
+        expect_equal(digest(int_round(sum(marathon_training$age), 0)), 'fbf206d474a56038342d51f17b0ba4c8')
+    })
+    test_that('Did not create an object named marathon_testing', {
+        expect_true(exists('marathon_testing'))
+    })
+    test_that('marathon_testing does not contain 0.25 of the data.', {
+        expect_equal(dim(marathon_testing), c(233, 13))
+        expect_equal(digest(int_round(sum(marathon_testing$age), 0)), '4a09fa6ca41f8e9bff7eea7fd4565399')
+    })
+    print("Success!")
+}
+
+test_7.0 <- function(){
+    test_that('Did not create an object named marathon_spec', {
+        expect_true(exists("marathon_spec")) 
+    })
+    test_that('neighbors argument is incorrect', {
+        expect_equal(digest(as.character(get_expr(marathon_spec$args$neighbors))), '4b89cff22bb78b28a0a6b7fe28d371f6')
+    })
+    test_that('weight_func is incorrect', {
+        expect_equal(digest(as.character(get_expr(marathon_spec$args$weight_func))), '989de78e881829b4499af3610dfe54fd')
+    })
+    test_that('set_engine is incorrect', {
+        expect_equal(digest(as.character(marathon_spec$engine)), '93fe1d3f0a1fa2e625af1e1eb51a5c33')
+    })
+    test_that('mode is incorrect', {
+        expect_equal(digest(as.character(marathon_spec$mode)), 'b8bdd7015e0d1c6037512fd1396aef1a')
+    })
+    test_that('Did not create an object named marathon_recipe', {
+        expect_true(exists("marathon_recipe"))
+    })
+    test_that('Data in marathon_recipe is not scaled and centered', {
+        expect_equal(digest(int_round(sum(marathon_recipe$template$max), 0)), 'f51f00d1db12d6567be874b8acd7d2e0')
+        expect_equal(digest(int_round(sum(marathon_recipe$template$time_hrs), 0)), '241691b869a7ec8e10915214932a8d86')
+    })
+print("Success!")
+}
+
+test_7.1 <- function(){
+    test_that('Did not create an object called marathon_vfold', {
+        expect_true(exists("marathon_vfold"))
+    })
+    test_that('marathon_vfold does not contain 5 folds', {
+        expect_equal(int_round(length(marathon_vfold$id), 0), 5)
+    })
+    test_that('marathon_vfold should be a cross-validation object', {
+        expect_true('vfold_cv' %in% class(marathon_vfold))
+    })
+    test_that('Did not create an object called marathon_workflow', {
+        expect_true(exists("marathon_workflow"))
+    })
+    test_that('marathon_workflow is not a workflow object', {
+        expect_true('workflow' %in% class(marathon_workflow))
+    })
+    test_that('marathon_workflow does not contain the correct model specification', {
+        expect_equal(digest(as_label(marathon_workflow$fit$actions$model$spec$args$neighbors)), 'c77d97a68a64b2275e17f3075c96d102')
+        expect_equal(digest(as_label(marathon_workflow$fit$actions$model$spec$mode)), 'ac823aa7d8cf42de9175927e6f7682e5')
+        expect_equal(digest(as_label(marathon_workflow$fit$actions$model$spec$engine)), '141555abd68c42d32c04aba2636361c6')
+        expect_true('nearest_neighbor' %in% class(marathon_workflow$fit$actions$model$spec))
+    })
+    test_that('marathon_workflow does not contain the correct recipe', {
+        expect_true('recipe' %in% class(marathon_workflow$pre$actions$recipe$recipe))
+        expect_equal(digest(int_round(sum(marathon_workflow$pre$actions$recipe$recipe$template$max), 0)), 'f51f00d1db12d6567be874b8acd7d2e0')
+        expect_equal(digest(int_round(sum(marathon_workflow$pre$actions$recipe$recipe$template$time_hrs), 0)), '241691b869a7ec8e10915214932a8d86')
+    })
+print("Success!")
+    }
+
+
+test_8.0 <- function(){
+    test_that('Did not create an object named gridvals', {
+        expect_true(exists('gridvals')) 
+    })
+    test_that('gridvals does not contain the correct data and column name', {
+        expect_true('tbl' %in% class(gridvals))
+        expect_true('neighbors' %in% colnames(gridvals))
+        expect_equal(digest(int_round(sum(gridvals), 0)), '251921ac8b52641fc990099b8c3d3b19') 
+    })
+    test_that('Did not create an object named marathon_results', {
+        expect_true(exists('marathon_results'))
+    })
+    test_that('marathon_results is not a tibble', {
+        expect_true('tbl' %in% class(marathon_results))
+    })
+    test_that('marathon_results does not contain the correct data', {
+        expect_equal(dim(marathon_results), c(18, 7))
+        expect_equal(digest(int_round(sum(marathon_results$neighbors), 0)), 'f7783fc3ee9f29933ddb6b84b210f0f6')
+        expect_equal(int_round(unique(marathon_results$n), 0), 5)
+        expect_equal(digest(int_round(sum(marathon_results$mean), 0)), '8eaca7c9b35d05ab15c9125bc92372fa')
+        expect_equal(digest(int_round(sum(marathon_results$std_err), 0)), '1473d70e5646a26de3c52aa1abd85b1f')
+    })
+print("Success!")
+}
+
+
+test_8.1 <- function(){
+    test_that('Did not create an object named marathon_min', {
+        expect_true(exists('marathon_min'))
+   })
+    test_that('marathon_min is not a tibble', {
+        expect_true('tbl' %in% class(marathon_min))
+   })
+    test_that('marathon_min does not contain the correct data', {
+        expect_equal(dim(marathon_min), c(1, 7))
+        expect_true('neighbors' %in% colnames(marathon_min))
+        expect_true('.metric' %in% colnames(marathon_min))
+        expect_true('.estimator' %in% colnames(marathon_min))
+        expect_true('mean' %in% colnames(marathon_min))
+        expect_true('n' %in% colnames(marathon_min))
+        expect_true('std_err' %in% colnames(marathon_min))
+        expect_true('.config' %in% colnames(marathon_min))
+   })
+    test_that('Best K value is incorrect', {
+        expect_equal(digest(int_round(marathon_min$neighbors, 2)), 'd35f53c853d2daeb8607b7f873601c34')
+   })
+    test_that('Metric is incorrect', {
+        expect_equal(digest(marathon_min$.metric), '91a8c46d46a2a25459eaabfa08f35967')
+    })
+    print("Success!")
+}
+
+test_8.2 <- function(){
+    test_that('Did not create an object named k_min', {
+        expect_true(exists('k_min'))
+    })
+    test_that('k_min is not correct', {
+        expect_equal(digest(int_round(k_min, 2)), 'd35f53c853d2daeb8607b7f873601c34')
+    })
+    test_that('Did not create an object named marathon_best_spec', {
+        expect_true(exists('marathon_best_spec'))
+    })
+    test_that('marathon_best_spec has incorrect specifications', {
+        expect_equal(digest(as.character(get_expr(marathon_best_spec$args$neighbors))), '0b942c90bc01f15b084d00fa29bf4cc0')
+    })
+    test_that('weight_func is incorrect', {
+        expect_equal(digest(as.character(get_expr(marathon_best_spec$args$weight_func))), '989de78e881829b4499af3610dfe54fd')
+    })
+    test_that('set_engine is incorrect', {
+        expect_equal(digest(as.character(marathon_best_spec$engine)), '93fe1d3f0a1fa2e625af1e1eb51a5c33')
+    })
+    test_that('mode is incorrect', {
+        expect_equal(digest(as.character(marathon_best_spec$mode)), 'b8bdd7015e0d1c6037512fd1396aef1a')
+    })
+    test_that('Did not create an object named marathon_best_fit', {
+        expect_true(exists('marathon_best_fit'))
+    })
+     test_that('marathon_best_fit should be a workflow', {
+        expect_true('workflow' %in% class(marathon_best_fit))
+    })
+    test_that('marathon_best_fit does not contain the correct model specification', {
+        expect_equal(digest(get_expr(marathon_best_fit$fit$actions$model$spec$args$neighbors)), '7ad692ee809beafa13e6d291d0d5372f')
+        expect_equal(digest(as.character(marathon_best_fit$fit$actions$model$spec$mode)), 'b8bdd7015e0d1c6037512fd1396aef1a')
+        expect_equal(digest(as.character(marathon_best_fit$fit$actions$model$spec$engine)), '93fe1d3f0a1fa2e625af1e1eb51a5c33')
+        expect_true('nearest_neighbor' %in% class(marathon_best_fit$fit$actions$model$spec))
+    })
+    test_that('marathon_best_fit does not contain the correct recipe', {
+        expect_true('recipe' %in% class(marathon_best_fit$pre$actions$recipe$recipe))
+        expect_equal(digest(int_round(sum(marathon_best_fit$pre$actions$recipe$recipe$template$max), 0)), 'f51f00d1db12d6567be874b8acd7d2e0')
+        expect_equal(digest(int_round(sum(marathon_best_fit$pre$actions$recipe$recipe$template$time_hrs), 0)), '241691b869a7ec8e10915214932a8d86')
+    })
+    test_that('Did not create an object named marathon_summary', {
+        expect_true(exists('marathon_summary'))
+    })
+    test_that('marathon_summary is not a tibble', {
+        expect_true('tbl' %in% class(marathon_summary))
+    })
+    test_that('marathon_summary contains the incorrect data', {
+        expect_true('.metric' %in% colnames(marathon_summary))
+        expect_true('.estimator' %in% colnames(marathon_summary))
+        expect_true('.estimate' %in% colnames(marathon_summary))
+        expect_equal(digest(int_round(sum(marathon_summary$.estimate), 0)), '4b5630ee914e848e8d07221556b0a2fb')
+    })
+    print("Success!")
+}
+
+test_8.3 <- function(){
+    test_that('Did not create an objected named answer8.3', {
+        expect_true(exists('answer8.3'))
+    })
+    test_that('answer is incorrect', {
+        expect_equal(digest(answer8.3), 'd2a90307aac5ae8d0ef58e2fe730d38b')
+    })
+    print("Success!")
+}
+
+test_9.0 <- function(){
+    properties <- c(marathon_plot$layers[[1]]$mapping, marathon_plot$mapping)
+    labels <- marathon_plot$labels
+    test_that('Did not create an object named marathon_preds', {
+        expect_true(exists('marathon_preds')) 
+    })
+    test_that('marathon_preds should be a tibble', {
+        expect_true('tbl' %in% class(marathon_preds))
+    })
+    test_that('marathon_preds contains incorrect data', {
+        expect_equal(dim(marathon_preds), c(696, 14))
+        expect_true('.pred' %in% colnames(marathon_preds))
+        expect_equal(digest(int_round(sum(marathon_preds$.pred), 2)), '6796478bc90c68268ec527deb6473273')
+        expect_equal(digest(int_round(sum(marathon_preds$time_hrs), 2)), '8b88838a2398216f8b3254aad44c6f8f') 
+    })
+    test_that('Did not create an object called marathon_plot', {
+        expect_true(exists('marathon_plot'))
+    })
+     test_that('max should be on the x-axis.', {
+        expect_true("max" == rlang::get_expr(properties$x))
+        })
+    test_that('time_hrs should be on the y-axis (try adding geom_point *before* geom_line!)', {
+        expect_true("time_hrs" == rlang::get_expr(properties$y))
+        })
+    test_that('marathon_plot should have full_predictions plotted as a blue line over the data points.', {
+        expect_true('blue' %in% as.character(marathon_plot$layers[[2]]$aes_params))
+        expect_true('GeomLine' %in% c(class(rlang::get_expr(marathon_plot$layers[[1]]$geom)), class(rlang::get_expr(marathon_plot$layers[[2]]$geom))))
+    })
+    test_that('max should be the x argument for geom_line, and geom_line should come *after* geom_point', {
+        expect_true('max' == rlang::get_expr(marathon_plot$layers[[2]]$mapping$x))
+    })
+    test_that('.pred should be the y argument for geom_line,  and geom_line should come *after* geom_point',{
+        expect_true('.pred' == rlang::get_expr(marathon_plot$layers[[2]]$mapping$y))
+    })
+    test_that('Labels on the axes/title and legend need to be changed to be descriptive, nicely formatted, and human readable.', {
+        expect_false((labels$y) == 'time_hrs')
+        expect_false((labels$x) == 'max')
+        expect_false((labels$title == 'k_min'))
+        })
+print("Success!")
+}