diff --git a/materials/tutorial_03/plot.png b/materials/tutorial_03/plot.png
new file mode 100644
index 0000000..1e0b5c9
Binary files /dev/null and b/materials/tutorial_03/plot.png differ
diff --git a/materials/tutorial_03/tests_tutorial_03.R b/materials/tutorial_03/tests_tutorial_03.R
new file mode 100644
index 0000000..ef0cf8a
--- /dev/null
+++ b/materials/tutorial_03/tests_tutorial_03.R
@@ -0,0 +1,293 @@
+test_1.1 <- function() {
+ test_that('Did not assign answer to an object called "answer1.1"', {
+ expect_true(exists("answer1.1"))
+ })
+ test_that('Solution should be "true" or "false"', {
+ expect_match(answer1.1, "true|false", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer1.1))
+ if (answer_hash == "05ca18b596514af73f6880309a21b5dd") {
+ print("Think about potential edge cases.")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "d2a90307aac5ae8d0ef58e2fe730d38b")
+ })
+ print("Success!")
+test_1.2 <- function() {
+ test_that('Did not assign answer to an object called "answer1.2"', {
+ expect_true(exists("answer1.2"))
+ })
+ test_that('Solution should be "true" or "false"', {
+ expect_match(answer1.2, "true|false", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer1.2))
+ if (answer_hash == "05ca18b596514af73f6880309a21b5dd") {
+ print("When we sample with replacement, we put the observation we just sampled BACK into the pool before selecting another.")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "d2a90307aac5ae8d0ef58e2fe730d38b")
+ })
+ print("Success!")
+test_2.0 <- function() {
+ test_that('Did not assign answer to an object called "answer2.0"', {
+ expect_true(exists("answer2.0"))
+ })
+ test_that('Solution should be a single character ("A", "B", "C", or "D")', {
+ expect_match(answer2.0, "a|b|c|d", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer2.0))
+ if (answer_hash == "127a2ec00989b9f7faf671ed470be7f8" | answer_hash == "ddf100612805359cd81fdc5ce3b9fbba") {
+ print("Where would the centre of the new sampling distribution be? Would expect either of the edges of the distribution still appear 'cut off'?")
+ } else if (answer_hash == "d110f00cfb1b248e835137025804a23b") {
+ print("Try to visualize what the new sampling disitribution would look like compared to the sampling distribution above.")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "6e7a8c1c098e8817e3df3fd1b21149d1")
+ })
+ print("Success!")
+test_2.1 <- function() {
+ test_that('Did not assign answer to an object called "barrier_pop"', {
+ expect_true(exists("barrier_pop"))
+ })
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(barrier_pop))
+ })
+ expected_colnames <- c("root_barrier")
+ given_colnames <- colnames(barrier_pop)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+ test_that("Data frame should not contain any NA values", {
+ expect_equal(digest(as.integer(nrow(filter(barrier_pop, is.na(root_barrier))))), "1473d70e5646a26de3c52aa1abd85b1f")
+ })
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(barrier_pop))), "33cdfaf5b5548592e62ab05a10e99d7d")
+ })
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(nrow(filter(barrier_pop, root_barrier == "N")))), "7b52488abfeed248a0eecb9d27db8758")
+ })
+ print("Success!")
+test_2.2 <- function() {
+ test_that('Did not assign answer to an object called "barrier_sampling_dist"', {
+ expect_true(exists("barrier_sampling_dist"))
+ })
+ properties <- c(barrier_sampling_dist$layers[[1]]$mapping, barrier_sampling_dist$mapping)
+ test_that("Plot should have p on the x-axis", {
+ expect_true("p" == rlang::get_expr(properties$x))
+ })
+ test_that("Plot is not the correct type", {
+ expect_true("GeomBar" %in% class(barrier_sampling_dist$layers[[1]]$geom))
+ })
+ test_that("Plot does not have the correct bin width", {
+ expect_equal(
+ digest(as.integer(mget("stat_params", barrier_sampling_dist$layers[[1]])[["stat_params"]][["binwidth"]]) * 1000),
+ "908d1fd10b357ed0ceaaec823abf81bc"
+ )
+ })
+ test_that("Plot does not use the correct data. Sampling distribution should be drawn by sampling without replacement", {
+ expect_equal(digest(nrow(barrier_sampling_dist$data)), "6e96c307060fba1b1d3a36d2410fd595")
+ expect_equal(digest(round(sd(barrier_sampling_dist$data$p),7)), "e9db0e152223a01249a4c3225e899af9")
+ })
+ test_that("x-axis label should be descriptive and human readable", {
+ expect_false(barrier_sampling_dist$labels$x == "p")
+ })
+ test_that("Plot should have a title", {
+ expect_true("title" %in% names(barrier_sampling_dist$labels))
+ })
+ print("Success!")
+test_2.3 <- function() {
+ test_that('Did not assign answer to an object called "barrier_sample"', {
+ expect_true(exists("barrier_sample"))
+ })
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(barrier_sample))
+ })
+ expected_colnames <- c("root_barrier")
+ given_colnames <- colnames(barrier_sample)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(barrier_sample))), "be3c152f6f6bcd5f85f9e4cba49b1e48")
+ })
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(nrow(filter(barrier_sample, root_barrier == "N")))), "2a099397e2d2dd0f2a2e5a5b4234867d")
+ })
+ print("Success!")
+test_2.4 <- function() {
+ test_that('Did not assign answer to an object called "answer2.4"', {
+ expect_true(exists("answer2.4"))
+ })
+ test_that('Solution should be a single character ("A", "B", "C", "D", "E", "F", or "G")', {
+ expect_match(answer2.4, "a|b|c|d|e|f|g", ignore.case = TRUE)
+ })
+ test_that("Solution is incorrect", {
+ expect_equal(digest(tolower(answer2.4)), "127a2ec00989b9f7faf671ed470be7f8")
+ })
+ print("Success!")
+test_2.5 <- function() {
+ test_that('Did not assign answer to an object called "barrier_bootstrap_dist"', {
+ expect_true(exists("barrier_bootstrap_dist"))
+ })
+ properties <- c(barrier_bootstrap_dist$layers[[1]]$mapping, barrier_bootstrap_dist$mapping)
+ test_that("Plot is not the correct type", {
+ expect_true("GeomBar" %in% class(barrier_bootstrap_dist$layers[[1]]$geom))
+ })
+ test_that("Plot does not have the correct bin width", {
+ expect_equal(
+ digest(as.integer(mget("stat_params", barrier_bootstrap_dist$layers[[1]])[["stat_params"]][["binwidth"]]) * 1000),
+ "908d1fd10b357ed0ceaaec823abf81bc"
+ )
+ })
+ test_that("Plot does not use the correct data", {
+ expect_equal(digest(nrow(barrier_bootstrap_dist$data)), "6e96c307060fba1b1d3a36d2410fd595")
+ expect_equal(digest(round(sum(pull(barrier_bootstrap_dist$data, rlang::get_expr(properties$x))))), "39fe840086944a2ff92b5b0413fada63")
+ })
+ test_that("x-axis label should be descriptive and human readable", {
+ expect_false(barrier_bootstrap_dist$labels$x == toString(rlang::get_expr(properties$x)))
+ })
+ test_that("Plot should have a title", {
+ expect_true("title" %in% names(barrier_bootstrap_dist$labels))
+ })
+ print("Success!")
+test_2.6 <- function() {
+ # part A
+ test_that('Did not assign answer to an object called "standard_error"', {
+ expect_true(exists("standard_error"))
+ })
+ answer_as_numeric <- as.numeric(standard_error)
+ test_that("Solution should be a number", {
+ expect_false(is.na(answer_as_numeric))
+ })
+ test_that("Solution is incorrect", {
+ expect_equal(digest(as.integer(answer_as_numeric * 1000000)), "01b27636cb5a0bc29d3245cf9b5d14d7")
+ })
+ # part B
+ test_that('Did not assign answer to an object called "standard_deviation"', {
+ expect_true(exists("standard_deviation"))
+ })
+ answer_as_numeric <- as.numeric(standard_deviation)
+ test_that("Solution should be a number", {
+ expect_false(is.na(answer_as_numeric))
+ })
+ test_that("Solution is incorrect", {
+ expect_equal(digest(as.integer(answer_as_numeric * 1000000)), "b46b14ab0d3b4cdb172dae88b8921fa6")
+ })
+ print("Success!")
+test_2.7 <- function() {
+ test_that('Did not assign answer to an object called "answer2.7"', {
+ expect_true(exists("answer2.7"))
+ })
+ test_that('Solution should be "true" or "false"', {
+ expect_match(answer2.7, "true|false", ignore.case = TRUE)
+ })
+ test_that("Solution is incorrect", {
+ expect_equal(digest(tolower(answer2.7)), "05ca18b596514af73f6880309a21b5dd")
+ })
+ print("Success!")
+test_3.0 <- function() {
+ test_that('Did not assign answer to an object called "plum_pop"', {
+ expect_true(exists("plum_pop"))
+ })
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(plum_pop))
+ })
+ expected_colnames <- c("diameter")
+ given_colnames <- colnames(plum_pop)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(plum_pop))), "81069550898d54275db061d49bb7f779")
+ })
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(plum_pop$diameter))), "c82ea42b76ddfa4115c6472821803b9f")
+ })
+ print("Success!")
diff --git a/materials/tutorial_03/tutorial_03.ipynb b/materials/tutorial_03/tutorial_03.ipynb
new file mode 100644
index 0000000..d286667
--- /dev/null
+++ b/materials/tutorial_03/tutorial_03.ipynb
@@ -0,0 +1,2061 @@
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "38f531f0b13511dae6c173db3a34b41e",
+ "grade": false,
+ "grade_id": "cell-f3ae6db28c5041d9",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "# Tutorial 3: Bootstrapping and its Relationship to the Sampling Distribution"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "db9756e7b71b8b631805409a4e335e65",
+ "grade": false,
+ "grade_id": "cell-712e9ac4e769eb39",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "### Lecture and Tutorial Learning Goals\n",
+ "After completing this week's lecture and tutorial work, you will be able to:\n",
+ "1. Explain why we don’t know/have a sampling distribution in practice/real life.\n",
+ "2. Define bootstrapping.\n",
+ "3. Write a computer script to create a bootstrap distribution to approximate a sampling distribution.\n",
+ "4. Contrast a bootstrap sampling distribution with a sampling distribution obtained using multiple samples."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "30515f2934452486657577710bcb6636",
+ "grade": false,
+ "grade_id": "cell-28cb9e52e89ace8d",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "library(cowplot)\n",
+ "library(datateachr)\n",
+ "library(digest)\n",
+ "library(gridExtra)\n",
+ "library(infer)\n",
+ "library(repr)\n",
+ "library(taxyvr)\n",
+ "library(tidyverse)\n",
+ "source(\"tests_tutorial_03.R\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "0044a8ce4048de32292c6153b18507d1",
+ "grade": false,
+ "grade_id": "cell-dc2ef0d60b4a4933",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "## 1. Warm-Up Questions\n",
+ "\n",
+ "Let's start off with a few questions about bootstrapping and sampling practices in reality."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "6c325c84f9dbc24ca7d5dd77335510ce",
+ "grade": false,
+ "grade_id": "cell-3e24b34269f128b4",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.0**\n",
+ "
{points: 3}\n",
+ "\n",
+ "In 1-2 sentences, explain what bootstrapping is useful for."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "a23919b36d994ade6b9ed4960645ddc4",
+ "grade": true,
+ "grade_id": "cell-8c84f203af7f48fd",
+ "locked": false,
+ "points": 3,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "source": [
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "85c80662a5bdaf88fa4396a949fd85e2",
+ "grade": false,
+ "grade_id": "cell-608fcde8287d2be1",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.1**\n",
+ "
{points: 1}\n",
+ "\n",
+ "True or false?\n",
+ "\n",
+ "A bootstrap sampling distribution will **always** have a similar width as the sampling distribution it is approximating.\n",
+ "\n",
+ "_Assign your answer to an object called `answer1.1`. Your answer should be either \"true\" or \"false\", surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "9f5912307ae2c5fb859dd19047b8449b",
+ "grade": false,
+ "grade_id": "cell-3eb563d3ebcd4b3e",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer1.1 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "159c62cc078cc395c73a49dd100e9700",
+ "grade": true,
+ "grade_id": "cell-ec461d6d65200f1d",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.1()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "e06f748b9e001c4c12d026edd1b3c0a8",
+ "grade": false,
+ "grade_id": "cell-45d0af92c1953424",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.2**\n",
+ "
{points: 1}\n",
+ "\n",
+ "True or false?\n",
+ "\n",
+ "In reality, when we take a sample from the population, we are sampling with replacement.\n",
+ "\n",
+ "_Assign your answer to an object called `answer1.2`. Your answer should be either \"true\" or \"false\", surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "715b740d7cf5ba296e0e533c16da3b80",
+ "grade": false,
+ "grade_id": "cell-54bf9d55fc56bbf6",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer1.2 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "6f9346280e962f7a161c81946e3709e1",
+ "grade": true,
+ "grade_id": "cell-1b1ad2d17f5bb494",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.2()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "7f34576a7df82c2f84f434d90fb048fb",
+ "grade": false,
+ "grade_id": "cell-d6c903c695f1f6a5",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "## 2. Bootstrap Distribution vs Sampling Distribution\n",
+ "\n",
+ "By now you should understand that not all sampling distributions come out as a nice symmetrical bell shape. In `worksheet_02` you saw one example of this when we were studying the distributions of point estimates for the proportion of properties that are located in downtown Vancouver, using the `tax_2019` data set. In the code cell below, we have re-created this sampling distribution so we can look at it a bit more closely.\n",
+ "\n",
+ "_Use the plot below to answer the **next question**._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "45078ac3d5cba761589d3ae6362d5732",
+ "grade": false,
+ "grade_id": "cell-0b662bd6e3054d2b",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "set.seed(2410) # Use the same seed as last time to get\n",
+ " # the exact same observations in our samples.\n",
+ "\n",
+ "sampling_dist <- tax_2019 %>% \n",
+ " filter(!is.na(geo_local_area)) %>% \n",
+ " select(geo_local_area) %>% \n",
+ " rep_sample_n(size = 10, reps = 2000) %>% \n",
+ " group_by(replicate) %>% \n",
+ " summarize(x = sum(geo_local_area == \"Downtown\"),\n",
+ " n = n()) %>% \n",
+ " mutate(sample_proportion = x / n) %>% \n",
+ " ggplot(aes(x = sample_proportion)) +\n",
+ " geom_histogram(binwidth = 1/10, colour = \"white\") +\n",
+ " xlab(\"Sample Proportion of Downtown Buildings\") +\n",
+ " ggtitle(\"n = 10\") +\n",
+ " scale_x_continuous(breaks = seq(0, 0.9, 0.1))\n",
+ "sampling_dist"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "a27791770b11804017dfe0d6af4a4985",
+ "grade": false,
+ "grade_id": "cell-d41acc8f82a60bac",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.0**\n",
+ "
{points: 1}\n",
+ "\n",
+ "The true proportion of buildings in Vancouver that are located downtown is 0.195. Suppose the data was adjusted such that the true proportion is now 0.5, and we created another sampling distribution with samples of size 100 using the code above. How would the symmetry of the new sampling distribution compare to the one generated above?\n",
+ "\n",
+ "A. The new sampling distribution would be less symmetrical.\n",
+ "\n",
+ "B. The symmetry of the new sampling distribution would be about the same.\n",
+ "\n",
+ "C. The new sampling distribution would be more symmetrical.\n",
+ "\n",
+ "D. It is impossible to tell how the symmetry of the new sampling distribution would compare.\n",
+ "\n",
+ "_Assign your answer to an object called `answer2.0`. Your answer should be a single character surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "d32a422a3e1d2411aadd5250d84d22c6",
+ "grade": false,
+ "grade_id": "cell-1487c5a41a521870",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer2.0 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "f45c1240d909b00f9b28a6f39b1b5793",
+ "grade": true,
+ "grade_id": "cell-79e2ecf537661b10",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.0()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "a37e3967abd22349e9e31d730aa45ad4",
+ "grade": false,
+ "grade_id": "cell-f3fec4bdcd74fe8c",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "### Root Barriers\n",
+ "\n",
+ "In this section, we are going to test the limits of bootstrapping to see whether it results in reliable approximations of asymmetrical sampling distributions, such as the one shown above. To do this, we will attempt to use bootstrapping to estimate sampling distributions that we know are even less symmetrical and compare them to see if our estimates are reasonable. One population that we have at our disposal that yields some asymmetrical sampling distributions is the `vancouver_trees` data set from the `datateachr` package. One example of this is the sampling distribution of sample proportions for the `root_barrier` variable; in this section, we will be looking at the proportion of trees that **do not** have a root barrier.\n",
+ "\n",
+ "\n",
+ "\n",
+ "
+ "\n",
+ "Recall that the `vancouver_trees` dataset contains information about public trees planted along boulevards in Vancouver. The `root_barrier` variable in this dataset specifies whether or not a tree was planted with a root barrier or not. A root barrier is a type of underground wall that protects buildings, sidewalks, and roads from roots, which can severely damage these structures. One example of a type of root barrier is shown in the picture above."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "351f3d626a4db26ff8712f7c9a1a135b",
+ "grade": false,
+ "grade_id": "cell-c719fb42fc24881f",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.1** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Filter `vancouver_trees` such that there are no `NA` values in the `root_barrier` column, and then select only that column. Use the scaffolding provided below as a guide:\n",
+ "\n",
+ "```r\n",
+ "tree_pop <- vancouver_trees %>% \n",
+ " filter(...) %>% \n",
+ " ...(root_barrier)\n",
+ "```\n",
+ "\n",
+ "_Assign your data frame to an object called `barrier_pop`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "b8c41f19fcece75027afa5a77acfd61b",
+ "grade": false,
+ "grade_id": "cell-65aebf1f840a376d",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "head(barrier_pop)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "d453b38d0b4b0438c14f6a5ce3ffc859",
+ "grade": true,
+ "grade_id": "cell-5be38a133d838f2f",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.1()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "3e5c9418633f33c559fb99a2ec38e9ea",
+ "grade": false,
+ "grade_id": "cell-3db19654da2479d1",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.2** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Draw 2000 random samples of size 20 from the population `barrier_pop` using the `rep_sample_n` function and a seed of 3909. For each sample, calculate the proportion of trees that **do not** have a root barrier (i.e. where `root_barrier == \"N\"`) as the point estimate. Lastly, visualize the distribution of the sample proportions you just calculated by plotting a histogram using `geom_histogram` with bin widths of 1/20. Add a descriptive title to the plot using `ggtitle` and ensure that the x-axis has a human-readable label. Your final plot should have a variable named `p` on the x-axis. Use the scaffolding provided below as a guide:\n",
+ "\n",
+ "```r\n",
+ "barrier_sampling_dist <- ... %>% \n",
+ " rep_sample_n(size = ..., reps = ..., replace = ...) %>% \n",
+ " ...(replicate) %>% \n",
+ " summarize(x = sum(... == \"N\"),\n",
+ " n = n()) %>% \n",
+ " mutate(p = ... / ...) %>% \n",
+ " ggplot(aes(x = p)) +\n",
+ " geom_histogram(... = ...) +\n",
+ " xlab(\"Proportion\") +\n",
+ " ggtitle(\"Sampling Distribution of Proportions (n = 20)\")\n",
+ "```\n",
+ "\n",
+ "_Assign your plot to an object called `barrier_sampling_dist`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "749d6e267353d126881216786837c83a",
+ "grade": false,
+ "grade_id": "cell-68ae3b69a08fbf94",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "set.seed(3909) # DO NOT CHANGE!\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "barrier_sampling_dist"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "d55ad4e6a96aa5f120f32e77c85e8c38",
+ "grade": true,
+ "grade_id": "cell-baec19ed025dc2e9",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.2()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "11913c8f65c17bbd77210c78dd465d74",
+ "grade": false,
+ "grade_id": "cell-656b90f2deb5d102",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.3** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Take a single random sample of size 20 from `barrier_pop` using `rep_sample_n` and a seed of 1933. Ensure your resulting data frame only has a single column: `root_barrier`.\n",
+ "\n",
+ "**Hint:** Remember to `ungroup()` before using `select()`!\n",
+ "\n",
+ "_Assign your data frame to an object called `barrier_sample`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "4367ca632b75b5d3ae359a67a64f793f",
+ "grade": false,
+ "grade_id": "cell-d565a00f0fb18d3a",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "set.seed(1933) # DO NOT CHANGE!\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "head(barrier_sample)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "883d740dbfe5561607ee39b666e967a0",
+ "grade": true,
+ "grade_id": "cell-789b0060d74dbe20",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.3()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "e8fba40dcf4610cc4c7e0188b330a453",
+ "grade": false,
+ "grade_id": "cell-5cc1c16691bdfe76",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.4** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Now we want to produce a bootstrap sampling distribution using `barrier_sample` sample we just took, which we will be able to compare to the sampling distribution we generated above. We want to use the exact same scaffolding as **question 2.2** (except the object name we are saving to) to complete the following task: \n",
+ "\n",
+ "> Take 2000 bootstrap samples from `barrier_sample` using `rep_sample_n` with a seed of 2767. Then, calculate the proportion of trees in each sample that does not have a root barrier (`root_barrier == \"N\"`); name the column containing the sample propotions `p`. Lastly, use `geom_histogram` with bin widths of 1/20 to visualize the bootstrap distribution. Add a descriptive title to the plot using `ggtitle` and ensure that the x-axis has a human-readable label. \n",
+ "\n",
+ "**Which two `...`'s in the scaffolding below _must_ be different than the code you used in question 2.2?**\n",
+ "\n",
+ "```R\n",
+ "# LINE 1: bootstrap_dist_20 <- ... %>% \n",
+ "# LINE 2: rep_sample_n(size = ..., reps = ..., replace = ...) %>% \n",
+ "# LINE 3: ...(replicate) %>% \n",
+ "# LINE 4: summarize(x = sum(... == \"N\"),\n",
+ "# LINE 5: n = n()) %>% \n",
+ "# LINE 6: mutate(p = ... / ...) %>% \n",
+ "# LINE 7: ggplot(aes(x = p)) +\n",
+ "# LINE 8: geom_histogram(... = ...) +\n",
+ "# LINE 9: xlab(\"Proportion\") +\n",
+ "# LINE 10: ...(\"n = 20\")\n",
+ "```\n",
+ "\n",
+ "A. The `...` in `LINE 1` and the third `...` from the left in `LINE 2`\n",
+ "\n",
+ "B. The `...` in `LINE 1` and the second `...` from the left in `LINE 8`\n",
+ "\n",
+ "C. The first `...` from the left in `LINE 2` and the third `...` from the left in `LINE 2`\n",
+ "\n",
+ "D. The first `...` from the left in `LINE 2` and the second `...` from the left in `LINE 8`\n",
+ "\n",
+ "E. Some other two `...`'s not listed above.\n",
+ "\n",
+ "F. None of the above; only one `...` must be different.\n",
+ "\n",
+ "G. None of the above; three or more of the `...` must be different.\n",
+ "\n",
+ "_Assign your answer to an object called `answer2.4`. Your answer should be a single character surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "f8bbd80dd4c4cdbeb39f6ed8ea860c06",
+ "grade": false,
+ "grade_id": "cell-bb6ac4a03c970908",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer2.4 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "fcacfde33502575ab5c58484bd216830",
+ "grade": true,
+ "grade_id": "cell-c108dea3f7b6b6e2",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.4()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "a6a5b5bb19434837649ecf829ddad640",
+ "grade": false,
+ "grade_id": "cell-092e11d77cbda9d5",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.5** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Take 2000 bootstrap samples from `barrier_sample` using `rep_sample_n` with a seed of 2767. Then, calculate the proportion of trees in each sample that does not have a root barrier (`root_barrier == \"N\"`). Lastly, use `geom_histogram` with bin widths of 1/20 to visualize the bootstrap distribution. Add a descriptive title to the plot using `ggtitle` and ensure that the x-axis has a human-readable label. \n",
+ "\n",
+ "**Hint:** use your answer to the previous question and your code from **question 2.2**.\n",
+ "\n",
+ "_Assign your plot to an object called `barrier_bootstrap_dist`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "a6e988e4e63b3365325e7d60be2e9330",
+ "grade": false,
+ "grade_id": "cell-49c66e4141824107",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "set.seed(2767) # DO NOT CHANGE!\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "barrier_bootstrap_dist"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "3b2625cf78306e32c7186da8b945ea32",
+ "grade": true,
+ "grade_id": "cell-1e5db4cb336f35b7",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.5()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "a5449d507c19491aa14350813efcc4d4",
+ "grade": false,
+ "grade_id": "cell-5b7a33251a40cd78",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.6** \n",
+ "
{points: 1}\n",
+ "\n",
+ "**Note:** this question has two parts!\n",
+ "\n",
+ "a) Calculate the standard deviation of the sampling distribution you generated above (`barrier_sampling_dist`); this is the standard error of the corresponding estimator.\n",
+ "\n",
+ "_Assign your answer to an object called `standard_error`. Your answer should be a single number._\n",
+ "\n",
+ "
+ "\n",
+ "b) Calculate the standard deviation of the bootstrap distribution you generated above (`barrier_bootstrap_dist`).\n",
+ "\n",
+ "_Assign your answer to an object called `standard_deviation`. Your answer should be a single number._\n",
+ "\n",
+ "**Hints:**\n",
+ "- You can get the data that was used to generate using a plot with `plot_name$data`, for example: `barrier_sampling_dist$data`.\n",
+ "- You can convert a 1x1 data frame to a number using `as.numeric()`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "9f4b1d0a76fe148b09e2e30c3ed6d181",
+ "grade": false,
+ "grade_id": "cell-723febc1c6611afd",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "standard_error\n",
+ "standard_deviation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "0d66edec900a689d8254cd6926e8976d",
+ "grade": true,
+ "grade_id": "cell-96fbd5f460153e4d",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.6()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "52cc1899a2724b40dae616dc80ed7e2f",
+ "grade": false,
+ "grade_id": "cell-872bca96dfb37d39",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.7** \n",
+ "
{points: 1}\n",
+ "\n",
+ "True or false?\n",
+ "\n",
+ "The standard deviation of a bootstrap distribution is a \"good guess\" of the standard deviation of the corresponding sampling distribution."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "a7477575e102bfb6c8f759f276c7acd5",
+ "grade": false,
+ "grade_id": "cell-98838fdd73e845c4",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer2.7 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "fca85ddd5ac1d7323ac72e1faa9c6be9",
+ "grade": true,
+ "grade_id": "cell-2b590d0c77cfb166",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.7()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "8ff10ec5398dcdb8b846e04a11be84a7",
+ "grade": false,
+ "grade_id": "cell-c1862c7bad5b258e",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.8** \n",
+ "
{points: 3}\n",
+ "\n",
+ "Will the standard deviation of a bootstrap distribution **always** be relatively close to the standard deviation of the corresponding sampling distribution?\n",
+ "- If no, describe one situation related to our root barrier scenario above that would result in the `standard_deviation` object from **question 2.6** being very different than the `standard_error` object.\n",
+ "- If yes, explain why no such situation exists."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "fa4ef74f207a8af9d77c47ade5dd5548",
+ "grade": true,
+ "grade_id": "cell-692bb294134028ed",
+ "locked": false,
+ "points": 3,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "source": [
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "88dad7be920980f75fae98697fc803c7",
+ "grade": false,
+ "grade_id": "cell-09699b3910c5e22d",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "## 3. A Closer Look at Bootstrapping\n",
+ "\n",
+ "There is one \"rule\" related to bootstrapping that we have not mentioned yet:\n",
+ "\n",
+ "> When generating a bootstrap distribution to estimate the sampling distribution for the original sample size, the **bootstrap samples** should be the **same size** as the **original sample** to get a useful estimate.\n",
+ "\n",
+ "For example, we would get poor results if we took a sample of size 30 from the population, and then took many bootstrap samples (resamples from the original sample, with replacement) of size 60 to estimate a sampling distribution for samples of size 30. Why? Let's try it out ourselves to discover the answer. Afterwards, we'll also go through some other questions to continue to solidify our understanding of the various nuances related to bootstrapping."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "29f0247abda2cf0190a6b00d05a8795e",
+ "grade": false,
+ "grade_id": "cell-0bd9dc7d44fd3c04",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "### Building Permits\n",
+ "\n",
+ "To explore the \"rule of thumb\" that we mentioned above, we will again use the `vancouver_trees` data set from the `datateachr` pacakge. However, this time the population we are interested in is only the trees with the common name `\"PISSARD PLUM\"`, and the parameter that we are interested in is the standard deviation of the `diameter` of these trees."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "edddfed3762773a96971a5767cad4355",
+ "grade": false,
+ "grade_id": "cell-499c4bc56a311d6f",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "head(vancouver_trees)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "a613408aebd17172332b5689490c2d1b",
+ "grade": false,
+ "grade_id": "cell-b08c8f90c4b7f79d",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.0** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Filter the `vancouver_trees` dataset for the population that we are interested in and then select the variable that we are interested in (your final data frame should have a single column).\n",
+ "\n",
+ "_Assign your data frame to an object called `plum_pop`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "76373c38979f404115129c5182a0c2fc",
+ "grade": false,
+ "grade_id": "cell-bdde492539c6004d",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "head(plum_pop)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "a25d03df15bf8776893bdbc138012b67",
+ "grade": true,
+ "grade_id": "cell-ae4e3ed82fd5daf8",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_3.0()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "8f1c3c40a52d7b018041b290dac44fdc",
+ "grade": false,
+ "grade_id": "cell-380227c9b0e5412d",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.1** \n",
+ "
{points: 3}\n",
+ "\n",
+ "Take a single random sample of size 10 from `plum_pop` using the `rep_sample_n` function and a seed of 0737. Ensure your resulting data frame only has a single column: `diameter`.\n",
+ "\n",
+ "_Assign your data frame to an object called `plum_sample`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "2c7107f26e57daf5c739e7f9be58d65a",
+ "grade": false,
+ "grade_id": "cell-75af97d43ec269ac",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "set.seed(0737) # DO NOT CHANGE!\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "head(plum_sample)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "b746a6c7a74cd1aa6ee9061d8867c4ed",
+ "grade": true,
+ "grade_id": "cell-0986f2eb2717d803",
+ "locked": true,
+ "points": 3,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Here we check to see if you have given your answer the correct object name\n",
+ "# and if your answer is plausible. However, all other tests have been hidden\n",
+ "# so you can practice deciding when you have the correct answer.\n",
+ "test_that('Did not assign answer to an object called \"plum_sample\"', {\n",
+ " expect_true(exists(\"plum_sample\"))\n",
+ "})\n",
+ "test_that(\"Solution should be a data frame\", {\n",
+ " expect_true(\"data.frame\" %in% class(plum_sample))\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "97c11d31d3e9f6b64a86e59f2934cc6d",
+ "grade": false,
+ "grade_id": "cell-997950696b0f714d",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.2** \n",
+ "
{points: 3}\n",
+ "\n",
+ "Take 2500 bootstrap samples **of size 100** from the sample you took in the previous question by using the `rep_sample_n` function and a seed of 9284. \n",
+ "\n",
+ "_Assign your data frame to an object called `plum_resamples`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "56d583dd611bcbbe6df83f1b791af971",
+ "grade": false,
+ "grade_id": "cell-b06447756aff9cd5",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "set.seed(9284) # DO NOT CHANGE!\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "head(plum_resamples)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "6024169a3b230b918d508557b7862530",
+ "grade": true,
+ "grade_id": "cell-a2e18241939fda90",
+ "locked": true,
+ "points": 3,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Here we check to see if you have given your answer the correct object name\n",
+ "# and if your answer is plausible. However, all other tests have been hidden\n",
+ "# so you can practice deciding when you have the correct answer.\n",
+ "test_that('Did not assign answer to an object called \"plum_resamples\"', {\n",
+ " expect_true(exists(\"plum_resamples\"))\n",
+ "})\n",
+ "test_that(\"Solution should be a data frame\", {\n",
+ " expect_true(\"data.frame\" %in% class(plum_resamples))\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "79ab6cd4efd145db521b5cbefca13a73",
+ "grade": false,
+ "grade_id": "cell-c8eda13872adcd19",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.3** \n",
+ "
{points: 3}\n",
+ "\n",
+ "Calculate the standard deviation for each resample that you took in the previous question with `group_by()` and `summarize()`. Name the new column containing the standard deviation `sd`.\n",
+ "\n",
+ "_Assign your data frame to an object called `resample_estimates`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "d8426c02e20209394824498868530160",
+ "grade": false,
+ "grade_id": "cell-704a430b3580ea41",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "head(resample_estimates)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "acb1bf708fc89826d14bc2707f3eb135",
+ "grade": true,
+ "grade_id": "cell-caa64285e3611f8d",
+ "locked": true,
+ "points": 3,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Here we check to see if you have given your answer the correct object name\n",
+ "# and if your answer is plausible. However, all other tests have been hidden\n",
+ "# so you can practice deciding when you have the correct answer.\n",
+ "test_that('Did not assign answer to an object called \"resample_estimates\"', {\n",
+ " expect_true(exists(\"resample_estimates\"))\n",
+ "})\n",
+ "test_that(\"Solution should be a data frame\", {\n",
+ " expect_true(\"data.frame\" %in% class(resample_estimates))\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "58bc88c1684a60cb15e98ccb71d7de31",
+ "grade": false,
+ "grade_id": "cell-28a80204df4ba87d",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.3** \n",
+ "
{points: 3}\n",
+ "\n",
+ "Visualize the bootstrap distribution (of `resample_estimates`) by plotting a histogram using `geom_histogram` with bin widths of 0.25. Ensure that the x-axis has a human-readable label.\n",
+ "\n",
+ "_Assign your plot to an object called `plum_bootstrap_dist`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "131209bf2b21a0c29462c42caf136cf8",
+ "grade": false,
+ "grade_id": "cell-f4e06076985de5fe",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "plum_bootstrap_dist"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "d1871e5552d82012ff62a7c8117ec982",
+ "grade": true,
+ "grade_id": "cell-962d663ea98f596f",
+ "locked": true,
+ "points": 3,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Here we check to see if you have given your answer the correct object name\n",
+ "# and if your answer is plausible. However, all other tests have been hidden\n",
+ "# so you can practice deciding when you have the correct answer.\n",
+ "test_that('Did not assign answer to an object called \"plum_bootstrap_dist\"', {\n",
+ " expect_true(exists(\"plum_bootstrap_dist\"))\n",
+ "})\n",
+ "test_that(\"Solution should be a ggplot object\", {\n",
+ " expect_true(is.ggplot(plum_bootstrap_dist))\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "fef5fec05cefa5ac860f3952033d674f",
+ "grade": false,
+ "grade_id": "cell-1b4a28c6484afef9",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.4** \n",
+ "
{points: 3}\n",
+ "\n",
+ "Produce a sampling distribution (**not** a bootstrap distribution) of sample standard deviations for samples of size 10 from the population `plum_pop` using a procedure similar to the previous questions and the last section; use `geom_histogram` with bin widths of 0.25. Also, use 2500 sample replicates and a seed of 2362.\n",
+ "\n",
+ "_Assign your plot to an object called `plum_sampling_dist`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "5add0c837275b32c7deaeb266f9b0f9c",
+ "grade": false,
+ "grade_id": "cell-ad672783439467a1",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "set.seed(2362) # DO NOT CHANGE!\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "plum_sampling_dist"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "bcc3c9bc04e09bf71c4d853c3e8dc380",
+ "grade": true,
+ "grade_id": "cell-6766ffaf56560b7f",
+ "locked": true,
+ "points": 3,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Here we check to see if you have given your answer the correct object name\n",
+ "# and if your answer is plausible. However, all other tests have been hidden\n",
+ "# so you can practice deciding when you have the correct answer.\n",
+ "test_that('Did not assign answer to an object called \"plum_sampling_dist\"', {\n",
+ " expect_true(exists(\"plum_sampling_dist\"))\n",
+ "})\n",
+ "test_that(\"Solution should be a ggplot object\", {\n",
+ " expect_true(is.ggplot(plum_sampling_dist))\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "9650703e071d3003622c3c388e8ee15a",
+ "grade": false,
+ "grade_id": "cell-7e8315e5d3cbc365",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "In the code cell below, we have used `plot_grid` to plot the sample distribution and bootstrap distribution side by side.\n",
+ "\n",
+ "**Note:** some of the sample standard deviations are not visible because we have manually set bounds on the x-axis so you can compare the important parts of the distributions more easily\n",
+ "\n",
+ "_Use the two plots below to answer the next **three questions**._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "fee2002f5ff4d0f86197d3da2c2bc03b",
+ "grade": false,
+ "grade_id": "cell-8fb6ead05de278cb",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "options(repr.plot.width = 12, repr.plot.height = 4)\n",
+ "plot_grid(plum_sampling_dist +\n",
+ " labs(title = \"Sampling Distribution\",\n",
+ " caption = \"Generated using 2500 sample replicates of size 10.\") +\n",
+ " scale_x_continuous(limits = c(0, 10)),\n",
+ " plum_bootstrap_dist +\n",
+ " labs(title = \"Bootstrap Distribution\",\n",
+ " caption = \"Generated using 2500 bootstrap samples of size 100 from a sample of size 10.\") + \n",
+ " scale_x_continuous(limits = c(0, 10)),\n",
+ " ncol = 2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "cb65d84a6c4ab211340c07d46ab1d94b",
+ "grade": false,
+ "grade_id": "cell-2244b50e950e9a0c",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.5** \n",
+ "
{points: 3}\n",
+ "\n",
+ "Which statement **best** describes the bootstrap distribution above?\n",
+ "\n",
+ "A. The distribution of many point estimates for the standard deviation of the population, which were acquired by taking many samples from the population and calculating the standard deviation of each sample.\n",
+ "\n",
+ "B. The distribution of many point estimates for the standard deviation of the sampling distribution (which is the standard error of the corresponding estimator), which were acquired by re-sampling from the original sample and calculating the standard deviation of each re-sample.\n",
+ "\n",
+ "C. The distribution of the standard deviations of many samples that were taken from the population.\n",
+ "\n",
+ "D. The distribution of standard deviations for many re-samples that were taken from the original sample.\n",
+ "\n",
+ "_Assign your answer to an object called `answer3.5`. Your answer should be a single character surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "6dfce03f274f48e431ea0ef4659a95ca",
+ "grade": false,
+ "grade_id": "cell-91d590e6b77000e6",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer3.5 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "7ac2bfde607b3bb3ff66ca641e8ef7a1",
+ "grade": true,
+ "grade_id": "cell-edf9b5c8e15d6131",
+ "locked": true,
+ "points": 3,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Here we check to see if you have given your answer the correct object name\n",
+ "# and if your answer is plausible. However, all other tests have been hidden\n",
+ "# so you can practice deciding when you have the correct answer.\n",
+ "test_that('Did not assign answer to an object called \"answer3.5\"', {\n",
+ " expect_true(exists(\"answer3.5\"))\n",
+ "})\n",
+ "test_that('Solution should be a single character (\"A\", \"B\", \"C\", or \"D\")', {\n",
+ " expect_match(answer3.5, \"a|b|c|d\", ignore.case = TRUE)\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "08e96f05c3a16af30e6b3c1ca9907d1b",
+ "grade": false,
+ "grade_id": "cell-a21ce58b3d2da734",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.6** \n",
+ "
{points: 3}\n",
+ "\n",
+ "By referencing the plots above, explain why it's not a good idea to take bootstrap sizes of a **larger size than the original sample** to estimate the sampling distribution for the original sample size."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "ec10052cbfc89966d146437b3686976b",
+ "grade": true,
+ "grade_id": "cell-ea429f75fae054cc",
+ "locked": false,
+ "points": 3,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "source": [
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "88995f4e7559480dc857d341e63eae24",
+ "grade": false,
+ "grade_id": "cell-5a499689a7ee1a31",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.7** \n",
+ "
{points: 3}\n",
+ "\n",
+ "Suppose you took a single sample of size 164 and then took many bootstrap samples of size 10 from the first sample to produce a bootstrap distribution for the mean of the `diameter` variable in the `plum_pop` population. Suppose you wanted to use the standard deviation of the bootstrap distribution to estimate the standard deviation of the sampling distribution of sample means for the `diameter` variable for samples of size 164. How would you expect the estimate to compare to the actual standard error?\n",
+ "\n",
+ "A. The estimate would likely be an under-estimate.\n",
+ "\n",
+ "B. The estimate would likely be accurate.\n",
+ "\n",
+ "C. The estimate would likely be an over-estimate.\n",
+ "\n",
+ "D. There is not enough information to make this comparison.\n",
+ "\n",
+ "_Assign your answer to an object called `answer3.7`. Your answer should be a single character surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "935c8fec12d03944b439bced23c09f10",
+ "grade": false,
+ "grade_id": "cell-fcc688c1a5f5aafc",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer3.7 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "138b7aab3f29269f02391ae81d211739",
+ "grade": true,
+ "grade_id": "cell-77a7fcc29f5d9d08",
+ "locked": true,
+ "points": 3,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Here we check to see if you have given your answer the correct object name\n",
+ "# and if your answer is plausible. However, all other tests have been hidden\n",
+ "# so you can practice deciding when you have the correct answer.\n",
+ "test_that('Did not assign answer to an object called \"answer3.7\"', {\n",
+ " expect_true(exists(\"answer3.7\"))\n",
+ "})\n",
+ "test_that('Solution should be a single character (\"A\", \"B\", \"C\", or \"D\")', {\n",
+ " expect_match(answer3.7, \"a|b|c|d\", ignore.case = TRUE)\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "4d279ea8b312399f1ebff0e7b687e543",
+ "grade": false,
+ "grade_id": "cell-aa93730bf9b99905",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "### More Bootstrapping Nuances"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "daf5b677527958ca272fe85e819c8583",
+ "grade": false,
+ "grade_id": "cell-926dcf19654f9798",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.8** \n",
+ "
{points: 3}\n",
+ "\n",
+ "Suppose a bootstrap distribution of sample means of the `diameter` variable in `plum_pop`, using `rep_sample_n` to take single sample of size 8 from the population and 3000 bootstrap samples. The resulting distribution is displayed below with bin widths of 0.25:\n",
+ "\n",
+ "\n",
+ "\n",
+ "a) Given that the standard deviation of the `diameter` variable for the population `plum_pop` is around 5.0, is this a shape that you would expect the bootstrap distribution to have?\n",
+ "\n",
+ "b) If you answered yes, justify yourself in 1-2 sentences. If you answered no, justify yourself in 1-2 sentence and describe an error or scenario that would result in such a distribution in another sentence."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "76acfe4edb73921e9bef0df9313caab0",
+ "grade": true,
+ "grade_id": "cell-b89f9dba49eab352",
+ "locked": false,
+ "points": 3,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "source": [
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "eb924b9f35ecc910c3d6a3db9503b20c",
+ "grade": false,
+ "grade_id": "cell-2ea018ce1de43103",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.9** \n",
+ "
{points: 3}\n",
+ "\n",
+ "Consider the following single random sample of 6 observations of the reported average hours of screen time a person is exposed to each day:\n",
+ "\n",
+ "| `screen_time`
+ "| -- |\n",
+ "| 3 |\n",
+ "| 6 |\n",
+ "| 8 |\n",
+ "| 1 |\n",
+ "| 7 |\n",
+ "| 7 |\n",
+ " \n",
+ "Below are two more data frames that are claimed to have been created by bootstrapping from the original sample.\n",
+ "\n",
+ "| `screen_time`
+ "| -- |\n",
+ "| 6 |\n",
+ "| 7 |\n",
+ "| 6 |\n",
+ "| 7 |\n",
+ "| 7 |\n",
+ "| 1 |\n",
+ "\n",
+ "| `screen_time`
+ "| -- |\n",
+ "| 7 |\n",
+ "| 1 |\n",
+ "| 7 |\n",
+ "| 3 |\n",
+ "| 6 |\n",
+ "| 8 |\n",
+ "\n",
+ " Consider the values in the two data frames above. Do you agree that the two data frames above were bootstrapped samples? Explain why or why not in your own words in a few sentences."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "c06bcbd38196b61090cc73d7078bdb39",
+ "grade": true,
+ "grade_id": "cell-e29a92ff6d0d764d",
+ "locked": false,
+ "points": 3,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "source": [
+ ]
+ }
+ ],
+ "metadata": {
+ "docker": {
+ "latest_image_tag": "v0.4.0"
+ },
+ "jupytext": {
+ "formats": "ipynb,Rmd"
+ },
+ "kernelspec": {
+ "display_name": "R",
+ "language": "R",
+ "name": "ir"
+ },
+ "language_info": {
+ "codemirror_mode": "r",
+ "file_extension": ".r",
+ "mimetype": "text/x-r-source",
+ "name": "R",
+ "pygments_lexer": "r",
+ "version": "4.2.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
diff --git a/materials/tutorial_04/flow_sample.csv b/materials/tutorial_04/flow_sample.csv
new file mode 100644
index 0000000..e8bb8c2
--- /dev/null
+++ b/materials/tutorial_04/flow_sample.csv
@@ -0,0 +1,110 @@
diff --git a/materials/tutorial_04/tests_tutorial_04.R b/materials/tutorial_04/tests_tutorial_04.R
new file mode 100644
index 0000000..e8738e7
--- /dev/null
+++ b/materials/tutorial_04/tests_tutorial_04.R
@@ -0,0 +1,277 @@
+test_1.0 <- function() {
+ test_that('Did not assign answer to an object called "steam_pop"', {
+ expect_true(exists("steam_pop"))
+ })
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(steam_pop))
+ })
+ expected_colnames <- c("original_price")
+ given_colnames <- colnames(steam_pop)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(steam_pop))), "bc015e4c2e1698184f3f79d5432f54f1")
+ })
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(steam_pop$original_price) * 1000)), "4d38676bfa8adb0e95087935662737c9")
+ })
+ print("Success!")
+test_1.1 <- function() {
+ test_that('Did not assign answer to an object called "steam_sample"', {
+ expect_true(exists("steam_sample"))
+ })
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(steam_sample))
+ })
+ expected_colnames <- c("original_price")
+ given_colnames <- colnames(steam_sample)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(steam_sample))), "16071ab8270571c6c83d682892e00ea5")
+ })
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(steam_sample$original_price) * 1000)), "9e4221b66064892c1311cfb904d05af9")
+ })
+ print("Success!")
+test_1.2 <- function() {
+ test_that('Did not assign answer to an object called "steam_bootstrapped"', {
+ expect_true(exists("steam_bootstrapped"))
+ })
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(steam_bootstrapped))
+ })
+ expected_colnames <- c("replicate", "bootstrap_median")
+ given_colnames <- colnames(steam_bootstrapped)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(steam_bootstrapped))), "a6d2eaaf1485f7b5c2c656e014e1835c")
+ })
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(steam_bootstrapped$bootstrap_median) * 1000)), "81f8e8b8151f4ce03110e473991992a8")
+ })
+ print("Success!")
+test_1.3 <- function() {
+ test_that('Did not assign answer to an object called "steam_ci"', {
+ expect_true(exists("steam_ci"))
+ })
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(steam_ci))
+ })
+ expected_colnames <- c("ci_lower", "ci_upper")
+ given_colnames <- colnames(steam_ci)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(steam_ci))), "4b5630ee914e848e8d07221556b0a2fb")
+ })
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(steam_ci$ci_lower * 1000)), "8da5500b2599f3c65fdea9ebde1e892e")
+ expect_equal(digest(as.integer(steam_ci$ci_upper * 1000)), "5078d99ad73dda092eca9e13110ea1ce")
+ })
+ print("Success!")
+test_1.2_infer <- function() {
+ test_that('Did not assign answer to an object called "steam_bootstrapped2"', {
+ expect_true(exists("steam_bootstrapped2"))
+ })
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(steam_bootstrapped2))
+ })
+ expected_colnames <- c("replicate", "stat")
+ given_colnames <- colnames(steam_bootstrapped2)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(steam_bootstrapped2))), "a6d2eaaf1485f7b5c2c656e014e1835c")
+ })
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(steam_bootstrapped2$stat) * 1000)), "81f8e8b8151f4ce03110e473991992a8")
+ })
+ print("Success!")
+test_1.3_infer <- function() {
+ test_that('Did not assign answer to an object called "steam_ci2"', {
+ expect_true(exists("steam_ci2"))
+ })
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(steam_ci2))
+ })
+ expected_colnames <- c("lower_ci", "upper_ci")
+ given_colnames <- colnames(steam_ci2)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(steam_ci2))), "4b5630ee914e848e8d07221556b0a2fb")
+ })
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(steam_ci2$lower_ci * 1000)), "8da5500b2599f3c65fdea9ebde1e892e")
+ expect_equal(digest(as.integer(steam_ci2$upper_ci * 1000)), "5078d99ad73dda092eca9e13110ea1ce")
+ })
+ print("Success!")
+test_1.4 <- function() {
+ test_that('Did not assign answer to an object called "steam_ci_plot"', {
+ expect_true(exists("steam_ci_plot"))
+ })
+ properties <- c(steam_ci_plot$layers[[1]]$mapping, steam_ci_plot$mapping)
+ test_that("Plot should have stat on the x-axis", {
+ expect_true("stat" == rlang::get_expr(properties$x))
+ })
+ test_that("Plot does not have the correct layers", {
+ expect_true("GeomBar" %in% class(steam_ci_plot$layers[[1]]$geom))
+ expect_true("GeomVline" %in% class(steam_ci_plot$layers[[2]]$geom))
+ expect_true("GeomVline" %in% class(steam_ci_plot$layers[[3]]$geom))
+ })
+ test_that("Plot does not have the correct bin width", {
+ expect_equal(
+ digest(as.integer(mget("stat_params", steam_ci_plot$layers[[1]])[["stat_params"]][["binwidth"]] * 1000)),
+ "189e2f1b2fbb3743811990e9708c226a"
+ )
+ })
+ test_that("Plot does not use the correct data", {
+ expect_equal(digest(nrow(steam_ci_plot$data)), "a6d2eaaf1485f7b5c2c656e014e1835c")
+ expect_equal(digest(as.integer(sum(steam_ci_plot$data$stat) * 1000)), "81f8e8b8151f4ce03110e473991992a8")
+ })
+ test_that("geom_vline layers are not in the correct locations", {
+ expect_equal(digest(as.integer(steam_ci_plot$layers[[2]]$data * 1000)), "8da5500b2599f3c65fdea9ebde1e892e")
+ expect_equal(digest(as.integer(steam_ci_plot$layers[[3]]$data * 1000)), "5078d99ad73dda092eca9e13110ea1ce")
+ })
+ print("Success!")
+test_1.6 <- function() {
+ test_that('Did not assign answer to an object called "steam_median"', {
+ expect_true(exists("steam_median"))
+ })
+ answer_as_numeric <- as.numeric(steam_median)
+ test_that("Solution should be a number", {
+ expect_false(is.na(answer_as_numeric))
+ })
+ test_that("Solution is incorrect", {
+ expect_equal(digest(as.integer(answer_as_numeric * 1000)), "8da5500b2599f3c65fdea9ebde1e892e")
+ })
+ print("Success!")
+test_1.7 <- function() {
+ test_that('Did not assign answer to an object called "answer1.7"', {
+ expect_true(exists("answer1.7"))
+ })
+ test_that('Solution should be "true" or "false"', {
+ expect_match(answer1.7, "true|false", ignore.case = TRUE)
+ })
+ test_that("Solution is incorrect", {
+ expect_equal(digest(tolower(answer1.7)), "05ca18b596514af73f6880309a21b5dd")
+ })
+ print("Success!")
+test_2.1 <- function() {
+ test_that('Did not assign answer to an object called "cancer_ci"', {
+ expect_true(exists("cancer_ci"))
+ })
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(cancer_ci))
+ })
+ expected_colnames <- c("lower_ci", "upper_ci")
+ given_colnames <- colnames(cancer_ci)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(cancer_ci))), "4b5630ee914e848e8d07221556b0a2fb")
+ })
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(cancer_ci$lower_ci * 1000000)), "81d11d2d9c5edc8dde5943e7e8ccf428")
+ expect_equal(digest(as.integer(cancer_ci$upper_ci * 1000000)), "f79b45b4d0457d5c267b08e74659f678")
+ })
+ print("Success!")
diff --git a/materials/tutorial_04/tutorial_04.ipynb b/materials/tutorial_04/tutorial_04.ipynb
new file mode 100644
index 0000000..7db2761
--- /dev/null
+++ b/materials/tutorial_04/tutorial_04.ipynb
@@ -0,0 +1,1918 @@
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "4ab39923e859202b3dac23e507ed2a90",
+ "grade": false,
+ "grade_id": "cell-0a3a16e4560c3222",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "# Tutorial 4: Confidence Intervals via Bootstrapping"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "af30f6f6426fae8bddd94612c19f7280",
+ "grade": false,
+ "grade_id": "cell-697926f90dfb6d3f",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "#### Lecture and Tutorial Learning Goals:\n",
+ "From this section, students are expected to be able to:\n",
+ "\n",
+ "1. Define what a confidence interval is, and why we want to generate one.\n",
+ "2. Explain how the bootstrap sampling distribution can be used to create confidence intervals.\n",
+ "3. Write a computer script to calculate confidence intervals for a population parameter using bootstrapping.\n",
+ "4. Effectively visualize point estimates and confidence intervals.\n",
+ "5. Interpret and explain results from confidence intervals.\n",
+ "6. Discuss the potential limitations of these methods."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "9be68c508ee8bbb19091c1c26490932f",
+ "grade": false,
+ "grade_id": "cell-9d4ab0c1978d7fc8",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Run this cell before continuing.\n",
+ "library(cowplot)\n",
+ "library(datateachr)\n",
+ "library(digest)\n",
+ "library(infer)\n",
+ "library(repr)\n",
+ "library(taxyvr)\n",
+ "library(tidyverse)\n",
+ "source(\"tests_tutorial_04.R\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "d61511e7aa18bf87dd7c00f4a7313a70",
+ "grade": false,
+ "grade_id": "cell-8f98efcea21ec2e0",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "## 1. Steam Games\n",
+ "\n",
+ "For the first part of this tutorial, we'll be working with a dataset that contains various attributes, including the name, original price, genre, and description, of over 40,000 different games available on Steam. Steam is a platform where video games (and some other types of applications) are distributed digitally. Essentially, Steam acts as a digital game store and library. Although this particular dataset was compiled in 2019, for this tutorial, we will assume we have data for the entire population for simplicity.\n",
+ "\n",
+ "\n",
+ "\n",
+ "
+ "\n",
+ "This data set, like several of the others we have worked with so far, is included in the `datateachr` package under the object name `steam_games`. The original source of the data set can be found in the documentation (`?steam_games`). We are interested in calculating a **90% confidence interval** for the **median of the `original_price`** of the games in the population. Afterwards, we will interpret the confidence interval and, because we are lucky enough to have access to data for the entire finite population, we will calculate the true median and see whether it is captured by our confidence interval."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "227788338cab9dbc911dc817701eae1c",
+ "grade": false,
+ "grade_id": "cell-ad245a9f4e1c5358",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "?steam_games"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "f33106681c17504006b884904dfa4b86",
+ "grade": false,
+ "grade_id": "cell-e3af05128aa353b2",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.0** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Filter out all `NA` values from the variable we are interested in (`original_price`), and select only that column.\n",
+ "\n",
+ "**Note:** the values of `original_price` are in $USD.\n",
+ "\n",
+ "_Assign your data frame to an object called `steam_pop`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "defe590fe055c73e6f911de6ecb82cea",
+ "grade": false,
+ "grade_id": "cell-fa279e7c87ba536d",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "head(steam_pop)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "43c1d812896a0cb82f8f8f902c736879",
+ "grade": true,
+ "grade_id": "cell-2328820ae2dd9342",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.0()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "fd266d9ca666ec5392f50a66961f7576",
+ "grade": false,
+ "grade_id": "cell-6e83465e672d5f4a",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.1** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Take a single random sample of size 40 from `steam_pop` using `rep_sample_n` and a seed of 2078. After taking the sample, ensure that only the `original_price` column is selected. (_Hint: you will need to ungroup before selecting the column_.)\n",
+ "\n",
+ "_Assign your data frame to an object called `steam_sample`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "4d91b6b44ebf5c30e0e4f55d1fb56609",
+ "grade": false,
+ "grade_id": "cell-b15be21c9148990e",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "set.seed(2078) # DO NOT CHANGE\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "head(steam_sample)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "3bbcc2e183f04c22b6c1ec3862c7ef22",
+ "grade": true,
+ "grade_id": "cell-aa73f2f5d26ec1a5",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.1()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "4d507dcb0aa7e8203cce2b8427e0a5dc",
+ "grade": false,
+ "grade_id": "cell-f8d2705dd1f8a738",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.2** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Take 1500 bootstrap samples from `steam_sample` using `rep_sample_n` and a seed of 9844. Then, calculate the median of each sample (name this column `bootstrap_median`). Your final data frame should have a `replicate` column and a `bootstrap_median` column.\n",
+ "\n",
+ "_Assign your data frame to an object called `steam_bootstrapped`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "670d26f008167559c8c7301d128bd531",
+ "grade": false,
+ "grade_id": "cell-fb44dfe96bbfc3e2",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "set.seed(9844) # DO NOT CHANGE\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "head(steam_bootstrapped)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "0a936f8824b87b368ec46cf274e24af7",
+ "grade": true,
+ "grade_id": "cell-ac7825fd6fb09e74",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.2()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "c7f220f713c5a889e7e388537a21feec",
+ "grade": false,
+ "grade_id": "cell-a81081ee88a8dcbb",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.3** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Use the `summarize` and `quantile` functions to calculate a 90% confidence interval for the median. Use the 0.05th and 0.95th quantiles for the lower and upper bounds of the interval, respectively. Name the column containing the lower bound of the interval `ci_lower` and the upper bound `ci_upper`.\n",
+ "\n",
+ "_Assign your data frame to an object called `steam_ci`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "68bc9f6a4e84a69245fb4582e1b86047",
+ "grade": false,
+ "grade_id": "cell-32b6987424efb99e",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "head(steam_ci)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "2337cba0041021ee6be2b882216291cc",
+ "grade": true,
+ "grade_id": "cell-d5e684abafd5de61",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.3()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "3c7b1ef5c45106ddfa3c376c43ddc3bb",
+ "grade": false,
+ "grade_id": "cell-83b27ff0876de4d5",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "### The `infer` package workflow for bootstrapping (and calculating confidence intervals)\n",
+ "\n",
+ "As you may have seen in [ModernDive](https://moderndive.com/8-confidence-intervals.html#infer-workflow), there is an alternative workflow for generating bootstrap distributions and calculating confidence intervals. The benefits are outlined in [Section 8.4.2](https://moderndive.com/8-confidence-intervals.html#infer-workflow). However, to summarize, the main benefit is that it will allow us to use similar code for inference methods that we will learn about later on, so we can compare and transition from one method to another much easier. The general workflow for bootstrapping with the `infer` workflow is summarized in the chart below:\n",
+ "\n",
+ "\n",
+ "
+ "\n",
+ "Given a sample, in the `specify` step, you \"specify\" the variable in the sample that you are interested in. With `generate`, you \"generate\" the bootstrap samples (like using `rep_sample_n`). Then, with `calculate`, you \"calculate\" the statistic you are interested in for each re-sample (like using `group_by(replicate)` and then `summarize`).\n",
+ "\n",
+ "Finally, you can use `get_confidence_interval` to calculate a confidence interval using our bootstrap distribution and quantiles (like using the `summarize` and `quantile` functions).\n",
+ "\n",
+ "Let's re-do **question 1.2** and **question 1.3** using this new workflow for practice."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "79309a4db19c5fb687d20ba011244e74",
+ "grade": false,
+ "grade_id": "cell-d38884f4b37c73d0",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.2 (with the `infer` workflow!)** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Using the `infer` workflow, take 1500 bootstrap samples from `steam_sample` and calculate the **median** of each sample. Use the same seed as you did previously (9844) and the scaffolding provided below as a guide:\n",
+ "\n",
+ "```r\n",
+ "steam_bootstrapped2 <- ... %>% \n",
+ " specify(response = ...) %>% \n",
+ " generate(type = \"bootstrap\", reps = ...) %>% \n",
+ " calculate(stat = \"...\")\n",
+ "```\n",
+ "\n",
+ "_Assign your data frame to an object called `steam_bootstrapped2`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "5ed3133010cfb9ea0197b258fc04a5b1",
+ "grade": false,
+ "grade_id": "cell-1f0df6bdbb62604c",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "set.seed(9844) # DO NOT CHANGE\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "head(steam_bootstrapped2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "8b1e65ce3b046d0057a4ab82669b0f72",
+ "grade": true,
+ "grade_id": "cell-b45446b574e94ceb",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.2_infer()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "de8940b2aa2ca55af5ce334a9f9db0c1",
+ "grade": false,
+ "grade_id": "cell-2d3f19165602c754",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.3 (with the `infer` workflow)** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Use the `get_confidence_interval` function with the bootstrap distribution you just generated (`steam_bootstrapped2`) to calculate a 90% confidence interval for the median. Use the scaffolding provided below as a guide:\n",
+ "\n",
+ "```r\n",
+ "steam_ci2 <- steam_bootstrapped2 %>% \n",
+ " get_confidence_interval(level = ..., type = \"percentile\")\n",
+ "\n",
+ "```\n",
+ "\n",
+ "**Note:** you can also use the function `get_ci`, which is the same as the function `get_confidence_interval`, but it is much more concise. Try replacing `get_confidence_interval` with `get_ci`, and the result will be the same!\n",
+ "\n",
+ "_Assign your data frame to an object called `steam_ci2`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "fb1acd4a13b4c79c4b8c332806278127",
+ "grade": false,
+ "grade_id": "cell-236e2d720af60c9f",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "steam_ci2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "057d0b52f21247b5489abcd90fa05b59",
+ "grade": true,
+ "grade_id": "cell-bcb118f1c903bc0c",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.3_infer()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "73b102e923f60fed1360a9ad31944c2b",
+ "grade": false,
+ "grade_id": "cell-787bd139839ff9f2",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.4**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Visualize the confidence interval (`steam_ci2`) with two `geom_vline` layers on top of the bootstrap distribution (`steam_bootstrapped2`) using `geom_histogram` with bin widths of 5. Use the scaffolding provided below as a guide:\n",
+ "\n",
+ "```r\n",
+ "steam_ci_plot <- steam_bootstrapped2 %>% \n",
+ " ggplot(aes(x = ...)) +\n",
+ " ...(binwidth = ...) +\n",
+ " ...(xintercept = steam_ci[[1]]) +\n",
+ " ...(xintercept = ...)\n",
+ "```\n",
+ "\n",
+ "_Assign your plot to an object called `steam_ci_plot`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "f72b52cd3cc106700ac06b8d1cfc2ca2",
+ "grade": false,
+ "grade_id": "cell-ed597d1944e8d0f4",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "steam_ci_plot"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "ac084109fb348bfd0d2c07ea1019cf08",
+ "grade": true,
+ "grade_id": "cell-33d5d981468b489b",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.4()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "706486984cdbda00167d37cf4335da02",
+ "grade": false,
+ "grade_id": "cell-ae991fa30d049a8c",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.5** \n",
+ "
{points: 3}\n",
+ "\n",
+ "Consider the effectiveness of the plot above. If you think the plot is effective, list **at least** three reasons why. Otherwise, list **at least** three things that you would change about the plot to make it more effective."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "b2e5749eaf38060acb4fac0382bd343a",
+ "grade": true,
+ "grade_id": "cell-d8bb30be9c34acae",
+ "locked": false,
+ "points": 3,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "source": [
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "83657a8daeca2e0e3765e36459c0e603",
+ "grade": false,
+ "grade_id": "cell-671f9c9171d201f1",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.6** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Calculate the median of the population `steam_pop`.\n",
+ "\n",
+ "_Assign your answer to an object called `steam_median`. Your answer should be a single number._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "5e9c3809092194522cbdeba00bb96e24",
+ "grade": false,
+ "grade_id": "cell-bf6e8d2bad6dbfa0",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "steam_median"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "435b0750da8f883296f6f59acec10960",
+ "grade": true,
+ "grade_id": "cell-54bb7b760cdd994e",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.6()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "15435aaae46548ac208c0d43e9972944",
+ "grade": false,
+ "grade_id": "cell-479025f4f0b088ae",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.7**\n",
+ "
{points: 1}\n",
+ "\n",
+ "True or false?\n",
+ "\n",
+ "The confidence interval `steam_ci` captures the parameter of interest.\n",
+ "\n",
+ "_Assign your answer to an object called `answer1.7`. Your answer should be either \"true\" or \"false\", surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "082c06c4e4bbf9245a327a68dd6e658d",
+ "grade": false,
+ "grade_id": "cell-667d895a7bdce38f",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer1.7 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "b1c28b66220bc6de62706de2b54fa689",
+ "grade": true,
+ "grade_id": "cell-d4ea9b654f5ac678",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.7()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "e417622f4ce890f4c4393df0a309d421",
+ "grade": false,
+ "grade_id": "cell-ad40e9e61ba37ed6",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.8**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Suppose you repeated the process above and took 100 more samples and calculated a 90% cofidence interval for each sample. How many of the 100 intervals would you expect to capture the true median of the population?\n",
+ "\n",
+ "_Assign your answer to an object called `answer1.8`. Your answer should be a single integer._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "9990fffac2dfd19de46e1db71c04b23f",
+ "grade": false,
+ "grade_id": "cell-20d3a4e79a063c02",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer1.8 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "6c4415d8a5aec418f9107d3fb0d5555b",
+ "grade": true,
+ "grade_id": "cell-8e27b4947d4866fb",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Here we check to see if you have given your answer the correct object name\n",
+ "# and if your answer is plausible. However, all other tests have been hidden\n",
+ "# so you can practice deciding when you have the correct answer.\n",
+ "test_that('Did not assign answer to an object called \"answer1.8\"', {\n",
+ " expect_true(exists(\"answer1.8\"))\n",
+ "})\n",
+ "answer_as_numeric <- as.numeric(answer1.8)\n",
+ "test_that(\"Solution should be a number\", {\n",
+ " expect_false(is.na(answer_as_numeric))\n",
+ "})\n",
+ "test_that(\"Solution should be an integer\", {\n",
+ " expect_true(answer_as_numeric %% 1 == 0)\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "579395940a42ea27afeed737af0141df",
+ "grade": false,
+ "grade_id": "cell-90ccaf35bb6ea089",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "## 2. Breast Cancer Diagnosis\n",
+ "\n",
+ "In this section, we'll be working with a sample of characteristics of the cell nuclei of various breast masses. The data originates from University of Wisconsin Hospital, where a physician named Dr. William H. Wolberg obtained samples of breast lumps (or tumors) using a fine needle aspiration (FNA) biopsy. Images of the samples were digitized to compute the characteristics of the nuclei that you can find in the sample to work towards Dr. Wolberg's original goal of diagnosing new tumours mathematically using only a single FNA [(he was quite successful!)](https://www.pnas.org/content/87/23/9193.short). In the sample, the nuclei characteristics have been paired with the ultimate diagnosis of the mass (benign or malignant).\n",
+ "\n",
+ "\n",
+ "A screenshot from Xcyt, a program that was developed by Dr. Wolberg for breast mass diagnoses using these data.
Image from pages.cs.wisc.edu
+ "\n",
+ "This sample is located in the `datateachr` package, and is named `cancer_sample`. We are interested in estimating the **proportion of the patients from the population whose breast masses are malignant** (`diagnosis == \"M\"`)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "d0fa0d8449308f971963465b6d98c30c",
+ "grade": false,
+ "grade_id": "cell-97d69e73d0674bea",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Run this cell before continuing.\n",
+ "head(cancer_sample)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "e9ba371b2fa8d81b257c2eb47df58499",
+ "grade": false,
+ "grade_id": "cell-40898ae24f137226",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.0**\n",
+ "
{points: 3}\n",
+ "\n",
+ "Describe the population from which the sample `cancer_sample` was drawn from.\n",
+ "\n",
+ "**Note:** this question has a fairly wide range of acceptable answers!"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "758c3dca7a8f12f784480e93cc5f7768",
+ "grade": true,
+ "grade_id": "cell-c8e6f6b5bbccd05b",
+ "locked": false,
+ "points": 3,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "source": [
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "176c8c20ef7f72bebc5fb474dcae51cc",
+ "grade": false,
+ "grade_id": "cell-26332d818a4387fa",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.1**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Use the `infer` package workflow to calculate an 80% confidence interval for the proportion with malignant breast masses (`diagnosis == \"M\"`), using bootstrapping with 1000 replicates. Set your seed to 8943. Your final dataframe should have a single row and two columns named `lower_ci` and `upper_ci`.\n",
+ "\n",
+ "**Hint:** If you're stuck, don't be afraid to explore the [documentation for the `infer` package](https://cran.r-project.org/web/packages/infer/infer.pdf), peek at [Section 8.5 of ModernDive](https://moderndive.com/8-confidence-intervals.html#one-prop-ci), or ask someone for help! You can use your code from the previous section as a blueprint.\n",
+ "\n",
+ "_Assign your data frame to an object called `cancer_ci`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "58015b6a9d8a9e6e428715fbcd47c743",
+ "grade": false,
+ "grade_id": "cell-b3208e919708e3a6",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "set.seed(8943) # DO NOT CHANGE!\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "cancer_ci"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "d81ba5294bb7787e00ad3ad3c806239b",
+ "grade": true,
+ "grade_id": "cell-a48f1ab67ab90f8f",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.1()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "1eff2d367e1f321f38171a9c0bc2fce8",
+ "grade": false,
+ "grade_id": "cell-c136664317b779e9",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.2**\n",
+ "
{points: 3}\n",
+ "\n",
+ "Does the confidence interval capture the population parameter we are interested in? If there is no way to determine this for certain, explain why that is the case."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "0dc8ab43fb616518a71a405b362d4869",
+ "grade": true,
+ "grade_id": "cell-dace4e4a65f80abb",
+ "locked": false,
+ "points": 3,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "source": [
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "1c509c810af1159e3a2a3d46f89b41b1",
+ "grade": false,
+ "grade_id": "cell-439bd5e9841feb5b",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.3**\n",
+ "
{points: 3}\n",
+ "\n",
+ "In 1-2 sentences, explain one way you can interpret the confidence interval you calculated above (`cancer_ci`)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "47c8d031c2acec306273fc5d57fa2b61",
+ "grade": true,
+ "grade_id": "cell-7d0ade0ba996d669",
+ "locked": false,
+ "points": 3,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "source": [
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "17ef2f8a26ea5a754f79a87d1748a67d",
+ "grade": false,
+ "grade_id": "cell-e62dad9584edaeb3",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.4**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Suppose you calculated another 80% confidence interval for the population proportion, but with a sample that was 5 times larger than `cancer_sample`. How would you expect this second interval compare to the first confidence interval you calculated above (`cancer_ci`)?\n",
+ "\n",
+ "A. The second confidence interval would likely be narrower than the first.\n",
+ "\n",
+ "B. The second confidence interval would likely be about the same width as the first.\n",
+ "\n",
+ "C. The second confidence interval would likely be wider than the first.\n",
+ "\n",
+ "D. There is no way to tell how the second interval would compare to the first.\n",
+ "\n",
+ "_Assign your answer to an object called `answer2.4`. Your answer should be a single character surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "ed6880dc41e3c65b21111c33e18ba7a4",
+ "grade": false,
+ "grade_id": "cell-f8ebc2b8a4e59c38",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer2.4 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "9f3d9b7eca19aa91074726481b195095",
+ "grade": true,
+ "grade_id": "cell-1c527e7a33c44b58",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Here we check to see if you have given your answer the correct object name\n",
+ "# and if your answer is plausible. However, all other tests have been hidden\n",
+ "# so you can practice deciding when you have the correct answer.\n",
+ "test_that('Did not assign answer to an object called \"answer2.4\"', {\n",
+ " expect_true(exists(\"answer2.4\"))\n",
+ "})\n",
+ "test_that('Solution should be a single character (\"A\", \"B\", \"C\", or \"D\")', {\n",
+ " expect_match(answer2.4, \"a|b|c|d\", ignore.case = TRUE)\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "59c10a1e2e63fdc78f6ebcdcb371813e",
+ "grade": false,
+ "grade_id": "cell-ede7d2d839cfd7d4",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "## 3. Bow River at Banff: 100-Year Flood\n",
+ "\n",
+ "Sample quantiles, like the other statistics we have introduced such as the sample mean and standard deviation, can also be considered as a point estimate of a population parameter. Meaning, quantiles can be used to describe the distribution of a quantitative variable of a population; each distribution has a unique 0.4 quantile, 0.93 quantile, and so on. One common usage of quantiles is to estimate N-year floods. An N-year flood is defined as smallest possible severety for a flood that we **expect** to occur once every N years (but could occur more or less than once in any given N-year span). To calculate a point estimate N-year flood, one must find a quantile such that $\\frac{1}{N}$ known observations fall above it. Also, like the other population parameters we have mentioned, we can also calculate a **confidence interval** for this parameter, which is what we will be doing in this section.\n",
+ "\n",
+ "\n",
+ "In 2013, a combination of factors lead to the Bow River reaching very high flow rates (466 ${\\text{m}^3}/{\\text{s}}$ near Banff), which contributed to extreme flooding throughout Alberta, the Canadian province that lies directly east of British Columbia. One photograph of the event is shown above. For many years the Government of Canada has been collecting hydrometric data at many different stations for several rivers, including the Bow River. So, what's the least severe flood due to high Bow River flow rates that Albertans can expect every 100 years? To answer this question, our goal is calculate a **95% confidence interval for the 100-year flood** (the $1 - \\frac{1}{100} = 0.99$ quantile) using the **maxima** flow rate data collected at the Banff Bow River station, recorded in ${\\text{m}^3}/{\\text{s}}$. This data is located [here](https://wateroffice.ec.gc.ca/report/historical_e.html?stn=05BB001&dataType=Annual+Extremes¶meterType=Flow&year=2018&mode=Table) but we have already tidied the data for you and included it in the `datateachr` package under the name `flow_sample`.\n",
+ "\n",
+ "`flow_sample` contains information about the maximum _and_ minimum flow rates for each year, so we need to filter the data set for **flow maxima**. We have done this for you in the cell below:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "264de0861282cf155f80a526fe692a06",
+ "grade": false,
+ "grade_id": "cell-bd77bcb71614f9ea",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Run this cell before continuing.\n",
+ "flow_sample <- flow_sample %>% \n",
+ " filter(extreme_type == \"maximum\")\n",
+ "\n",
+ "head(flow_sample)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "0a62b9f437a5fae6ed9b602708345eaf",
+ "grade": false,
+ "grade_id": "cell-899204d1a64f2bfd",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.0**\n",
+ "
{points: 3}\n",
+ "\n",
+ "The data contained in `flow_sample` is considered a sample. Describe the population from which the sample was drawn from.\n",
+ "\n",
+ "**Note:** there may more than one solution to this question depending on how one interprets \"population\"."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "50d40f9d93bd6e5036808cde1e064d62",
+ "grade": true,
+ "grade_id": "cell-31c1a7ca3535d01a",
+ "locked": false,
+ "points": 3,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "source": [
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "8cf3095d95de7f39a0c10c4a40daa35c",
+ "grade": false,
+ "grade_id": "cell-c6f632ef113e101b",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.1**\n",
+ "
{points: 3}\n",
+ "\n",
+ "Use the `infer` package workflow to generate bootstrap distribution of the appropriate sample quantiles for `flow_sample` using 3000 bootstrap replicates. Set your seed to 4629. Your final dataframe should have the columns `replicate` and `stat`.\n",
+ "\n",
+ "**IMPORTANT NOTE:** because the `calculate` function does not support using quantiles as the sample statistic, here we have to use the `summarize` function. Thus, in place of the line where you would usually use `calculate` with the `infer` package workflow, you can use the following line:\n",
+ "\n",
+ "```r\n",
+ " ... %>%\n",
+ " summarize(stat = quantile(flow, probs = 0.99))\n",
+ "```\n",
+ "\n",
+ "_Assign your data frame to an object called `flow_bootstrapped`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "b956204cc704c08c7e3a70de5382daf7",
+ "grade": false,
+ "grade_id": "cell-0355d24604d1db72",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "set.seed(4629) # DO NOT CHANGE!\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "head(flow_bootstrapped)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "47e10cb1d924af0840b7ffb8ae2bd99c",
+ "grade": true,
+ "grade_id": "cell-6773d36ab69a5944",
+ "locked": true,
+ "points": 3,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Here we check to see if you have given your answer the correct object name\n",
+ "# and if your answer is plausible. However, all other tests have been hidden\n",
+ "# so you can practice deciding when you have the correct answer.\n",
+ "test_that('Did not assign answer to an object called \"flow_bootstrapped\"', {\n",
+ " expect_true(exists(\"flow_bootstrapped\"))\n",
+ "})\n",
+ "test_that(\"Solution should be a data frame\", {\n",
+ " expect_true(\"data.frame\" %in% class(flow_bootstrapped))\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "c63b08b0eaf03c135810545fc1be74f8",
+ "grade": false,
+ "grade_id": "cell-0654e2f42a6a3bc0",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.2**\n",
+ "
{points: 3}\n",
+ "\n",
+ "Use the appropriate function from the `infer` package to calculate a 95% confidence interval for the 100-year flood from the bootstrap distribution you just generated. Your final dataframe should have a single row and two columns named `lower_ci` and `upper_ci`.\n",
+ "\n",
+ "_Assign your data frame to an object called `flow_ci`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "183cc268fd2176442c27288ac786313d",
+ "grade": false,
+ "grade_id": "cell-4b586362c8579701",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "head(flow_ci)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "068c492e6e95a8b4f765c3c8acc46a98",
+ "grade": true,
+ "grade_id": "cell-0452ee4cd567c75f",
+ "locked": true,
+ "points": 3,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Here we check to see if you have given your answer the correct object name\n",
+ "# and if your answer is plausible. However, all other tests have been hidden\n",
+ "# so you can practice deciding when you have the correct answer.\n",
+ "test_that('Did not assign answer to an object called \"flow_ci\"', {\n",
+ " expect_true(exists(\"flow_ci\"))\n",
+ "})\n",
+ "test_that(\"Solution should be a data frame\", {\n",
+ " expect_true(\"data.frame\" %in% class(flow_ci))\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "85ef64954d9286a02648c326d3bb5e51",
+ "grade": false,
+ "grade_id": "cell-38ba2cf206a30f11",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.3**\n",
+ "
{points: 3}\n",
+ "\n",
+ "Create an **effective** visualization of the confidence interval `flow_ci` and its associated bootstrap distribution `flow_bootstrapped`. Use whichever layers and arguments you wish. \n",
+ "\n",
+ "**Hint:** if you want some inspiration, check out https://www.r-graph-gallery.com/index.html!\n",
+ "\n",
+ "_Assign your plot to an object called `flow_ci_plot`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "8aef645e0a789179cf12d3127b7f14ca",
+ "grade": true,
+ "grade_id": "cell-b2ea95c2613769e0",
+ "locked": false,
+ "points": 3,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "steam_ci_plot"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "22f33c6ff2c2aadb3c8d44c7e6b363fa",
+ "grade": false,
+ "grade_id": "cell-d8c1d59bb6a4338c",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "_Use your plot above to help you answer the **next 3 questions**._"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "1d27982dacde907bf7d8c0e46c0d276f",
+ "grade": false,
+ "grade_id": "cell-83d097657c5a693e",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.4**\n",
+ "
{points: 1}\n",
+ "\n",
+ "True or false?\n",
+ "\n",
+ "There is a 50% chance that the true 100-year flood value is captured by the confidence interval `flow_ci`.\n",
+ "\n",
+ "_Assign your answer to an object called `answer3.4`. Your answer should be either \"true\" or \"false\", surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "5eb1cc95b98041a9d937075d7f7dd5bd",
+ "grade": false,
+ "grade_id": "cell-d9a4eb2e56e2ecbe",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer3.4 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "26733256d3d6d726278d2be915180390",
+ "grade": true,
+ "grade_id": "cell-3eb0b62c08bdfbe9",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Here we check to see if you have given your answer the correct object name\n",
+ "# and if your answer is plausible. However, all other tests have been hidden\n",
+ "# so you can practice deciding when you have the correct answer.\n",
+ "test_that('Did not assign answer to an object called \"answer3.4\"', {\n",
+ " expect_true(exists(\"answer3.4\"))\n",
+ "})\n",
+ "test_that('Answer should be \"true\" or \"false\"', {\n",
+ " expect_match(answer3.4, \"true|false\", ignore.case = TRUE)\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "c0155c66c81fcdbc77819fe910c19e6b",
+ "grade": false,
+ "grade_id": "cell-d03d1d2daa5f780e",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.5**\n",
+ "
{points: 1}\n",
+ "\n",
+ "True or false?\n",
+ "\n",
+ "We are 95% confident that the true 100-year flood value is captured by the confidence interval `flow_ci`.\n",
+ "\n",
+ "_Assign your answer to an object called `answer3.5`. Your answer should be either \"true\" or \"false\", surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "ba88552f56b0fc18f9bdccaf0504f839",
+ "grade": false,
+ "grade_id": "cell-f67f3bc2d2cecd09",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer3.5 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "b9c8d89fc1eca272b79a9ea4ae5cd580",
+ "grade": true,
+ "grade_id": "cell-2c25ad9596dad135",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Here we check to see if you have given your answer the correct object name\n",
+ "# and if your answer is plausible. However, all other tests have been hidden\n",
+ "# so you can practice deciding when you have the correct answer.\n",
+ "test_that('Did not assign answer to an object called \"answer3.5\"', {\n",
+ " expect_true(exists(\"answer3.5\"))\n",
+ "})\n",
+ "test_that('Answer should be \"true\" or \"false\"', {\n",
+ " expect_match(answer3.5, \"true|false\", ignore.case = TRUE)\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "7d005791cc14cfdbef2be34df2bb7ee1",
+ "grade": false,
+ "grade_id": "cell-008bc1078d0f8846",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.6**\n",
+ "
{points: 1}\n",
+ "\n",
+ "True or false?\n",
+ "\n",
+ "If we increased the confidence level of the confidence interval `flow_ci`, we would expect that it would become narrower.\n",
+ "\n",
+ "_Assign your answer to an object called `answer3.6`. Your answer should be either \"true\" or \"false\", surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "2484a9220502b270eb1ead910fbdb627",
+ "grade": false,
+ "grade_id": "cell-91d3dae1fa9afd59",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer3.6 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "25ca65c55b35cb478c4100f649bf87ea",
+ "grade": true,
+ "grade_id": "cell-8375139f411d5d82",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Here we check to see if you have given your answer the correct object name\n",
+ "# and if your answer is plausible. However, all other tests have been hidden\n",
+ "# so you can practice deciding when you have the correct answer.\n",
+ "test_that('Did not assign answer to an object called \"answer3.6\"', {\n",
+ " expect_true(exists(\"answer3.6\"))\n",
+ "})\n",
+ "test_that('Answer should be \"true\" or \"false\"', {\n",
+ " expect_match(answer3.6, \"true|false\", ignore.case = TRUE)\n",
+ "})"
+ ]
+ }
+ ],
+ "metadata": {
+ "docker": {
+ "latest_image_tag": "v0.4.0"
+ },
+ "jupytext": {
+ "formats": "ipynb,Rmd"
+ },
+ "kernelspec": {
+ "display_name": "R",
+ "language": "R",
+ "name": "ir"
+ },
+ "language_info": {
+ "codemirror_mode": "r",
+ "file_extension": ".r",
+ "mimetype": "text/x-r-source",
+ "name": "R",
+ "pygments_lexer": "r",
+ "version": "4.2.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
diff --git a/materials/worksheet_03/section_3.R b/materials/worksheet_03/section_3.R
new file mode 100644
index 0000000..78ea63c
--- /dev/null
+++ b/materials/worksheet_03/section_3.R
@@ -0,0 +1,13 @@
+ubc_sample <- tibble(full_years = rexp(n = 40, rate = 0.25) - 1) %>%
+ ceiling() %>%
+ filter(full_years >= 0, full_years <= 10)
+sample_dist <- ubc_sample %>%
+ ggplot(aes(x = full_years)) +
+ geom_histogram(binwidth = 1, colour = "white") +
+ ggtitle("Sample Distribution") +
+ xlab("# Full Years At UBC") +
+ scale_x_continuous(breaks = seq(0, 10, 1))
\ No newline at end of file
diff --git a/materials/worksheet_03/section_4.R b/materials/worksheet_03/section_4.R
new file mode 100644
index 0000000..b754765
--- /dev/null
+++ b/materials/worksheet_03/section_4.R
@@ -0,0 +1,120 @@
+# =========================== SECTION 3 =========================== #
+ubc_sample <- tibble(full_years = rexp(n = 40, rate = 0.25) - 1) %>%
+ ceiling() %>%
+ filter(full_years >= 0, full_years <= 10)
+sample_dist <- ubc_sample %>%
+ ggplot(aes(x = full_years)) +
+ geom_histogram(binwidth = 1, colour = "white") +
+ ggtitle("Sample Distribution") +
+ xlab("# Full Years At UBC") +
+ scale_x_continuous(breaks = seq(0, 10, 1))
+# =========================== SECTION 4 =========================== #
+sampling_dist_10 <- multi_family_strata %>%
+ rep_sample_n(size = 10, reps = 2000) %>%
+ group_by(replicate) %>%
+ summarise(mean_land_value = mean(current_land_value)) %>%
+ ggplot(aes(x = mean_land_value)) +
+ geom_histogram(binwidth = 15000) +
+ xlab("Mean Land Value (CAD)") +
+ ggtitle("n = 10")
+sampling_dist_30 <- multi_family_strata %>%
+ rep_sample_n(size = 30, reps = 2000) %>%
+ group_by(replicate) %>%
+ summarise(mean_land_value = mean(current_land_value)) %>%
+ ggplot(aes(x = mean_land_value)) +
+ geom_histogram(binwidth = 15000) +
+ xlab("Mean Land Value (CAD)") +
+ ggtitle("n = 30")
+sampling_dist_100 <- multi_family_strata %>%
+ rep_sample_n(size = 100, reps = 2000) %>%
+ group_by(replicate) %>%
+ summarise(mean_land_value = mean(current_land_value)) %>%
+ ggplot(aes(x = mean_land_value)) +
+ geom_histogram(binwidth = 15000) +
+ xlab("Mean Land Value (CAD)") +
+ ggtitle("n = 100")
+bootstrap_10_mean <- round(mean(bootstrap_dist_10$data$mean_land_value), 2)
+bootstrap_30_mean <- round(mean(bootstrap_dist_30$data$mean_land_value), 2)
+bootstrap_100_mean <- round(mean(bootstrap_dist_100$data$mean_land_value), 2)
+sampling_10_mean <- round(mean(sampling_dist_10$data$mean_land_value), 2)
+sampling_30_mean <- round(mean(sampling_dist_30$data$mean_land_value), 2)
+sampling_100_mean <- round(mean(sampling_dist_100$data$mean_land_value), 2)
+bootstrap_dist_row <- plot_grid(bootstrap_dist_10 +
+ theme(axis.text.x = element_text(angle = 90)) +
+ scale_x_continuous(breaks = seq(400000, 1200000, 200000),
+ limits = c(400000, 1200000)) +
+ geom_vline(xintercept = bootstrap_10_mean, colour = "red"),
+ bootstrap_dist_30 +
+ theme(axis.text.x = element_text(angle = 90)) +
+ scale_x_continuous(breaks = seq(400000, 1200000, 200000),
+ limits = c(400000, 1200000)) +
+ geom_vline(xintercept = bootstrap_30_mean, colour = "red"),
+ bootstrap_dist_100 +
+ theme(axis.text.x = element_text(angle = 90)) +
+ scale_x_continuous(breaks = seq(400000, 1200000, 200000),
+ limits = c(400000, 1200000)) +
+ geom_vline(xintercept = bootstrap_100_mean, colour = "red"),
+ ncol = 3)
+title <- ggdraw() +
+ draw_label("Bootstrap Sampling Distributions (of Sample Means)",
+ fontface = 'bold',
+ x = 0,
+ hjust = 0) +
+ theme(plot.margin = margin(0, 0, 0, 7))
+bootstrap_dist_grid <- plot_grid(title,
+ bootstrap_dist_row,
+ ncol = 1,
+ rel_heights = c(0.1, 1))
+sampling_dist_row <- plot_grid(sampling_dist_10 +
+ theme(axis.text.x = element_text(angle = 90)) +
+ scale_x_continuous(breaks = seq(400000, 1200000, 200000),
+ limits = c(400000, 1200000)) +
+ geom_vline(xintercept = sampling_10_mean, colour = "red"),
+ sampling_dist_30 +
+ theme(axis.text.x = element_text(angle = 90)) +
+ scale_x_continuous(breaks = seq(400000, 1200000, 200000),
+ limits = c(400000, 1200000)) +
+ geom_vline(xintercept = sampling_30_mean, colour = "red"),
+ sampling_dist_100 +
+ theme(axis.text.x = element_text(angle = 90)) +
+ scale_x_continuous(breaks = seq(400000, 1200000, 200000),
+ limits = c(400000, 1200000)) +
+ geom_vline(xintercept = sampling_100_mean, colour = "red"),
+ ncol = 3)
+title <- ggdraw() +
+ draw_label("Sampling Distributions (of Sample Means)",
+ fontface = 'bold',
+ x = 0,
+ hjust = 0) +
+ theme(plot.margin = margin(0, 0, 0, 7))
+sampling_dist_grid <- plot_grid(title,
+ sampling_dist_row,
+ ncol = 1,
+ rel_heights = c(0.1, 1))
+all_dist_grid <- plot_grid(bootstrap_dist_grid, sampling_dist_grid, ncol = 1)
diff --git a/materials/worksheet_03/tests_worksheet_03.R b/materials/worksheet_03/tests_worksheet_03.R
new file mode 100644
index 0000000..aea73a1
--- /dev/null
+++ b/materials/worksheet_03/tests_worksheet_03.R
@@ -0,0 +1,593 @@
+test_1.0 <- function() {
+ test_that('Did not assign answer to an object called "answer1.0.0"', {
+ expect_true(exists("answer1.0.0"))
+ })
+ test_that('Did not assign answer to an object called "answer1.0.1"', {
+ expect_true(exists("answer1.0.1"))
+ })
+ test_that('Did not assign answer to an object called "answer1.0.2"', {
+ expect_true(exists("answer1.0.2"))
+ })
+ test_that('Did not assign answer to an object called "answer1.0.3"', {
+ expect_true(exists("answer1.0.3"))
+ })
+ test_that('Solutions should be a single character ("A", "B", or "C")', {
+ expect_match(answer1.0.0, "a|b|c", ignore.case = TRUE)
+ expect_match(answer1.0.1, "a|b|c", ignore.case = TRUE)
+ expect_match(answer1.0.2, "a|b|c", ignore.case = TRUE)
+ expect_match(answer1.0.3, "a|b|c", ignore.case = TRUE)
+ })
+ test_that("One or more solutions are incorrect", {
+ expect_equal(digest(tolower(answer1.0.0)), "127a2ec00989b9f7faf671ed470be7f8")
+ expect_equal(digest(tolower(answer1.0.1)), "ddf100612805359cd81fdc5ce3b9fbba")
+ expect_equal(digest(tolower(answer1.0.2)), "ddf100612805359cd81fdc5ce3b9fbba")
+ expect_equal(digest(tolower(answer1.0.3)), "127a2ec00989b9f7faf671ed470be7f8")
+ })
+ print("Success!")
+test_1.1 <- function() {
+ test_that('Did not assign answer to an object called "answer1.1"', {
+ expect_true(exists("answer1.1"))
+ })
+ test_that('Solution should be "true" or "false"', {
+ expect_match(answer1.1, "true|false", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer1.1))
+ if (answer_hash == "d2a90307aac5ae8d0ef58e2fe730d38b") {
+ print("What is required to generate a bootstrap distribution?")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "05ca18b596514af73f6880309a21b5dd")
+ })
+ print("Success!")
+test_1.2 <- function() {
+ test_that('Did not assign answer to an object called "answer1.2"', {
+ expect_true(exists("answer1.2"))
+ })
+ test_that('Solution should be "true" or "false"', {
+ expect_match(answer1.2, "true|false", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer1.2))
+ if (answer_hash == "05ca18b596514af73f6880309a21b5dd") {
+ print("What do we need to compute a population parameter? Is it realistic?")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "d2a90307aac5ae8d0ef58e2fe730d38b")
+ })
+ print("Success!")
+test_2.0 <- function() {
+ test_that('Did not assign answer to an object called "answer2.0"', {
+ expect_true(exists("answer2.0"))
+ })
+ test_that('Solution should be a single character ("A", "B", "C", "D", "E", or "F")', {
+ expect_match(answer2.0, "a|b|c|d|e|f", ignore.case = TRUE)
+ })
+ test_that("Solution is incorrect", {
+ expect_equal(digest(tolower(answer2.0)), "6e7a8c1c098e8817e3df3fd1b21149d1")
+ })
+ print("Success!")
+test_2.1 <- function() {
+ test_that('Did not assign answer to an object called "answer2.1"', {
+ expect_true(exists("answer2.1"))
+ })
+ test_that('Solution should be a single character ("A", "B", or "C")', {
+ expect_match(answer2.1, "a|b|c", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer2.1))
+ if (answer_hash == "ddf100612805359cd81fdc5ce3b9fbba") {
+ print("Is there any characteristic of the sample that guarantees it was taken without replacement?")
+ } else if (answer_hash == "6e7a8c1c098e8817e3df3fd1b21149d1") {
+ print("Is there any scenario where you wouldn't know if the sample was taken with or without replacement?")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "127a2ec00989b9f7faf671ed470be7f8")
+ })
+ print("Success!")
+test_2.2 <- function() {
+ test_that('Did not assign answer to an object called "answer2.2"', {
+ expect_true(exists("answer2.2"))
+ })
+ test_that('Solution should be a single character ("A", "B", or "C")', {
+ expect_match(answer2.2, "a|b|c", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer2.2))
+ if (answer_hash == "127a2ec00989b9f7faf671ed470be7f8") {
+ print("Is there any characteristic of the sample that guarantees it was taken with replacement?")
+ } else if (answer_hash == "ddf100612805359cd81fdc5ce3b9fbba") {
+ print("Is there any characteristic of the sample that guarantees it was taken without replacement?")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "6e7a8c1c098e8817e3df3fd1b21149d1")
+ })
+ print("Success!")
+test_2.3 <- function() {
+ test_that('Did not assign answer to an object called "answer2.3"', {
+ expect_true(exists("answer2.3"))
+ })
+ test_that('Solution should be "true" or "false"', {
+ expect_match(answer2.3, "true|false", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer2.3))
+ if (answer_hash == "05ca18b596514af73f6880309a21b5dd") {
+ print("Read carefully!")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "d2a90307aac5ae8d0ef58e2fe730d38b")
+ })
+ print("Success!")
+test_3.0 <- function() {
+ test_that('Did not assign answer to an object called "bootstrap_sample"', {
+ expect_true(exists("bootstrap_sample"))
+ })
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(bootstrap_sample))
+ })
+ expected_colnames <- c("replicate", "full_years")
+ given_colnames <- colnames(bootstrap_sample)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(bootstrap_sample))), "20c9a920779e3feca5b4ed6948450f8a")
+ })
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(bootstrap_sample$full_years))), "dc1d757c56fd15656b23102dc5f6727c")
+ })
+ print("Success!")
+test_3.1 <- function() {
+ test_that('Did not assign answer to an object called "ubc_sample_mean"', {
+ expect_true(exists("ubc_sample_mean"))
+ })
+ answer_as_numeric <- as.numeric(ubc_sample_mean)
+ test_that("Solution should be a number", {
+ expect_false(is.na(answer_as_numeric))
+ })
+ test_that("Solution is incorrect", {
+ expect_equal(digest(round(ubc_sample_mean,3)), "baf0148fd27fa92338a8c33829af5128")
+ })
+ print("Success!")
+test_3.2 <- function() {
+ test_that('Did not assign answer to an object called "bootstrap_sample_mean"', {
+ expect_true(exists("bootstrap_sample_mean"))
+ })
+ answer_as_numeric <- as.numeric(bootstrap_sample_mean)
+ test_that("Solution should be a number", {
+ expect_false(is.na(answer_as_numeric))
+ })
+ test_that("Solution is incorrect", {
+ expect_equal(digest(as.integer(answer_as_numeric*1000000000)), "328b8c66e7f4b6511242246eab9bd08b")
+ })
+ print("Success!")
+test_3.3 <- function() {
+ test_that('Did not assign answer to an object called "answer3.3"', {
+ expect_true(exists("answer3.3"))
+ })
+ test_that('Solution should be "true" or "false"', {
+ expect_match(answer3.3, "true|false", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer3.3))
+ if (answer_hash == "d2a90307aac5ae8d0ef58e2fe730d38b") {
+ print("Would the mean change if we sampled without replacement?")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "05ca18b596514af73f6880309a21b5dd")
+ })
+ print("Success!")
+test_3.4 <- function() {
+ test_that('Did not assign answer to an object called "bootstrap_sample_dist"', {
+ expect_true(exists("bootstrap_sample_dist"))
+ })
+ properties <- c(bootstrap_sample_dist$layers[[1]]$mapping, bootstrap_sample_dist$mapping)
+ test_that("Plot should have full_years on the x-axis", {
+ expect_true("full_years" == rlang::get_expr(properties$x))
+ })
+ test_that("Plot is not the correct type", {
+ expect_true("GeomBar" %in% class(bootstrap_sample_dist$layers[[1]]$geom))
+ })
+ test_that("Plot does not have the correct bin width", {
+ expect_equal(
+ digest(as.integer(mget("stat_params", bootstrap_sample_dist$layers[[1]])[["stat_params"]][["binwidth"]])),
+ "4b5630ee914e848e8d07221556b0a2fb"
+ )
+ })
+ test_that("Plot does not use the correct data", {
+ expect_equal(digest(nrow(bootstrap_sample_dist$data)), "20c9a920779e3feca5b4ed6948450f8a")
+ expect_equal(digest(as.integer(sum(bootstrap_sample_dist$data$full_years))), "dc1d757c56fd15656b23102dc5f6727c")
+ })
+ test_that("x-axis label should be descriptive and human readable", {
+ expect_false(bootstrap_sample_dist$labels$x == "full_years")
+ })
+ test_that("Plot should have a title", {
+ expect_true("title" %in% names(bootstrap_sample_dist$labels))
+ })
+ print("Success!")
+test_3.5 <- function() {
+ test_that('Did not assign answer to an object called "answer3.5"', {
+ expect_true(exists("answer3.5"))
+ })
+ test_that('Solution should be "true" or "false"', {
+ expect_match(answer3.5, "true|false", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer3.5))
+ if (answer_hash == "05ca18b596514af73f6880309a21b5dd") {
+ print("Think through the process of sampling with replacement carefully")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "d2a90307aac5ae8d0ef58e2fe730d38b")
+ })
+ print("Success!")
+test_4.0 <- function() {
+ test_that('Did not assign answer to an object called "sample_10"', {
+ expect_true(exists("sample_10"))
+ })
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(sample_10))
+ })
+ expected_colnames <- c("current_land_value")
+ given_colnames <- colnames(sample_10)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(sample_10))), "71db8a6cad03244e6e50f0ad8bc95a65")
+ })
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(sample_10$current_land_value))), "3f92c7fab3c6a839e5ab5aacda979f3a")
+ })
+ print("Success!")
+test_4.1 <- function() {
+ test_that('Did not assign answer to an object called "resampled_means_10"', {
+ expect_true(exists("resampled_means_10"))
+ })
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(resampled_means_10))
+ })
+ expected_colnames <- c("replicate", "mean_land_value")
+ given_colnames <- colnames(resampled_means_10)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(resampled_means_10))), "6e96c307060fba1b1d3a36d2410fd595")
+ })
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(resampled_means_10$mean_land_value))), "1de19f20fb7008fb8a562077cbcc2cf0")
+ })
+ print("Success!")
+test_4.2 <- function() {
+ test_that('Did not assign answer to an object called "bootstrap_dist_10"', {
+ expect_true(exists("bootstrap_dist_10"))
+ })
+ properties <- c(bootstrap_dist_10$layers[[1]]$mapping, bootstrap_dist_10$mapping)
+ test_that("Plot should have mean_land_value on the x-axis", {
+ expect_true("mean_land_value" == rlang::get_expr(properties$x))
+ })
+ test_that("Plot is not the correct type", {
+ expect_true("GeomBar" %in% class(bootstrap_dist_10$layers[[1]]$geom))
+ })
+ test_that("Plot does not have the correct bin width", {
+ expect_equal(
+ digest(as.integer(mget("stat_params", bootstrap_dist_10$layers[[1]])[["stat_params"]][["binwidth"]])),
+ "829aba66b0d64feac09b067c4cce133c"
+ )
+ })
+ test_that("Plot does not use the correct data", {
+ expect_equal(digest(nrow(bootstrap_dist_10$data)), "6e96c307060fba1b1d3a36d2410fd595")
+ expect_equal(digest(round(as.integer(bootstrap_dist_10$data$mean_land_value))), "60b3274d21a3e83b615c7731adfd0d79")
+ })
+ test_that("x-axis label should be descriptive and human readable", {
+ expect_false(bootstrap_dist_10$labels$x == "mean_land_value")
+ })
+ test_that("Plot should have a title", {
+ expect_true("title" %in% names(bootstrap_dist_10$labels))
+ })
+ print("Success!")
+test_4.3 <- function() {
+ test_that('Did not assign answer to an object called "answer4.3"', {
+ expect_true(exists("answer4.3"))
+ })
+ test_that('Solution should be "true" or "false"', {
+ expect_match(answer4.3, "true|false", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer4.3))
+ if (answer_hash == "05ca18b596514af73f6880309a21b5dd") {
+ print("Note: the distribution above is a BOOTSTRAP distribution")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "d2a90307aac5ae8d0ef58e2fe730d38b")
+ })
+ print("Success!")
+test_4.4 <- function() {
+ test_that('Did not assign answer to an object called "sample_30"', {
+ expect_true(exists("sample_30"))
+ })
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(sample_30))
+ })
+ expected_colnames <- c("current_land_value")
+ given_colnames <- colnames(sample_30)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(sample_30))), "7d2842cab7725fd8f382293e410d42b2")
+ })
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(sample_30$current_land_value))), "20a5caed5f997ae762307c6a6e98c276")
+ })
+ print("Success!")
+test_4.5 <- function() {
+ test_that('Did not assign answer to an object called "bootstrap_dist_30"', {
+ expect_true(exists("bootstrap_dist_30"))
+ })
+ properties <- c(bootstrap_dist_30$layers[[1]]$mapping, bootstrap_dist_30$mapping)
+ test_that("Plot should have mean_land_value on the x-axis", {
+ expect_true("mean_land_value" == rlang::get_expr(properties$x))
+ })
+ test_that("Plot is not the correct type", {
+ expect_true("GeomBar" %in% class(bootstrap_dist_30$layers[[1]]$geom))
+ })
+ test_that("Plot does not have the correct bin width", {
+ expect_equal(
+ digest(as.integer(mget("stat_params", bootstrap_dist_30$layers[[1]])[["stat_params"]][["binwidth"]])),
+ "829aba66b0d64feac09b067c4cce133c"
+ )
+ })
+ test_that("Plot does not use the correct data", {
+ expect_equal(digest(nrow(bootstrap_dist_30$data)), "6e96c307060fba1b1d3a36d2410fd595")
+ expect_equal(digest(as.integer(sum(bootstrap_dist_30$data$mean_land_value))), "a6d96db9e40e16a8b310b518ba018d08")
+ })
+ test_that("x-axis label should be descriptive and human readable", {
+ expect_false(bootstrap_dist_30$labels$x == "mean_land_value")
+ })
+ test_that("Plot should have a title", {
+ expect_true("title" %in% names(bootstrap_dist_30$labels))
+ })
+ print("Success!")
+test_4.6 <- function() {
+ test_that('Did not assign answer to an object called "sample_100"', {
+ expect_true(exists("sample_100"))
+ })
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(sample_100))
+ })
+ expected_colnames <- c("current_land_value")
+ given_colnames <- colnames(sample_100)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(sample_100))), "5d6e7fe43b3b73e5fd2961d5162486fa")
+ })
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(sample_100$current_land_value))), "225421bda4b1e262315936431acd58aa")
+ })
+ print("Success!")
+test_4.7 <- function() {
+ test_that('Did not assign answer to an object called "bootstrap_dist_100"', {
+ expect_true(exists("bootstrap_dist_100"))
+ })
+ properties <- c(bootstrap_dist_100$layers[[1]]$mapping, bootstrap_dist_100$mapping)
+ test_that("Plot should have mean_land_value on the x-axis", {
+ expect_true("mean_land_value" == rlang::get_expr(properties$x))
+ })
+ test_that("Plot is not the correct type", {
+ expect_true("GeomBar" %in% class(bootstrap_dist_100$layers[[1]]$geom))
+ })
+ test_that("Plot does not have the correct bin width", {
+ expect_equal(
+ digest(as.integer(mget("stat_params", bootstrap_dist_100$layers[[1]])[["stat_params"]][["binwidth"]])),
+ "829aba66b0d64feac09b067c4cce133c"
+ )
+ })
+ test_that("Plot does not use the correct data", {
+ expect_equal(digest(nrow(bootstrap_dist_100$data)), "6e96c307060fba1b1d3a36d2410fd595")
+ expect_equal(digest(as.integer(sum(bootstrap_dist_100$data$mean_land_value))), "518a8fc51da6a689fee9e954795fb1d2")
+ })
+ test_that("x-axis label should be descriptive and human readable", {
+ expect_false(bootstrap_dist_100$labels$x == "mean_land_value")
+ })
+ test_that("Plot should have a title", {
+ expect_true("title" %in% names(bootstrap_dist_100$labels))
+ })
+ print("Success!")
+test_4.8 <- function() {
+ test_that('Did not assign answer to an object called "answer4.8"', {
+ expect_true(exists("answer4.8"))
+ })
+ test_that('Solution should be a single character ("A", "B", "C", "D", or "E")', {
+ expect_match(answer4.8, "a|b|c|d|e", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer4.8))
+ if (answer_hash == "127a2ec00989b9f7faf671ed470be7f8") {
+ print("Compare the red vertical lines for each bootstrap distribution and its corresponding sampling distribution")
+ } else if (answer_hash == "ddf100612805359cd81fdc5ce3b9fbba") {
+ print("Are the widths truly identical?")
+ } else if (answer_hash == "6e7a8c1c098e8817e3df3fd1b21149d1") {
+ print("In other words, do the plots become more narrow as the sample size increases?")
+ } else if (answer_hash == "d110f00cfb1b248e835137025804a23b") {
+ print("The sampling & bootstrap distributions for larger sample sizes may not look EXACTLY symmetrical and bell-shaped, but consider the OVERALL trend as sample size increases")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "93a9078c6326f37b481d3e99b60ad987")
+ })
+ print("Success!")
+test_4.9 <- function() {
+ test_that('Did not assign answer to an object called "answer4.9"', {
+ expect_true(exists("answer4.9"))
+ })
+ test_that('Solution should be "true" or "false"', {
+ expect_match(answer4.9, "true|false", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer4.9))
+ if (answer_hash == "d2a90307aac5ae8d0ef58e2fe730d38b") {
+ print("Do the bootstrap distributions above look at least SOMEWHAT similar to their corresponding sampling distributions?")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "05ca18b596514af73f6880309a21b5dd")
+ })
+ print("Success!")
diff --git a/materials/worksheet_03/worksheet_03.ipynb b/materials/worksheet_03/worksheet_03.ipynb
new file mode 100644
index 0000000..f09092e
--- /dev/null
+++ b/materials/worksheet_03/worksheet_03.ipynb
@@ -0,0 +1,2169 @@
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "e74aceff7ff293312405c2ef890ab6ab",
+ "grade": false,
+ "grade_id": "cell-e883b594bd088087",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "# Worksheet 3: Bootstrapping and its Relationship to the Sampling Distribution"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "049bda460bad4f83117d546bbe10321b",
+ "grade": false,
+ "grade_id": "cell-9460eb96253e3080",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "### Lecture and Tutorial Learning Goals\n",
+ "After completing this week's lecture and tutorial work, you will be able to:\n",
+ "1. Explain why we don’t know/have a sampling distribution in practice/real life.\n",
+ "2. Define bootstrapping.\n",
+ "3. Write a computer script to create a bootstrap distribution to approximate a sampling distribution.\n",
+ "4. Contrast a bootstrap sampling distribution with a sampling distribution obtained using multiple samples."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "8e4ea5f6af2c4baffbf1f0d12d6eda7e",
+ "grade": false,
+ "grade_id": "cell-9e25e98e883dccff",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Run this cell before continuing.\n",
+ "library(cowplot)\n",
+ "library(digest)\n",
+ "library(gridExtra)\n",
+ "library(infer)\n",
+ "library(repr)\n",
+ "library(taxyvr)\n",
+ "library(tidyverse)\n",
+ "source(\"tests_worksheet_03.R\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "dd2381152ee03caebb7e99863d972111",
+ "grade": false,
+ "grade_id": "cell-2ab7d5e0c77883ab",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "## 1. Warm Up Questions"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "81d9e7a4fe5388d982aa72dff10517fd",
+ "grade": false,
+ "grade_id": "cell-c5774e72ef309fbb",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.0**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Below is a table that lists out various types of distributions that require sampling to generate. The right column of the table is empty but should describe the location(s) from which one **must** sample (at some point) to produce the corresponding distribution. Fill in the blanks of the table by assigning the object name with the letter associated with the phrase that is the most appropriate for that cell. The options are listed below the table.\n",
+ "\n",
+ "| Distribution | Requires sampling from? |\n",
+ "| ------------------------------- | ----------------------- |\n",
+ "| Sampling distribution | `answer1.0.0` |\n",
+ "| Bootstrap sample distribution | `answer1.0.1` |\n",
+ "| Bootstrap sampling distribution | `answer1.0.2` |\n",
+ "| Sample distribution | `answer1.0.3` |\n",
+ "\n",
+ "A. The population.\n",
+ "\n",
+ "B. A sample from the population.\n",
+ "\n",
+ "_Assign your answers to the objects `answer1.0.0`, `answer1.0.1`, `answer1.0.2`, and `answer1.0.3`. Your answer should each be a single character surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "d933570fd74dcecd4c3ede0df82979c6",
+ "grade": false,
+ "grade_id": "cell-bdb1f0ffdd6da9e6",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer1.0.0 <- ...\n",
+ "# answer1.0.1 <- ...\n",
+ "# answer1.0.2 <- ...\n",
+ "# answer1.0.3 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "a37902e636d6912317218aa0234327d1",
+ "grade": true,
+ "grade_id": "cell-f7e3a38d921ae45d",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.0()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "d874e7dc59ae1b6f7c4c1166db8b3071",
+ "grade": false,
+ "grade_id": "cell-33fde50e9b5f9dcb",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.1**\n",
+ "
{points: 1}\n",
+ "\n",
+ "You are tasked with finding the proportion of UBC undergraduate students who own one or more pairs of Blundstone boots. You only have the time and resources to take a single sample of around 45 students.\n",
+ "\n",
+ "True or false?\n",
+ "\n",
+ "Given the scenario above, it would be possible to generate a bootstrap sampling distribution of point estimates for the population parameter of interest.\n",
+ "\n",
+ "_Assign your answer to an object called `answer1.1`. Your answer should be either \"true\" or \"false\", surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "16c627d85e3336f342a3949a83805a57",
+ "grade": false,
+ "grade_id": "cell-dbaa0d908d99de44",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer1.1 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "be3f456359b02aac45982de80b8c40a7",
+ "grade": true,
+ "grade_id": "cell-24bcecb2e78c1557",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.1()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "a93b0b1951576157f6e6d8ccb7ff3664",
+ "grade": false,
+ "grade_id": "cell-2ec23f379874e209",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.2**\n",
+ "
{points: 1}\n",
+ "\n",
+ "True or false?\n",
+ "\n",
+ "In reality, we most often have the ability to directly compute the population parameter that we are interested in.\n",
+ "\n",
+ "_Assign your answer to an object called `answer1.2`. Your answer should be either \"true\" or \"false\" surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "ae47cc4ba564ec7b1ecd14cc91e527f4",
+ "grade": false,
+ "grade_id": "cell-f61e1c57732aee28",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer1.2 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "3a685666096c714972c27a148d1cec03",
+ "grade": true,
+ "grade_id": "cell-d8a18c40c1276dca",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.2()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "ffaf96915907a06de727048b6b85e97a",
+ "grade": false,
+ "grade_id": "cell-7e27e396adcfb7e1",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "## 2. We Need More Samples (An Introduction to Bootstrapping)\n",
+ "\n",
+ "If you recall from the last tutorial, you were given a hypothetical scenario where you did not have access to data for the population of interest. Instead, you had two different samples from the population, and we got stuck when we wanted to study the variation (or, more formally, the standard error) of our point estimates (also called the *sampling variation*). The reasoning behind this was because we need many point estimates to study sampling variation, but we only had one of each size... So how do we study sampling variation without taking more samples from the population to produce a sampling distribution?\n",
+ "\n",
+ "
+ "\n",
+ "![](https://i1.wp.com/www.r-bloggers.com/wp-content/uploads/2010/06/boot.jpg?resize=210%2C294&ssl=1)\n",
+ "
+ "\n",
+ "The solution to this is **bootstrapping** (or specifically, **bootstrap resampling with replacement**). The term bootstrapping originates from the phrase \"to pull oneself up by one's bootstraps\", which refers to completing a seemingly impossible task with no external help. In Statistics, Bootstrapping is the idea of sampling from our original sample **with replacement** (also called **resampling with replacement**) to generate a **bootstrap sampling distribution**. Sampling with replacement means that each time we choose an observation from the population or sample, we return it before randomly selecting another. With this procedure, the original sample acts as an *estimate* of the population, and resampling with replacement gives us enough samples and results in enough *sampling variation* necessary to produce an approximation of the sampling distributions we have generated in the previous weeks. In this sense, our single sample has acted as our \"bootstrap\", and we have used it to \"pull ourselves up\" and create an approximation of the desired sampling distribution."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "df916681627390ccf488a47195a383c7",
+ "grade": false,
+ "grade_id": "cell-3cf9e544012b7eda",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.0**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Suppose you had a single sample of size 20 held in the data frame `sample` and you wanted to study the variation of a point estimate from samples of this size. This means we need many more samples of size 20 to generate a sampling distribution. **Which of the following statements is true about the approach below?**\n",
+ "\n",
+ "```r\n",
+ "more_samples <- sample %>% \n",
+ " rep_sample_n(size = 20, reps = 1400, replace = FALSE)\n",
+ "```\n",
+ "\n",
+ "A. This approach would enable us to approximate the sampling distribution so we can study sampling variation.\n",
+ "\n",
+ "B. This approach would not give us enough samples (we need to increase the `reps` argument).\n",
+ "\n",
+ "C. This approach would give us 1400 identical samples.\n",
+ "\n",
+ "D. This approach is known as sampling with replacement.\n",
+ "\n",
+ "E. Both A & D.\n",
+ "\n",
+ "F. None of the above.\n",
+ "\n",
+ "_Assign your answer to an object called `answer2.0`. Your answer should be a single character surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "8375ebf6af5ce95260d4cb00a41b051f",
+ "grade": false,
+ "grade_id": "cell-17beaf6d485bc828",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer2.0 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "e51028a0572b891beb67716c21550221",
+ "grade": true,
+ "grade_id": "cell-f6e3d66d982cf39a",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.0()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "f33976e1c18d68a5ade0e984f120cd83",
+ "grade": false,
+ "grade_id": "cell-18405f12fa50f6dc",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "Use the following scenario to answer **Question 2.1 and Question 2.2**:\n",
+ "\n",
+ "--- \n",
+ "Suppose you had a small bowl of 9 Canadian dimes with the following production years (listed in increasing order):\n",
+ "\n",
+ "```\n",
+ "1994, 1995, 1997, 1999, 2000, 2000, 2001, 2001, 2012\n",
+ "```\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "ab231eb67909277b48bf8163db5c8260",
+ "grade": false,
+ "grade_id": "cell-d94685a11194b662",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.1**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Assume we sampled 9 dimes out of the bowl, one at a time, in the order below. However, you do not know whether we returned the dime that we chose before taking another (sampling with replacement) or not (sampling without replacement).\n",
+ "\n",
+ "```\n",
+ "1997, 1999, 1994, 2001, 1994, 2012, 2001, 2012, 2000\n",
+ "```\n",
+ "\n",
+ "Given the sample above, was it a result of sampling **with replacement** or **without replacement**?\n",
+ "\n",
+ "A. The sample is a result of sampling with replacement.\n",
+ "\n",
+ "B. The sample is a result of sampling without replacement.\n",
+ "\n",
+ "C. There is not enough information to answer the question.\n",
+ "\n",
+ "_Assign your answer to an object called `answer2.1`. Your answer should be a single character surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "5e36bb0b2a26dde9e17568ed46925a91",
+ "grade": false,
+ "grade_id": "cell-cd12b59b81ebaeb9",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer2.1 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "f2067ce2693cc3c8ad43736261aba0b6",
+ "grade": true,
+ "grade_id": "cell-c4693651ea64e97c",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.1()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "d95b93a9dcd453ab7bc53ea911a20ba5",
+ "grade": false,
+ "grade_id": "cell-08bdbd88e7b40015",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.2**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Assume we sampled 9 dimes out of the bowl, one at a time, in the order below. However, you do not know whether we returned the dime that we chose before taking another (sampling with replacement) or not (sampling without replacement).\n",
+ "\n",
+ "```\n",
+ "2012, 1994, 2001, 1999, 1997, 2001, 2000, 2000, 1995\n",
+ "```\n",
+ "\n",
+ "Given the sample above, was it a result of sampling **with replacement** or **without replacement**?\n",
+ "\n",
+ "A. The sample is a result of sampling with replacement.\n",
+ "\n",
+ "B. The sample is a result of sampling without replacement.\n",
+ "\n",
+ "C. There is not enough information to answer the question.\n",
+ "\n",
+ "_Assign your answer to an object called `answer2.2`. Your answer should be a single character surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "95f9ad1a3003658ab271d2604708266a",
+ "grade": false,
+ "grade_id": "cell-991f90f96be76120",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer2.2 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "2c8659d46d7e6d22ee8a88a3e6c5b67b",
+ "grade": true,
+ "grade_id": "cell-3280774d9788a5d5",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.2()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "61b80ba1c212bf63799c9a6d085ace2c",
+ "grade": false,
+ "grade_id": "cell-255aba27bffea910",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.3**\n",
+ "
{points: 1}\n",
+ "\n",
+ "True or false?\n",
+ "\n",
+ "When you take a bootstrap sample, you are sampling with replacement from the population.\n",
+ "\n",
+ "_Assign your answer to an object called `answer2.3`. Your answer should be either \"true\" or \"false\", surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "853e6af568e83670a5caf3ef03554251",
+ "grade": false,
+ "grade_id": "cell-92f4cdff4df3b451",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer2.3 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "f3e7beb2227a225a33dead228535280f",
+ "grade": true,
+ "grade_id": "cell-771676480db29167",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.3()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "70e070af628cbd8539e7a0b8cfa51fac",
+ "grade": false,
+ "grade_id": "cell-93ad5f31ad660c76",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "## 3. Sample Distribution vs. Bootstrap Sample Distribution(s)\n",
+ "\n",
+ "Before we jump into creating bootstrap sampling distributions, let's take a moment to gain some more insight into how sampling with replacement introduces **sampling variation**. Consider the population of all current UBC students, and suppose we were interested in the mean number of **full** years that they have been studying at UBC. Assume we had the following sample of size 35 from the population named `ubc_sample`:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "d14cdb0ff78b73b6fd68a60939a79dfa",
+ "grade": false,
+ "grade_id": "cell-88a0a3192fae352b",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Run this cell before continuing.\n",
+ "source(\"section_3.R\") # Runs code to produce the sample.\n",
+ "head(ubc_sample)\n",
+ "nrow(ubc_sample)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "8a8cf635a38be44f2901addce4802b7d",
+ "grade": false,
+ "grade_id": "cell-fc10e0e5c79ee757",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.0**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Take a single bootstrap sample from `ubc_sample` using `rep_sample_n` with a seed of 0242. The resulting data frame should have two columns: `replicate` and `full_years`.\n",
+ "\n",
+ "_Assign your data frame to an object called `bootstrap_sample`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "f214b5c5350112dff4169d55358ee62f",
+ "grade": false,
+ "grade_id": "cell-f4bcf34fa7f83066",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "set.seed(0242) # DO NOT CHANGE!\n",
+ "\n",
+ "# bootstrap_sample <- ... %>% \n",
+ "# rep_sample_n(size = ..., replace = ...)\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "head(bootstrap_sample)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "b4dc83238133562d93a056382c82967d",
+ "grade": true,
+ "grade_id": "cell-79158cb203e2a650",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "test_3.0()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "f9e11a47bbcd76cd58d1fb07b276a599",
+ "grade": false,
+ "grade_id": "cell-b8690eaec88d238e",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.1** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Calculate the mean of the sample (`ubc_sample`).\n",
+ "\n",
+ "_Assign your answer to an object called `ubc_sample_mean`. Your answer should be a single number._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "cc3ce514b9ffebb16f5acd81f7be293a",
+ "grade": false,
+ "grade_id": "cell-a955e4165977b263",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "ubc_sample_mean"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "c8d54cae12388dfb03078367a99aff7d",
+ "grade": true,
+ "grade_id": "cell-9f921dd2e40bf012",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "test_3.1()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "9fcc4194ddd2e1d5cb779e6c6b295af7",
+ "grade": false,
+ "grade_id": "cell-8a25972eddd20fcf",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.2** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Calculate the mean of the bootstrap sample that you took in **question 3.0** (`bootstrap_sample`).\n",
+ "\n",
+ "_Assign your answer to an object called `bootstrap_sample_mean`. Your answer should be a single number._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "40ad6c4c1dee67b2b9510c7d7f002d64",
+ "grade": false,
+ "grade_id": "cell-33cecf1485afe722",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "bootstrap_sample_mean"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "aa3cd6f8ff2a48b518dba2512b0b6b5d",
+ "grade": true,
+ "grade_id": "cell-a46af22ef1b12f46",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_3.2()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "1e882d99420422b1cc79f8452bd0a6de",
+ "grade": false,
+ "grade_id": "cell-1b210e15af6de543",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.3**\n",
+ "
{points: 1}\n",
+ "\n",
+ "True or false?\n",
+ "\n",
+ "The mean of the bootstrap sample is different than the original sample because we sampled **with replacement** from the original sample.\n",
+ "\n",
+ "_Assign your answer to an object called `answer3.3`. Your answer should be either \"true\" or \"false\", surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "498f4ea04b65e2a1353401438c632a5b",
+ "grade": false,
+ "grade_id": "cell-b20cbe04061d9bfb",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer3.3 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "96693a976429b40e15f92597ae85b371",
+ "grade": true,
+ "grade_id": "cell-3616eec8b17904be",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_3.3()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "81db23884bb7667dc2012b47ac62d960",
+ "grade": false,
+ "grade_id": "cell-af13c58c5dd4a8e0",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.4**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Plot the bootstrap sample you just took by filling in the scaffolding below to create a histogram with bin widths of 1.\n",
+ "\n",
+ "**Notes:**\n",
+ "- The `colour = \"white\"` argument in `geom_histogram` adds outlines to the histogram bars.\n",
+ "- The `scale_x_continuous` layer modifies the x-axis scale to make it more readable.\n",
+ "\n",
+ "_Assign your plot to an object called `bootstrap_sample_dist`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "b446434c118aeee3915d1ae435a83f89",
+ "grade": false,
+ "grade_id": "cell-15b934fd4299ba32",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# bootstrap_sample_dist <- ... %>% \n",
+ "# ...(aes(x = ...)) + \n",
+ "# geom_histogram(binwidth = ..., colour = \"white\") +\n",
+ "# ...(\"Bootstrap Sample Distribution\") +\n",
+ "# xlab(\"# Full Years At UBC\") + \n",
+ "# scale_x_continuous(breaks = seq(0, 10, 1), limits = c(-0.5, 10.5)) \n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "bootstrap_sample_dist"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "a03f5988d55a328f67d53f23706ad9e4",
+ "grade": true,
+ "grade_id": "cell-817473b6609acfbc",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "test_3.4()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "481c182a3e79e1d6d67a38965e6bd6a1",
+ "grade": false,
+ "grade_id": "cell-b64797421e69d54a",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "In the code cell below, we have used `plot_grid` to plot the sample distribution and bootstrap distribution together.\n",
+ "\n",
+ "_Use the set of plots below to answer the **next question**._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "a4403ff4afb08bca8ea2a3932af4d3cd",
+ "grade": false,
+ "grade_id": "cell-c259d96c4d2a6bcc",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "plot_grid(sample_dist, bootstrap_sample_dist, ncol = 1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "911720cfbbe57cf16bea2fd7af4cfe13",
+ "grade": false,
+ "grade_id": "cell-2b75e7dead17cf81",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.5**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Suppose we repeated the process above, but using a different seed when we took our bootstrap sample (`bootstrap_sample`) from the original sample (`ubc_sample`).\n",
+ "\n",
+ "True or false?\n",
+ "\n",
+ "We are guaranteed that the two distributions above will be different.\n",
+ "\n",
+ "_Assign your answer to an object called `answer3.5`. Your answer should be either \"true\" or \"false\", surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "a69ab9222d9fbfc01b96a8aa2124f821",
+ "grade": false,
+ "grade_id": "cell-26b26e8fec74b1bd",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer3.5 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "252bb0234596e3cac9b38a4a3b703735",
+ "grade": true,
+ "grade_id": "cell-7e74ab7f7467ea07",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_3.5()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "a824dffb4b2c4ac5bba5719ec5065092",
+ "grade": false,
+ "grade_id": "cell-da1566194fd1cfbc",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "## 4. Sampling Distributions vs. Bootstrap Sampling Distributions\n",
+ "\n",
+ "Now that we understand that we can use **bootstrapping** to produce an **approximation** of a sampling distribution, we are going to use it to approximate the sampling distributions that we produced in `worksheet_02` and compare the two. You will produce a bootstrap sample distribution of size 10, 30, and 100 for the tax assessment values in the population of **multiple-family dwellings in strata housing** in Vancouver, we will give you their corresponding sampling distributions, and then you'll be able to compare the two types of distributions for each sample size. Let's get started!\n",
+ "\n",
+ "Recall our population of interest (**multiple-family dwellings in strata housing** in Vancouver) originates from the `tax_2019` data set included in the `taxyvr` R package. Again, since you have already done this is `worksheet_01`, we have done it for you in the code cell below. To filter the `tax_2019` data set for the population that we are interested in, we need the `current_land_value` of properties that meet the following critera:\n",
+ "- **Have a `current_land_value` greater than \\$1:** Some properties are assigned a value of `NA` and these are the properties undergoing big renovations. These values get ammended after the improvement and are reflected in the following year's assessment. The same occurs with homes that are asssessed at $0.\n",
+ "- **Are of `legal_type` `\"STRATA\"`**\n",
+ "- **Are of `zone_category` `\"Multiple Family Dwelling\"`**\n",
+ "\n",
+ "_If you need a refresher on the `tax_2019` data set and where it came from, please look back at `worksheet_01` and re-read the introduction of section 2 there._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "3649e69f61d54d29252de4e2856e99b5",
+ "grade": false,
+ "grade_id": "cell-99cae01db68690dd",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Run this cell before continuing.\n",
+ "multi_family_strata <- tax_2019 %>% \n",
+ " filter(!is.na(current_land_value),\n",
+ " current_land_value > 1,\n",
+ " legal_type == \"STRATA\",\n",
+ " zone_category == \"Multiple Family Dwelling\") %>% \n",
+ " select(current_land_value)\n",
+ "head(multi_family_strata)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "9e191607b86ad6a63a39194b7543b554",
+ "grade": false,
+ "grade_id": "cell-2924b562681fb65f",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 4.0** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Imagine you only have the resources to take a single random sample of size 10 from the population of interest. Use `rep_sample_n` with a seed of 2485 to take a single sample of size 10 from `multi_family_strata`. Select only the `current_land_value` column so your data frame has a single column.\n",
+ "\n",
+ "**Note:** we use `ungroup()` here because `rep_sample_n` returns a data frame that is grouped by `replicate`, but we want an ungrouped data frame so we can select `current_land_value` by itself! If you are curious, try commenting out the `ungroup() %>%` line to see why.\n",
+ "\n",
+ "_Assign your data frame to an object called `sample_10`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "949f1aa51d25c9b452f4d08485b8c3ea",
+ "grade": false,
+ "grade_id": "cell-83a392262f6ac62b",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "set.seed(2485) # DO NOT CHANGE!\n",
+ "\n",
+ "# sample_10 <- multi_family_strata %>% \n",
+ "# rep_sample_n(size = ..., replace = ...) %>%\n",
+ "# ungroup() %>% \n",
+ "# ...(...)\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "sample_10"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "c2b0d03051ab69a08601ecea6b621ddb",
+ "grade": true,
+ "grade_id": "cell-cf37e3b4dbe8561a",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_4.0()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "49c1281a35f9e15af46e17a8d612c1fc",
+ "grade": false,
+ "grade_id": "cell-59dd69eb7fc74b9c",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 4.1** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Take 2000 bootstrap samples from the sample you just took (`sample_10`) using `rep_sample_n` and a seed of 0510. Then, calculate the mean for each bootstrap sample and name the resulting column `mean_land_value`. The resulting data frame should have two columns: `replicate` and `mean_land_value`.\n",
+ "\n",
+ "_Assign your data frame to an object called `resampled_means_10`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "36cc4b7fe0ed21a1e3c2a1760b6f4d78",
+ "grade": false,
+ "grade_id": "cell-e1bdb3abbcdf5c99",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "set.seed(0510) # DO NOT CHANGE!\n",
+ "\n",
+ "# resampled_means_10 <- sample_10 %>% \n",
+ "# rep_sample_n(size = ..., reps = ..., replace = ...) %>% \n",
+ "# group_by(...) %>% \n",
+ "# summarise(mean_land_value = ...(...))\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "head(resampled_means_10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "346bc403cee192f45e6f2a8333f5f149",
+ "grade": true,
+ "grade_id": "cell-02812ebacc5a610e",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_4.1()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "9d8a1e43e9b2b544e6b04537469b9e38",
+ "grade": false,
+ "grade_id": "cell-7d2e4bca670a3bdf",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 4.2** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Visualize the distribution of the sample means (the `mean_land_value` variable in `resampled_means_10`) by plotting a histogram using `geom_histogram` with the argument `binwidth = 15000`. Add a title of \"n = 10\" to the plot using `ggtitle` and ensure that the x-axis has a human-readable label.\n",
+ "\n",
+ "_Assign your plot to an object called `bootstrap_dist_10`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "b96c162e48e4f585b0eba1f6e528adf1",
+ "grade": false,
+ "grade_id": "cell-e192cfc3e979b2de",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# bootstrap_dist_10 <- ... %>% \n",
+ "# ggplot(aes(x = ...)) +\n",
+ "# ...(... = ...) +\n",
+ "# xlab(\"Mean Land Value (CAD)\") +\n",
+ "# ...(\"n = 10\")\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "bootstrap_dist_10"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "14b80a4842fad51c935b10924cf4c80e",
+ "grade": true,
+ "grade_id": "cell-3015035368ebbfa5",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_4.2()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "b795545cf15831b0f00a3c6e737d0560",
+ "grade": false,
+ "grade_id": "cell-e7bd16fe8aea02c8",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 4.3** \n",
+ "
{points: 1}\n",
+ "\n",
+ "True or false?\n",
+ "\n",
+ "In order to produce the distribution above, we had to sample with replacement from the population many times.\n",
+ "\n",
+ "_Assign your answer to an object called `answer4.3`. Your answer should be either \"true\" or \"false\", surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "ca8a114e086de3938fa25a3170f33a0b",
+ "grade": false,
+ "grade_id": "cell-e034e1d8b3ac69cc",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer4.3 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "8487c36668fea112156c2f5f15cc1509",
+ "grade": true,
+ "grade_id": "cell-689ca07402031a13",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_4.3()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "130396c8ceed4b4f36f3346c3f15a2f7",
+ "grade": false,
+ "grade_id": "cell-a05dbc9672af2e83",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 4.4** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Imagine you only have the resources to take a single random sample of size 30 from the population of interest. Use `rep_sample_n` with a seed of `8183` to take a single sample of size 30 from `multi_family_strata`. Select only the `current_land_value` column so your data frame has a single column.\n",
+ "\n",
+ "_Assign your data frame to an object called `sample_30`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "d0e1573392c019bd2bdc608e2e0b47e4",
+ "grade": false,
+ "grade_id": "cell-b225d2da00c136f1",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# set.seed(...)\n",
+ "# sample_30 <- multi_family_strata %>% \n",
+ "# rep_sample_n(size = ..., replace = ...) %>%\n",
+ "# ungroup() %>% \n",
+ "# ...(...)\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "head(sample_30)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "1ac976c768fb2e13db61aac7b2a6f682",
+ "grade": true,
+ "grade_id": "cell-5e16848f066c67f3",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_4.4()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "91e393cd562df45d9b4b00fd6e6101a3",
+ "grade": false,
+ "grade_id": "cell-0f542f85b72df1e9",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 4.5** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Using the same strategy as you did above, take 2000 bootstrap samples from the sample you just took (`sample_30`), but this time use the seed `7032`. Then, calculate the mean for each sample (name the new column `mean_land_value`). Lastly, visualize the distribution of the sample means you just calculated by plotting a histogram using `geom_histogram` with the argument `binwidth = 15000`. Add a title of \"n = 30\" to the plot using `ggtitle` and ensure that the x-axis has a human-readable label.\n",
+ "\n",
+ "_Assign your plot to an object called `bootstrap_dist_30`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "be930ff3667542012421313f23ff2330",
+ "grade": false,
+ "grade_id": "cell-4153c51957605ac6",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "bootstrap_dist_30"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "bd33c6640fb1aa2f928d1917cdcf1279",
+ "grade": true,
+ "grade_id": "cell-ae86fbe13570d25d",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_4.5()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "f5066eb786f267d0e7340ba78acd13f5",
+ "grade": false,
+ "grade_id": "cell-7d5754b45a9fc421",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 4.6** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Imagine you only have the resources to take a single random sample of size 100 from the population of interest. Use `rep_sample_n` with a seed of 5029 to take a single sample of size 100 from `multi_family_strata`. Select only the `current_land_value` column so your data frame has a single column.\n",
+ "\n",
+ "_Assign your data frame to an object called `sample_100`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "b981dd49c7404a9155dc8181f59e93cc",
+ "grade": false,
+ "grade_id": "cell-f4b5f6c9f6fd9cd3",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "set.seed(5029) # DO NOT CHANGE!\n",
+ "\n",
+ "# sample_100 <- multi_family_strata %>% \n",
+ "# rep_sample_n(size = ..., replace = ...) %>%\n",
+ "# ungroup() %>% \n",
+ "# ...(...)\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "head(sample_100)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "baec50d95daae77ce910af9e4828c0d5",
+ "grade": true,
+ "grade_id": "cell-0141641c277ad8c5",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_4.6()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "d5331aa79d7ba10513f8c6e2c32a2588",
+ "grade": false,
+ "grade_id": "cell-d92f8052dc246ca2",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 4.7** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Using the same strategy as you did above, take 2000 bootstrap samples from the sample you just took (`sample_100`), but this time use the seed 2334. Then, calculate the mean for each sample (name the new column `mean_land_value`). Lastly, visualize the distribution of the sample means you just calculated by plotting a histogram using `geom_histogram` with the argument `binwidth = 15000`. Add a title of \"n = 100\" to the plot using `ggtitle` and ensure that the x-axis has a human-readable label.\n",
+ "\n",
+ "_Assign your plot to an object called `bootstrap_dist_100`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "5d6a2971b24aa9ed3c9af725624172bf",
+ "grade": false,
+ "grade_id": "cell-a1d586c8ab8df866",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "set.seed(2334) # DO NOT CHANGE!\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "bootstrap_dist_100"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "c4d73d1d87049ca9f0048791b549c269",
+ "grade": true,
+ "grade_id": "cell-4b14c14a2338c9b2",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_4.7()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "41ed2d9c2e7861b8f5a83af38658d4e6",
+ "grade": false,
+ "grade_id": "cell-f2d6ff479742e96a",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "In the code cell below, we have run code that generates the same sampling distributions you produced in `worksheet_02`, and displays them in a row **below** the three bootstrap distributions that you produced. We have sorted the plots by increasing order of sample size from left to right. The red vertical line shows the location of the mean for each plot.\n",
+ "\n",
+ "**Note**: a very small number of the sample means are not visible because we manually set bounds on the x-axis so you can compare the distributions more easily. Also, the code is inside a different file (which we ran from within this worksheet with `source`) to save some vertical space in the worksheet.\n",
+ "\n",
+ "_Use the set of plots below to answer the **next two questions**._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "f918b0600d171180bcbc7c986ad31ca7",
+ "grade": false,
+ "grade_id": "cell-9cd9f9cad6d53153",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Run this cell before continuing.\n",
+ "options(repr.plot.width = 15, repr.plot.height = 10)\n",
+ "source(\"section_4.R\") # Runs code to produce the sampling distributions & arrange in grid.\n",
+ "all_dist_grid"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "4a353c7faf48cc890b02ed6c233533a4",
+ "grade": false,
+ "grade_id": "cell-6c4a90e25b93fd69",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 4.8**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Considering the set of plots above, which statement below **is not** correct:\n",
+ "\n",
+ "A. The centre of the sampling distribution and bootstrap distributions with the same sample size are different.\n",
+ "\n",
+ "B. The width of the sampling distribution and bootstrap distributions with the same sample size are different.\n",
+ "\n",
+ "C. For both the sampling distributions and bootstrap distributions, as the sample size increase, the standard deviation decreases.\n",
+ "\n",
+ "D. For both the sampling distributions and bootstrap distributions, as the sample size increase, the distributions appear more bell-shaped.\n",
+ "\n",
+ "E. The sampling distribution and bootstrap distribution are just as different from each other, no matter the sample size.\n",
+ "\n",
+ "_Assign your answer to an object called `answer4.8`. Your answer should be a single character surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "9819d4e2949aa63c9f90e53d4b77c93a",
+ "grade": false,
+ "grade_id": "cell-cbd968352c879e51",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer4.8 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "72635afd8894aec2d751f6e0f887e375",
+ "grade": true,
+ "grade_id": "cell-6e8d7076805c9ed8",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_4.8()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "3e35702f1908d3498c454982ecd88223",
+ "grade": false,
+ "grade_id": "cell-0561298f07cbc750",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 4.9**\n",
+ "
{points: 1}\n",
+ "\n",
+ "True or false?\n",
+ "\n",
+ "When we only have access to a single sample from the population of interest, we can obtain a useful approximation of standard error (specifically, the standard deviation of the distribution of an estimator, or more generally, how much we expect our point estimates to vary) by producing a bootstrap distribution.\n",
+ "\n",
+ "_Assign your answer to an object called `answer4.9`. Your answer should be either \"true\" or \"false\", surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "ed858e77a7303be2481bef27ff65c947",
+ "grade": false,
+ "grade_id": "cell-9786afeac35dc96a",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer4.9 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "266e4a8991f1e66f2ae9e5ca94f64c4e",
+ "grade": true,
+ "grade_id": "cell-9d78c06d071d02a2",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "test_4.9()"
+ ]
+ }
+ ],
+ "metadata": {
+ "docker": {
+ "latest_image_tag": "v0.4.0"
+ },
+ "jupytext": {
+ "formats": "ipynb,Rmd"
+ },
+ "kernelspec": {
+ "display_name": "R",
+ "language": "R",
+ "name": "ir"
+ },
+ "language_info": {
+ "codemirror_mode": "r",
+ "file_extension": ".r",
+ "mimetype": "text/x-r-source",
+ "name": "R",
+ "pygments_lexer": "r",
+ "version": "4.2.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
diff --git a/materials/worksheet_04/bootstrap_dists_worksheet_04.R b/materials/worksheet_04/bootstrap_dists_worksheet_04.R
new file mode 100644
index 0000000..dec7ab6
--- /dev/null
+++ b/materials/worksheet_04/bootstrap_dists_worksheet_04.R
@@ -0,0 +1,41 @@
+options(warn = -1)
+generate_bootstrap_df <- function(seed) {
+ set.seed(seed)
+ sunset_pop %>%
+ rep_sample_n(size = 24, reps = 1, replace = FALSE) %>%
+ ungroup() %>%
+ select(diameter) %>%
+ rep_sample_n(size = 24, reps = 1500, replace = TRUE) %>%
+ group_by(replicate) %>%
+ summarize(diameter_mean = mean(diameter), .groups = "drop") %>%
+ select(diameter_mean)
+seeds <- c(5457, 4457, 5192, 4808, 0017, 4492, 8499, 0730, 8704, 2070)
+bootstrap_dfs <- lapply(seeds, generate_bootstrap_df)
+all_means <- unlist(bootstrap_dfs)
+max_mean <- max(all_means)
+min_mean <- min(all_means)
+generate_bootstrap_plots <- function(df) {
+ df %>%
+ ggplot(aes(x = diameter_mean)) +
+ geom_histogram(binwidth = 0.5) +
+ geom_vline(xintercept = pop_mean, colour = "red") +
+ labs(
+ title = "n = 24",
+ x = "Mean Diameter (cm)"
+ ) +
+ scale_x_continuous(limits = c(min_mean, max_mean))
+plot_row <- plot_grid(plotlist = lapply(bootstrap_dfs, generate_bootstrap_plots), ncol = 5)
+title <- ggdraw() +
+ draw_label("Bootstrap Sampling Distributions (of Sample Means)",
+ fontface = "bold",
+ x = 0,
+ hjust = 0
+ ) +
+ theme(plot.margin = margin(0, 0, 0, 7))
+bootstrap_dists <- plot_grid(title, plot_row, ncol = 1, rel_heights = c(0.1, 1))
diff --git a/materials/worksheet_04/bootstrap_samples.csv b/materials/worksheet_04/bootstrap_samples.csv
new file mode 100644
index 0000000..1fb7871
--- /dev/null
+++ b/materials/worksheet_04/bootstrap_samples.csv
@@ -0,0 +1,30001 @@
diff --git a/materials/worksheet_04/generate_estimates_worksheet_04.R b/materials/worksheet_04/generate_estimates_worksheet_04.R
new file mode 100644
index 0000000..75ad2a6
--- /dev/null
+++ b/materials/worksheet_04/generate_estimates_worksheet_04.R
@@ -0,0 +1,45 @@
+# This script is used to generate the data frames used in section 3.
+sunset_pop <- vancouver_trees %>%
+ filter(neighbourhood_name == "SUNSET") %>%
+ select(diameter) %>%
+ mutate(diameter = diameter * 2.54)
+generate_bootstrap_samples <- function(sample) {
+ sample %>%
+ rep_sample_n(reps = 1000, size = nrow(sample), replace = TRUE) %>%
+ ungroup()
+compute_bootstrap_means <- function(bootstrap_sample) {
+ bootstrap_sample %>%
+ group_by(replicate) %>%
+ summarize(bootstrap_mean = mean(diameter), .groups = "drop")
+sunset_bootstrap_means <- sunset_pop %>%
+ rep_sample_n(size = 30, reps = 100, replace = FALSE) %>%
+ nest() %>%
+ ungroup() %>%
+ rename(sample = data) %>%
+ rename(sample_id = replicate) %>%
+ mutate(bootstrap_samples = map(sample, generate_bootstrap_samples)) %>%
+ mutate(bootstrap_means = map(bootstrap_samples, compute_bootstrap_means))
+bootstrap_samples <- sunset_bootstrap_means %>%
+ select(sample_id, bootstrap_samples) %>%
+ unnest(bootstrap_samples) %>%
+ filter(sample_id == 1)
+write.csv(bootstrap_samples, "bootstrap_samples.csv", row.names = FALSE)
+sampling_dist_estimates <- sunset_bootstrap_means %>%
+ select(sample_id, bootstrap_means) %>%
+ unnest(bootstrap_means)
+write.csv(sampling_dist_estimates, "sampling_dist_estimates.csv", row.names = FALSE)
\ No newline at end of file
diff --git a/materials/worksheet_04/sampling_dist_estimates.csv b/materials/worksheet_04/sampling_dist_estimates.csv
new file mode 100644
index 0000000..df3f055
--- /dev/null
+++ b/materials/worksheet_04/sampling_dist_estimates.csv
@@ -0,0 +1,100001 @@
diff --git a/materials/worksheet_04/tests_worksheet_04.R b/materials/worksheet_04/tests_worksheet_04.R
new file mode 100644
index 0000000..b29d681
--- /dev/null
+++ b/materials/worksheet_04/tests_worksheet_04.R
@@ -0,0 +1,741 @@
+test_1.0 <- function() {
+ test_that('Did not assign answer to an object called "answer1.0"', {
+ expect_true(exists("answer1.0"))
+ })
+ test_that('Solution should be a string of ALL the following characters ("A", "B", "C", "D", and "E")', {
+ expect_match(answer1.0, "^[aA|bB|cC|dD|eE]{5}$")
+ })
+ test_that("Solution is incorrect", {
+ expect_equal(digest(tolower(answer1.0)), "aa817d77bfc37cf923b17fe3465952f0")
+ })
+ print("Success!")
+test_1.1 <- function() {
+ test_that('Did not assign answer to an object called "answer1.1"', {
+ expect_true(exists("answer1.1"))
+ })
+ test_that('Solution should be a single character ("A", "B", "C", or "D")', {
+ expect_match(answer1.1, "a|b|c|d", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer1.1))
+ if (answer_hash == "ddf100612805359cd81fdc5ce3b9fbba") {
+ print("Think about how the shape of the bootstrap distribution is affected by sample size")
+ } else if (answer_hash == "6e7a8c1c098e8817e3df3fd1b21149d1") {
+ print("The centre of the bootstrap distribution may not be the same, but the standard deviation is a good estimate of the standard error of an estimator")
+ } else if (answer_hash == "d110f00cfb1b248e835137025804a23b") {
+ print("Think about the pennies example fromv modern dive")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "127a2ec00989b9f7faf671ed470be7f8")
+ })
+ print("Success!")
+test_2.0 <- function() {
+ test_that('Did not assign answer to an object called "pop_mean"', {
+ expect_true(exists("pop_mean"))
+ })
+ test_that("Answer object should be a single number, not a dataframe", {
+ expect_false("data.frame" %in% class(pop_mean))
+ })
+ answer_as_numeric <- as.numeric(pop_mean)
+ test_that("Solution should be a single number", {
+ expect_false(is.na(answer_as_numeric))
+ })
+ test_that("Solution is incorrect", {
+ expect_equal(digest(as.integer(answer_as_numeric * 1000000)), "47ec131903ad9e44ca74761a81ba7e29")
+ })
+ print("Success!")
+test_2.1 <- function() {
+ test_that('Did not assign answer to an object called "answer2.1"', {
+ expect_true(exists("answer2.1"))
+ })
+ answer_as_numeric <- as.numeric(answer2.1)
+ test_that("Solution should be a number", {
+ expect_false(is.na(answer_as_numeric))
+ })
+ test_that("Solution should be an integer", {
+ expect_true(answer_as_numeric %% 1 == 0)
+ })
+ test_that("Solution is incorrect", {
+ expect_true(digest(as.integer(answer_as_numeric)) == "7c7124efff5c7039a1b1e7cba65c5379" | digest(as.integer(answer_as_numeric)) == "9d08099943f8627959cfb8ecee0d2f5d" | digest(as.integer(answer_as_numeric)) == "8eaca7c9b35d05ab15c9125bc92372fa")
+ })
+ print("Success!")
+test_2.2 <- function() {
+ test_that('Did not assign answer to an object called "sample_1"', {
+ expect_true(exists("sample_1"))
+ })
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(sample_1))
+ })
+ expected_colnames <- c("diameter")
+ given_colnames <- colnames(sample_1)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(sample_1))), "2bbdc9479e5ddf03425adc57599af655")
+ })
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(sample_1$diameter))), "1d7b23b077358cf4fa53bf3d1e1d81d8")
+ })
+ print("Success!")
+test_2.3 <- function() {
+ test_that('Did not assign answer to an object called "upper_quantile"', {
+ expect_true(exists("upper_quantile"))
+ })
+ answer_as_numeric <- as.numeric(upper_quantile)
+ test_that("Solution should be a number", {
+ expect_false(is.na(answer_as_numeric))
+ })
+ test_that("Solution is incorrect", {
+ expect_equal(digest(as.integer(answer_as_numeric) * 1000000), "f7be34588ada5ef454b8117a06dd0d33")
+ })
+ print("Success!")
+test_2.4 <- function() {
+ test_that('Did not assign answer to an object called "answer2.4"', {
+ expect_true(exists("answer2.4"))
+ })
+ test_that('Solution should be "true" or "false"', {
+ expect_match(answer2.4, "true|false", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer2.4))
+ if (answer_hash == "d2a90307aac5ae8d0ef58e2fe730d38b") {
+ print("Revisit the definition of a quantile")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "05ca18b596514af73f6880309a21b5dd")
+ })
+ print("Success!")
+test_2.5 <- function() {
+ test_that('Did not assign answer to an object called "lower_quantile"', {
+ expect_true(exists("lower_quantile"))
+ })
+ answer_as_numeric <- as.numeric(lower_quantile)
+ test_that("Solution should be a number", {
+ expect_false(is.na(answer_as_numeric))
+ })
+ test_that("Solution is incorrect", {
+ expect_equal(digest(as.integer(answer_as_numeric) * 1000000), "50f9a6d45084d41172a5f0fefc4178a5")
+ })
+ print("Success!")
+test_2.6 <- function() {
+ test_that('Did not assign answer to an object called "sample_quantile_plot"', {
+ expect_true(exists("sample_quantile_plot"))
+ })
+ properties <- c(sample_quantile_plot$layers[[1]]$mapping, sample_quantile_plot$mapping)
+ test_that("Plot should have diameter on the x-axis", {
+ expect_true("diameter" == rlang::get_expr(properties$x))
+ })
+ test_that("Plot does not have the correct layers", {
+ expect_true("GeomBar" %in% class(sample_quantile_plot$layers[[1]]$geom))
+ expect_true("GeomVline" %in% class(sample_quantile_plot$layers[[2]]$geom))
+ expect_true("GeomVline" %in% class(sample_quantile_plot$layers[[3]]$geom))
+ })
+ test_that("Plot does not have the correct bin width", {
+ expect_equal(
+ digest(as.integer(mget("stat_params", sample_quantile_plot$layers[[1]])[["stat_params"]][["binwidth"]]) * 1000),
+ "998633da79c2a3f44fe6482751ba47e1"
+ )
+ })
+ test_that("Plot does not use the correct data", {
+ expect_equal(digest(nrow(sample_quantile_plot$data)), "2bbdc9479e5ddf03425adc57599af655")
+ expect_equal(digest(as.integer(sum(sample_quantile_plot$data$diameter) * 1000000)), "c7f66da1cae4f223b9bae717f05900f7")
+ })
+ test_that("Vertical line layers are not in the correct locations", {
+ expect_equal(digest(as.numeric(sample_quantile_plot$layers[[2]]$data) * 1000000), "cbca884fded8a271a9d2c0cafee98a90")
+ expect_equal(digest(as.numeric(sample_quantile_plot$layers[[3]]$data) * 1000000), "e202f4968c4813a4470552909accad76")
+ })
+ test_that("x-axis label should be descriptive and human readable", {
+ expect_false(sample_quantile_plot$labels$x == "diameter")
+ })
+ test_that("Plot should have a title", {
+ expect_true("title" %in% names(sample_quantile_plot$labels))
+ })
+ print("Success!")
+test_2.7 <- function() {
+ test_that('Did not assign answer to an object called "p_below"', {
+ expect_true(exists("p_below"))
+ })
+ answer_as_numeric <- as.numeric(p_below)
+ test_that("Solution should be a number", {
+ expect_false(is.na(answer_as_numeric))
+ })
+ test_that("Solution is incorrect", {
+ expect_equal(digest(as.integer(answer_as_numeric * 1000000)), "19fdc64eb993dca43f6f6d5136ae0208")
+ })
+ print("Success!")
+test_2.8 <- function() {
+ test_that('Did not assign answer to an object called "p_between"', {
+ expect_true(exists("p_between"))
+ })
+ answer_as_numeric <- as.numeric(p_between)
+ test_that("Solution should be a number", {
+ expect_false(is.na(answer_as_numeric))
+ })
+ test_that("Solution is incorrect", {
+ expect_equal(digest(as.integer(answer_as_numeric * 1000000)), "f4ea81e7a356c1aea215c7f4295cc2e2")
+ })
+ print("Success!")
+test_2.9 <- function() {
+ test_that('Did not assign answer to an object called "answer2.9"', {
+ expect_true(exists("answer2.9"))
+ })
+ test_that('Solution should be a single character ("A", "B", "C", or "D")', {
+ expect_match(answer2.9, "a|b|c|d", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer2.9))
+ if (answer_hash == "127a2ec00989b9f7faf671ed470be7f8") {
+ print("If 25% of the observations fall below lower_quantile and 25% fall above upper_quantile, what proportion lie between the two?")
+ } else if (answer_hash == "ddf100612805359cd81fdc5ce3b9fbba") {
+ print("Revisit the definition of a quantile: if 75% of the observations lie below, what proportion lie above?")
+ } else if (answer_hash == "d110f00cfb1b248e835137025804a23b") {
+ print("Revisit the definition of a quantile")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "6e7a8c1c098e8817e3df3fd1b21149d1")
+ })
+ print("Success!")
+test_2.10 <- function() {
+ test_that('Did not assign answer to an object called "answer2.10"', {
+ expect_true(exists("answer2.10"))
+ })
+ test_that('Solution should be "true" or "false"', {
+ expect_match(answer2.10, "true|false", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer2.10))
+ if (answer_hash == "d2a90307aac5ae8d0ef58e2fe730d38b") {
+ print("Revisit the definition of quantiles. If 10% of the observation lie below the lower quantile and 40% lie above the upper quantile, what proportion lie between?")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "05ca18b596514af73f6880309a21b5dd")
+ })
+ print("Success!")
+test_3.0 <- function() {
+ test_that('Did not assign answer to an object called "sample_2"', {
+ expect_true(exists("sample_2"))
+ })
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(sample_2))
+ })
+ expected_colnames <- c("diameter")
+ given_colnames <- colnames(sample_2)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(sample_2))), "7d2842cab7725fd8f382293e410d42b2")
+ })
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(sample_2$diameter) * 1000000)), "e1e301e732cbe4988c21ffec8bc2c7e3")
+ })
+ print("Success!")
+test_3.1 <- function() {
+ test_that('Did not assign answer to an object called "bootstrap_dist"', {
+ expect_true(exists("bootstrap_dist"))
+ })
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(bootstrap_dist))
+ })
+ expected_colnames <- c("mean_diameter")
+ given_colnames <- colnames(bootstrap_dist)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(bootstrap_dist))), "b6a6227038bf9be67533a45a6511cc7e")
+ })
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(bootstrap_dist$mean_diameter) * 1000000)), "c7f66da1cae4f223b9bae717f05900f7")
+ })
+ print("Success!")
+test_3.2 <- function() {
+ test_that('Did not assign answer to an object called "ci"', {
+ expect_true(exists("ci"))
+ })
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(ci))
+ })
+ expected_colnames <- c("ci_lower", "ci_upper")
+ given_colnames <- colnames(ci)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(ci$ci_lower) * 1000000), "d4ef090268118448c4f4b8f7c2cef425")
+ expect_equal(digest(as.integer(ci$ci_upper) * 1000000), "9e62aede040a3339d7fe4f57cb3b61da")
+ })
+ print("Success!")
+test_3.3 <- function() {
+ test_that('Did not assign answer to an object called "ci_plot"', {
+ expect_true(exists("ci_plot"))
+ })
+ properties <- c(ci_plot$layers[[1]]$mapping, ci_plot$mapping)
+ test_that("Plot should have mean_diameter on the x-axis", {
+ expect_true("mean_diameter" == rlang::get_expr(properties$x))
+ })
+ test_that("Plot does not have the correct layers", {
+ expect_true("GeomBar" %in% class(ci_plot$layers[[1]]$geom))
+ expect_true("GeomVline" %in% class(ci_plot$layers[[3]]$geom))
+ })
+ test_that("Plot does not have the correct bin width", {
+ expect_equal(
+ digest(as.integer(mget("stat_params", ci_plot$layers[[1]])[["stat_params"]][["binwidth"]])),
+ "4b5630ee914e848e8d07221556b0a2fb"
+ )
+ })
+ test_that("Plot does not use the correct data", {
+ expect_equal(digest(nrow(ci_plot$data)), "b6a6227038bf9be67533a45a6511cc7e")
+ expect_equal(digest(as.integer(sum(ci_plot$data$mean_diameter) * 1000000)), "c7f66da1cae4f223b9bae717f05900f7")
+ })
+ test_that("x-axis label should be descriptive and human readable", {
+ expect_false(ci_plot$labels$x == "mean_diameter")
+ })
+ test_that("Plot should have a title", {
+ expect_true("title" %in% names(ci_plot$labels))
+ })
+ print("Success!")
+test_3.4 <- function() {
+ test_that('Did not assign answer to an object called "answer3.4"', {
+ expect_true(exists("answer3.4"))
+ })
+ test_that('Solution should be "true" or "false"', {
+ expect_match(answer3.4, "true|false", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer3.4))
+ if (answer_hash == "d2a90307aac5ae8d0ef58e2fe730d38b") {
+ print("Take a look at the mapping for the different layers of the plot")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "05ca18b596514af73f6880309a21b5dd")
+ })
+ print("Success!")
+test_3.5 <- function() {
+ test_that('Did not assign answer to an object called "answer3.5"', {
+ expect_true(exists("answer3.5"))
+ })
+ test_that('Solution should be "true" or "false"', {
+ expect_match(answer3.5, "true|false", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer3.5))
+ if (answer_hash == "05ca18b596514af73f6880309a21b5dd") {
+ print("Compare the ranges of the sample ID column")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "d2a90307aac5ae8d0ef58e2fe730d38b")
+ })
+ print("Success!")
+test_3.6 <- function() {
+ test_that('Did not assign answer to an object called "answer3.6"', {
+ expect_true(exists("answer3.6"))
+ })
+ test_that('Solution should be "true" or "false"', {
+ expect_match(answer3.6, "true|false", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer3.6))
+ if (answer_hash == "05ca18b596514af73f6880309a21b5dd") {
+ print("Compare the ranges of the sample ID column")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "d2a90307aac5ae8d0ef58e2fe730d38b")
+ })
+ print("Success!")
+test_3.7 <- function() {
+ test_that('Did not assign answer to an object called "sampling_dist_estimate"', {
+ expect_true(exists("sampling_dist_estimate"))
+ })
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(sampling_dist_estimate))
+ })
+ expected_colnames <- c("replicate", "bootstrap_mean")
+ given_colnames <- colnames(sampling_dist_estimate)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(sampling_dist_estimate))), "b6a6227038bf9be67533a45a6511cc7e")
+ })
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(sampling_dist_estimate$bootstrap_mean) * 1000000)), "c7f66da1cae4f223b9bae717f05900f7")
+ })
+ print("Success!")
+test_3.8 <- function() {
+ test_that('Did not assign answer to an object called "intervals"', {
+ expect_true(exists("intervals"))
+ })
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(intervals))
+ })
+ expected_colnames <- c("sample_id", "ci_lower", "ci_upper")
+ given_colnames <- colnames(intervals)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(intervals))), "5d6e7fe43b3b73e5fd2961d5162486fa")
+ })
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(intervals$ci_lower) * 100000)), "475863f7b59a7ac90ba822dc3f1e28f3")
+ expect_equal(digest(as.integer(sum(intervals$ci_upper) * 100000)), "f487416aca9e23b69bc7dee15ed812ca")
+ })
+ print("Success!")
+test_3.9 <- function() {
+ test_that('Did not assign answer to an object called "intervals_captured"', {
+ expect_true(exists("intervals_captured"))
+ })
+ test_that("Solution should be a data frame", {
+ expect_true("data.frame" %in% class(intervals_captured))
+ })
+ expected_colnames <- c("sample_id", "ci_lower", "ci_upper", "captured")
+ given_colnames <- colnames(intervals_captured)
+ test_that("Data frame does not have the correct columns", {
+ expect_equal(length(setdiff(
+ union(expected_colnames, given_colnames),
+ intersect(expected_colnames, given_colnames)
+ )), 0)
+ })
+ test_that("Data frame does not contain the correct number of rows", {
+ expect_equal(digest(as.integer(nrow(intervals_captured))), "5d6e7fe43b3b73e5fd2961d5162486fa")
+ })
+ test_that("Data frame does not contain the correct data", {
+ expect_equal(digest(as.integer(sum(intervals_captured$ci_lower) * 1000000)), "5e2e6071008dd2bd800537489b3a59c5")
+ expect_equal(digest(as.integer(sum(intervals_captured$ci_upper) * 1000000)), "c7f66da1cae4f223b9bae717f05900f7")
+ expect_equal(digest(as.integer(sum(intervals_captured$captured))), "e444a32cd8c806b12b8baff9696a342f")
+ })
+ print("Success!")
+test_3.10 <- function() {
+ test_that('Did not assign answer to an object called "many_ci_plot"', {
+ expect_true(exists("many_ci_plot"))
+ })
+ properties <- c(many_ci_plot$layers[[1]]$mapping, many_ci_plot$mapping)
+ test_that("Lower boundary of x-axis segments should be ci_lower", {
+ expect_true("ci_lower" == rlang::get_expr(properties$x))
+ })
+ test_that("Upper boundary of x-axis segments should be ci_upper", {
+ expect_true("ci_upper" == rlang::get_expr(properties$xend))
+ })
+ test_that("Plot does not use the correct data", {
+ expect_equal(digest(as.integer(sum(many_ci_plot$data$ci_lower) * 1000000)), "5e2e6071008dd2bd800537489b3a59c5")
+ expect_equal(digest(as.integer(sum(many_ci_plot$data$ci_upper) * 1000000)), "c7f66da1cae4f223b9bae717f05900f7")
+ expect_equal(digest(as.integer(sum(many_ci_plot$data$captured))), "e444a32cd8c806b12b8baff9696a342f")
+ })
+ test_that("x-axis label should be descriptive and human readable", {
+ expect_false(ci_plot$labels$x == "ci_lower")
+ })
+ test_that("Plot should have a title", {
+ expect_true("title" %in% names(ci_plot$labels))
+ })
+ print("Success!")
+test_3.11 <- function() {
+ test_that('Did not assign answer to an object called "answer3.11"', {
+ expect_true(exists("answer3.11"))
+ })
+ answer_as_numeric <- as.numeric(answer3.11)
+ test_that("Solution should be a number", {
+ expect_false(is.na(answer_as_numeric))
+ })
+ test_that("Solution should be an integer", {
+ expect_true(answer_as_numeric %% 1 == 0)
+ })
+ test_that("Solution is incorrect", {
+ expect_equal(digest(as.integer(answer_as_numeric)), "e444a32cd8c806b12b8baff9696a342f")
+ })
+ print("Success!")
+test_3.12 <- function() {
+ test_that('Did not assign answer to an object called "answer3.12"', {
+ expect_true(exists("answer3.12"))
+ })
+ answer_as_numeric <- as.numeric(answer3.12)
+ test_that("Solution should be a number", {
+ expect_false(is.na(answer_as_numeric))
+ })
+ test_that("Solution should be an integer", {
+ expect_true(answer_as_numeric %% 1 == 0)
+ })
+ test_that("Solution is incorrect", {
+ expect_equal(digest(as.integer(answer_as_numeric)), "5ea3aa881cac20eac898460bc769efae")
+ })
+ print("Success!")
+test_3.13 <- function() {
+ test_that('Did not assign answer to an object called "answer3.13"', {
+ expect_true(exists("answer3.13"))
+ })
+ test_that('Solution should be "true" or "false"', {
+ expect_match(answer3.13, "true|false", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer3.13))
+ if (answer_hash == "d2a90307aac5ae8d0ef58e2fe730d38b") {
+ print("These are two common interpretations of a confidence interval. HOWEVER, note that we DO NOT say 'The interval has a 90% CHANCE OF CAPTURING THE TRUE PARAMETER'.")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "05ca18b596514af73f6880309a21b5dd")
+ })
+ print("Success!")
+test_4.0 <- function() {
+ test_that('Did not assign answer to an object called "answer4.0"', {
+ expect_true(exists("answer4.0"))
+ })
+ test_that('Solution should be "true" or "false"', {
+ expect_match(answer4.0, "true|false", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer4.0))
+ if (answer_hash == "05ca18b596514af73f6880309a21b5dd") {
+ print("This is a common interpretation of confidence intervals. Once an interval is calculated, it either captures or does not capture the true parameter (i.e., the probability of the interval capturing the interval is either 0 or 1)")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "d2a90307aac5ae8d0ef58e2fe730d38b")
+ })
+ print("Success!")
+test_4.1 <- function() {
+ test_that('Did not assign answer to an object called "answer4.1"', {
+ expect_true(exists("answer4.1"))
+ })
+ test_that('Solution should be a single character ("A", "B", "C", "D", "E", "F", "G", "H", "I")', {
+ expect_match(answer4.1, "a|b|c|d|e|f|g|h|i", ignore.case = TRUE)
+ })
+ test_that("Solution is incorrect", {
+ expect_equal(digest(tolower(answer4.1)), "fe98eba4312fd761affde1df9b1b51ea")
+ })
+ print("Success!")
+test_4.2 <- function() {
+ test_that('Did not assign answer to an object called "answer4.2"', {
+ expect_true(exists("answer4.2"))
+ })
+ test_that('Solution should be "true" or "false"', {
+ expect_match(answer4.2, "true|false", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer4.2))
+ if (answer_hash == "d2a90307aac5ae8d0ef58e2fe730d38b") {
+ print("So long as the proportion of the bootstrap distribution that falls between the two quantiles is equal to the confidence level of the interval, the confidence interval is still valid and we can interpret it the same way!")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "05ca18b596514af73f6880309a21b5dd")
+ })
+ print("Success!")
+test_4.3 <- function() {
+ test_that('Did not assign answer to an object called "answer4.3"', {
+ expect_true(exists("answer4.3"))
+ })
+ test_that('Solution should be a single character ("A", "B", "C", or "D")', {
+ expect_match(answer4.3, "a|b|c|d", ignore.case = TRUE)
+ })
+ answer_hash <- digest(tolower(answer4.3))
+ if (answer_hash == "127a2ec00989b9f7faf671ed470be7f8") {
+ print("Is this more likely than the other potential confidence level?")
+ } else if (answer_hash == "ddf100612805359cd81fdc5ce3b9fbba" | answer_hash == "6e7a8c1c098e8817e3df3fd1b21149d1") {
+ print("Think about how the confidence level relates to the proportion of intervals that would capture the true parameter if you calculated many, many confidence intervals.")
+ }
+ test_that("Solution is incorrect", {
+ expect_equal(answer_hash, "d110f00cfb1b248e835137025804a23b")
+ })
+ print("Success!")
diff --git a/materials/worksheet_04/worksheet_04.ipynb b/materials/worksheet_04/worksheet_04.ipynb
new file mode 100644
index 0000000..ad081d9
--- /dev/null
+++ b/materials/worksheet_04/worksheet_04.ipynb
@@ -0,0 +1,2903 @@
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "183a2e078152dd161a30387b5a0e5db4",
+ "grade": false,
+ "grade_id": "cell-c3be035a676e3d8f",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "# Worksheet 4: Confidence Intervals via Bootstrapping"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "97171ab5adfcd750c67c4bb92da174c2",
+ "grade": false,
+ "grade_id": "cell-e74e47156fd30ffe",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "#### Lecture and Tutorial Learning Goals:\n",
+ "From this section, students are expected to be able to:\n",
+ "\n",
+ "1. Define what a confidence interval is and why we want to generate one.\n",
+ "2. Explain how the bootstrap sampling distribution can be used to create confidence intervals.\n",
+ "3. Write a computer script to calculate confidence intervals for a population parameter using bootstrapping.\n",
+ "4. Effectively visualize point estimates and confidence intervals.\n",
+ "5. Interpret and explain results from confidence intervals.\n",
+ "6. Discuss the potential limitations of these methods."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "1472cf43296097795109b54543e4eee0",
+ "grade": false,
+ "grade_id": "cell-e3dd9d74ab20965b",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Run this cell before continuing.\n",
+ "library(cowplot)\n",
+ "library(datateachr)\n",
+ "library(infer)\n",
+ "library(repr)\n",
+ "library(tidyverse)\n",
+ "source(\"tests_worksheet_04.R\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "f7e7f33d097396294395d7212ef07a16",
+ "grade": false,
+ "grade_id": "cell-04657c2d7e49dc5d",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "## 1. Short Recap & Warm-Up\n",
+ "\n",
+ "Before we start exploring the new material for this week, let's remind ourselves of some of the most important points that we covered in the previous week by answering a couple of questions."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "e84f69315a3126d27024d3117f6632c3",
+ "grade": false,
+ "grade_id": "cell-277aef32f735ac0e",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.0**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Consider the following statements pertaining to some concepts that we have learned about so far: \n",
+ "\n",
+ "---\n",
+ "\n",
+ "- From a given sample, we obtain a point estimate that estimates a (1) `...`.\n",
+ "\n",
+ "- (2) `...` is a function (depends on) of the sample, and its standard deviation is called the standard error.\n",
+ "\n",
+ "- From bootstrap samples, we obtain bootstrap distribution, which (3) `...` the sampling distribution.\n",
+ "\n",
+ "- The (4) `...` of the bootstrap distribution estimates the (5)`...` of a statistic.\n",
+ "\n",
+ "---\n",
+ "\n",
+ "Notice that some terms are missing, as indicated by the five blanks (`...`). Your job is to match the following terms with their appropriate locations in the diagram:\n",
+ "\n",
+ "A. Standard deviation\n",
+ "\n",
+ "B. Standard error\n",
+ "\n",
+ "C. Estimator\n",
+ "\n",
+ "D. Estimates\n",
+ "\n",
+ "E. Parameter\n",
+ "\n",
+ "Your answer should be a string containing the letters associated with the terms **in the same order as they would appear above:** `(1)(2)(3)(4)(5)`. Each letter must be used exactly once. For example, one potential solution is `\"ABCDE\"`.\n",
+ "\n",
+ "_Assign your answer to an object called `answer1.0`. Your answer should be a string containing the letters \"A\", \"B\", \"C\", \"D\", and \"E\" in any order._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "7d68b56e6852681b46c0bef0ca97cae9",
+ "grade": false,
+ "grade_id": "cell-1727cffb8fad502f",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer1.0 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "1861854a0a5451c2ac811312d881821e",
+ "grade": true,
+ "grade_id": "cell-44068c9ca1db48aa",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.0()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "45ef2eabad40d15b0c17912520d44a88",
+ "grade": false,
+ "grade_id": "cell-eea64fe8856b291f",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 1.1**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Which statment below is **not** true?\n",
+ "\n",
+ "A. Given a sample of size 100, we could get a useful estimate of the sampling distribution for samples of size 100 by taking many bootstrap samples of size 70 from the original sample.\n",
+ "\n",
+ "B. As sample size increases, the standard deviation of the boostrap distribution resulting from a given sample generally decreases.\n",
+ "\n",
+ "C. The standard deviation of a bootstrap distribution can provide a reliable estimate of the standard error of an estimator, even if the estimator's distribution is asymmetrical.\n",
+ "\n",
+ "D. The procedure for drawing a normal sample and a bootstrap sample are very similar, with the exception of where we sample from, and the type of sampling (with vs. without replacement).\n",
+ "\n",
+ "_Assign your answer to an object called `answer1.1`. Your answer should be a single character surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "2fe9c30eec4bb5eaf011a71050add8c2",
+ "grade": false,
+ "grade_id": "cell-6e7c99c9098e9e01",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer1.1 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "6215c6106144eca0b5a679ae057a300f",
+ "grade": true,
+ "grade_id": "cell-0e9dc9ae6c2a29f4",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_1.1()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "7106f48195e0b39444f688ef4640e15f",
+ "grade": false,
+ "grade_id": "cell-0ba462b13e472a93",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "## 2. Introduction to Confidence Intervals\n",
+ "\n",
+ "So far, we have spent a lot of time understanding sampling distributions as well as one method used to estimate them (bootstrapping). But what's the point of sampling distributions in reality? We know that it tells us about **sampling variation**, which we quantify as the **standard error** of an estimator (the standard deviation of the estimator's distribution). But how are sampling distributions used in practice?\n",
+ "> We use a sampling distribution to give a range of plausible values for a population parameter. This range of values is known as a **confidence interval**.\n",
+ "\n",
+ "In this section, we'll present the basic idea behind confidence intervals and the formal concepts that are used to define them."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "58a980684fe89d1059aa5ba425d5e883",
+ "grade": false,
+ "grade_id": "cell-509ae1c00179cdf6",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "### Vancouver Street Trees\n",
+ "Because we are introducing something brand new, let's revisit a dataset that we are familiar with: `vancouver_trees`. This time, we'll consider our population to be all of the public trees planted in Sunset. Our parameter of interest will be the mean of the `diameter` variable. In the cell below we have filtered the data frame for the population of interest and selected the variable that we need. We have also converted the diameter column from inches to centimetres."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "e0224a5939c6f3f1d7c342af24004229",
+ "grade": false,
+ "grade_id": "cell-acaf959d1d48ef8b",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Run this cell before continuing.\n",
+ "sunset_pop <- \n",
+ " vancouver_trees %>% \n",
+ " filter(neighbourhood_name == \"SUNSET\") %>% \n",
+ " select(diameter) %>% \n",
+ " mutate(diameter = diameter * 2.54)\n",
+ "\n",
+ "head(sunset_pop)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "747ecae948dd00caf20056ace5423ec4",
+ "grade": false,
+ "grade_id": "cell-04d92e2140412282",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.0** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Calculate the mean of the population of interest.\n",
+ "\n",
+ "_Assign your answer to an object called `pop_mean`. Your answer should be a single number._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "5870082f29c7af2bc0a07ad09b9d480d",
+ "grade": false,
+ "grade_id": "cell-9316dff880f18da5",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "pop_mean"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "196684b82d6ee223b1e0cc56de6c899b",
+ "grade": true,
+ "grade_id": "cell-21c011af0ae860c8",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.0()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "ce5c633f0e833e0ee1ee9c11b150657a",
+ "grade": false,
+ "grade_id": "cell-668169d4e9139f50",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "In the cell below, we run code that generates 10 different bootstrap distributions. Each distribution estimates the sampling distribution of the mean of `sunset_pop` for samples of size 24. Hence, each bootstrap distribution was generated by taking a single sample of size 24 from the population `sunset_pop`, taking many resamples from this sample, and calculating the mean of each resample. We can consider the bootstrap distributions to depict some plausible values for the population parameter of interest, based on the original sample that was used to generate them.\n",
+ "\n",
+ "_Use the 10 bootstrap distributions below to answer the **next question**._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "bffc7c9538a99849fc74c7c746436d66",
+ "grade": false,
+ "grade_id": "cell-1e4480115f4769d7",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Run this cell before continuing.\n",
+ "options(repr.plot.width = 20, repr.plot.height = 8)\n",
+ "source(\"bootstrap_dists_worksheet_04.R\")\n",
+ "print(bootstrap_dists)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "b3e0527a37de3fba1e6d57c7d6aa847e",
+ "grade": false,
+ "grade_id": "cell-c93453feb55827dc",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.1**\n",
+ "
{points: 1}\n",
+ "\n",
+ "How many of the bootstrap distributions **reasonably \"cover\"** the true mean of the population?\n",
+ "\n",
+ "**Hint:** there may be more than one correct answer.\n",
+ "\n",
+ "_Assign your answer to an object called `answer2.1`. Your answer should be a single integer._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "7cfab1f6ac89fe05a6b9fe1642b35981",
+ "grade": false,
+ "grade_id": "cell-b718f74c7ad73204",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer2.1 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "9e22ed2ce555b908ef76b2c1861d5ba3",
+ "grade": true,
+ "grade_id": "cell-cf6d3812af64bde2",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.1()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "b9294ec77530d4272a3fa3cb0f89136e",
+ "grade": false,
+ "grade_id": "cell-08219e12485b6dbd",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "### Quantiles\n",
+ "Is there a clear answer to the previous question? Not really... The meaning of \"reasonably covers\" may differ from person to person; where would you draw the line between \"covers\" and \"does not cover\"? To avoid relying on our own intuition, we need a more rigorous way to say whether or not a bootstrap distribution \"covers\" the parameter of interest. To do this, we will use **quantiles**. The general definition of quantiles that we will be using in this course is as follows:\n",
+ "> The $p$th quantile is the value of the observation data set where a proportion of $p$ fall below it, and $1 - p$ fall above it.\n",
+ "\n",
+ "The $p$th quantile is also commonly referred to as the $p \\times 100\\%$th **percentile**. For example, the 0.025th quantile is also referred to as the 2.5th percentile. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "c74f4cddedf55e5eebe967cbf1c87381",
+ "grade": false,
+ "grade_id": "cell-5a8d653eb31a90a3",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.2**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Take a single sample of size 150 from `sunset_pop` using `rep_sample_n`. Be sure to `ungroup` and select only the variable that we are interested in (`diameter`). Use the scaffolding provided below as a guide:\n",
+ "\n",
+ "```r\n",
+ "sample_1 <- \n",
+ " ... %>% \n",
+ " rep_sample_n(reps = ..., size = ..., replace = ...) %>% \n",
+ " ungroup() %>% \n",
+ " select(...)\n",
+ "```\n",
+ "\n",
+ "_Assign your data frame to an object called `sample_1`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "1fc531e7d0de9f119023504a0ce1ec45",
+ "grade": false,
+ "grade_id": "cell-631427c9071eaba9",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "set.seed(8622) # DO NOT CHANGE THIS!\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "head(sample_1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "c4a1124b5a515db2d2e351ccd13a6d78",
+ "grade": true,
+ "grade_id": "cell-a081d7650da43c44",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.2()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "596548b20ff36900203aa0f0d3c94854",
+ "grade": false,
+ "grade_id": "cell-21b0d343665f8c2b",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.3**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Calculate the 0.75th quantile of `sample_1` using R's `quantile` function. Use the scaffolding provided below as a guide:\n",
+ "\n",
+ "```r\n",
+ "upper_quantile <- \n",
+ " sample_1 %>% \n",
+ " pull(...) %>% \n",
+ " quantile(...)\n",
+ "```\n",
+ "\n",
+ "**Hints:**\n",
+ "1. `quantile` takes a numeric vector for the first argument\n",
+ "2. You can use the `pull` to get a single column from a data frame as a vector, for example, `pull(data_frame, column_name)`\n",
+ "\n",
+ "_Assign your answer to an object called `upper_quantile`. Your answer should be a single number._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "1cfe81118b002bfd513511024952058a",
+ "grade": false,
+ "grade_id": "cell-cacf4f1c9c1e26dc",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "upper_quantile"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "515f58bd3fac17448778d18d174d2edf",
+ "grade": true,
+ "grade_id": "cell-8b38703a2532492b",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.3()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "40031598c204784c11e54a3aa7011a4d",
+ "grade": false,
+ "grade_id": "cell-1decc52a2880a0bf",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.4**\n",
+ "
{points: 1}\n",
+ "\n",
+ "True or false?\n",
+ "\n",
+ "One correct interpretation of the value of `upper_quantile` is: approximately 25% of observations in the sample `sample_1` lie above the value of `upper_quantile`.\n",
+ "\n",
+ "_Assign your answer to an object called `answer2.4`. Your answer should be either \"true\" or \"false\", surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "cd655f0bca6b8f1877dbf65bda9a25ce",
+ "grade": false,
+ "grade_id": "cell-43b8be334725790e",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer2.4 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "8736c07cf06fa7e47cfa0f7bb6ff87e5",
+ "grade": true,
+ "grade_id": "cell-e04d21251d8de8fb",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.4()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "3f1cddce2df0ce3a11e0fd195bd1e297",
+ "grade": false,
+ "grade_id": "cell-bffd96979f8468c2",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.5**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Calculate the 0.25th quantile of `sample_1` using R's `quantile` function.\n",
+ "\n",
+ "_Assign your answer to an object called `lower_quantile`. Your answer should be a single number._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "814bbce0a89c4234521912aca790e781",
+ "grade": false,
+ "grade_id": "cell-80552f5495dd76e7",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "lower_quantile"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "78ac2dc569ec290d5ae87331ed8efd30",
+ "grade": true,
+ "grade_id": "cell-00effe9dd34235fa",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.5()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "c7d218d5a27bc7de70941552bb6a8f27",
+ "grade": false,
+ "grade_id": "cell-6509ea4d827fc45e",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.6**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Plot the distribution of the sample `sample_1` using `geom_histogram` with bin widths of 2. Additionally, add vertical lines on top of the histogram at the location of the 0.25th and 0.75th quantiles using `geom_vline`. Ensure your plot has descriptive, human-readable labels with units and a title. Use the scaffolding provided below as a guide:\n",
+ "\n",
+ "```r\n",
+ "sample_quantile_plot <- \n",
+ " ... %>% \n",
+ " ggplot(aes(x = ...)) +\n",
+ " theme_bw() +\n",
+ " ...(binwidth = ..., color = 'white') +\n",
+ " geom_vline(... = lower_quantile, colour = \"red\", size = 1) +\n",
+ " ...(xintercept = upper_quantile, colour = \"red\", size = 1) +\n",
+ " labs(title = \"Sample Distribution (150 observations from population)\",\n",
+ " x = ...) + \n",
+ " theme(text = element_text(size = 20))\n",
+ "\n",
+ "```\n",
+ "\n",
+ "_Assign your plot to an object called `sample_quantile_plot`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "909345903fd70ea9f975184b2a63de52",
+ "grade": false,
+ "grade_id": "cell-f733caa7b3918b85",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "options(repr.plot.width = 10, repr.plot.height = 6)\n",
+ "sample_quantile_plot"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "759eae3254135d1caba084aaa23981d6",
+ "grade": true,
+ "grade_id": "cell-4d14bf1e59651b64",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.6()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "3bb61b1a51e28558b51a8efd9ac84884",
+ "grade": false,
+ "grade_id": "cell-d39d1428b752e6ef",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.7** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Calculate the proportion of `sample_1` that falls **below** the 0.75th quantile you calculated earlier (`upper_quantile`). Use the scaffolding provided below as a guide:\n",
+ "\n",
+ "```r\n",
+ "p_below <-\n",
+ " sample_1 %>% \n",
+ " summarise(prop = mean(... < ...)) %>% \n",
+ " pull(...)\n",
+ "```\n",
+ "\n",
+ "_Assign your answer to an object called `p_below`. Your answer should be a single number._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "9d763dd1a7bfe17fdb5fca30f17f087e",
+ "grade": false,
+ "grade_id": "cell-d881d2aa312309a9",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "p_below"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "7853b57e1f5ecbb3a2d35d7855f3ae09",
+ "grade": true,
+ "grade_id": "cell-6f42ef47e030eb3c",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.7()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "ffdaacf49cc21620b726d5a35467de00",
+ "grade": false,
+ "grade_id": "cell-4182c2bf8c74aae5",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.8** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Calculate the proportion of `sample_1` that falls **between** the 0.25th and 0.75th quantiles that you calculated earlier (`lower_quantile` and `upper_quantile`, respectively).\n",
+ "\n",
+ "**Hint:** one method of calculating the answer to this question is very similar to the method used in the previous question.\n",
+ "\n",
+ "_Assign your answer to an object called `p_between`. Your answer should be a single number._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "4336cecfbda7a3986b175a4bb30a1e00",
+ "grade": false,
+ "grade_id": "cell-8eec876afcff3471",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "p_between"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "f16daa65c3b1ad236e036305a75d9d9a",
+ "grade": true,
+ "grade_id": "cell-ba5a902a9f7c2d70",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.8()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "ca6dee0ccb9c06f4c732644669ca2dbe",
+ "grade": false,
+ "grade_id": "cell-2f242a5fcf503103",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.9**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Assume that we are certain that the sample `sample_1` is unbiased and representative of the population (i.e. we can consider it to be a good estimate of the population). Which of the interpretation **of the interval** (`lower_quantile`, `upper_quantile`) between the two quantiles of the sample distribution is **incorrect**?\n",
+ "\n",
+ "A. If we picked a random observation from the population, we can estimate that there is approximately a 50% chance that its diameter would fall within the interval (`lower_quantile`, `upper_quantile`).\n",
+ "\n",
+ "B. Approximately 25% of the diameters of the observations in the sample fall above the value of `upper_quantile`.\n",
+ "\n",
+ "C. If we draw another random sample and randomly picked an observation, there is approximately a 75% chance that its diameter would fall above the value of `lower_quantile`.\n",
+ "\n",
+ "D. We can estimate that approximately 25% of the diameters of the observations in the population fall below the value of `lower_quantile`.\n",
+ "\n",
+ "_Assign your answer to an object called `answer2.9`. Your answer should be a single character surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "915bd5f052f829e1a0ad49c7a2dc330c",
+ "grade": false,
+ "grade_id": "cell-15bf5fcc9a1caf7a",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer2.9 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "010b2b359de650775499b6d8921a79ab",
+ "grade": true,
+ "grade_id": "cell-05e987f5d1c2ac77",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.9()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "697fef9b4320c8b04b90bf99cb57324c",
+ "grade": false,
+ "grade_id": "cell-9f27e8a9508261b2",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 2.10**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Like the previous question, assume that we are certain that the sample `sample_1` is unbiased and representative of the population (i.e. we can consider it to be a good estimate of the population). However, suppose that you calculated the 0.1th quantile for `lower_quantile` and the 0.6th quantile for `upper_quantile` for the sample, instead of the 0.25th and 0.75th quantiles as you did previously.\n",
+ "\n",
+ "Given the above scenario, would the correctness of the following interpretation of the interval (`lower_quantile`, `upper_quantile`) remain **unchanged**?\n",
+ "\n",
+ "> If we picked a random observation from the population, we can estimate that there is approximately a 50% chance that it would fall within the interval (`lower_quantile`, `upper_quantile`).\n",
+ "\n",
+ "_Assign your answer to an object called `answer2.10`. Your answer should be either \"true\" or \"false\", surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "c917edf8c3bc1325b83d95bbc8feea8e",
+ "grade": false,
+ "grade_id": "cell-7cb07710ff20a77a",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer2.10 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "bbc07b9bed4a9074781864d35f23cc38",
+ "grade": true,
+ "grade_id": "cell-7c17acaaaaafaf03",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_2.10()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "181316faa3b135e913817670ff954118",
+ "grade": false,
+ "grade_id": "cell-d5e11d0ae75cba1a",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "## 3. Formalizing Confidence Intervals\n",
+ "\n",
+ "Now that we understand quantiles and how we can interpret them, we can rephrase the vague question of\n",
+ "> How many of the bootstrap distributions **reasonably \"cover\"** the true mean of the population?\n",
+ "\n",
+ "that we encountered at the start of the previous section. Specifically, we could ask something like\n",
+ "> Does the interval between the 0.1th and 0.9th quantiles capture the true mean of the population?\n",
+ "\n",
+ "instead, where the answer does not rely on one's intuition. That interval referenced in the question above is called a **confidence interval**, and in particular, it is called an **80% confidence interval**. One should think of a confidence interval as a **range of plausible** values for the population parameter, which may or may not fall within the interval. This is significantly different than a point estimate, which is a **single plausible value** for the population parameter. A popular analogy used to compare point estimates to confidence intervals is fishing with a spear vs. fishing with a net, respectively.\n",
+ "\n",
+ "
+ "\n",
+ "![](https://d33wubrfki0l68.cloudfront.net/45f6d2e16255dbcb42de86336e1e49ef732aa5da/8bcd0/images/shutterstock/point_estimate_vs_conf_int.png)\n",
+ "
+ "With a spear, we are aiming for a particular fish (a single value) and hoping to catch it, whereas with a net, we are aiming for a larger area in space (a range of plausible values) and hoping that the fish is captured in the net."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "d9f2dd2455331b347dcae4c95ae58873",
+ "grade": false,
+ "grade_id": "cell-f79e0d2285a3d223",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "### Calculating & Visualizing a Single 90% Confidence Interval Using Bootstrapping\n",
+ "To get a clearer picture of how we can use quantiles and a bootstrap sampling distribution to calculate a confidence interval, let's calculate one from scratch. Specifically, we'll calculate a single 90% confidence interval for the mean of the `diameter` variable for the population `sunset_pop`. Afterwards, we'll visualize it and compare it to the true mean.\n",
+ "> As usual, it is important to note that we do not usually have access to the population parameter of interest; many of these questions are purely for your understanding of confidence intervals. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "7532b3a418b00de62097fa93b4ee5e13",
+ "grade": false,
+ "grade_id": "cell-e79671482c0c74aa",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.0** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Take a single random sample of size 30 from `sunset_pop` using `rep_sample_n` and a seed of 0120. Ensure the resulting data frame only has a single column: `diameter`. Use the scaffolding provided below as a guide:\n",
+ "\n",
+ "```r\n",
+ "sample_2 <- \n",
+ " sunset_pop %>% \n",
+ " rep_sample_n(reps = ..., size = ..., replace = ...) %>% \n",
+ " ungroup() %>% \n",
+ " select(...)\n",
+ "```\n",
+ "\n",
+ "**Note:** don't forget that `rep_sample_n` returns a grouped data frame.\n",
+ "\n",
+ "_Assign your data frame to an object called `sample_2`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "818a2468d1a6640170310f2bcd48f1f6",
+ "grade": false,
+ "grade_id": "cell-9662bf3faea4b384",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "set.seed(0120) # DO NOT CHANGE!\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "head(sample_2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "019fd32a7836b161864d6cc98fdaf35c",
+ "grade": true,
+ "grade_id": "cell-655ca510c6b77ddd",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_3.0()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "aeb41e8fae2865744e13df91c47ef1fd",
+ "grade": false,
+ "grade_id": "cell-c91006d0be04f2ed",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.1** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Generate a bootstrap distribution of sample means from `sample_2` by re-sampling with replacement 1000 times using `rep_sample_n` and a seed of 5127. Then, calculate the mean of each bootstrap sample. Name the new column containing the bootstrap sample means `mean_diameter`, and select only that column. Use the scaffolding provided below as a guide:\n",
+ "```r\n",
+ "bootstrap_dist <- \n",
+ " sample_2 %>% \n",
+ " rep_sample_n(reps = ..., size = ..., replace = ...) %>% \n",
+ " group_by(...) %>% \n",
+ " ...(mean_diameter = ...(diameter)) %>% \n",
+ " select(mean_diameter)\n",
+ "```\n",
+ "\n",
+ "_Assign your data frame to an object called `bootstrap_dist`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "85e98a5c8bf41a7d1ef9f273a24f1183",
+ "grade": false,
+ "grade_id": "cell-3210e119a4b45f52",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "set.seed(5127) # DO NOT CHANGE!\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "head(bootstrap_dist)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "52b56bb12c56ade4e1d8dd475c8cca3d",
+ "grade": true,
+ "grade_id": "cell-468f79ddac80be06",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_3.1()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "375b024ae7cddd5823f0714301e8d7cc",
+ "grade": false,
+ "grade_id": "cell-c687afc82623f277",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.2** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Calculate the 5th and 95th percentiles of `bootstrap_dist` using the `quantile` and `summarize` functions. Name the column containing the 5th percentile `ci_lower` and the column containing the 95th percentile `ci_upper`. Use the scaffolding provided below as a guide:\n",
+ "\n",
+ "```r\n",
+ "ci <- \n",
+ " bootstrap_dist %>% \n",
+ " ...(ci_lower = quantile(..., 0.05),\n",
+ " ci_upper = ...(mean_diameter, ...))\n",
+ "```\n",
+ "\n",
+ "_Assign your data frame to an object called `ci`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "7dd8bb470fd3982c2e16b2f06a91da0f",
+ "grade": false,
+ "grade_id": "cell-5c3ecec57f5961ed",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "head(ci)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "25424d0f6e5da845fb7b4b460f65d333",
+ "grade": true,
+ "grade_id": "cell-06d2c37656961ba4",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_3.2()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "8bd77c538396c6e7777ff3f10ed94d77",
+ "grade": false,
+ "grade_id": "cell-49ce2db61cd6187f",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.3**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Plot the confidence interval (represented by a transluscent rectangle) and true mean of the population (represented by a vertical line, generated by `geom_vline`) over the bootstrap distribution `bootstrap_dist` (visualized as a histogram with bin widths of 1, generated by `geom_histogram`). Use the scaffolding provided below as a guide:\n",
+ "\n",
+ "```r\n",
+ "ci_plot <- \n",
+ " ... %>% \n",
+ " ggplot(aes(x = ...)) +\n",
+ " ...(binwidth = ..., colour = \"white\", fill = \"grey\") +\n",
+ " annotate(\"rect\", xmin = ci$ci_lower, xmax = ci$ci_upper, ymin = 0, ymax = Inf,\n",
+ " fill = \"deepskyblue\",\n",
+ " alpha = 0.3) +\n",
+ " geom_vline(xintercept = pop_mean,\n",
+ " size = 2,\n",
+ " colour = \"red\") +\n",
+ " labs(title = \"Bootstrap distribution with 90% confidence interval\",\n",
+ " x = \"Mean tree diameter (cm)\") +\n",
+ " theme_bw() # Sets a theme for better visibility\n",
+ "```\n",
+ "\n",
+ "**Note:** recall that you already calculated the true mean of the population in the and saved it to an object named `pop_mean`. \n",
+ "\n",
+ "_Assign your plot to an object called `ci_plot`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "834a61d580660a32a40df0d326166168",
+ "grade": false,
+ "grade_id": "cell-eb29c1f5f79ebd0d",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "ci_plot"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "8fec8121a401bd8b2eeb0fae41d722d8",
+ "grade": true,
+ "grade_id": "cell-f9895e50fbccccc1",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_3.3()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "d212480892614c224ecf26fa6fccfb4f",
+ "grade": false,
+ "grade_id": "cell-8f5235449d935656",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.4**\n",
+ "
{points: 1}\n",
+ "\n",
+ "True or false?\n",
+ "\n",
+ "The parameter of interest (the mean diameter of public street trees located in sunset) is captured by the confidence interval that we calculated in **question 3.2** and visualized above.\n",
+ "\n",
+ "_Assign your answer to an object called `answer3.4`. Your answer should be either \"true\" or \"false\", surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "2b4f7394fbf078c68b8c6b128ba271f7",
+ "grade": false,
+ "grade_id": "cell-fc39e7ebca011d85",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer3.4 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "3d6764157a2a6f6432b73e27766d9f9a",
+ "grade": true,
+ "grade_id": "cell-c3b87542b1ddd91e",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_3.4()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "8b7072f87952b92da5c354cd8d193832",
+ "grade": false,
+ "grade_id": "cell-5045ad6c08aad93f",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "### Calculating & Visualizing Many 90% Confidence Intervals Using Bootstrapping\n",
+ "How can we interpret a confidence interval when we don't have access to the population parameter (i.e. in reality)? Does our interpretation change as the width of the interval changes? How could we choose the width in the first place? To answer these questions, we're going to produce 100 different 90% confidence intervals to see how they behave. The bulk of the work has already been done for you in the cell below. We took 100 samples of size 30 and used each one to produce an estimate of the sampling distribution using bootstrapping with 1000 repetitions, essentially repeating the procedure you followed at the start of this section 100 times over."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "643589ff9f3bfff4303cc0a7b19babf2",
+ "grade": false,
+ "grade_id": "cell-44364736f4a0c09a",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Run this cell before continuing.\n",
+ "sampling_dist_estimates <- read_csv(\"sampling_dist_estimates.csv\",show_col_types = FALSE)\n",
+ "bootstrap_samples <- read_csv(\"bootstrap_samples.csv\",show_col_types = FALSE)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "5387cae40cac9b0481d2baa4e96fd705",
+ "grade": false,
+ "grade_id": "cell-e4c194c8f89c4d32",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "However, here are a few questions to check your understanding of what went on behind the scenes.\n",
+ "\n",
+ "_Use the following information to answer the **next 3 questions**._\n",
+ "\n",
+ "Consider the following two data frames:\n",
+ "1. `sampling_dist_estimates`: contains 100 estimates of the sampling distribution as 100 different tibbles within the column `bootstrap_mean`. The `sample_id` column ranges from 1 to 100 (since we took 100 samples)\n",
+ "2. `bootstrap_samples`: a bootstrap data frame, where `sample_id == 1` for all rows"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "b7a3703a7bc5d5320a704e46ecf88d4d",
+ "grade": false,
+ "grade_id": "cell-050b99444ff299dd",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "print(sampling_dist_estimates)\n",
+ "print(bootstrap_samples)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "6a4453baf832eef9c73a943d7d599bc8",
+ "grade": false,
+ "grade_id": "cell-89d9be6420a6b0c2",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.5**\n",
+ "
{points: 1}\n",
+ "\n",
+ "True or false?\n",
+ "\n",
+ "`bootstrap_samples` contains all of the bootstrap samples that were used to calculate the `bootstrap_mean` variable in `sampling_dist_estimates` for all of the 100 original samples.\n",
+ "\n",
+ "_Assign your answer to an object called `answer3.5`. Your answer should be either \"true\" or \"false\", surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "5b0f6fa4f6eae6c958ef1d9d241ba212",
+ "grade": false,
+ "grade_id": "cell-6119935a39ff6560",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer3.5 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "6c319ec28704f515a21eea857c4977db",
+ "grade": true,
+ "grade_id": "cell-38d43d4c70b0e718",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_3.5()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "f1d651113dec9bbbca5e6387fddbe716",
+ "grade": false,
+ "grade_id": "cell-00d2e4f54afbf569",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.6**\n",
+ "
{points: 1}\n",
+ "\n",
+ "True or false?\n",
+ "\n",
+ "Given **only** the `sampling_dist_estimates` data frame, we could produce `bootstrap_samples` data frame.\n",
+ "\n",
+ "_Assign your answer to an object called `answer3.6`. Your answer should be either \"true\" or \"false\", surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "a0d5bf6493917cee336024e1d9af5a5d",
+ "grade": false,
+ "grade_id": "cell-1098ceb1db0279d4",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer3.6 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "2e30a13ab82689ff9c1e9b736ef84bab",
+ "grade": true,
+ "grade_id": "cell-3368d315642a0ac1",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_3.6()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "215b6934f99349d90940620a039105e6",
+ "grade": false,
+ "grade_id": "cell-adfe76c5059f0f22",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.7** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Use `bootstrap_samples` to re-create the bootstrap distribution from `sampling_dist_estimates` that was generated using `sample_id == 1`. Your resulting data frame should have two columns: `replicate` and `bootstrap_mean`.\n",
+ "\n",
+ "_Assign your data frame to an object called `sampling_dist_estimate`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "c2d92d8510cd5a8f93afffbec8a3fc25",
+ "grade": false,
+ "grade_id": "cell-42da114f25e415ff",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "print(sampling_dist_estimate)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "fc8a3a77b187fbaab314772fe550bebf",
+ "grade": true,
+ "grade_id": "cell-f5c211ade1be51d0",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_3.7()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "79412bd3b0ce71a9d3dd323e3fce32d6",
+ "grade": false,
+ "grade_id": "cell-cc5f6279584f2648",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.8** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Now that we understand where `sampling_dist_estimates` came from, calculate a 90% confidence interval from each bootstrap distribution using the 0.05th and 0.95th quantiles. Use the scaffolding provided below as a guide:\n",
+ "\n",
+ "```r\n",
+ "intervals <- \n",
+ " sampling_dist_estimates %>% \n",
+ " ...(...) %>% \n",
+ " summarize(ci_lower = ...(bootstrap_mean, ...),\n",
+ " ci_upper = ...(..., ...))\n",
+ "```\n",
+ "\n",
+ "**Hint:** The procedure is very similar to **question 3.2**, but with one extra step.\n",
+ "\n",
+ "_Assign your data frame to an object called `intervals`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "d9d52d32128045d543bb1730de433e5d",
+ "grade": false,
+ "grade_id": "cell-cfcd85d8fdffd524",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "print(intervals)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "febd710620699b8335d0001d9555837b",
+ "grade": true,
+ "grade_id": "cell-d2e624405bcd8f41",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_3.8()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "cd04c388c55e6083d7b004d4b1b87dfc",
+ "grade": false,
+ "grade_id": "cell-d62749d5f278136a",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.9** \n",
+ "
{points: 1}\n",
+ "\n",
+ "Add a variable named `captured` to `intervals` that indicates whether the confidence interval described in that row captures the true mean (`pop_mean`). If the true mean is captured, `captured` should be `TRUE`, otherwise it should be `FALSE`. Use the scaffolding provided below as a guide:\n",
+ "\n",
+ "```r\n",
+ "intervals_captured <- \n",
+ " intervals %>% \n",
+ " mutate(captured = (ci_lower <= ... & pop_mean <= ...))\n",
+ "```\n",
+ "\n",
+ "_Assign your answer to an object called `intervals_captured`. Your answer should be a single number._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "c54c3ea3dd0940d8ff143114e5b90e21",
+ "grade": false,
+ "grade_id": "cell-f04e9dff8487a991",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "head(intervals_captured)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "5e68f3b739d20b42e12a4c827c51c38f",
+ "grade": true,
+ "grade_id": "cell-84b7cebc8e7d4385",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_3.9()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "3c92688640bbbe26a581a2fe07c35b77",
+ "grade": false,
+ "grade_id": "cell-5cc314f8a903e16d",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.10**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Visualize all 100 confidence intervals. Each interval is represented by a horizontal line (`geom_segment`) and is coloured depending on whether it captures the true mean of the population, which is represented as a vertical line (`geom_vline`). The ID of the sample should be on the y-axis. Use the scaffolding provided below as a guide:\n",
+ "\n",
+ "```r\n",
+ "many_ci_plot <- \n",
+ " ... %>%\n",
+ " ggplot() +\n",
+ " scale_colour_manual(breaks = c(\"TRUE\", \"FALSE\"), # Change colour scale for better visibility.\n",
+ " values = c(\"grey\", \"black\")) +\n",
+ " ...(aes(x = ...,\n",
+ " xend = ...,\n",
+ " y = sample_id,\n",
+ " yend = sample_id,\n",
+ " colour = ...)) +\n",
+ " ...(xintercept = ..., colour = \"red\", size = 1) +\n",
+ " labs(title = \"100 90% Confidence Intervals\",\n",
+ " y = \"Sample ID\",\n",
+ " x = \"Diameter (cm)\",\n",
+ " colour = \"Captured?\") +\n",
+ " theme_bw() # Sets a theme for better visibility.\n",
+ "```\n",
+ "\n",
+ "_Assign your plot to an object called `many_ci_plot`._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "fe3895af9714b09d0c25caec77beb342",
+ "grade": false,
+ "grade_id": "cell-5e4348b127e03245",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "options(repr.plot.width = 5, repr.plot.height = 6)\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer\n",
+ "\n",
+ "many_ci_plot"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "9606434424e06eaaa268a50f04d742f0",
+ "grade": true,
+ "grade_id": "cell-2d43ad356138ddbc",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_3.10()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "39d326c1f2da136427ee1c65acac9995",
+ "grade": false,
+ "grade_id": "cell-b42b0e3fe75aa077",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.11**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Using the plot above, how many of the 100 confidence intervals \"capture\" the true mean of the population?\n",
+ "\n",
+ "_Assign your answer to an object called `answer3.11`. Your answer should be a single integer._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "cc32a6c480eda1aaa7b66984b06bd764",
+ "grade": false,
+ "grade_id": "cell-8f63c686e8f83932",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer3.11 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "27e3db9c7494e4e4b03415b764e52a3b",
+ "grade": true,
+ "grade_id": "cell-21fc9714f214f663",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_3.11()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "580c706518be5471bc03eaf6d6e1d550",
+ "grade": false,
+ "grade_id": "cell-1a3b8516cd23c692",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.12**\n",
+ "
{points: 1}\n",
+ "\n",
+ "If we repeated the above experiment, but with 1000 different 90% confidence intervals instead of 100, how many of the intervals would you **expect** to capture the true mean of the population?\n",
+ "\n",
+ "_Assign your answer to an object called `answer3.12`. Your answer should be a single integer._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "a0daee792b58a33e2a480e6cea6a10d2",
+ "grade": false,
+ "grade_id": "cell-bb9290a90cf1a962",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer3.12 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "7e9b935a0cccc9b4fbbd16537f731433",
+ "grade": true,
+ "grade_id": "cell-a82f209ebd0dc15e",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_3.12()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "346331f01e6b8c0111d994bd674f3df7",
+ "grade": false,
+ "grade_id": "cell-8b120440dfa4831f",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 3.13**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Suppose you **did not** have data for the entire population of public street trees in Sunset. Consider a single **90% confidence interval** that you calculated using by bootstrapping a single sample of size $n$.\n",
+ "\n",
+ "True or false?\n",
+ "\n",
+ "We can interpret the confidence interval as: we are 90% confident that the true mean is captured by the interval. Or, in other words, across all 90% confidence intervals that could be calculated for the mean of the population of interest, we can expect that 90% of the intervals contain the true mean. \n",
+ "\n",
+ "_Assign your answer to an object called `answer3.13`. Your answer should be either \"true\" or \"false\", surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "44150755686d6e57996e0cffe50a7e00",
+ "grade": false,
+ "grade_id": "cell-88e21c6567e30d81",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer3.13 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "994629e40a2541ba73e7d0f1c6467947",
+ "grade": true,
+ "grade_id": "cell-db5f3330adee8e77",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_3.13()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "82b36b834b8c2d53367b363bd7bdcfde",
+ "grade": false,
+ "grade_id": "cell-efa920c654bbd2a6",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "## 4. Conclusion\n",
+ "Here are a few more questions that target some of the nuances related to confidence intervals."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "a5c8f28905dccb125c3c274ea915cfff",
+ "grade": false,
+ "grade_id": "cell-c2dd7916ef433d4c",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 4.0**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Suppose you **did not** have access to data for the entire population of public street trees in Sunset. Assume you took a single sample of size 75 and used bootstrapping to calculate a 95% confidence interval for the **standard deviation** of the diameter of the population, which turned out to be\n",
+ "> (12.52cm, 22.33cm)\n",
+ "\n",
+ "True or false?\n",
+ "\n",
+ "There is a 95% chance that the true standard deviation of the diameter of the population of public street trees in Sunset will fall within this interval.\n",
+ "\n",
+ "_Assign your answer to an object called `answer4.0`. Your answer should be either \"true\" or \"false\", surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "fb7e4a1325b32d7d578101edc4c11d98",
+ "grade": false,
+ "grade_id": "cell-e8d058e353ab4f72",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer4.0 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "d268daf5d4d2a0804921334073a9a66d",
+ "grade": true,
+ "grade_id": "cell-bbd47776f99243d4",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_4.0()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "fa7803c203c5437e746612693df46921",
+ "grade": false,
+ "grade_id": "cell-0a22a20c35a80f51",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 4.1**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Suppose you were interested in calculating a 90% confidence interval for the **true proportion** of UBC students who lived on campus during their first year. Which of the following can be considered as \"random\"?\n",
+ "\n",
+ "A. The true proportion of the population of interest.\n",
+ "\n",
+ "B. The lower bound of a 99% confidence interval for the proportion of the population of interest.\n",
+ "\n",
+ "C. The upper bound of a 99% confidence interval for the proportion of the population of interest.\n",
+ "\n",
+ "D. A confidence interval of (0.44, 0.82) that you calculated by taking a sample of size 50 from the population of UBC students.\n",
+ "\n",
+ "E. All of the above.\n",
+ "\n",
+ "F. Only A, B, and C.\n",
+ "\n",
+ "G. Only B, C, and D.\n",
+ "\n",
+ "H. Only A and D.\n",
+ "\n",
+ "I. Only B and C.\n",
+ "\n",
+ "_Assign your answer to an object called `answer4.1`. Your answer should be a single character surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "132cdf28c4def2f627ed4ed0d6761a0c",
+ "grade": false,
+ "grade_id": "cell-0f3aa2532492f4c5",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer4.1 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "6ab647311923217d60f7b428e4eaf57f",
+ "grade": true,
+ "grade_id": "cell-2def133d8fcf3278",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_4.1()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "37afce45f7da32854fb0d285e54a966c",
+ "grade": false,
+ "grade_id": "cell-f5e68ef10f101d6c",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 4.2**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Suppose you calculated two 50% confidence intervals for the mean of the number of years that it takes a UBC student to complete their undergraduate degree using bootstrapping. For the first confidence interval, you used the 0.1th and 0.6th quantiles, and for the second, you used the 0.25th and 0.75th quantiles.\n",
+ "\n",
+ "True or false?\n",
+ "\n",
+ "Both confidence intervals are valid **and** they can be interpretted in the same way.\n",
+ "\n",
+ "_Assign your answer to an object called `answer4.2`. Your answer should be either \"true\" or \"false\", surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "5ca4e50d295f6b91470bcf040b2fcef6",
+ "grade": false,
+ "grade_id": "cell-2d10c846379de836",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer4.2 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "29aaebe67dc4b6aceb758399112a8bcd",
+ "grade": true,
+ "grade_id": "cell-957569e757b313b3",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "test_4.2()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "markdown",
+ "checksum": "5c5c014b3df86ff62b6d879418a12023",
+ "grade": false,
+ "grade_id": "cell-df732433c2fb284d",
+ "locked": true,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ }
+ },
+ "source": [
+ "**Question 4.3**\n",
+ "
{points: 1}\n",
+ "\n",
+ "Consider the following (**impossible**) scenario:\n",
+ "\n",
+ "> Congratulations! You've just made a statistical breakthrough and found a way to access the true sampling distribution for any population using only a single sample! There is no need for bootstrapping anymore, as you can determine the true sampling distribution directly.\n",
+ "> \n",
+ "> However, you're still unable to work out the population parameter you are interested in. So, you decide to use the 0.1th and 0.9th quantiles of **true sampling distribution** to formulate a confidence interval for the population parameter.\n",
+ "\n",
+ "What confidence level is **most likely** to be associated with this confidence interval that you calculated in the scenario above? \n",
+ "\n",
+ "Hint: The center of a sampling distribution is approximately 0.5 quantile\n",
+ "\n",
+ "A. 0%\n",
+ "\n",
+ "B. 50%\n",
+ "\n",
+ "C. 80%\n",
+ "\n",
+ "D. 100%\n",
+ "\n",
+ "_Assign your answer to an object called `answer4.3`. Your answer should be a single character surrounded by quotes._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "122fbeba6c65707fa56efbb999b0b90e",
+ "grade": false,
+ "grade_id": "cell-9d1f6cfbbe4c3ce5",
+ "locked": false,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# answer4.3 <- ...\n",
+ "\n",
+ "# your code here\n",
+ "fail() # No Answer - remove if you provide an answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deletable": false,
+ "editable": false,
+ "nbgrader": {
+ "cell_type": "code",
+ "checksum": "4d91af6a81c601565225160e5dd541c7",
+ "grade": true,
+ "grade_id": "cell-fd193800f5a81b99",
+ "locked": true,
+ "points": 1,
+ "schema_version": 3,
+ "solution": false,
+ "task": false
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "test_4.3()"
+ ]
+ }
+ ],
+ "metadata": {
+ "docker": {
+ "latest_image_tag": "v0.4.0"
+ },
+ "jupytext": {
+ "formats": "ipynb,Rmd"
+ },
+ "kernelspec": {
+ "display_name": "R",
+ "language": "R",
+ "name": "ir"
+ },
+ "language_info": {
+ "codemirror_mode": "r",
+ "file_extension": ".r",
+ "mimetype": "text/x-r-source",
+ "name": "R",
+ "pygments_lexer": "r",
+ "version": "4.2.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4