From 7293a21dc8d817e207383ef956823ad53f9a3f75 Mon Sep 17 00:00:00 2001
From: Geoff Pleiss <824157+gpleiss@users.noreply.github.com>
Date: Wed, 23 Oct 2024 22:05:03 -0700
Subject: [PATCH] Update last of classification slides
---
.../execute-results/html.json | 5 +-
.../figure-revealjs/unnamed-chunk-3-1.svg | 1715 +++++++++++
.../figure-revealjs/unnamed-chunk-4-1.svg | 1609 ++++++-----
.../figure-revealjs/unnamed-chunk-5-1.svg | 1909 +-----------
.../figure-revealjs/unnamed-chunk-6-1.svg | 225 +-
.../execute-results/html.json | 7 +
.../figure-revealjs/plot-d1-1.svg | 675 +++++
.../figure-revealjs/plot-partition-1.svg | 500 ++++
.../figure-revealjs/unnamed-chunk-3-1.svg | 574 ++++
.../figure-revealjs/unnamed-chunk-4-1.svg | 2570 +++++++++++++++++
.../figure-revealjs/unnamed-chunk-6-1.svg | 261 ++
.../figure-revealjs/unnamed-chunk-7-1.svg | 416 ++-
.../figure-revealjs/unnamed-chunk-8-1.svg | 214 ++
.../figure-revealjs/unnamed-chunk-9-1.svg | 574 ++++
schedule/slides/00-classification-losses.qmd | 74 +-
schedule/slides/17-nonlinear-classifiers.qmd | 341 +--
16 files changed, 8606 insertions(+), 3063 deletions(-)
create mode 100644 _freeze/schedule/slides/00-classification-losses/figure-revealjs/unnamed-chunk-3-1.svg
diff --git a/_freeze/schedule/slides/00-classification-losses/execute-results/html.json b/_freeze/schedule/slides/00-classification-losses/execute-results/html.json
index 4b83642..8716f3c 100644
--- a/_freeze/schedule/slides/00-classification-losses/execute-results/html.json
+++ b/_freeze/schedule/slides/00-classification-losses/execute-results/html.json
@@ -1,7 +1,8 @@
{
- "hash": "a52fe0e79bf5e4db92eca003f346d642",
+ "hash": "03d8618359c4ba22b893a73c0d7d89b5",
"result": {
- "markdown": "---\nlecture: \"00 Evaluating classifiers\"\nformat: revealjs\nmetadata-files: \n - _metadata.yml\n---\n---\n---\n\n## {{< meta lecture >}} {.large background-image=\"gfx/smooths.svg\" background-opacity=\"0.3\"}\n\n[Stat 406]{.secondary}\n\n[{{< meta author >}}]{.secondary}\n\nLast modified -- 16 October 2023\n\n\n\n$$\n\\DeclareMathOperator*{\\argmin}{argmin}\n\\DeclareMathOperator*{\\argmax}{argmax}\n\\DeclareMathOperator*{\\minimize}{minimize}\n\\DeclareMathOperator*{\\maximize}{maximize}\n\\DeclareMathOperator*{\\find}{find}\n\\DeclareMathOperator{\\st}{subject\\,\\,to}\n\\newcommand{\\E}{E}\n\\newcommand{\\Expect}[1]{\\E\\left[ #1 \\right]}\n\\newcommand{\\Var}[1]{\\mathrm{Var}\\left[ #1 \\right]}\n\\newcommand{\\Cov}[2]{\\mathrm{Cov}\\left[#1,\\ #2\\right]}\n\\newcommand{\\given}{\\ \\vert\\ }\n\\newcommand{\\X}{\\mathbf{X}}\n\\newcommand{\\x}{\\mathbf{x}}\n\\newcommand{\\y}{\\mathbf{y}}\n\\newcommand{\\P}{\\mathcal{P}}\n\\newcommand{\\R}{\\mathbb{R}}\n\\newcommand{\\norm}[1]{\\left\\lVert #1 \\right\\rVert}\n\\newcommand{\\snorm}[1]{\\lVert #1 \\rVert}\n\\newcommand{\\tr}[1]{\\mbox{tr}(#1)}\n\\newcommand{\\brt}{\\widehat{\\beta}^R_{s}}\n\\newcommand{\\brl}{\\widehat{\\beta}^R_{\\lambda}}\n\\newcommand{\\bls}{\\widehat{\\beta}_{ols}}\n\\newcommand{\\blt}{\\widehat{\\beta}^L_{s}}\n\\newcommand{\\bll}{\\widehat{\\beta}^L_{\\lambda}}\n$$\n\n\n\n\n\n## How do we measure accuracy?\n\n[So far]{.secondary} --- 0-1 loss. If correct class, lose 0 else lose 1.\n\n[Asymmetric classification loss]{.secondary} --- If correct class, lose 0 else lose something.\n\nFor example, consider facial recognition. Goal is \"person OK\", \"person has expired passport\", \"person is a known terrorist\"\n\n1. If classify OK, but was terrorist, lose 1,000,000\n1. If classify OK, but expired passport, lose 2\n1. If classify terrorist, but was OK, lose 100\n1. If classify terrorist, but was expired passport, lose 10\n1. etc.\n\n. . .\n\n\nResults in a 3x3 matrix of losses with 0 on the diagonal.\n\n\n::: {.cell layout-align=\"center\" R.options='{\"scipen\":8}'}\n::: {.cell-output .cell-output-stdout}\n```\n [,1] [,2] [,3]\n[1,] 0 2 30\n[2,] 10 0 100\n[3,] 1000000 50000 0\n```\n:::\n:::\n\n\n\n## Deviance loss\n\nSometimes we output [probabilities]{.secondary} as well as class labels.\n\nFor example, logistic regression returns the probability that an observation is in class 1. $P(Y_i = 1 \\given x_i) = 1 / (1 + \\exp\\{-x'_i \\hat\\beta\\})$\n\nLDA and QDA produce probabilities as well. So do Neural Networks (typically)\n\n(Trees \"don't\", neither does KNN, though you could fake it)\n\n. . .\n\n
\n\n* Deviance loss for 2-class classification is $-2\\textrm{loglikelihood}(y, \\hat{p}) = -2 (y_i x'_i\\hat{\\beta} - \\log (1-\\hat{p}))$\n\n(Technically, it's the difference between this and the loss of the null model, but people play fast and loose)\n\n* Could also use cross entropy or Gini index.\n\n\n\n## Calibration\n\nSuppose we predict some probabilities for our data, how often do those events happen?\n\nIn principle, if we predict $\\hat{p}(x_i)=0.2$ for a bunch of events observations $i$, we'd like to see about 20% 1 and 80% 0. (In training set and test set)\n\nThe same goes for the other probabilities. If we say \"20% chance of rain\" it should rain 20% of such days.\n\n\nOf course, we didn't predict **exactly** $\\hat{p}(x_i)=0.2$ ever, so lets look at $[.15, .25]$.\n\n\n::: {.cell layout-align=\"center\" output-location='fragment'}\n\n```{.r .cell-code code-line-numbers=\"1-6|7|8-9\"}\nn <- 250\ndat <- tibble(\n x = seq(-5, 5, length.out = n),\n p = 1 / (1 + exp(-x)),\n y = rbinom(n, 1, p)\n)\nfit <- glm(y ~ x, family = binomial, data = dat)\ndat$phat <- predict(fit, type = \"response\") # predicted probabilities\ndat |>\n filter(phat > .15, phat < .25) |>\n summarize(target = .2, obs = mean(y))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 1 × 2\n target obs\n \n1 0.2 0.222\n```\n:::\n:::\n\n\n\n## Calibration plot\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbinary_calibration_plot <- function(y, phat, nbreaks = 10) {\n dat <- tibble(y = y, phat = phat) |>\n mutate(bins = cut_number(phat, n = nbreaks))\n midpts <- quantile(dat$phat, seq(0, 1, length.out = nbreaks + 1), na.rm = TRUE)\n midpts <- midpts[-length(midpts)] + diff(midpts) / 2\n sum_dat <- dat |>\n group_by(bins) |>\n summarise(\n p = mean(y, na.rm = TRUE),\n se = sqrt(p * (1 - p) / n())\n ) |>\n arrange(p)\n sum_dat$x <- midpts\n\n ggplot(sum_dat, aes(x = x)) +\n geom_errorbar(aes(ymin = pmax(p - 1.96 * se, 0), ymax = pmin(p + 1.96 * se, 1))) +\n geom_point(aes(y = p), colour = blue) +\n geom_abline(slope = 1, intercept = 0, colour = orange) +\n ylab(\"observed frequency\") +\n xlab(\"average predicted probability\") +\n coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) +\n geom_rug(data = dat, aes(x = phat), sides = \"b\")\n}\n```\n:::\n\n\n\n## Amazingly well-calibrated\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbinary_calibration_plot(dat$y, dat$phat, 20L)\n```\n\n::: {.cell-output-display}\n![](00-classification-losses_files/figure-revealjs/unnamed-chunk-4-1.svg){fig-align='center'}\n:::\n:::\n\n\n\n## Less well-calibrated\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n![](00-classification-losses_files/figure-revealjs/unnamed-chunk-5-1.svg){fig-align='center'}\n:::\n:::\n\n\n\n\n## True positive, false negative, sensitivity, specificity\n\nTrue positive rate\n: \\# correct predict positive / \\# actual positive (1 - FNR)\n\nFalse negative rate\n: \\# incorrect predict negative / \\# actual positive (1 - TPR), Type II Error\n\nTrue negative rate\n: \\# correct predict negative / \\# actual negative\n\nFalse positive rate\n: \\# incorrect predict positive / \\# actual negative (1 - TNR), Type I Error\n\nSensitivity\n: TPR, 1 - Type II error\n\nSpecificity\n: TNR, 1 - Type I error\n\n\n\n## ROC and thresholds\n\nROC (Receiver Operating Characteristic) Curve\n: TPR (sensitivity) vs. FPR (1 - specificity)\n \nAUC (Area under the curve)\n: Integral of ROC. Closer to 1 is better.\n \nSo far, we've been thresholding at 0.5, though you shouldn't always do that. \n \nWith unbalanced data (say 10% 0 and 90% 1), if you care equally about predicting both classes, you might want to choose a different cutoff (like in LDA).\n \nTo make the [ROC]{.secondary} we look at our errors [as we vary the cutoff]{.secondary}\n \n\n## ROC curve\n\n\n\n::: {.cell layout-align=\"center\" output-location='column-fragment'}\n\n```{.r .cell-code}\nroc <- function(prediction, y) {\n op <- order(prediction, decreasing = TRUE)\n preds <- prediction[op]\n y <- y[op]\n noty <- 1 - y\n if (any(duplicated(preds))) {\n y <- rev(tapply(y, preds, sum))\n noty <- rev(tapply(noty, preds, sum))\n }\n tibble(\n FPR = cumsum(noty) / sum(noty),\n TPR = cumsum(y) / sum(y)\n )\n}\n\nggplot(roc(dat$phat, dat$y), aes(FPR, TPR)) +\n geom_step(colour = blue, size = 2) +\n geom_abline(slope = 1, intercept = 0)\n```\n\n::: {.cell-output-display}\n![](00-classification-losses_files/figure-revealjs/unnamed-chunk-6-1.svg){fig-align='center'}\n:::\n:::\n\n\n\n\n## Other stuff\n\n![](gfx/huge-roc.png)\n\n* Source: worth exploring [Wikipedia](https://en.wikipedia.org/wiki/Receiver_operating_characteristic)\n",
+ "engine": "knitr",
+ "markdown": "---\nlecture: \"00 Evaluating classifiers\"\nformat: revealjs\nmetadata-files: \n - _metadata.yml\n---\n\n\n## {{< meta lecture >}} {.large background-image=\"gfx/smooths.svg\" background-opacity=\"0.3\"}\n\n[Stat 406]{.secondary}\n\n[{{< meta author >}}]{.secondary}\n\nLast modified -- 23 October 2024\n\n\n\n\n\n$$\n\\DeclareMathOperator*{\\argmin}{argmin}\n\\DeclareMathOperator*{\\argmax}{argmax}\n\\DeclareMathOperator*{\\minimize}{minimize}\n\\DeclareMathOperator*{\\maximize}{maximize}\n\\DeclareMathOperator*{\\find}{find}\n\\DeclareMathOperator{\\st}{subject\\,\\,to}\n\\newcommand{\\E}{E}\n\\newcommand{\\Expect}[1]{\\E\\left[ #1 \\right]}\n\\newcommand{\\Var}[1]{\\mathrm{Var}\\left[ #1 \\right]}\n\\newcommand{\\Cov}[2]{\\mathrm{Cov}\\left[#1,\\ #2\\right]}\n\\newcommand{\\given}{\\ \\vert\\ }\n\\newcommand{\\X}{\\mathbf{X}}\n\\newcommand{\\x}{\\mathbf{x}}\n\\newcommand{\\y}{\\mathbf{y}}\n\\newcommand{\\P}{\\mathcal{P}}\n\\newcommand{\\R}{\\mathbb{R}}\n\\newcommand{\\norm}[1]{\\left\\lVert #1 \\right\\rVert}\n\\newcommand{\\snorm}[1]{\\lVert #1 \\rVert}\n\\newcommand{\\tr}[1]{\\mbox{tr}(#1)}\n\\newcommand{\\brt}{\\widehat{\\beta}^R_{s}}\n\\newcommand{\\brl}{\\widehat{\\beta}^R_{\\lambda}}\n\\newcommand{\\bls}{\\widehat{\\beta}_{ols}}\n\\newcommand{\\blt}{\\widehat{\\beta}^L_{s}}\n\\newcommand{\\bll}{\\widehat{\\beta}^L_{\\lambda}}\n\\newcommand{\\U}{\\mathbf{U}}\n\\newcommand{\\D}{\\mathbf{D}}\n\\newcommand{\\V}{\\mathbf{V}}\n$$\n\n\n\n\n\n## How do we measure accuracy?\n\n[So far]{.secondary} --- 0-1 loss. If correct class, lose 0 else lose 1.\n\n. . .\n\n[Generalization: Asymmetric classification loss]{.secondary} --- If correct class, lose 0 else lose something.\n\n\\\nE.g. MRI screening. Goal is \"person OK\", \"person has a disease\"\n\n1. If classify OK, but was disease, lose 1,000,000\n1. If classify disease, but was OK, lose 10\n1. etc.\n\n. . .\n\n\nResults in a 2x2 matrix of losses with 0 on the diagonal.\n\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output .cell-output-stdout}\n\n```\n [,1] [,2]\n[1,] 0 1000000\n[2,] 10 0\n```\n\n\n:::\n:::\n\n\n\n\n## Deviance loss\n\nSometimes we output [probabilities]{.secondary} as well as class labels.\n\nFor example, logistic regression returns the probability that an observation is in class 1. $P(Y_i = 1 \\given x_i) = 1 / (1 + \\exp\\{-x'_i \\hat\\beta\\})$\n\nLDA and QDA produce probabilities as well. So do Neural Networks (typically)\n\n(Trees \"don't\", neither does KNN, though you could fake it)\n\n. . .\n\n
\n\n* Deviance loss for 2-class classification is $-2\\textrm{loglikelihood}(y, \\hat{p}) = -2 (y_i x'_i\\hat{\\beta} - \\log (1-\\hat{p}))$\n\n\n\n* Could also use cross entropy or Gini index.\n\n\n\n## Calibration\n\nSuppose we predict some probabilities for our data, how often do those events happen?\n\nIn principle, if we predict $\\hat{p}(x_i)=0.2$ for a bunch of events observations $i$, we'd like to see about 20% 1 and 80% 0. (In training set and test set)\n\nThe same goes for the other probabilities. If we say \"20% chance of rain\" it should rain 20% of such days.\n\n\nOf course, we didn't predict **exactly** $\\hat{p}(x_i)=0.2$ ever, so lets look at $[.15, .25]$.\n\n\n## Calibration plot\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nn <- 250\ndat <- tibble(\n x = seq(-5, 5, length.out = n),\n p = 1 / (1 + exp(-x)),\n y = rbinom(n, 1, p)\n)\nfit <- glm(y ~ x, family = binomial, data = dat)\ndat$phat <- predict(fit, type = \"response\") # predicted probabilities\nbinary_calibration_plot <- function(y, phat, nbreaks = 10) {\n dat <- tibble(y = y, phat = phat) |>\n mutate(bins = cut_number(phat, n = nbreaks))\n midpts <- quantile(dat$phat, seq(0, 1, length.out = nbreaks + 1), na.rm = TRUE)\n midpts <- midpts[-length(midpts)] + diff(midpts) / 2\n sum_dat <- dat |>\n group_by(bins) |>\n summarise(\n p = mean(y, na.rm = TRUE),\n se = sqrt(p * (1 - p) / n())\n ) |>\n arrange(p)\n sum_dat$x <- midpts\n\n ggplot(sum_dat, aes(x = x)) +\n geom_errorbar(aes(ymin = pmax(p - 1.96 * se, 0), ymax = pmin(p + 1.96 * se, 1))) +\n geom_point(aes(y = p), colour = blue) +\n geom_abline(slope = 1, intercept = 0, colour = orange) +\n ylab(\"observed frequency\") +\n xlab(\"average predicted probability\") +\n coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) +\n geom_rug(data = dat, aes(x = phat), sides = \"b\")\n}\n```\n:::\n\n\n\n\n## Amazingly well-calibrated\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbinary_calibration_plot(dat$y, dat$phat, 20L)\n```\n\n::: {.cell-output-display}\n![](00-classification-losses_files/figure-revealjs/unnamed-chunk-3-1.svg){fig-align='center'}\n:::\n:::\n\n\n\n\n## Less well-calibrated\n\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n![](00-classification-losses_files/figure-revealjs/unnamed-chunk-4-1.svg){fig-align='center'}\n:::\n:::\n\n\n\n\n\n## True positive, false negative, sensitivity, specificity\n\nTrue positive rate\n: \\# correct predict positive / \\# actual positive (1 - FNR)\n\nFalse negative rate\n: \\# incorrect predict negative / \\# actual positive (1 - TPR), Type II Error\n\nTrue negative rate\n: \\# correct predict negative / \\# actual negative\n\nFalse positive rate\n: \\# incorrect predict positive / \\# actual negative (1 - TNR), Type I Error\n\nSensitivity\n: TPR, 1 - Type II error\n\nSpecificity\n: TNR, 1 - Type I error\n\n\n\n## Decision making\n\nGiven a logistic regression output $\\hat P(Y \\mid X) = 0.56$,\nshould we assign $\\hat Y = 1$ or $\\hat Y = 0$?\n\nE.g. $P(Y=1 \\mid X)$ is predicted probability that email $X$ is spam. \\\nDo we send it to the spam folder ($\\hat Y=1$) or the inbox ($\\hat Y=0$)?\n\n. . .\n\nSo far we've been making the \"decision\" $\\hat Y=1$ if $\\hat P(Y=1 \\mid X) > \\hat P(Y=0 \\mid X)$ \\\ni.e. $\\hat Y = \\begin{cases} 1 & \\hat P(Y=1 \\mid X) > 0.5 \\\\ 0 & \\mathrm{o.w.} \\end{cases}$.\n\nBut maybe (for our application) a \"better\" decision is $$\\hat Y = \\begin{cases} 1 & \\hat P(Y=1 \\mid X) > t \\\\ 0 & \\mathrm{o.w.} \\end{cases}$$\n\n\n## ROC and thresholds\n\nROC (Receiver Operating Characteristic) Curve\n: TPR (sensitivity) vs. FPR (1 - specificity)\n: [Each point corresponds to a different $0 \\leq t \\leq 1$.]{.small}\n \nAUC (Area under the curve)\n: Integral of ROC. Closer to 1 is better.\n\n\n\n\n::: {.cell layout-align=\"center\" output-location='column'}\n\n```{.r .cell-code}\nroc <- function(prediction, y) {\n op <- order(prediction, decreasing = TRUE)\n preds <- prediction[op]\n y <- y[op]\n noty <- 1 - y\n if (any(duplicated(preds))) {\n y <- rev(tapply(y, preds, sum))\n noty <- rev(tapply(noty, preds, sum))\n }\n tibble(\n FPR = cumsum(noty) / sum(noty),\n TPR = cumsum(y) / sum(y)\n )\n}\n\nggplot(roc(dat$phat, dat$y), aes(FPR, TPR)) +\n geom_step(colour = blue, size = 2) +\n geom_abline(slope = 1, intercept = 0)\n```\n\n::: {.cell-output-display}\n![](00-classification-losses_files/figure-revealjs/unnamed-chunk-5-1.svg){fig-align='center'}\n:::\n:::\n\n\n\n## Other stuff\n\n![](gfx/huge-roc.png)\n\n* Source: worth exploring [Wikipedia](https://en.wikipedia.org/wiki/Receiver_operating_characteristic)\n\n\n\n\n\n# Next time ... {background-image=\"https://i1.wp.com/bdtechtalks.com/wp-content/uploads/2018/12/artificial-intelligence-deep-learning-neural-networks-ai.jpg?w=1392&ssl=1\" background-opacity=.4}\n\n\n[Module 4]{.secondary}\n\n[boosting, bagging, random forests, and neural nets]{.secondary}\n",
"supporting": [
"00-classification-losses_files"
],
diff --git a/_freeze/schedule/slides/00-classification-losses/figure-revealjs/unnamed-chunk-3-1.svg b/_freeze/schedule/slides/00-classification-losses/figure-revealjs/unnamed-chunk-3-1.svg
new file mode 100644
index 0000000..76545cb
--- /dev/null
+++ b/_freeze/schedule/slides/00-classification-losses/figure-revealjs/unnamed-chunk-3-1.svg
@@ -0,0 +1,1715 @@
+
+
diff --git a/_freeze/schedule/slides/00-classification-losses/figure-revealjs/unnamed-chunk-4-1.svg b/_freeze/schedule/slides/00-classification-losses/figure-revealjs/unnamed-chunk-4-1.svg
index fdd4da7..22d1cb3 100644
--- a/_freeze/schedule/slides/00-classification-losses/figure-revealjs/unnamed-chunk-4-1.svg
+++ b/_freeze/schedule/slides/00-classification-losses/figure-revealjs/unnamed-chunk-4-1.svg
@@ -3,1742 +3,1913 @@
-
-
-
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
-
-
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
-
-
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
-
-
-
-
-
+
-
-
-
-
-
+
-
-
-
+
-
+
-
-
-
+
-
-
-
+
-
-
-
+
-
-
+
-
-
+
-
+
-
-
+
-
-
+
-
-
+
-
+
-
+
-
+
-
-
+
-
+
-
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
-
+
-
+
-
-
+
-
+
-
-
+
-
-
+
-
+
-
-
+
-
-
+
-
+
-
-
+
-
-
+
-
-
-
+
-
-
-
+
-
-
-
+
-
-
-
+
-
+
-
-
-
+
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
+
+
+
+
-
-
-
-
+
+
+
+
-
-
-
-
+
+
+
+
-
-
-
-
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
+
+
+
+
-
-
-
-
+
+
+
+
-
-
-
-
+
+
+
+
-
-
-
-
+
+
+
+
-
-
-
-
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/_freeze/schedule/slides/00-classification-losses/figure-revealjs/unnamed-chunk-5-1.svg b/_freeze/schedule/slides/00-classification-losses/figure-revealjs/unnamed-chunk-5-1.svg
index f534d4b..e990367 100644
--- a/_freeze/schedule/slides/00-classification-losses/figure-revealjs/unnamed-chunk-5-1.svg
+++ b/_freeze/schedule/slides/00-classification-losses/figure-revealjs/unnamed-chunk-5-1.svg
@@ -1,1929 +1,248 @@
-
diff --git a/_freeze/schedule/slides/17-nonlinear-classifiers/execute-results/html.json b/_freeze/schedule/slides/17-nonlinear-classifiers/execute-results/html.json
index de69554..795d6fe 100644
--- a/_freeze/schedule/slides/17-nonlinear-classifiers/execute-results/html.json
+++ b/_freeze/schedule/slides/17-nonlinear-classifiers/execute-results/html.json
@@ -1,8 +1,15 @@
{
+<<<<<<< HEAD
"hash": "5725965e671aeb548cbdcca02b931368",
"result": {
"engine": "knitr",
"markdown": "---\nlecture: \"17 Nonlinear classifiers\"\nformat: revealjs\nmetadata-files: \n - _metadata.yml\n---\n\n\n\n## {{< meta lecture >}} {.large background-image=\"gfx/smooths.svg\" background-opacity=\"0.3\"}\n\n[Stat 406]{.secondary}\n\n[{{< meta author >}}]{.secondary}\n\nLast modified -- 21 October 2024\n\n\n\n\n\n\n\n$$\n\\DeclareMathOperator*{\\argmin}{argmin}\n\\DeclareMathOperator*{\\argmax}{argmax}\n\\DeclareMathOperator*{\\minimize}{minimize}\n\\DeclareMathOperator*{\\maximize}{maximize}\n\\DeclareMathOperator*{\\find}{find}\n\\DeclareMathOperator{\\st}{subject\\,\\,to}\n\\newcommand{\\E}{E}\n\\newcommand{\\Expect}[1]{\\E\\left[ #1 \\right]}\n\\newcommand{\\Var}[1]{\\mathrm{Var}\\left[ #1 \\right]}\n\\newcommand{\\Cov}[2]{\\mathrm{Cov}\\left[#1,\\ #2\\right]}\n\\newcommand{\\given}{\\ \\vert\\ }\n\\newcommand{\\X}{\\mathbf{X}}\n\\newcommand{\\x}{\\mathbf{x}}\n\\newcommand{\\y}{\\mathbf{y}}\n\\newcommand{\\P}{\\mathcal{P}}\n\\newcommand{\\R}{\\mathbb{R}}\n\\newcommand{\\norm}[1]{\\left\\lVert #1 \\right\\rVert}\n\\newcommand{\\snorm}[1]{\\lVert #1 \\rVert}\n\\newcommand{\\tr}[1]{\\mbox{tr}(#1)}\n\\newcommand{\\brt}{\\widehat{\\beta}^R_{s}}\n\\newcommand{\\brl}{\\widehat{\\beta}^R_{\\lambda}}\n\\newcommand{\\bls}{\\widehat{\\beta}_{ols}}\n\\newcommand{\\blt}{\\widehat{\\beta}^L_{s}}\n\\newcommand{\\bll}{\\widehat{\\beta}^L_{\\lambda}}\n\\newcommand{\\U}{\\mathbf{U}}\n\\newcommand{\\D}{\\mathbf{D}}\n\\newcommand{\\V}{\\mathbf{V}}\n$$\n\n\n\n\n## Last time\n\n\nWe reviewed logistic regression\n\n$$\\begin{aligned}\nPr(Y = 1 \\given X=x) & = \\frac{\\exp\\{\\beta_0 + \\beta^{\\top}x\\}}{1 + \\exp\\{\\beta_0 + \\beta^{\\top}x\\}} \\\\\nPr(Y = 0 \\given X=x) & = \\frac{1}{1 + \\exp\\{\\beta_0 + \\beta^{\\top}x\\}}=1-\\frac{\\exp\\{\\beta_0 + \\beta^{\\top}x\\}}{1 + \\exp\\{\\beta_0 + \\beta^{\\top}x\\}}\\end{aligned}$$\n\n## Make it nonlinear\n\nWe can make LDA or logistic regression have non-linear decision boundaries by mapping the features to a higher dimension (just like with regular regression)\n\nSay:\n\n__Polynomials__\n\n$(x_1, x_2) \\mapsto \\left(1,\\ x_1,\\ x_1^2,\\ x_2,\\ x_2^2,\\ x_1 x_2\\right)$\n\n\n\n\n::: {.cell layout-align=\"center\"}\n\n:::\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndat1 <- generate_lda_2d(100, Sigma = .5 * diag(2)) |> mutate(y = as.factor(y))\nlogit_poly <- glm(y ~ x1 * x2 + I(x1^2) + I(x2^2), dat1, family = \"binomial\")\nlda_poly <- lda(y ~ x1 * x2 + I(x1^2) + I(x2^2), dat1)\n```\n:::\n\n\n\n\n\n\n## Visualizing the classification boundary\n\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code code-fold=\"true\"}\nlibrary(cowplot)\ngr <- expand_grid(x1 = seq(-2.5, 3, length.out = 100), x2 = seq(-2.5, 3, length.out = 100))\npts_logit <- predict(logit_poly, gr)\npts_lda <- predict(lda_poly, gr)\ng0 <- ggplot(dat1, aes(x1, x2)) +\n scale_shape_manual(values = c(\"0\", \"1\"), guide = \"none\") +\n geom_raster(data = tibble(gr, disc = pts_logit), aes(x1, x2, fill = disc)) +\n geom_point(aes(shape = as.factor(y)), size = 4) +\n coord_cartesian(c(-2.5, 3), c(-2.5, 3)) +\n scale_fill_viridis_b(n.breaks = 6, alpha = .5, name = \"log odds\") +\n ggtitle(\"Polynomial logit\") +\n theme(legend.position = \"bottom\", legend.key.width = unit(1.5, \"cm\"))\ng1 <- ggplot(dat1, aes(x1, x2)) +\n scale_shape_manual(values = c(\"0\", \"1\"), guide = \"none\") +\n geom_raster(data = tibble(gr, disc = pts_lda$x), aes(x1, x2, fill = disc)) +\n geom_point(aes(shape = as.factor(y)), size = 4) +\n coord_cartesian(c(-2.5, 3), c(-2.5, 3)) +\n scale_fill_viridis_b(n.breaks = 6, alpha = .5, name = bquote(delta[1] - delta[0])) +\n ggtitle(\"Polynomial lda\") +\n theme(legend.position = \"bottom\", legend.key.width = unit(1.5, \"cm\"))\nplot_grid(g0, g1)\n```\n\n::: {.cell-output-display}\n![](17-nonlinear-classifiers_files/figure-revealjs/plot-d1-1.svg){fig-align='center'}\n:::\n:::\n\n\n\n\nA linear decision boundary in the higher-dimensional space corresponds to a non-linear decision boundary in low dimensions.\n\n\n\n## Trees\n\n::: flex\n\n::: w-50\nWe saw regression trees last module\n\nClassification trees are \n\n- More natural\n- Slightly different computationally\n\nEverything else is pretty much the same\n:::\n\n::: w-50\n![](https://upload.wikimedia.org/wikipedia/commons/e/eb/Decision_Tree.jpg)\n:::\n:::\n\n\n\n## Axis-parallel splits\n\nLike with regression trees, classification trees operate by greedily splitting the predictor space\n\n\n\n\n::: {.cell layout-align=\"center\"}\n\n:::\n\n\n\n\n::: flex\n::: w-50\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnames(bakeoff)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n [1] \"winners\" \n [2] \"series\" \n [3] \"age\" \n [4] \"occupation\" \n [5] \"hometown\" \n [6] \"percent_star\" \n [7] \"percent_technical_wins\" \n [8] \"percent_technical_bottom3\"\n [9] \"percent_technical_top3\" \n[10] \"technical_highest\" \n[11] \"technical_lowest\" \n[12] \"technical_median\" \n[13] \"judge1\" \n[14] \"judge2\" \n[15] \"viewers_7day\" \n[16] \"viewers_28day\" \n```\n\n\n:::\n:::\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsmalltree <- tree(\n winners ~ technical_median + percent_star,\n data = bakeoff\n)\n```\n:::\n\n\n\n\n:::\n\n\n::: w-50\n\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code code-fold=\"true\"}\npar(mar = c(5, 5, 0, 0) + .1)\nplot(bakeoff$technical_median, bakeoff$percent_star,\n pch = c(\"-\", \"+\")[bakeoff$winners + 1], cex = 2, bty = \"n\", las = 1,\n ylab = \"% star baker\", xlab = \"times above median in technical\",\n col = orange, cex.axis = 2, cex.lab = 2\n)\npartition.tree(smalltree,\n add = TRUE, col = blue,\n ordvars = c(\"technical_median\", \"percent_star\")\n)\n```\n\n::: {.cell-output-display}\n![](17-nonlinear-classifiers_files/figure-revealjs/plot-partition-1.svg){fig-align='center'}\n:::\n:::\n\n\n\n:::\n:::\n\n\n## When do trees do well?\n\n::: flex\n::: w-50\n![](gfx/8.7.png)\n:::\n\n::: w-50\n\n[2D example]{.hand}\n\n[Top Row:]{.primary} \n\ntrue decision boundary is linear\n\n🍎 linear classifier \n\n👎 tree with axis-parallel splits\n\n[Bottom Row:]{.primary}\n\ntrue decision boundary is non-linear\n\n🤮 A linear classifier can't capture the true decision boundary\n\n🍎 decision tree is successful.\n:::\n:::\n\n\n\n\n## How do we build a tree?\n\n\n1. Divide the predictor space into\n$J$ non-overlapping regions $R_1, \\ldots, R_J$ \n\n > this is done via greedy, recursive binary splitting\n\n2. Every observation that falls into a given region $R_j$ is given the same prediction\n\n > determined by majority (or plurality) vote in that region.\n\n\n\n[Important:]{.hand}\n\n* Trees can only make rectangular regions that are aligned with the coordinate axis.\n* The fit is _greedy_, which means that after a split is made, all further decisions are conditional on that split.\n\n\n\n\n\n\n## How do we measure quality of fit?\n\n\nLet $p_{mk}$ be the proportion of training observations in the $m^{th}$\nregion that are from the $k^{th}$ class.\n\n| | |\n|---|---|\n| __classification error rate:__ | $E = 1 - \\max_k (\\widehat{p}_{mk})$|\n| __Gini index:__ | $G = \\sum_k \\widehat{p}_{mk}(1-\\widehat{p}_{mk})$ |\n| __cross-entropy:__ | $D = -\\sum_k \\widehat{p}_{mk}\\log(\\widehat{p}_{mk})$|\n\n\nBoth Gini and cross-entropy measure the purity of the classifier (small if all $p_{mk}$ are near zero or 1). \n\nThese are preferred over the classification error rate. \n\nClassification error is hard to optimize.\n\nWe build a classifier by growing a tree that minimizes $G$ or $D$.\n\n\n\n## Pruning the tree\n\n\n* Cross-validation can be used to directly prune the tree, \n\n* But it is computationally expensive (combinatorial complexity).\n\n* Instead, we use _weakest link pruning_, (Gini version)\n\n$$\\sum_{m=1}^{|T|} \\sum_{k \\in R_m} \\widehat{p}_{mk}(1-\\widehat{p}_{mk}) + \\alpha |T|$$\n\n* $|T|$ is the number of terminal nodes. \n\n* Essentially, we are trading training fit (first term) with model complexity (second) term (compare to lasso).\n\n* Now, cross-validation can be used to pick $\\alpha$.\n\n\n\n\n## Advantages and disadvantages of trees (again)\n\n🎉 Trees are very easy to explain (much easier than even linear regression). \n\n🎉 Some people believe that decision trees mirror human decision. \n\n🎉 Trees can easily be displayed graphically no matter the dimension of the data.\n\n🎉 Trees can easily handle qualitative predictors without the need to create dummy variables.\n\n💩 Trees aren't very good at prediction.\n\n💩 Trees are highly variable. Small changes in training data $\\Longrightarrow$ big changes in the tree.\n\nTo fix these last two, we can try to grow many trees and average their performance. \n\n. . .\n\nWe do this next module\n\n\n## KNN classifiers\n\n* We saw $k$-nearest neighbors in the last module.\n\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(class)\nknn3 <- knn(dat1[, -1], gr, dat1$y, k = 3)\n```\n:::\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code code-fold=\"true\"}\ngr$nn03 <- knn3\nggplot(dat1, aes(x1, x2)) +\n scale_shape_manual(values = c(\"0\", \"1\"), guide = \"none\") +\n geom_raster(data = tibble(gr, disc = knn3), aes(x1, x2, fill = disc), alpha = .5) +\n geom_point(aes(shape = as.factor(y)), size = 4) +\n coord_cartesian(c(-2.5, 3), c(-2.5, 3)) +\n scale_fill_manual(values = c(orange, blue), labels = c(\"0\", \"1\")) +\n theme(\n legend.position = \"bottom\", legend.title = element_blank(),\n legend.key.width = unit(2, \"cm\")\n )\n```\n\n::: {.cell-output-display}\n![](17-nonlinear-classifiers_files/figure-revealjs/unnamed-chunk-3-1.svg){fig-align='center'}\n:::\n:::\n\n\n\n\n\n## Choosing $k$ is very important\n\n\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code code-fold=\"true\"}\nset.seed(406406406)\nks <- c(1, 2, 5, 10, 20)\nnn <- map(ks, ~ as_tibble(knn(dat1[, -1], gr[, 1:2], dat1$y, .x)) |> \n set_names(sprintf(\"k = %02s\", .x))) |>\n list_cbind() |>\n bind_cols(gr)\npg <- pivot_longer(nn, starts_with(\"k =\"), names_to = \"k\", values_to = \"knn\")\n\nggplot(pg, aes(x1, x2)) +\n geom_raster(aes(fill = knn), alpha = .6) +\n facet_wrap(~ k) +\n scale_fill_manual(values = c(orange, green), labels = c(\"0\", \"1\")) +\n geom_point(data = dat1, mapping = aes(x1, x2, shape = as.factor(y)), size = 4) +\n theme_bw(base_size = 18) +\n scale_shape_manual(values = c(\"0\", \"1\"), guide = \"none\") +\n coord_cartesian(c(-2.5, 3), c(-2.5, 3)) +\n theme(\n legend.title = element_blank(),\n legend.key.height = unit(3, \"cm\")\n )\n```\n\n::: {.cell-output-display}\n![](17-nonlinear-classifiers_files/figure-revealjs/unnamed-chunk-4-1.svg){fig-align='center'}\n:::\n:::\n\n\n\n\n* How should we choose $k$?\n\n* Scaling is also very important. \"Nearness\" is determined by distance, so better to standardize your data first.\n\n* If there are ties, break randomly. So even $k$ is strange.\n\n\n## `knn.cv()` (leave one out)\n\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nkmax <- 20\nerr <- map_dbl(1:kmax, ~ mean(knn.cv(dat1[, -1], dat1$y, k = .x) != dat1$y))\n```\n:::\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n![](17-nonlinear-classifiers_files/figure-revealjs/unnamed-chunk-6-1.svg){fig-align='center'}\n:::\n:::\n\n\n\n\nI would use the _largest_ (odd) `k` that is close to the minimum. \nThis produces simpler, smoother, decision boundaries.\n\n\n## Alternative (using deviance loss, I think this is right)\n\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code code-fold=\"true\"}\ndev <- function(y, prob, prob_min = 1e-5) {\n y <- as.numeric(as.factor(y)) - 1 # 0/1 valued\n m <- mean(y)\n prob_max <- 1 - prob_min\n prob <- pmin(pmax(prob, prob_min), prob_max)\n lp <- (1 - y) * log(1 - prob) + y * log(prob)\n ly <- (1 - y) * log(1 - m) + y * log(m)\n 2 * (ly - lp)\n}\nknn.cv_probs <- function(train, cl, k = 1) {\n o <- knn.cv(train, cl, k = k, prob = TRUE)\n p <- attr(o, \"prob\")\n o <- as.numeric(as.factor(o)) - 1\n p[o == 0] <- 1 - p[o == 0]\n p\n}\ndev_err <- map_dbl(1:kmax, ~ mean(dev(dat1$y, knn.cv_probs(dat1[, -1], dat1$y, k = .x))))\n```\n:::\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n![](17-nonlinear-classifiers_files/figure-revealjs/unnamed-chunk-8-1.svg){fig-align='center'}\n:::\n:::\n\n\n\n\n\n\n\n## Final version\n\n\n::: flex\n::: w-50\n\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code code-fold=\"true\"}\nkopt <- max(which(err == min(err)))\nkopt <- kopt + 1 * (kopt %% 2 == 0)\ngr$opt <- knn(dat1[, -1], gr[, 1:2], dat1$y, k = kopt)\ntt <- table(knn(dat1[, -1], dat1[, -1], dat1$y, k = kopt), dat1$y, dnn = c(\"predicted\", \"truth\"))\nggplot(dat1, aes(x1, x2)) +\n theme_bw(base_size = 24) +\n scale_shape_manual(values = c(\"0\", \"1\"), guide = \"none\") +\n geom_raster(data = gr, aes(x1, x2, fill = opt), alpha = .6) +\n geom_point(aes(shape = y), size = 4) +\n coord_cartesian(c(-2.5, 3), c(-2.5, 3)) +\n scale_fill_manual(values = c(orange, green), labels = c(\"0\", \"1\")) +\n theme(\n legend.position = \"bottom\", legend.title = element_blank(),\n legend.key.width = unit(2, \"cm\")\n )\n```\n\n::: {.cell-output-display}\n![](17-nonlinear-classifiers_files/figure-revealjs/unnamed-chunk-9-1.svg){fig-align='center'}\n:::\n:::\n\n\n\n\n:::\n\n::: w-50\n\n* Best $k$: 19\n\n* Misclassification error: 0.17\n\n* Confusion matrix:\n\n\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output .cell-output-stdout}\n\n```\n truth\npredicted 1 2\n 1 41 6\n 2 11 42\n```\n\n\n:::\n:::\n\n\n\n\n:::\n:::\n\n# Next time ... {background-image=\"https://i1.wp.com/bdtechtalks.com/wp-content/uploads/2018/12/artificial-intelligence-deep-learning-neural-networks-ai.jpg?w=1392&ssl=1\" background-opacity=.4}\n\n\n[Module 4]{.secondary}\n\n[boosting, bagging, random forests, and neural nets]{.secondary}\n",
+=======
+ "hash": "66668c5323e08003556d8ec7fd8757dc",
+ "result": {
+ "engine": "knitr",
+ "markdown": "---\nlecture: \"17 Nonlinear classifiers\"\nformat: revealjs\nmetadata-files: \n - _metadata.yml\n---\n\n\n## {{< meta lecture >}} {.large background-image=\"gfx/smooths.svg\" background-opacity=\"0.3\"}\n\n[Stat 406]{.secondary}\n\n[{{< meta author >}}]{.secondary}\n\nLast modified -- 23 October 2024\n\n\n\n\n\n$$\n\\DeclareMathOperator*{\\argmin}{argmin}\n\\DeclareMathOperator*{\\argmax}{argmax}\n\\DeclareMathOperator*{\\minimize}{minimize}\n\\DeclareMathOperator*{\\maximize}{maximize}\n\\DeclareMathOperator*{\\find}{find}\n\\DeclareMathOperator{\\st}{subject\\,\\,to}\n\\newcommand{\\E}{E}\n\\newcommand{\\Expect}[1]{\\E\\left[ #1 \\right]}\n\\newcommand{\\Var}[1]{\\mathrm{Var}\\left[ #1 \\right]}\n\\newcommand{\\Cov}[2]{\\mathrm{Cov}\\left[#1,\\ #2\\right]}\n\\newcommand{\\given}{\\ \\vert\\ }\n\\newcommand{\\X}{\\mathbf{X}}\n\\newcommand{\\x}{\\mathbf{x}}\n\\newcommand{\\y}{\\mathbf{y}}\n\\newcommand{\\P}{\\mathcal{P}}\n\\newcommand{\\R}{\\mathbb{R}}\n\\newcommand{\\norm}[1]{\\left\\lVert #1 \\right\\rVert}\n\\newcommand{\\snorm}[1]{\\lVert #1 \\rVert}\n\\newcommand{\\tr}[1]{\\mbox{tr}(#1)}\n\\newcommand{\\brt}{\\widehat{\\beta}^R_{s}}\n\\newcommand{\\brl}{\\widehat{\\beta}^R_{\\lambda}}\n\\newcommand{\\bls}{\\widehat{\\beta}_{ols}}\n\\newcommand{\\blt}{\\widehat{\\beta}^L_{s}}\n\\newcommand{\\bll}{\\widehat{\\beta}^L_{\\lambda}}\n\\newcommand{\\U}{\\mathbf{U}}\n\\newcommand{\\D}{\\mathbf{D}}\n\\newcommand{\\V}{\\mathbf{V}}\n$$\n\n\n\n\n## Two lectures ago\n\n\nWe discussed logistic regression\n\n$$\\begin{aligned}\nPr(Y = 1 \\given X=x) & = \\frac{\\exp\\{\\beta_0 + \\beta^{\\top}x\\}}{1 + \\exp\\{\\beta_0 + \\beta^{\\top}x\\}} \\\\\nPr(Y = 0 \\given X=x) & = \\frac{1}{1 + \\exp\\{\\beta_0 + \\beta^{\\top}x\\}}=1-\\frac{\\exp\\{\\beta_0 + \\beta^{\\top}x\\}}{1 + \\exp\\{\\beta_0 + \\beta^{\\top}x\\}}\\end{aligned}$$\n\n## Make it nonlinear\n\nWe can make logistic regression have non-linear decision boundaries by mapping the features to a higher dimension (just like with linear regression)\n\nSay:\n\n__Polynomials__\n\n$(x_1, x_2) \\mapsto \\left(1,\\ x_1,\\ x_1^2,\\ x_2,\\ x_2^2,\\ x_1 x_2\\right)$\n\n\n\n::: {.cell layout-align=\"center\"}\n\n:::\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndat1 <- generate_lda_2d(100, Sigma = .5 * diag(2)) |> mutate(y = as.factor(y))\nlogit_poly <- glm(y ~ x1 * x2 + I(x1^2) + I(x2^2), dat1, family = \"binomial\")\n```\n:::\n\n\n\n\n\n## Visualizing the classification boundary\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code code-fold=\"true\"}\nlibrary(cowplot)\ngr <- expand_grid(x1 = seq(-2.5, 3, length.out = 100), x2 = seq(-2.5, 3, length.out = 100))\npts_logit <- predict(logit_poly, gr)\ng0 <- ggplot(dat1, aes(x1, x2)) +\n scale_shape_manual(values = c(\"0\", \"1\"), guide = \"none\") +\n geom_raster(data = tibble(gr, disc = pts_logit), aes(x1, x2, fill = disc)) +\n geom_point(aes(shape = as.factor(y)), size = 4) +\n coord_cartesian(c(-2.5, 3), c(-2.5, 3)) +\n scale_fill_viridis_b(n.breaks = 6, alpha = .5, name = \"log odds\") +\n ggtitle(\"Polynomial logit\") +\n theme(legend.position = \"bottom\", legend.key.width = unit(1.5, \"cm\"))\nplot_grid(g0)\n```\n\n::: {.cell-output-display}\n![](17-nonlinear-classifiers_files/figure-revealjs/plot-d1-1.svg){fig-align='center'}\n:::\n:::\n\n\n\nA linear decision boundary in the higher-dimensional space corresponds to a non-linear decision boundary in low dimensions.\n\n\n## KNN classifiers\n\n* We saw $k$-nearest neighbors in the last module.\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(class)\nknn3 <- knn(dat1[, -1], gr, dat1$y, k = 3)\n```\n:::\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code code-fold=\"true\"}\ngr$nn03 <- knn3\nggplot(dat1, aes(x1, x2)) +\n scale_shape_manual(values = c(\"0\", \"1\"), guide = \"none\") +\n geom_raster(data = tibble(gr, disc = knn3), aes(x1, x2, fill = disc), alpha = .5) +\n geom_point(aes(shape = as.factor(y)), size = 4) +\n coord_cartesian(c(-2.5, 3), c(-2.5, 3)) +\n scale_fill_manual(values = c(orange, blue), labels = c(\"0\", \"1\")) +\n theme(\n legend.position = \"bottom\", legend.title = element_blank(),\n legend.key.width = unit(2, \"cm\")\n )\n```\n\n::: {.cell-output-display}\n![](17-nonlinear-classifiers_files/figure-revealjs/unnamed-chunk-3-1.svg){fig-align='center'}\n:::\n:::\n\n\n\n\n## Choosing $k$ is very important\n\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code code-fold=\"true\"}\nset.seed(406406406)\nks <- c(1, 2, 5, 10, 20)\nnn <- map(ks, ~ as_tibble(knn(dat1[, -1], gr[, 1:2], dat1$y, .x)) |> \n set_names(sprintf(\"k = %02s\", .x))) |>\n list_cbind() |>\n bind_cols(gr)\npg <- pivot_longer(nn, starts_with(\"k =\"), names_to = \"k\", values_to = \"knn\")\n\nggplot(pg, aes(x1, x2)) +\n geom_raster(aes(fill = knn), alpha = .6) +\n facet_wrap(~ k) +\n scale_fill_manual(values = c(orange, green), labels = c(\"0\", \"1\")) +\n geom_point(data = dat1, mapping = aes(x1, x2, shape = as.factor(y)), size = 4) +\n theme_bw(base_size = 18) +\n scale_shape_manual(values = c(\"0\", \"1\"), guide = \"none\") +\n coord_cartesian(c(-2.5, 3), c(-2.5, 3)) +\n theme(\n legend.title = element_blank(),\n legend.key.height = unit(3, \"cm\")\n )\n```\n\n::: {.cell-output-display}\n![](17-nonlinear-classifiers_files/figure-revealjs/unnamed-chunk-4-1.svg){fig-align='center'}\n:::\n:::\n\n\n\n* How should we choose $k$?\n\n* Scaling is also very important. \"Nearness\" is determined by distance, so better to standardize your data first.\n\n* If there are ties, break randomly. So even $k$ is strange.\n\n\n## `knn.cv()` (leave one out)\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nkmax <- 20\nerr <- map_dbl(1:kmax, ~ mean(knn.cv(dat1[, -1], dat1$y, k = .x) != dat1$y))\n```\n:::\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n![](17-nonlinear-classifiers_files/figure-revealjs/unnamed-chunk-6-1.svg){fig-align='center'}\n:::\n:::\n\n\n\nI would use the _largest_ (odd) `k` that is close to the minimum. \nThis produces simpler, smoother, decision boundaries.\n\n\n\n## Final version\n\n\n::: flex\n::: w-50\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code code-fold=\"true\"}\nkopt <- max(which(err == min(err)))\nkopt <- kopt + 1 * (kopt %% 2 == 0)\ngr$opt <- knn(dat1[, -1], gr[, 1:2], dat1$y, k = kopt)\ntt <- table(knn(dat1[, -1], dat1[, -1], dat1$y, k = kopt), dat1$y, dnn = c(\"predicted\", \"truth\"))\nggplot(dat1, aes(x1, x2)) +\n theme_bw(base_size = 24) +\n scale_shape_manual(values = c(\"0\", \"1\"), guide = \"none\") +\n geom_raster(data = gr, aes(x1, x2, fill = opt), alpha = .6) +\n geom_point(aes(shape = y), size = 4) +\n coord_cartesian(c(-2.5, 3), c(-2.5, 3)) +\n scale_fill_manual(values = c(orange, green), labels = c(\"0\", \"1\")) +\n theme(\n legend.position = \"bottom\", legend.title = element_blank(),\n legend.key.width = unit(2, \"cm\")\n )\n```\n\n::: {.cell-output-display}\n![](17-nonlinear-classifiers_files/figure-revealjs/unnamed-chunk-7-1.svg){fig-align='center'}\n:::\n:::\n\n\n\n:::\n\n::: w-50\n\n* Best $k$: 19\n\n* Misclassification error: 0.17\n\n* Confusion matrix:\n\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output .cell-output-stdout}\n\n```\n truth\npredicted 1 2\n 1 41 6\n 2 11 42\n```\n\n\n:::\n:::\n\n\n\n:::\n:::\n\n\n\n## Trees\n\n::: flex\n\n::: w-50\nWe saw regression trees last module\n\nClassification trees are \n\n- More natural\n- Slightly different computationally\n\nEverything else is pretty much the same\n:::\n\n::: w-50\n![](https://upload.wikimedia.org/wikipedia/commons/e/eb/Decision_Tree.jpg)\n:::\n:::\n\n\n\n## Axis-parallel splits\n\nLike with regression trees, classification trees operate by greedily splitting the predictor space\n\n\n\n::: {.cell layout-align=\"center\"}\n\n:::\n\n\n\n::: flex\n::: w-50\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnames(bakeoff)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n [1] \"winners\" \n [2] \"series\" \n [3] \"age\" \n [4] \"occupation\" \n [5] \"hometown\" \n [6] \"percent_star\" \n [7] \"percent_technical_wins\" \n [8] \"percent_technical_bottom3\"\n [9] \"percent_technical_top3\" \n[10] \"technical_highest\" \n[11] \"technical_lowest\" \n[12] \"technical_median\" \n[13] \"judge1\" \n[14] \"judge2\" \n[15] \"viewers_7day\" \n[16] \"viewers_28day\" \n```\n\n\n:::\n:::\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsmalltree <- tree(\n winners ~ technical_median + percent_star,\n data = bakeoff\n)\n```\n:::\n\n\n\n:::\n\n\n::: w-50\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code code-fold=\"true\"}\npar(mar = c(5, 5, 0, 0) + .1)\nplot(bakeoff$technical_median, bakeoff$percent_star,\n pch = c(\"-\", \"+\")[bakeoff$winners + 1], cex = 2, bty = \"n\", las = 1,\n ylab = \"% star baker\", xlab = \"times above median in technical\",\n col = orange, cex.axis = 2, cex.lab = 2\n)\npartition.tree(smalltree,\n add = TRUE, col = blue,\n ordvars = c(\"technical_median\", \"percent_star\")\n)\n```\n\n::: {.cell-output-display}\n![](17-nonlinear-classifiers_files/figure-revealjs/plot-partition-1.svg){fig-align='center'}\n:::\n:::\n\n\n:::\n:::\n\n\n## When do trees do well?\n\n::: flex\n::: w-50\n![](gfx/8.7.png)\n:::\n\n::: w-50\n\n[2D example]{.hand}\n\n[Top Row:]{.primary} \n\ntrue decision boundary is linear\n\n🍎 linear classifier \n\n👎 tree with axis-parallel splits\n\n[Bottom Row:]{.primary}\n\ntrue decision boundary is non-linear\n\n🤮 A linear classifier can't capture the true decision boundary\n\n🍎 decision tree is successful.\n:::\n:::\n\n\n\n\n## How do we build a tree?\n\n\n1. Divide the predictor space into\n$J$ non-overlapping regions $R_1, \\ldots, R_J$ \n\n > this is done via greedy, recursive binary splitting\n\n2. Every observation that falls into a given region $R_j$ is given the same prediction\n\n > determined by majority (or plurality) vote in that region.\n\n\n\n[Important:]{.hand}\n\n* Trees can only make rectangular regions that are aligned with the coordinate axis.\n\n* We use a *greedy* (not optimal) algorithm to fit the tree\n\n\n## Flashback: Constructing Trees for Regression\n\n* While ($\\mathtt{depth} \\ne \\mathtt{max.depth}$):\n * For each existing region $R_k$\n * For a given *splitting variable* $j$ and *split value* $s$,\n define\n $$\n \\begin{align}\n R_k^> &= \\{x \\in R_k : x^{(j)} > s\\} \\\\\n R_k^< &= \\{x \\in R_k : x^{(j)} > s\\}\n \\end{align}\n $$\n * Choose $j$ and $s$ \n to *maximize quality of fit*; i.e.\n $$\\min |R_k^>| \\cdot \\widehat{Var}(R_k^>) + |R_k^<| \\cdot \\widehat{Var}(R_k^<)$$\n\n. . .\n\n[We have to change this last line for classification]{.secondary}\n\n\n\n\n\n## How do we measure quality of fit?\n\n\nLet $p_{mk}$ be the proportion of training observations in the $m^{th}$\nregion that are from the $k^{th}$ class.\n\n| | |\n|---|---|\n| __classification error rate:__ | $E = 1 - \\max_k (\\widehat{p}_{mk})$|\n| __Gini index:__ | $G = \\sum_k \\widehat{p}_{mk}(1-\\widehat{p}_{mk})$ |\n| __cross-entropy:__ | $D = -\\sum_k \\widehat{p}_{mk}\\log(\\widehat{p}_{mk})$|\n\n\nBoth Gini and cross-entropy measure the purity of the classifier (small if all $p_{mk}$ are near zero or 1). \n\nClassification error is hard to optimize.\n\nWe build a classifier by growing a tree that minimizes $G$ or $D$.\n\n\n\n\n\n\n## Advantages and disadvantages of trees (again)\n\n🎉 Trees are very easy to explain (much easier than even linear regression). \n\n🎉 Some people believe that decision trees mirror human decision. \n\n🎉 Trees can easily be displayed graphically no matter the dimension of the data.\n\n🎉 Trees can easily handle qualitative predictors without the need to create dummy variables.\n\n💩 Trees aren't very good at prediction.\n\n💩 Trees are highly variable. Small changes in training data $\\Longrightarrow$ big changes in the tree.\n\nTo fix these last two, we can try to grow many trees and average their performance. \n\n. . .\n\nWe do this next module\n",
+>>>>>>> 1c36b39 (Update last of classification slides)
"supporting": [
"17-nonlinear-classifiers_files"
],
diff --git a/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/plot-d1-1.svg b/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/plot-d1-1.svg
index 657a170..eaacf57 100644
--- a/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/plot-d1-1.svg
+++ b/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/plot-d1-1.svg
@@ -1,4 +1,5 @@
+<<<<<<< HEAD
@@ -1314,5 +1315,679 @@
+=======
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+>>>>>>> 1c36b39 (Update last of classification slides)
diff --git a/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/plot-partition-1.svg b/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/plot-partition-1.svg
index c52c7d5..f6f4e40 100644
--- a/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/plot-partition-1.svg
+++ b/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/plot-partition-1.svg
@@ -2,6 +2,7 @@
+<<<<<<< HEAD
@@ -512,5 +513,504 @@
+=======
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+>>>>>>> 1c36b39 (Update last of classification slides)
diff --git a/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-3-1.svg b/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-3-1.svg
index 1e0279a..6a44f05 100644
--- a/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-3-1.svg
+++ b/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-3-1.svg
@@ -2,6 +2,7 @@
+<<<<<<< HEAD
@@ -582,5 +583,578 @@
+=======
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+>>>>>>> 1c36b39 (Update last of classification slides)
diff --git a/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-4-1.svg b/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-4-1.svg
index f9d64bd..2e2624c 100644
--- a/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-4-1.svg
+++ b/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-4-1.svg
@@ -2,6 +2,7 @@
+<<<<<<< HEAD
@@ -2599,5 +2600,2574 @@
+=======
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+>>>>>>> 1c36b39 (Update last of classification slides)
diff --git a/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-6-1.svg b/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-6-1.svg
index ab3b6eb..f552efc 100644
--- a/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-6-1.svg
+++ b/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-6-1.svg
@@ -2,6 +2,7 @@
+<<<<<<< HEAD
@@ -271,5 +272,265 @@
+=======
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+>>>>>>> 1c36b39 (Update last of classification slides)
diff --git a/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-7-1.svg b/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-7-1.svg
index e5f32d9..ae30e68 100644
--- a/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-7-1.svg
+++ b/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-7-1.svg
@@ -3,587 +3,575 @@
-
-
-
-
+
-
-
-
-
+
-
+
-
+
-
+
-
-
-
-
+
-
-
-
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
-
+
+
-
-
+
+
-
+
-
+
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
-
-
+
+
-
+
-
+
-
+
-
+
-
-
+
+
-
-
+
+
-
-
-
-
-
+
+
+
+
+
-
+
-
+
diff --git a/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-8-1.svg b/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-8-1.svg
index c0b3517..88568ed 100644
--- a/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-8-1.svg
+++ b/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-8-1.svg
@@ -2,6 +2,7 @@
+<<<<<<< HEAD
@@ -224,5 +225,218 @@
+=======
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+>>>>>>> 1c36b39 (Update last of classification slides)
diff --git a/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-9-1.svg b/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-9-1.svg
index 8160a1b..4cb207b 100644
--- a/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-9-1.svg
+++ b/_freeze/schedule/slides/17-nonlinear-classifiers/figure-revealjs/unnamed-chunk-9-1.svg
@@ -2,6 +2,7 @@
+<<<<<<< HEAD
@@ -587,5 +588,578 @@
+=======
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+>>>>>>> 1c36b39 (Update last of classification slides)
diff --git a/schedule/slides/00-classification-losses.qmd b/schedule/slides/00-classification-losses.qmd
index ff2f7c0..354f6ab 100644
--- a/schedule/slides/00-classification-losses.qmd
+++ b/schedule/slides/00-classification-losses.qmd
@@ -12,23 +12,24 @@ metadata-files:
[So far]{.secondary} --- 0-1 loss. If correct class, lose 0 else lose 1.
-[Asymmetric classification loss]{.secondary} --- If correct class, lose 0 else lose something.
+. . .
+
+[Generalization: Asymmetric classification loss]{.secondary} --- If correct class, lose 0 else lose something.
-For example, consider facial recognition. Goal is "person OK", "person has expired passport", "person is a known terrorist"
+\
+E.g. MRI screening. Goal is "person OK", "person has a disease"
-1. If classify OK, but was terrorist, lose 1,000,000
-1. If classify OK, but expired passport, lose 2
-1. If classify terrorist, but was OK, lose 100
-1. If classify terrorist, but was expired passport, lose 10
+1. If classify OK, but was disease, lose 1,000,000
+1. If classify disease, but was OK, lose 10
1. etc.
. . .
-Results in a 3x3 matrix of losses with 0 on the diagonal.
+Results in a 2x2 matrix of losses with 0 on the diagonal.
```{r echo=FALSE, R.options=list(scipen=8)}
-matrix(c(0, 10, 1000000, 2, 0, 50000, 30, 100, 0), nrow = 3)
+matrix(c(0, 10, 1000000, 0), nrow = 2)
```
@@ -48,7 +49,7 @@ LDA and QDA produce probabilities as well. So do Neural Networks (typically)
* Deviance loss for 2-class classification is $-2\textrm{loglikelihood}(y, \hat{p}) = -2 (y_i x'_i\hat{\beta} - \log (1-\hat{p}))$
-(Technically, it's the difference between this and the loss of the null model, but people play fast and loose)
+
* Could also use cross entropy or Gini index.
@@ -65,9 +66,10 @@ The same goes for the other probabilities. If we say "20% chance of rain" it sho
Of course, we didn't predict **exactly** $\hat{p}(x_i)=0.2$ ever, so lets look at $[.15, .25]$.
+
+## Calibration plot
+
```{r}
-#| code-line-numbers: "1-6|7|8-9"
-#| output-location: fragment
n <- 250
dat <- tibble(
x = seq(-5, 5, length.out = n),
@@ -76,15 +78,6 @@ dat <- tibble(
)
fit <- glm(y ~ x, family = binomial, data = dat)
dat$phat <- predict(fit, type = "response") # predicted probabilities
-dat |>
- filter(phat > .15, phat < .25) |>
- summarize(target = .2, obs = mean(y))
-```
-
-
-## Calibration plot
-
-```{r}
binary_calibration_plot <- function(y, phat, nbreaks = 10) {
dat <- tibble(y = y, phat = phat) |>
mutate(bins = cut_number(phat, n = nbreaks))
@@ -152,26 +145,34 @@ Specificity
+## Decision making
+
+Given a logistic regression output $\hat P(Y \mid X) = 0.56$,
+should we assign $\hat Y = 1$ or $\hat Y = 0$?
+
+E.g. $P(Y=1 \mid X)$ is predicted probability that email $X$ is spam. \
+Do we send it to the spam folder ($\hat Y=1$) or the inbox ($\hat Y=0$)?
+
+. . .
+
+So far we've been making the "decision" $\hat Y=1$ if $\hat P(Y=1 \mid X) > \hat P(Y=0 \mid X)$ \
+i.e. $\hat Y = \begin{cases} 1 & \hat P(Y=1 \mid X) > 0.5 \\ 0 & \mathrm{o.w.} \end{cases}$.
+
+But maybe (for our application) a "better" decision is $$\hat Y = \begin{cases} 1 & \hat P(Y=1 \mid X) > t \\ 0 & \mathrm{o.w.} \end{cases}$$
+
+
## ROC and thresholds
ROC (Receiver Operating Characteristic) Curve
: TPR (sensitivity) vs. FPR (1 - specificity)
+: [Each point corresponds to a different $0 \leq t \leq 1$.]{.small}
AUC (Area under the curve)
: Integral of ROC. Closer to 1 is better.
-
-So far, we've been thresholding at 0.5, though you shouldn't always do that.
-
-With unbalanced data (say 10% 0 and 90% 1), if you care equally about predicting both classes, you might want to choose a different cutoff (like in LDA).
-
-To make the [ROC]{.secondary} we look at our errors [as we vary the cutoff]{.secondary}
-
-
-## ROC curve
```{r}
-#| output-location: column-fragment
+#| output-location: column
#| fig-width: 8
#| fig-height: 6
roc <- function(prediction, y) {
@@ -194,10 +195,19 @@ ggplot(roc(dat$phat, dat$y), aes(FPR, TPR)) +
geom_abline(slope = 1, intercept = 0)
```
-
-
## Other stuff
![](gfx/huge-roc.png)
* Source: worth exploring [Wikipedia](https://en.wikipedia.org/wiki/Receiver_operating_characteristic)
+
+
+
+
+
+# Next time ... {background-image="https://i1.wp.com/bdtechtalks.com/wp-content/uploads/2018/12/artificial-intelligence-deep-learning-neural-networks-ai.jpg?w=1392&ssl=1" background-opacity=.4}
+
+
+[Module 4]{.secondary}
+
+[boosting, bagging, random forests, and neural nets]{.secondary}
diff --git a/schedule/slides/17-nonlinear-classifiers.qmd b/schedule/slides/17-nonlinear-classifiers.qmd
index 366cce2..53dba6b 100644
--- a/schedule/slides/17-nonlinear-classifiers.qmd
+++ b/schedule/slides/17-nonlinear-classifiers.qmd
@@ -7,10 +7,10 @@ metadata-files:
{{< include _titleslide.qmd >}}
-## Last time
+## Two lectures ago
-We reviewed logistic regression
+We discussed logistic regression
$$\begin{aligned}
Pr(Y = 1 \given X=x) & = \frac{\exp\{\beta_0 + \beta^{\top}x\}}{1 + \exp\{\beta_0 + \beta^{\top}x\}} \\
@@ -18,7 +18,7 @@ Pr(Y = 0 \given X=x) & = \frac{1}{1 + \exp\{\beta_0 + \beta^{\top}x\}}=1-\frac{\
## Make it nonlinear
-We can make LDA or logistic regression have non-linear decision boundaries by mapping the features to a higher dimension (just like with regular regression)
+We can make logistic regression have non-linear decision boundaries by mapping the features to a higher dimension (just like with linear regression)
Say:
@@ -45,7 +45,6 @@ generate_lda_2d <- function(
```{r}
dat1 <- generate_lda_2d(100, Sigma = .5 * diag(2)) |> mutate(y = as.factor(y))
logit_poly <- glm(y ~ x1 * x2 + I(x1^2) + I(x2^2), dat1, family = "binomial")
-lda_poly <- lda(y ~ x1 * x2 + I(x1^2) + I(x2^2), dat1)
```
@@ -54,12 +53,11 @@ lda_poly <- lda(y ~ x1 * x2 + I(x1^2) + I(x2^2), dat1)
```{r plot-d1}
#| code-fold: true
-#| fig-width: 10
+#| fig-width: 5
#| fig-height: 5
library(cowplot)
gr <- expand_grid(x1 = seq(-2.5, 3, length.out = 100), x2 = seq(-2.5, 3, length.out = 100))
pts_logit <- predict(logit_poly, gr)
-pts_lda <- predict(lda_poly, gr)
g0 <- ggplot(dat1, aes(x1, x2)) +
scale_shape_manual(values = c("0", "1"), guide = "none") +
geom_raster(data = tibble(gr, disc = pts_logit), aes(x1, x2, fill = disc)) +
@@ -68,21 +66,143 @@ g0 <- ggplot(dat1, aes(x1, x2)) +
scale_fill_viridis_b(n.breaks = 6, alpha = .5, name = "log odds") +
ggtitle("Polynomial logit") +
theme(legend.position = "bottom", legend.key.width = unit(1.5, "cm"))
-g1 <- ggplot(dat1, aes(x1, x2)) +
+plot_grid(g0)
+```
+
+A linear decision boundary in the higher-dimensional space corresponds to a non-linear decision boundary in low dimensions.
+
+
+## KNN classifiers
+
+<<<<<<< HEAD
+=======
+* We saw $k$-nearest neighbors in the last module.
+
+```{r}
+library(class)
+knn3 <- knn(dat1[, -1], gr, dat1$y, k = 3)
+```
+
+```{r}
+#| code-fold: true
+#| fig-width: 8
+#| fig-height: 4
+gr$nn03 <- knn3
+ggplot(dat1, aes(x1, x2)) +
scale_shape_manual(values = c("0", "1"), guide = "none") +
- geom_raster(data = tibble(gr, disc = pts_lda$x), aes(x1, x2, fill = disc)) +
+ geom_raster(data = tibble(gr, disc = knn3), aes(x1, x2, fill = disc), alpha = .5) +
geom_point(aes(shape = as.factor(y)), size = 4) +
coord_cartesian(c(-2.5, 3), c(-2.5, 3)) +
- scale_fill_viridis_b(n.breaks = 6, alpha = .5, name = bquote(delta[1] - delta[0])) +
- ggtitle("Polynomial lda") +
- theme(legend.position = "bottom", legend.key.width = unit(1.5, "cm"))
-plot_grid(g0, g1)
+ scale_fill_manual(values = c(orange, blue), labels = c("0", "1")) +
+ theme(
+ legend.position = "bottom", legend.title = element_blank(),
+ legend.key.width = unit(2, "cm")
+ )
```
-A linear decision boundary in the higher-dimensional space corresponds to a non-linear decision boundary in low dimensions.
+## Choosing $k$ is very important
+```{r}
+#| code-fold: true
+#| fig-width: 16
+#| fig-height: 5
+set.seed(406406406)
+ks <- c(1, 2, 5, 10, 20)
+nn <- map(ks, ~ as_tibble(knn(dat1[, -1], gr[, 1:2], dat1$y, .x)) |>
+ set_names(sprintf("k = %02s", .x))) |>
+ list_cbind() |>
+ bind_cols(gr)
+pg <- pivot_longer(nn, starts_with("k ="), names_to = "k", values_to = "knn")
+
+ggplot(pg, aes(x1, x2)) +
+ geom_raster(aes(fill = knn), alpha = .6) +
+ facet_wrap(~ k) +
+ scale_fill_manual(values = c(orange, green), labels = c("0", "1")) +
+ geom_point(data = dat1, mapping = aes(x1, x2, shape = as.factor(y)), size = 4) +
+ theme_bw(base_size = 18) +
+ scale_shape_manual(values = c("0", "1"), guide = "none") +
+ coord_cartesian(c(-2.5, 3), c(-2.5, 3)) +
+ theme(
+ legend.title = element_blank(),
+ legend.key.height = unit(3, "cm")
+ )
+```
+
+* How should we choose $k$?
+
+* Scaling is also very important. "Nearness" is determined by distance, so better to standardize your data first.
+
+* If there are ties, break randomly. So even $k$ is strange.
+
+
+## `knn.cv()` (leave one out)
+
+```{r}
+kmax <- 20
+err <- map_dbl(1:kmax, ~ mean(knn.cv(dat1[, -1], dat1$y, k = .x) != dat1$y))
+```
+
+```{r}
+#| echo: false
+ggplot(data.frame(k = 1:kmax, error = err), aes(k, error)) +
+ geom_point(color = orange) +
+ geom_line(color = orange)
+```
+
+I would use the _largest_ (odd) `k` that is close to the minimum.
+This produces simpler, smoother, decision boundaries.
+
+
+
+## Final version
+
+
+::: flex
+::: w-50
+
+```{r}
+#| code-fold: true
+#| fig-height: 6
+#| fig-width: 6
+kopt <- max(which(err == min(err)))
+kopt <- kopt + 1 * (kopt %% 2 == 0)
+gr$opt <- knn(dat1[, -1], gr[, 1:2], dat1$y, k = kopt)
+tt <- table(knn(dat1[, -1], dat1[, -1], dat1$y, k = kopt), dat1$y, dnn = c("predicted", "truth"))
+ggplot(dat1, aes(x1, x2)) +
+ theme_bw(base_size = 24) +
+ scale_shape_manual(values = c("0", "1"), guide = "none") +
+ geom_raster(data = gr, aes(x1, x2, fill = opt), alpha = .6) +
+ geom_point(aes(shape = y), size = 4) +
+ coord_cartesian(c(-2.5, 3), c(-2.5, 3)) +
+ scale_fill_manual(values = c(orange, green), labels = c("0", "1")) +
+ theme(
+ legend.position = "bottom", legend.title = element_blank(),
+ legend.key.width = unit(2, "cm")
+ )
+```
+
+:::
+
+::: w-50
+
+* Best $k$: `r kopt`
+
+* Misclassification error: `r 1-sum(diag(tt))/sum(tt)`
+
+* Confusion matrix:
+
+```{r echo=FALSE}
+tt
+```
+
+:::
+:::
+
+
+
+>>>>>>> 1c36b39 (Update last of classification slides)
## Trees
::: flex
@@ -202,8 +322,29 @@ $J$ non-overlapping regions $R_1, \ldots, R_J$
[Important:]{.hand}
* Trees can only make rectangular regions that are aligned with the coordinate axis.
-* The fit is _greedy_, which means that after a split is made, all further decisions are conditional on that split.
+* We use a *greedy* (not optimal) algorithm to fit the tree
+
+
+## Flashback: Constructing Trees for Regression
+
+* While ($\mathtt{depth} \ne \mathtt{max.depth}$):
+ * For each existing region $R_k$
+ * For a given *splitting variable* $j$ and *split value* $s$,
+ define
+ $$
+ \begin{align}
+ R_k^> &= \{x \in R_k : x^{(j)} > s\} \\
+ R_k^< &= \{x \in R_k : x^{(j)} > s\}
+ \end{align}
+ $$
+ * Choose $j$ and $s$
+ to *maximize quality of fit*; i.e.
+ $$\min |R_k^>| \cdot \widehat{Var}(R_k^>) + |R_k^<| \cdot \widehat{Var}(R_k^<)$$
+
+. . .
+
+[We have to change this last line for classification]{.secondary}
@@ -224,14 +365,12 @@ region that are from the $k^{th}$ class.
Both Gini and cross-entropy measure the purity of the classifier (small if all $p_{mk}$ are near zero or 1).
-These are preferred over the classification error rate.
-
Classification error is hard to optimize.
We build a classifier by growing a tree that minimizes $G$ or $D$.
-
+
@@ -271,169 +410,3 @@ To fix these last two, we can try to grow many trees and average their performan
. . .
We do this next module
-
-
-## KNN classifiers
-
-* We saw $k$-nearest neighbors in the last module.
-
-```{r}
-library(class)
-knn3 <- knn(dat1[, -1], gr, dat1$y, k = 3)
-```
-
-```{r}
-#| code-fold: true
-#| fig-width: 8
-#| fig-height: 4
-gr$nn03 <- knn3
-ggplot(dat1, aes(x1, x2)) +
- scale_shape_manual(values = c("0", "1"), guide = "none") +
- geom_raster(data = tibble(gr, disc = knn3), aes(x1, x2, fill = disc), alpha = .5) +
- geom_point(aes(shape = as.factor(y)), size = 4) +
- coord_cartesian(c(-2.5, 3), c(-2.5, 3)) +
- scale_fill_manual(values = c(orange, blue), labels = c("0", "1")) +
- theme(
- legend.position = "bottom", legend.title = element_blank(),
- legend.key.width = unit(2, "cm")
- )
-```
-
-
-## Choosing $k$ is very important
-
-
-```{r}
-#| code-fold: true
-#| fig-width: 16
-#| fig-height: 5
-set.seed(406406406)
-ks <- c(1, 2, 5, 10, 20)
-nn <- map(ks, ~ as_tibble(knn(dat1[, -1], gr[, 1:2], dat1$y, .x)) |>
- set_names(sprintf("k = %02s", .x))) |>
- list_cbind() |>
- bind_cols(gr)
-pg <- pivot_longer(nn, starts_with("k ="), names_to = "k", values_to = "knn")
-
-ggplot(pg, aes(x1, x2)) +
- geom_raster(aes(fill = knn), alpha = .6) +
- facet_wrap(~ k) +
- scale_fill_manual(values = c(orange, green), labels = c("0", "1")) +
- geom_point(data = dat1, mapping = aes(x1, x2, shape = as.factor(y)), size = 4) +
- theme_bw(base_size = 18) +
- scale_shape_manual(values = c("0", "1"), guide = "none") +
- coord_cartesian(c(-2.5, 3), c(-2.5, 3)) +
- theme(
- legend.title = element_blank(),
- legend.key.height = unit(3, "cm")
- )
-```
-
-* How should we choose $k$?
-
-* Scaling is also very important. "Nearness" is determined by distance, so better to standardize your data first.
-
-* If there are ties, break randomly. So even $k$ is strange.
-
-
-## `knn.cv()` (leave one out)
-
-```{r}
-kmax <- 20
-err <- map_dbl(1:kmax, ~ mean(knn.cv(dat1[, -1], dat1$y, k = .x) != dat1$y))
-```
-
-```{r}
-#| echo: false
-ggplot(data.frame(k = 1:kmax, error = err), aes(k, error)) +
- geom_point(color = orange) +
- geom_line(color = orange)
-```
-
-I would use the _largest_ (odd) `k` that is close to the minimum.
-This produces simpler, smoother, decision boundaries.
-
-
-## Alternative (using deviance loss, I think this is right)
-
-```{r}
-#| code-fold: true
-dev <- function(y, prob, prob_min = 1e-5) {
- y <- as.numeric(as.factor(y)) - 1 # 0/1 valued
- m <- mean(y)
- prob_max <- 1 - prob_min
- prob <- pmin(pmax(prob, prob_min), prob_max)
- lp <- (1 - y) * log(1 - prob) + y * log(prob)
- ly <- (1 - y) * log(1 - m) + y * log(m)
- 2 * (ly - lp)
-}
-knn.cv_probs <- function(train, cl, k = 1) {
- o <- knn.cv(train, cl, k = k, prob = TRUE)
- p <- attr(o, "prob")
- o <- as.numeric(as.factor(o)) - 1
- p[o == 0] <- 1 - p[o == 0]
- p
-}
-dev_err <- map_dbl(1:kmax, ~ mean(dev(dat1$y, knn.cv_probs(dat1[, -1], dat1$y, k = .x))))
-```
-
-```{r}
-#| echo: false
-ggplot(data.frame(k = 1:kmax, error = dev_err), aes(k, error)) +
- geom_point(color = orange) +
- geom_line(color = orange)
-```
-
-
-
-
-## Final version
-
-
-::: flex
-::: w-50
-
-```{r}
-#| code-fold: true
-#| fig-height: 6
-#| fig-width: 6
-kopt <- max(which(err == min(err)))
-kopt <- kopt + 1 * (kopt %% 2 == 0)
-gr$opt <- knn(dat1[, -1], gr[, 1:2], dat1$y, k = kopt)
-tt <- table(knn(dat1[, -1], dat1[, -1], dat1$y, k = kopt), dat1$y, dnn = c("predicted", "truth"))
-ggplot(dat1, aes(x1, x2)) +
- theme_bw(base_size = 24) +
- scale_shape_manual(values = c("0", "1"), guide = "none") +
- geom_raster(data = gr, aes(x1, x2, fill = opt), alpha = .6) +
- geom_point(aes(shape = y), size = 4) +
- coord_cartesian(c(-2.5, 3), c(-2.5, 3)) +
- scale_fill_manual(values = c(orange, green), labels = c("0", "1")) +
- theme(
- legend.position = "bottom", legend.title = element_blank(),
- legend.key.width = unit(2, "cm")
- )
-```
-
-:::
-
-::: w-50
-
-* Best $k$: `r kopt`
-
-* Misclassification error: `r 1-sum(diag(tt))/sum(tt)`
-
-* Confusion matrix:
-
-```{r echo=FALSE}
-tt
-```
-
-:::
-:::
-
-# Next time ... {background-image="https://i1.wp.com/bdtechtalks.com/wp-content/uploads/2018/12/artificial-intelligence-deep-learning-neural-networks-ai.jpg?w=1392&ssl=1" background-opacity=.4}
-
-
-[Module 4]{.secondary}
-
-[boosting, bagging, random forests, and neural nets]{.secondary}