LinearRegression.html

<!DOCTYPE html>

<html>

<head>

<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />


<title>Linear Regression</title>

<script src="site_libs/header-attrs-2.14/header-attrs.js"></script>
<script src="site_libs/jquery-3.6.0/jquery-3.6.0.min.js"></script>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link href="site_libs/bootstrap-3.3.5/css/cerulean.min.css" rel="stylesheet" />
<script src="site_libs/bootstrap-3.3.5/js/bootstrap.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/html5shiv.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/respond.min.js"></script>
<style>h1 {font-size: 34px;}
       h1.title {font-size: 38px;}
       h2 {font-size: 30px;}
       h3 {font-size: 24px;}
       h4 {font-size: 18px;}
       h5 {font-size: 16px;}
       h6 {font-size: 12px;}
       code {color: inherit; background-color: rgba(0, 0, 0, 0.04);}
       pre:not([class]) { background-color: white }</style>
<script src="site_libs/navigation-1.1/tabsets.js"></script>
<script src="site_libs/navigation-1.1/codefolding.js"></script>
<script src="site_libs/htmlwidgets-1.5.4/htmlwidgets.js"></script>
<script src="site_libs/plotly-binding-4.10.0/plotly.js"></script>
<script src="site_libs/typedarray-0.1/typedarray.min.js"></script>
<link href="site_libs/crosstalk-1.2.0/css/crosstalk.min.css" rel="stylesheet" />
<script src="site_libs/crosstalk-1.2.0/js/crosstalk.min.js"></script>
<link href="site_libs/plotly-htmlwidgets-css-2.5.1/plotly-htmlwidgets.css" rel="stylesheet" />
<script src="site_libs/plotly-main-2.5.1/plotly-latest.min.js"></script>

<style type="text/css">
  code{white-space: pre-wrap;}
  span.smallcaps{font-variant: small-caps;}
  span.underline{text-decoration: underline;}
  div.column{display: inline-block; vertical-align: top; width: 50%;}
  div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
  ul.task-list{list-style: none;}
    </style>


<style type="text/css">
  code {
    white-space: pre;
  }
  .sourceCode {
    overflow: visible;
  }
</style>
<style type="text/css" data-origin="pandoc">
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
  { counter-reset: source-line 0; }
pre.numberSource code > span
  { position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
  { content: counter(source-line);
    position: relative; left: -1em; text-align: right; vertical-align: baseline;
    border: none; display: inline-block;
    -webkit-touch-callout: none; -webkit-user-select: none;
    -khtml-user-select: none; -moz-user-select: none;
    -ms-user-select: none; user-select: none;
    padding: 0 4px; width: 4em;
    color: #aaaaaa;
  }
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa;  padding-left: 4px; }
div.sourceCode
  {  background-color: #f8f8f8; }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ef2929; } /* Alert */
code span.an { color: #8f5902; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #c4a000; } /* Attribute */
code span.bn { color: #0000cf; } /* BaseN */
code span.cf { color: #204a87; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4e9a06; } /* Char */
code span.cn { color: #000000; } /* Constant */
code span.co { color: #8f5902; font-style: italic; } /* Comment */
code span.cv { color: #8f5902; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #8f5902; font-weight: bold; font-style: italic; } /* Documentation */
code span.dt { color: #204a87; } /* DataType */
code span.dv { color: #0000cf; } /* DecVal */
code span.er { color: #a40000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #0000cf; } /* Float */
code span.fu { color: #000000; } /* Function */
code span.im { } /* Import */
code span.in { color: #8f5902; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #204a87; font-weight: bold; } /* Keyword */
code span.op { color: #ce5c00; font-weight: bold; } /* Operator */
code span.ot { color: #8f5902; } /* Other */
code span.pp { color: #8f5902; font-style: italic; } /* Preprocessor */
code span.sc { color: #000000; } /* SpecialChar */
code span.ss { color: #4e9a06; } /* SpecialString */
code span.st { color: #4e9a06; } /* String */
code span.va { color: #000000; } /* Variable */
code span.vs { color: #4e9a06; } /* VerbatimString */
code span.wa { color: #8f5902; font-weight: bold; font-style: italic; } /* Warning */

.sourceCode .row {
  width: 100%;
}
.sourceCode {
  overflow-x: auto;
}
.code-folding-btn {
  margin-right: -30px;
}
</style>
<script>
// apply pandoc div.sourceCode style to pre.sourceCode instead
(function() {
  var sheets = document.styleSheets;
  for (var i = 0; i < sheets.length; i++) {
    if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue;
    try { var rules = sheets[i].cssRules; } catch (e) { continue; }
    for (var j = 0; j < rules.length; j++) {
      var rule = rules[j];
      // check if there is a div.sourceCode rule
      if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") continue;
      var style = rule.style.cssText;
      // check if color or background-color is set
      if (rule.style.color === '' && rule.style.backgroundColor === '') continue;
      // replace div.sourceCode by a pre.sourceCode rule
      sheets[i].deleteRule(j);
      sheets[i].insertRule('pre.sourceCode{' + style + '}', j);
    }
  }
})();
</script>


<link rel="stylesheet" href="styles.css" type="text/css" />


<style type = "text/css">
.main-container {
  max-width: 940px;
  margin-left: auto;
  margin-right: auto;
}
img {
  max-width:100%;
}
.tabbed-pane {
  padding-top: 12px;
}
.html-widget {
  margin-bottom: 20px;
}
button.code-folding-btn:focus {
  outline: none;
}
summary {
  display: list-item;
}
details > summary > p:only-child {
  display: inline;
}
pre code {
  padding: 0;
}
</style>


<style type="text/css">
.dropdown-submenu {
  position: relative;
}
.dropdown-submenu>.dropdown-menu {
  top: 0;
  left: 100%;
  margin-top: -6px;
  margin-left: -1px;
  border-radius: 0 6px 6px 6px;
}
.dropdown-submenu:hover>.dropdown-menu {
  display: block;
}
.dropdown-submenu>a:after {
  display: block;
  content: " ";
  float: right;
  width: 0;
  height: 0;
  border-color: transparent;
  border-style: solid;
  border-width: 5px 0 5px 5px;
  border-left-color: #cccccc;
  margin-top: 5px;
  margin-right: -10px;
}
.dropdown-submenu:hover>a:after {
  border-left-color: #adb5bd;
}
.dropdown-submenu.pull-left {
  float: none;
}
.dropdown-submenu.pull-left>.dropdown-menu {
  left: -100%;
  margin-left: 10px;
  border-radius: 6px 0 6px 6px;
}
</style>

<script type="text/javascript">
// manage active state of menu based on current page
$(document).ready(function () {
  // active menu anchor
  href = window.location.pathname
  href = href.substr(href.lastIndexOf('/') + 1)
  if (href === "")
    href = "index.html";
  var menuAnchor = $('a[href="' + href + '"]');

  // mark it active
  menuAnchor.tab('show');

  // if it's got a parent navbar menu mark it active as well
  menuAnchor.closest('li.dropdown').addClass('active');

  // Navbar adjustments
  var navHeight = $(".navbar").first().height() + 15;
  var style = document.createElement('style');
  var pt = "padding-top: " + navHeight + "px; ";
  var mt = "margin-top: -" + navHeight + "px; ";
  var css = "";
  // offset scroll position for anchor links (for fixed navbar)
  for (var i = 1; i <= 6; i++) {
    css += ".section h" + i + "{ " + pt + mt + "}\n";
  }
  style.innerHTML = "body {" + pt + "padding-bottom: 40px; }\n" + css;
  document.head.appendChild(style);
});
</script>

<!-- tabsets -->

<style type="text/css">
.tabset-dropdown > .nav-tabs {
  display: inline-table;
  max-height: 500px;
  min-height: 44px;
  overflow-y: auto;
  border: 1px solid #ddd;
  border-radius: 4px;
}

.tabset-dropdown > .nav-tabs > li.active:before {
  content: "";
  font-family: 'Glyphicons Halflings';
  display: inline-block;
  padding: 10px;
  border-right: 1px solid #ddd;
}

.tabset-dropdown > .nav-tabs.nav-tabs-open > li.active:before {
  content: "&#xe258;";
  border: none;
}

.tabset-dropdown > .nav-tabs.nav-tabs-open:before {
  content: "";
  font-family: 'Glyphicons Halflings';
  display: inline-block;
  padding: 10px;
  border-right: 1px solid #ddd;
}

.tabset-dropdown > .nav-tabs > li.active {
  display: block;
}

.tabset-dropdown > .nav-tabs > li > a,
.tabset-dropdown > .nav-tabs > li > a:focus,
.tabset-dropdown > .nav-tabs > li > a:hover {
  border: none;
  display: inline-block;
  border-radius: 4px;
  background-color: transparent;
}

.tabset-dropdown > .nav-tabs.nav-tabs-open > li {
  display: block;
  float: none;
}

.tabset-dropdown > .nav-tabs > li {
  display: none;
}
</style>

<!-- code folding -->
<style type="text/css">
.code-folding-btn { margin-bottom: 4px; }
</style>


</head>

<body>


<div class="container-fluid main-container">


<div class="navbar navbar-default  navbar-fixed-top" role="navigation">
  <div class="container">
    <div class="navbar-header">
      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-bs-toggle="collapse" data-target="#navbar" data-bs-target="#navbar">
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
      </button>
      <a class="navbar-brand" href="index.html"><img src='Images/snlogo.png' alt='Statistics Notebook Logo' style='height: 30px; margin: -5px 0px'></a>
    </div>
    <div id="navbar" class="navbar-collapse collapse">
      <ul class="nav navbar-nav">
        <li class="dropdown">
  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
    R Help
     
    <span class="caret"></span>
  </a>
  <ul class="dropdown-menu" role="menu">
    <li>
      <a href="RCommands.html">R Commands</a>
    </li>
    <li>
      <a href="RMarkdownHints.html">R Markdown Hints</a>
    </li>
    <li>
      <a href="RCheatSheetsAndNotes.html">R Cheatsheets &amp; Notes</a>
    </li>
    <li>
      <a href="DataSources.html">Data Sources</a>
    </li>
  </ul>
</li>
<li class="dropdown">
  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
    Describing Data
     
    <span class="caret"></span>
  </a>
  <ul class="dropdown-menu" role="menu">
    <li>
      <a href="GraphicalSummaries.html">Graphical Summaries</a>
    </li>
    <li>
      <a href="NumericalSummaries.html">Numerical Summaries</a>
    </li>
  </ul>
</li>
<li class="dropdown">
  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
    Making Inference
     
    <span class="caret"></span>
  </a>
  <ul class="dropdown-menu" role="menu">
    <li>
      <a href="MakingInference.html">Making Inference</a>
    </li>
    <li>
      <a href="tTests.html">t Tests</a>
    </li>
    <li>
      <a href="WilcoxonTests.html">Wilcoxon Tests</a>
    </li>
    <li>
      <a href="Kruskal.html">Kruskal-Wallis Test</a>
    </li>
    <li>
      <a href="ANOVA.html">ANOVA</a>
    </li>
    <li>
      <a href="LinearRegression.html">Linear Regression</a>
    </li>
    <li>
      <a href="LogisticRegression.html">Logistic Regression</a>
    </li>
    <li>
      <a href="ChiSquaredTests.html">Chi Squared Tests</a>
    </li>
    <li>
      <a href="PermutationTests.html">Randomization</a>
    </li>
  </ul>
</li>
<li class="dropdown">
  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
    Analyses
     
    <span class="caret"></span>
  </a>
  <ul class="dropdown-menu" role="menu">
    <li>
      <a href="./Analyses/StudentHousing.html">Good Example Analysis</a>
    </li>
    <li>
      <a href="./Analyses/StudentHousingPOOR.html">Poor Example Analysis</a>
    </li>
    <li>
      <a href="./Analyses/Rent.html">Rent</a>
    </li>
    <li>
      <a href="./Analyses/Stephanie.html">Stephanie</a>
    </li>
    <li>
      <a href="./Analyses/t Tests/HighSchoolSeniors.html">High School Seniors</a>
    </li>
    <li>
      <a href="./Analyses/Wilcoxon Tests/RecallingWords.html">Recalling Words</a>
    </li>
    <li>
      <a href="./Analyses/ANOVA/MyTwoWayANOVA.html">My Two-way ANOVA</a>
    </li>
    <li>
      <a href="./Analyses/Kruskal-Wallis Test/Food.html">Food</a>
    </li>
    <li>
      <a href="./Analyses/Linear Regression/MySimpleLinearRegression.html">My Simple Linear Regression</a>
    </li>
    <li>
      <a href="./Analyses/Linear Regression/CarPrices.html">Car Prices</a>
    </li>
    <li>
      <a href="./Analyses/Logistic Regression/MyLogisticRegression.html">My Logistic Regression</a>
    </li>
    <li>
      <a href="./Analyses/Chi Squared Tests/MyChiSquaredTest.html">My Chi-sqaured Test</a>
    </li>
  </ul>
</li>
      </ul>
      <ul class="nav navbar-nav navbar-right">
        
      </ul>
    </div><!--/.nav-collapse -->
  </div><!--/.container -->
</div><!--/.navbar -->

<div id="header">

<div class="btn-group pull-right float-right">
<button type="button" class="btn btn-default btn-xs btn-secondary btn-sm dropdown-toggle" data-toggle="dropdown" data-bs-toggle="dropdown" aria-haspopup="true" aria-expanded="false"><span>Code</span> <span class="caret"></span></button>
<ul class="dropdown-menu dropdown-menu-right" style="min-width: 50px;">
<li><a id="rmd-show-all-code" href="#">Show All Code</a></li>
<li><a id="rmd-hide-all-code" href="#">Hide All Code</a></li>
</ul>
</div>


<h1 class="title toc-ignore">Linear Regression</h1>

</div>


<script type="text/javascript">
 function showhide(id) {
    var e = document.getElementById(id);
    e.style.display = (e.style.display == 'block') ? 'none' : 'block';
 }
 
 function openTab(evt, tabName) {
    var i, tabcontent, tablinks;
    tabcontent = document.getElementsByClassName("tabcontent");
    for (i = 0; i < tabcontent.length; i++) {
        tabcontent[i].style.display = "none";
    }
    tablinks = document.getElementsByClassName("tablinks");
    for (i = 0; i < tablinks.length; i++) {
        tablinks[i].className = tablinks[i].className.replace(" active", "");
    }
    document.getElementById(tabName).style.display = "block";
    evt.currentTarget.className += " active";
 }
</script>
<hr />
<p>Determine which explanatory variables have a significant effect on
the mean of the quantitative response variable.</p>
<hr />
<div id="simple-linear-regression"
class="section level2 tabset tabset-fade tabset-pills">
<h2 class="tabset tabset-fade tabset-pills">Simple Linear
Regression</h2>
<div style="float:left;width:125px;" align="center">
<p><img src="Images/QuantYQuantX.png" width=58px;></p>
</div>
<p>Simple linear regression is a good analysis technique when the data
consists of a single quantitative response variable <span
class="math inline">\(Y\)</span> and a single quantitative explanatory
variable <span class="math inline">\(X\)</span>.</p>
<div id="overview" class="section level3 tabset">
<h3 class="tabset">Overview</h3>
<div style="padding-left:125px;">
<p><strong>Mathematical Model</strong></p>
<p>The true regression model assumed by a regression analysis is given
by</p>
<div
style="float:right;font-size:.8em;background-color:lightgray;padding:5px;border-radius:4px;">
<a style="color:darkgray;" href="javascript:showhide('simplelinearlatexrcode')">Math
Code</a>
</div>
<div id="simplelinearlatexrcode" style="display:none;">
<pre><code>$$
  \underbrace{Y_i}_\text{Some Label} = \overbrace{\beta_0}^\text{y-int} + \overbrace{\beta_1}^\text{slope} \underbrace{X_i}_\text{Some Label} + \epsilon_i \quad \text{where} \ \epsilon_i \sim N(0, \sigma^2)
$$</code></pre>
</div>
<center>
<span class="tooltipr"> <span class="math inline">\(Y_i\)</span> <span
class="tooltiprtext">The response variable. The “i” denotes that this is
the y-value for individual “i”, where “i” is 1, 2, 3,… and so on up to
<span class="math inline">\(n\)</span>, the sample size.</span>
</span><span class="tooltipr"> <span class="math inline">\(=\)</span>
<span class="tooltiprtext">This states that we are assuming <span
class="math inline">\(Y_i\)</span> was created, or is “equal to” the
formula that will follow on the right-hand-side of the equation.</span>
</span><span class="tooltipr"> <span
class="math inline">\(\underbrace{\overbrace{\beta_0}^\text{y-intercept}
+ \overbrace{\beta_1}^\text{slope} X_i \ }_\text{true regression
relation}\)</span> <span class="tooltiprtext">The true regression
relation is a line, a line that is typically unknown in real life. It
can be likened to “God’s Law” or “Natural Law”. Something that governs
the way the data behaves, but is unkown to us.</span> </span><span
class="tooltipr"> <span class="math inline">\(+\)</span> <span
class="tooltiprtext">This plus sign emphasizes that the actual data, the
<span class="math inline">\(Y_i\)</span>, is created by adding together
the value from the true line <span class="math inline">\(\beta_0 +
\beta_1 X_i\)</span> and an individual error term <span
class="math inline">\(\epsilon_i\)</span>, which allows each dot in the
regression to be off of the line by a certain amount called <span
class="math inline">\(\epsilon_i\)</span>.</span> </span><span
class="tooltipr"> <span
class="math inline">\(\overbrace{\epsilon_i}^\text{error term}\)</span>
<span class="tooltiprtext">Error term for each individual <span
class="math inline">\(i\)</span>. The error terms are “random” and
unique for each individual. This provides the statistical relationship
of the regression. It is what allows each dot to be different, while
still coming from the same line, or underlying law.</span> </span><span
class="tooltipr"> <span class="math inline">\(\quad
\text{where}\)</span> <span class="tooltiprtext">Some extra comments are
needed about <span class="math inline">\(\epsilon_i\)</span>…</span>
</span><span class="tooltipr"> <span class="math inline">\(\
\overbrace{\epsilon_i \sim N(0, \sigma^2)}^\text{error term normally
distributed}\)</span> <span class="tooltiprtext">The error terms <span
class="math inline">\(\epsilon_i\)</span> are assumed to be normally
distributed with constant variance. Pay special note that the <span
class="math inline">\(\sigma\)</span> does not have an <span
class="math inline">\(i\)</span> in it, so it is the same for each
individual. In other words, the variance is constant. The mean of the
errors is zero, which causes the dots to be spread out symmetrically
both above and below the line.</span> </span>
</center>
<p><br/></p>
<p>The estimated regression line obtained from a regression analysis,
pronounced “y-hat”, is written as</p>
<div
style="float:right;font-size:.8em;background-color:lightgray;padding:5px;border-radius:4px;">
<a style="color:darkgray;" href="javascript:showhide('simplelinearlatexrcodeyhat')">Math
Code</a>
</div>
<div id="simplelinearlatexrcodeyhat" style="display:none;">
<pre><code>$$
  \underbrace{\hat{Y}_i}_\text{Some Label} = \overbrace{b_0}^\text{est. y-int} + \overbrace{b_1}^\text{est. slope} \underbrace{X_i}_\text{Some Label}
$$</code></pre>
</div>
<center>
<span class="tooltipr"> <span class="math inline">\(\hat{Y}_i\)</span>
<span class="tooltiprtext">The estimated average y-value for individual
<span class="math inline">\(i\)</span> is denoted by <span
class="math inline">\(\hat{Y}_i\)</span>. It is important to recognize
that <span class="math inline">\(Y_i\)</span> is the actual value for
individual <span class="math inline">\(i\)</span>, and <span
class="math inline">\(\hat{Y}_i\)</span> is the average y-value for all
individuals with the same <span class="math inline">\(X_i\)</span>
value.</span> </span><span class="tooltipr"> <span
class="math inline">\(=\)</span> <span class="tooltiprtext">The formula
for the average y-value, <span class="math inline">\(\hat{Y}_i\)</span>
is equal to what follows…</span> </span><span class="tooltipr"> <span
class="math inline">\(\underbrace{\overbrace{\ b_0 \
}^\text{y-intercept} + \overbrace{b_1}^\text{slope} X_i \
}_\text{estimated regression relation}\)</span> <span
class="tooltiprtext">Two things are important to notice about this
equation. First, it uses <span class="math inline">\(b_0\)</span> and
<span class="math inline">\(b_1\)</span> instead of <span
class="math inline">\(\beta_0\)</span> and <span
class="math inline">\(\beta_1\)</span>. This is because <span
class="math inline">\(b_0\)</span> and <span
class="math inline">\(b_1\)</span> are the estimated y-intercept and
slope, respectively, not the true y-intercept <span
class="math inline">\(\beta_0\)</span> and true slope <span
class="math inline">\(\beta_1\)</span>. Second, this equation does not
include <span class="math inline">\(\epsilon_i\)</span>. In other words,
it is the estimated regression line, so it only describes the average
y-values, not the actual y-values.</span> </span>
</center>
<p><br/></p>
<div style="font-size:0.8em;">
<p>Note: see the <strong>Explanation</strong> tab <strong>The
Mathematical Model</strong> for details about these equations.</p>
</div>
<p><strong>Hypotheses</strong></p>
<div
style="float:right;font-size:.8em;background-color:lightgray;padding:5px;border-radius:4px;">
<a style="color:darkgray;" href="javascript:showhide('simplelinearhypecodeslope')">Math
Code</a>
</div>
<div id="simplelinearhypecodeslope" style="display:none;">
<pre><code>$$
\left.\begin{array}{ll}
H_0: \beta_1 = 0 \\  
H_a: \beta_1 \neq 0
\end{array}
\right\} \ \text{Slope Hypotheses}
$$

$$
\left.\begin{array}{ll}
H_0: \beta_0 = 0 \\  
H_a: \beta_0 \neq 0
\end{array}
\right\} \ \text{Intercept Hypotheses}
$$</code></pre>
</div>
<div style="clear:right;">

</div>
<p><span class="math display">\[
\left.\begin{array}{ll}
H_0: \beta_1 = 0 \\  
H_a: \beta_1 \neq 0
\end{array}
\right\} \ \text{Slope Hypotheses}^{\quad \text{(most
common)}}\quad\quad
\]</span></p>
<p><span class="math display">\[
\left.\begin{array}{ll}
H_0: \beta_0 = 0 \\  
H_a: \beta_0 \neq 0
\end{array}
\right\} \ \text{Intercept Hypotheses}^{\quad\text{(sometimes useful)}}
\]</span></p>
<p><br/></p>
<p>If <span class="math inline">\(\beta_1 = 0\)</span>, then the model
reduces to <span class="math inline">\(Y_i = \beta_0 +
\epsilon_i\)</span>, which is a flat line. This means <span
class="math inline">\(X\)</span> does not improve our understanding of
the mean of <span class="math inline">\(Y\)</span> if the null
hypothesis is true.</p>
<p>If <span class="math inline">\(\beta_0 = 0\)</span>, then the model
reduces to <span class="math inline">\(Y_i = \beta_1 X +
\epsilon_i\)</span>, a line going through the origin. This means the
average <span class="math inline">\(Y\)</span>-value is <span
class="math inline">\(0\)</span> when <span
class="math inline">\(X=0\)</span> if the null hypothesis is true.</p>
<p><strong>Assumptions</strong></p>
<p>This regression model is appropriate for the data when five
assumptions can be made.</p>
<ol style="list-style-type: decimal">
<li><p><strong>Linear Relation</strong>: the true regression relation
between <span class="math inline">\(Y\)</span> and <span
class="math inline">\(X\)</span> is linear.</p></li>
<li><p><strong>Normal Errors</strong>: the error terms <span
class="math inline">\(\epsilon_i\)</span> are normally distributed with
a mean of zero.</p></li>
<li><p><strong>Constant Variance</strong>: the variance <span
class="math inline">\(\sigma^2\)</span> of the error terms is constant
(the same) over all <span class="math inline">\(X_i\)</span>
values.</p></li>
<li><p><strong>Fixed X</strong>: the <span
class="math inline">\(X_i\)</span> values can be considered fixed and
measured without error.</p></li>
<li><p><strong>Independent Errors</strong>: the error terms <span
class="math inline">\(\epsilon_i\)</span> are independent.</p></li>
</ol>
<div style="font-size:0.8em;">
<p>Note: see the <strong>Explanation</strong> tab <strong>Residual Plots
&amp; Regression Assumptions</strong> for details about checking the
regression assumptions.</p>
</div>
<p><strong>Interpretation</strong></p>
<p>The slope is interpreted as, “the change in the average y-value for a
one unit change in the x-value.” It <strong>is not</strong> the average
change in y. <strong>It is</strong> the change in the average
y-value.</p>
<p>The y-intercept is interpreted as, “the average y-value when x is
zero.” It is often not meaningful, but is sometimes useful. It just
depends if x being zero is meaningful or not within the context of your
analysis. For example, knowing the average price of a car with zero
miles is useful. However, pretending to know the average height of adult
males that weigh zero pounds, is not useful.</p>
<hr />
</div>
</div>
<div id="r-instructions" class="section level3">
<h3>R Instructions</h3>
<div style="padding-left:125px;">
<p><strong>Console</strong> Help Command: <code>?lm()</code></p>
<p><strong>Perform the Regression</strong></p>
<a href="javascript:showhide('simplelinearrcode')">
<div class="hoverchunk">
<p><span class="tooltipr"> mylm <span class="tooltiprtext">This is some
name you come up with that will become the R object that stores the
results of your linear regression <code>lm(...)</code> command.</span>
</span><span class="tooltipr">  &lt;-  <span class="tooltiprtext">This
is the “left arrow” assignment operator that stores the results of your
<code>lm()</code> code into <code>mylm</code> name.</span> </span><span
class="tooltipr"> lm( <span class="tooltiprtext">lm(…) is an R function
that stands for “Linear Model”. It performs a linear regression analysis
for Y ~ X.</span> </span><span class="tooltipr"> Y  <span
class="tooltiprtext">Y is your quantitative response variable. It is the
name of one of the columns in your data set.</span> </span><span
class="tooltipr"> ~  <span class="tooltiprtext">The tilde symbol ~ is
used to tell R that Y should be treated as the response variable that is
being explained by the explanatory variable X.</span> </span><span
class="tooltipr"> X, <span class="tooltiprtext">X is the quantitative
explanatory variable (at least it is typically quantitative but could be
qualitative) that will be used to explain the average Y-value.</span>
</span><span class="tooltipr">  data = NameOfYourDataset <span
class="tooltiprtext">NameOfYourDataset is the name of the dataset that
contains Y and X. In other words, one column of your dataset would be
your response variable Y and another column would be your explanatory
variable X.</span> </span><span class="tooltipr"> ) <span
class="tooltiprtext">Closing parenthesis for the lm(…) function.</span>
</span><br/><span class="tooltipr"> summary(mylm) <span
class="tooltiprtext">The <code>summary</code> command allows you to
print the results of your linear regression that were previously saved
in <code>mylm</code> name.</span> </span><span class="tooltipr"
style="float:right;font-size:.8em;">  Click to Show Output  <span
class="tooltiprtext">Click to View Output.</span> </span></p>
</div>
<p></a></p>
<div id="simplelinearrcode" style="display:none;">
<p>Example output from a regression. Hover each piece to learn more.</p>
<table class="rconsole">
<tr>
<td>
<span class="tooltiprout"> Call:<br/> lm(formula = dist ~ speed, data =
cars) <span class="tooltiprouttext">This is simply a statement of your
original lm(…) “call” that you made when performing your regression. It
allows you to verify that you ran what you thought you ran in the
lm(…).</span> </span>
</td>
</tr>
</table>
<p><br/></p>
<table class="rconsole">
<tr>
<td colspan="2">
<span class="tooltiprout"> Residuals: <span
class="tooltiprouttext">Residuals are the vertical difference between
each point and the line, <span class="math inline">\(Y_i -
\hat{Y}_i\)</span>. The residuals are supposed to be normally
distributed, so a quick glance at their five-number summary can give us
insight about any skew present in the residuals. </span>
</td>
</tr>
<tr>
<td align="right">
<span class="tooltiprout"> min<br/>   -29.069 <span
class="tooltiprouttext">“min” gives the value of the residual that is
furthest below the regression line. Ideally, the magnitude of this value
would be about equal to the magnitude of the largest positive residual
(the max) because the hope is that the residuals are normally
distributed around the line.</span> </span>
</td>
<td align="right">
<span class="tooltiprout"> 1Q<br/>   -9.525 <span
class="tooltiprouttext">“1Q” gives the first quartile of the residuals,
which will always be negative, and ideally would be about equal in
magnitude to the third quartile.</span> </span>
</td>
<td align="right">
<span class="tooltiprout"> Median<br/>   -2.272 <span
class="tooltiprouttext">“Median” gives the median of the residuals,
which would ideally would be about equal to zero. Note that because the
regression line is the least squares line, the mean of the residuals
will ALWAYS be zero, so it is never included in the output summary. This
particular median value of -2.272 is a little smaller than zero than we
would hope for and suggests a right skew in the data because the mean
(0) is greater than the median (-2.272) witnessing the residuals are
right skewed. This can also be seen in the maximum being much larger in
magnitude than the minimum.</span> </span>
</td>
<td align="right">
<span class="tooltiprout"> 3Q<br/>   9.215 <span
class="tooltiprouttext">“3Q” gives the third quartile of the residuals,
which would ideally would be about equal in magnitude to the first
quartile. In this case, it is pretty close, which helps us see that the
first quartile of residuals on either side of the line is behaving
fairly normally.</span> </span>
</td>
<td align="right">
<span class="tooltiprout"> Max</br>   43.201 <span
class="tooltiprouttext">“Max” gives the maximum positive residuals,
which would ideally would be about equal in magnitude to the minimum
residual. In this case, it is much larger than the minimum, which helps
us see that the residuals are likely right skewed.</span> </span>
</td>
</tr>
</table>
<p><br/></p>
<table class="rconsole">
<tr>
<td colspan="2">
<span class="tooltiprout"> Coefficients: <span
class="tooltiprouttext">Notice that in your lm(…) you used only <span
class="math inline">\(Y\)</span> and <span
class="math inline">\(X\)</span>. You did type out any coefficients,
i.e., the <span class="math inline">\(\beta_0\)</span> or <span
class="math inline">\(\beta_1\)</span> of the regression model. These
coefficients are estimated by the lm(…) function and displayed in this
part of the output along with standard errors, t-values, and
p-values.</span> </span>
</td>
</tr>
<tr>
<td align="left">
</td>
<td align="right">
<span class="tooltiprout">   Estimate <span class="tooltiprouttext">To
learn more about the “Estimates” of the “Coefficients” see the
“Explanation” tab, “Estimating the Model Parameters” section for
details.</span>
</td>
<td align="right">
<span class="tooltiprout">   Std. Error <span class="tooltiprouttext">To
learn more about the “Standard Errors” of the “Coefficients” see the
“Explanation” tab, “Inference for the Model Parameters” section.</span>
</span>
</td>
<td align="right">
<span class="tooltiprout">   t value <span class="tooltiprouttext">To
learn more about the “t value” of the “Coefficients” see the
“Explanation” tab, “Inference for the Model Parameters” section.</span>
</span>
</td>
<td align="right">
<span class="tooltiprout">   Pr(&gt;|t|) <span
class="tooltiprouttext">The “Pr” stands for “Probability” and the “(&gt;
|t|)” stands for “more extreme than the observed t-value”. Thus, this is
the p-value for the hypothesis test of each coefficient being zero.<br/>
To learn more about the “p-value” of the “Coefficients” see the
“Explanation” tab, “Inference for the Model Parameters” section. </span>
</span>
</td>
</tr>
<tr>
<td align="left">
<span class="tooltiprout"> (Intercept) <span
class="tooltiprouttext">This always says “Intercept” for any lm(…) you
run in R. That is because R always assumes there is a y-intercept for
your regression function.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">   -17.5791 <span class="tooltiprouttext">This
is the estimate of the y-intercept, <span
class="math inline">\(\beta_0\)</span>. It is called <span
class="math inline">\(b_0\)</span>. It is the average y-value when X is
zero.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">   6.7584 <span class="tooltiprouttext">This
is the standard error of <span class="math inline">\(b_0\)</span>. It
tells you how much <span class="math inline">\(b_0\)</span> varies from
sample to sample. The closer to zero, the better.</span> </span>
</td>
<td align="right">
<span class="tooltiprout"> -2.601 <span class="tooltiprouttext">This is
the test statistic t for the test of <span class="math inline">\(\beta_0
= 0\)</span>. It is calculated by dividing the “Estimate” of the
intercept (-17.5791) by its standard error (6.7584). It gives the
“number of standard errors” away from zero that the “estimate” has
landed. In this case, the estimate of -17.5791 is -2.601 standard errors
(6.7584) from zero, which is a fairly surprising distance as shown by
the p-value.</span> </span>
</td>
<td align="right">
<span class="tooltiprout"> 0.0123 <span class="tooltiprouttext">This is
the p-value of the test of the hypothesis that <span
class="math inline">\(\beta_0 = 0\)</span>. It measures the probability
of observing a t-value as extreme as the one observed. To compute it
yourself in R, use
<code>pt(-abs(your t-value), df of your regression)*2</code>.</span>
</span>
</td>
<td align="left">
<span class="tooltiprout"> * <span class="tooltiprouttext">This is
called a “star”. One star means significant at the 0.1 level of <span
class="math inline">\(\alpha\)</span>.</span> </span>
</td>
</tr>
<tr>
<td align="left">
<span class="tooltiprout"> speed <span class="tooltiprouttext">This is
always the name of your X-variable in your lm(Y ~ X, …).</span> </span>
</td>
<td align="right">
<span class="tooltiprout">   3.9324 <span class="tooltiprouttext">This
is the estimate of the slope, <span
class="math inline">\(\beta_1\)</span>. It is called <span
class="math inline">\(b_1\)</span>. It is the change in the average
y-value as X is increased by 1 unit.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">   0.4155 <span class="tooltiprouttext">This
is the standard error of <span class="math inline">\(b_1\)</span>. It
tells you how much <span class="math inline">\(b_1\)</span> varies from
sample to sample. The closer to zero, the better.</span> </span>
</td>
<td align="right">
<span class="tooltiprout"> 9.464 <span class="tooltiprouttext">This is
the test statistic t for the test of <span class="math inline">\(\beta_1
= 0\)</span>. It is calculated by dividing the “Estimate” of the slope
(3.9324) by its standard error (0.4155). It gives the “number of
standard errors” away from zero that the “estimate” has landed. In this
case, the estimate of 3.9324 is 9.464 standard errors (0.4155) from
zero, which is a really surprising distance as shown by the smallness of
the p-value.</span> </span>
</td>
<td align="right">
<span class="tooltiprout"> 1.49e-12 <span class="tooltiprouttext">This
is the p-value of the test of the hypothesis that <span
class="math inline">\(\beta_1 = 0\)</span>. To compute it yourself in R,
use <code>pt(-abs(your t-value), df of your regression)*2</code></span>
</span>
</td>
<td align="left">
<span class="tooltiprout"> *** <span class="tooltiprouttext">This is
called a “star”. Three stars means significant at the 0.01 level of
<span class="math inline">\(\alpha\)</span>.</span> </span>
</td>
</tr>
</table>
<table class="rconsole">
<tr>
<td>
<span> --- </span>
</td>
</tr>
</table>
<table class="rconsole">
<tr>
<td>
<span class="tooltiprout"> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ’*’
0.05 ‘.’ 0.1 ‘ ’ 1 <span class="tooltiprouttext">These “codes” explain
what significance level the p-value is smaller than based on how many
“stars” * the p-value is labeled with in the Coefficients table
above.</span> </span>
</td>
</tr>
</table>
<p><br/></p>
<table class="rconsole">
<tr>
<td>
<span class="tooltiprout"> Residual standard error: <span
class="tooltiprouttext">This is the estimate of <span
class="math inline">\(\sigma\)</span> in the regression model <span
class="math inline">\(Y_i = \beta_0 + \beta_1 X_i + \epsilon_i\)</span>
where <span class="math inline">\(\epsilon_i \sim
N(0,\sigma^2)\)</span>. It is the square root of the MSE.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">  15.38 <span class="tooltiprouttext">For this
particular regression, the estimate of <span
class="math inline">\(\sigma\)</span> is 15.38. Squaring this number
gives you the MSE, which is the estimate of <span
class="math inline">\(\sigma^2\)</span>.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">  on 48 degrees of freedom <span
class="tooltiprouttext">This is <span class="math inline">\(n-p\)</span>
where <span class="math inline">\(n\)</span> is the sample size and
<span class="math inline">\(p\)</span> is the number of parameters in
the regression model. In this case, there is a sample size of 50 and two
parameters, <span class="math inline">\(\beta_0\)</span> and <span
class="math inline">\(\beta_1\)</span>, so 50-2 = 48.</span> </span>
</td>
</tr>
</table>
<table class="rconsole">
<tr>
<td>
<span class="tooltiprout"> Multiple R-squared: <span
class="tooltiprouttext">This is <span
class="math inline">\(R^2\)</span>, the percentage of variation in <span
class="math inline">\(Y\)</span> that is explained by the regression
model. It is equal to the SSR/SSTO or, equivalently, 1 -
SSE/SSTO.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">  0.6511, <span class="tooltiprouttext">In
this particular regression, 65.11% of the variation in stopping distance
<code>dist</code> is explained by the regression model using speed of
the car.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">  Adjusted R-squared: <span
class="tooltiprouttext">The adjusted R-squared will always be at least
slightly smaller than <span class="math inline">\(R^2\)</span>. The
closer to R-squared that it is, the better. When it differs dramatically
from <span class="math inline">\(R^2\)</span>, it is a sign that the
regression model is over-fitting the data.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">  0.6438 <span class="tooltiprouttext">In this
case, the value of 0.6438 is quite close to the original <span
class="math inline">\(R^2\)</span> value, so there is no fear of
over-fitting with this particular model. That is good.</span> </span>
</td>
</tr>
</table>
<table class="rconsole">
<tr>
<td>
<span class="tooltiprout"> F-statistic: <span
class="tooltiprouttext">The F-statistic is found as the ratio of the
MSR/MSE where MSR = SSR/(p-1) and MSE = SSE/(n-p) where n is the sample
size and p is the number of parameters in the regression model.</span>
</span>
</td>
<td align="right">
<span class="tooltiprout">  89.57 <span class="tooltiprouttext">This is
the value of the F-statistic for the lm(dist ~ speed, data=cars)
regression. Note that SSE = sum( cars.lm$res^2 ) = 11353.52 with n - p =
50 - 2 = 48 degrees of freedom for this data. Further, SSR = sum(
(cars.lm$fit - mean(cars$dist))^2 ) = 21185.46 with p - 1 = 1 degree of
freedom. So MSR = 21185.46 and MSE = 11353.52 / 48 = 236.5317. So MSR /
MSE = 21185.46 / 236.5317 = 89.56711.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">  on 1 and 48 DF, <span
class="tooltiprouttext">The 1 degree of freedom is the SSR degrees of
freedom (p-1). The 48 is the SSE degrees of freedom (n-p).</span>
</span>
</td>
<td align="right">
<span class="tooltiprout">  p-value: 1.49e-12 <span
class="tooltiprouttext">The p-value for an F-statistic is found by the
code pf(89.56711, 1, 48, lower.tail=FALSE), which gives the probability
of being more extreme than the observed F-statistic in an F distribution
with 1 and 48 degrees of freedom.</span> </span>
</td>
</tr>
</table>
</div>
<p><br/></p>
<p><strong>Check Assumptions 1, 2, 3, and 5</strong></p>
<a href="javascript:showhide('assumptionplots')">
<div class="hoverchunk">
<p><span class="tooltipr"> par( <span class="tooltiprtext">The par(…)
command stands for “Graphical PARameters”. It allows you to control
various aspects of graphics in Base R.</span> </span><span
class="tooltipr"> mfrow= <span class="tooltiprtext">This stands for
“multiple frames filled by row”, which means, put lots of plots on the
same row, starting with the plot on the left, then working towards the
right as more plots are created.</span> </span><span class="tooltipr">
c( <span class="tooltiprtext">The combine function c(…) is used to
specify how many rows and columns of graphics should be placed
together.</span> </span><span class="tooltipr"> 1, <span
class="tooltiprtext">This specifies that 1 row of graphics should be
produced.</span> </span><span class="tooltipr"> 3 <span
class="tooltiprtext">This states that 3 columns of graphics should be
produced.</span> </span><span class="tooltipr"> ) <span
class="tooltiprtext">Closing parenthesis for c(…) function.</span>
</span><span class="tooltipr"> ) <span class="tooltiprtext">Closing
parenthesis for par(…) function.</span> </span><br/><span
class="tooltipr"> plot( <span class="tooltiprtext">This version of
plot(…) will actually create several regression diagnostic plots by
default.</span> </span><span class="tooltipr"> mylm, <span
class="tooltiprtext">This is the name of an lm object that you created
previously.</span> </span><span class="tooltipr"> which= <span
class="tooltiprtext">This allows you to select “which” regression
diagnostic plots should be drawn.</span> </span><span class="tooltipr">
1 <span class="tooltiprtext">Selecting 1, would give the residuals
vs. fitted values plot only.</span> </span><span class="tooltipr"> :
<span class="tooltiprtext">The colon allows you to select more than just
one plot.</span> </span><span class="tooltipr"> 2 <span
class="tooltiprtext">Selecting 2 also gives the Q-Q Plot of residuals.
If you wanted to instead you could just use which=1 to get the residuals
vs fitted values plot, then you could use qqPlot(mylm$residuals) to
create a fancier Q-Q Plot of the residuals.</span> </span><span
class="tooltipr"> ) <span class="tooltiprtext">Closing parenthesis for
plot(…) function.</span> </span><br/><span class="tooltipr"> plot( <span
class="tooltiprtext">This version of plot(…) will be used to create a
time-ordered plot of the residuals. The order of the residuals is the
original order of the x-values in the original data set. If the original
data set doesn’t have an order, then this plot is not
interesting.</span> </span><span class="tooltipr"> mylm <span
class="tooltiprtext">The lm object that you created previously.</span>
</span><span class="tooltipr"> $ <span class="tooltiprtext">This allows
you to access various elements from the regression that was
performed.</span> </span><span class="tooltipr"> residuals <span
class="tooltiprtext">This grabs the residuals for each observation in
the regression.</span> </span><span class="tooltipr"> ) <span
class="tooltiprtext">Closing parenthesis for plot(…) function.</span>
</span><span class="tooltipr" style="float:right;font-size:.8em;">
 Click to Show Output  <span class="tooltiprtext">Click to View
Output.</span> </span></p>
</div>
<p></a></p>
<div id="assumptionplots" style="display:none;">
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-2-1.png" width="672" /></p>
</div>
<p><br/></p>
<p><strong>Plotting the Regression Line</strong></p>
<div class="tab">
<p><button class="tablinks" onclick="openTab(event, 'BaseScatterplot')">Base
R</button>
<button class="tablinks" onclick="openTab(event, 'ggplotScatterplot')">ggplot2</button></p>
</div>
<div id="BaseScatterplot" class="tabcontent">
<p>
<p>To add the regression line to a scatterplot use the
<code>abline(...)</code> command:</p>
<a href="javascript:showhide('regressionline')">
<div class="hoverchunk">
<p><span class="tooltipr"> plot( <span class="tooltiprtext">The plot(…)
function is used to create a scatterplot with a y-axis (the vertical
axis) and an x-axis (the horizontal axis).</span> </span><span
class="tooltipr"> Y  <span class="tooltiprtext">This is the “response
variable” of your regression. The thing you are interested in
predicting. This is the name of a “numeric” column of data from the data
set called YourDataSet.</span> </span><span class="tooltipr"> ~  <span
class="tooltiprtext">The tilde “~” is used to relate Y to X and can be
found on the top-left key of your keyboard.</span> </span><span
class="tooltipr"> X,  <span class="tooltiprtext">This is the explanatory
variable of your regression. It is the name of a “numeric” column of
data from YourDataSet. .</span> </span><span class="tooltipr"> data=
<span class="tooltiprtext">The data= statement is used to specify the
name of the data set where the columns of “X” and “Y” are
located.</span> </span><span class="tooltipr"> YourDataSet <span
class="tooltiprtext">This is the name of your data set, like KidsFeet or
cars or airquality.</span> </span><span class="tooltipr"> ) <span
class="tooltiprtext">Closing parenthesis for plot(…) function.</span>
</span><br/><span class="tooltipr"> abline( <span
class="tooltiprtext">This stands for “a” (intercept) “b” (slope) line.
It is a function that allows you to add a line to a plot by specifying
just the intercept and slope of the line.</span> </span><span
class="tooltipr"> mylm <span class="tooltiprtext">This is the name of an
lm(…) that you created previoiusly. Since mylm contains the slope and
intercept of the estimated line, the abline(…) function will locate
these two values from within mylm and use them to add a line to your
current plot(…).</span> </span><span class="tooltipr"> ) <span
class="tooltiprtext">Closing parenthesis for abline(…) function.</span>
</span><span class="tooltipr" style="float:right;font-size:.8em;">
 Click to Show Output  <span class="tooltiprtext">Click to View
Output.</span> </span></p>
</div>
<p></a></p>
<div id="regressionline" style="display:none;">
<pre><code>mylm &lt;- lm(dist ~ speed, data = cars)
plot(dist ~ speed, data = cars)
abline(mylm)</code></pre>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-3-1.png" width="672" /></p>
</div>
<p>You can customize the look of the regression line with</p>
<a href="javascript:showhide('regressionlinecolor')">
<div class="hoverchunk">
<p><span class="tooltipr"> abline( <span class="tooltiprtext">This
stands for “a” (intercept) “b” (slope) line. It is a function that
allows you to add a line to a plot by specifying just the intercept and
slope of the line.</span> </span><span class="tooltipr"> mylm, <span
class="tooltiprtext">This is the name of an lm(…) that you created
previoiusly. Since mylm contains the slope and intercept of the
estimated line, the abline(…) function will locate these two values from
within mylm and use them to add a line to your current plot(…).</span>
</span><span class="tooltipr"> lty= <span class="tooltiprtext">The lty=
stands for “line type” and allows you to select between 0=blank, 1=solid
(default), 2=dashed, 3=dotted, 4=dotdash, 5=longdash, 6=twodash.</span>
</span><span class="tooltipr"> 1, <span class="tooltiprtext">This
creates a solid line. Remember, other options include: 0=blank, 1=solid
(default), 2=dashed, 3=dotted, 4=dotdash, 5=longdash, 6=twodash.</span>
</span><span class="tooltipr"> lwd= <span class="tooltiprtext">The lwd=
allows you to specify the width of the line. The default width is 1.
Using lwd=2 would double the thickness, and so on. Any positive value is
allowed.</span> </span><span class="tooltipr"> 1, <span
class="tooltiprtext">Default line width. To make a thicker line, us 2 or
3… To make a thinner line, try 0.5, but 1 is already pretty thin.</span>
</span><span class="tooltipr"> col= <span class="tooltiprtext">This
allows you to specify the color of the line using either a name of a
color or rgb(.5,.2,.3,.2) where the format is rgb(percentage red,
percentage green, percentage blue, percent opaque).</span> </span><span
class="tooltipr"> “someColor” <span class="tooltiprtext">Type colors()
in R for options.</span> </span><span class="tooltipr"> ) <span
class="tooltiprtext">Closing parenthesis for abline(…) function.</span>
</span><span class="tooltipr" style="float:right;font-size:.8em;">
 Click to Show Output  <span class="tooltiprtext">Click to View
Output.</span> </span></p>
</div>
<p></a></p>
<div id="regressionlinecolor" style="display:none;">
<pre><code>mylm &lt;- lm(dist ~ speed, data = cars)
plot(dist ~ speed, data = cars)
abline(mylm, lty=1, lwd=1, col=&quot;firebrick&quot;)</code></pre>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-4-1.png" width="672" /></p>
</div>
<p>You can add points to the regression with…</p>
<a href="javascript:showhide('regressionaddpoints')">
<div class="hoverchunk">
<p><span class="tooltipr"> points( <span class="tooltiprtext">This is
like plot(…) but adds points to the current plot(…) instead of creating
a new plot.</span> </span><span class="tooltipr"> newY  <span
class="tooltiprtext">newY should be a column of values from some data
set. Or, use points(newX, newY) to add a single point to a graph.</span>
</span><span class="tooltipr"> ~  <span class="tooltiprtext">This links
Y to X in the plot.</span> </span><span class="tooltipr"> newX,  <span
class="tooltiprtext">newX should be a column of values from some data
set. It should be the same length as newY. If just a single value, use
points(newX, newY) instead.</span> </span><span class="tooltipr">
data=YourDataSet,  <span class="tooltiprtext">If newY and newX come from
a dataset, then use data= to tell the points(…) function what data set
they come from. If newY and newX are just single values, then data= is
not needed.</span> </span><span class="tooltipr"> col=“skyblue”, <span
class="tooltiprtext">This allows you to specify the color of the points
using either a name of a color or rgb(.5,.2,.3,.2) where the format is
rgb(percentage red, percentage green, percentage blue, percent
opaque).</span> </span><span class="tooltipr"> pch=16 <span
class="tooltiprtext">This allows you to specify the type of plotting
symbol to be used for the points. Type ?pch and scroll half way down in
the help file that appears to learn about other possible symbols.</span>
</span><span class="tooltipr"> ) <span class="tooltiprtext">Closing
parenthesis for points(…) function.</span> </span><span class="tooltipr"
style="float:right;font-size:.8em;">  Click to Show Output  <span
class="tooltiprtext">Click to View Output.</span> </span></p>
</div>
<p></a></p>
<div id="regressionaddpoints" style="display:none;">
<pre><code>mylm &lt;- lm(dist ~ speed, data = cars)
plot(dist ~ speed, data = cars)
points(7,40, pch=16, col=&quot;skyblue&quot;, cex=2)
text(7,40, &quot;New Dot&quot;, pos=3, cex=0.5)
points(dist ~ speed, data=filter(cars, mylm$res &gt; 2), cex=.8, col=&quot;red&quot;)
abline(mylm, lty=1, lwd=1, col=&quot;firebrick&quot;)</code></pre>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-5-1.png" width="672" /></p>
</div>
</p>
</div>
<div id="ggplotScatterplot" class="tabcontent">
<p>
<p>To add the regression line to a scatterplot using the ggplot2
approach, first ensure:</p>
<p><code>library(ggplot2)</code> or <code>library(tidyverse)</code></p>
<p>is loaded. Then, use the <code>geom_smooth(method = lm)</code>
command:</p>
<a href="javascript:showhide('ggplot')">
<div class="hoverchunk">
<p><span class="tooltipr"> ggplot( <span class="tooltiprtext">Every
ggplot2 graphic begins with the ggplot() command, which creates a
framework, or coordinate system, that you can add layers to. Without
adding any layers, ggplot() produces a blank graphic.</span>
</span><span class="tooltipr"> YourDataSet,  <span
class="tooltiprtext">This is simply the name of your data set, like
KidsFeet or starwars.</span> </span><span class="tooltipr"> aes( <span
class="tooltiprtext">aes stands for aesthetic. Inside of aes(), you
place elements that you want to map to the coordinate system, like x and
y variables.</span> </span><span class="tooltipr"> x =  <span
class="tooltiprtext">“x = ” declares which variable will become the
x-axis of the graphic, your explanatory variable. Both “x= ” and “y= ”
are optional phrasesin the ggplot2 syntax.</span> </span><span
class="tooltipr"> X, <span class="tooltiprtext">This is the explanatory
variable of the regression: the variable used to <em>explain</em> the
mean of y. It is the name of the “numeric” column of YourDataSet.</span>
</span><span class="tooltipr">  y =  <span class="tooltiprtext">“y= ”
declares which variable will become the y-axis of the graphic.</span>
</span><span class="tooltipr"> Y <span class="tooltiprtext">This is the
response variable of the regression: the variable that you are
interested in predicting. It is the name of a “numeric” column of
YourDataSet.</span> </span><span class="tooltipr"> ) <span
class="tooltiprtext">Closing parenthesis for aes(…) function.</span>
</span><span class="tooltipr"> ) <span class="tooltiprtext">Closing
parenthesis for ggplot(…) function.</span> </span><span
class="tooltipr"> + <span class="tooltiprtext">The + allows you to add
more layers to the framework provided by ggplot(). In this case, you use
+ to add a geom_point() layer on the next line.</span> </span><br/><span
class="tooltipr">   geom_point() <span class="tooltiprtext">geom_point()
allows you to add a layer of points, a scatterplot, over the ggplot()
framework. The x and y coordinates are received from the previously
specified x and y variables declared in the ggplot() aesthetic.</span>
</span><span class="tooltipr"> + <span class="tooltiprtext">Here the +
is used to add yet another layer to ggplot().</span> </span><br/><span
class="tooltipr">   geom_smooth( <span
class="tooltiprtext">geom_smooth() is a smoothing function that you can
use to add different lines or curves to ggplot(). In this case, you will
use it to add the least-squares regression line to the
scatterplot.</span> </span><span class="tooltipr"> method =  <span
class="tooltiprtext">Use “method = ” to tell geom_smooth() that you are
going to declare a specific smoothing function, or method, to alter the
line or curve..</span> </span><span class="tooltipr"> “lm”, <span
class="tooltiprtext">lm stands for linear model. Using method = “lm”
tells geom_smooth() to fit a least-squares regression line onto the
graphic. The regression line is modeled using y ~ x, which variables
were declared in the initial ggplot() aesthetic. There are several other
methods that could be used here.</span> </span><span class="tooltipr">
 formula = y~x, <span class="tooltiprtext">This tells geom_smooth to
place a simple linear regression line on the plot. Other formula
statements can be used in the same way as lm(…) to place more
complicated models on the plot.</span> </span><span class="tooltipr">
 se = FALSE <span class="tooltiprtext">se stands for “standard error”.
Specifying FALSE turns this feature off. When TRUE, a gray band showing
the “confidence band” for the regression is shown. Unless you know how
to interpret this confidence band, leave it turned off.</span>
</span><span class="tooltipr"> ) <span class="tooltiprtext">Closing
parenthesis for the geom_smooth() function.</span> </span><span
class="tooltipr" style="float:right;font-size:.8em;">  Click to Show
Output  <span class="tooltiprtext">Click to View Output.</span>
</span></p>
</div>
<p></a></p>
<div id="ggplot" style="display:none;">
<pre><code>ggplot(cars, aes(x = speed, y = dist)) +
  geom_point() +
  geom_smooth(method = &quot;lm&quot;, formula=y~x, se=FALSE)</code></pre>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-6-1.png" width="672" /></p>
</div>
<p>There are a number of ways to customize the appearance of the
regression line:</p>
<a href="javascript:showhide('ggplotline')">
<div class="hoverchunk">
<p><span class="tooltipr"> ggplot( <span class="tooltiprtext">Every
ggplot2 graphic begins with the ggplot() command, which creates a
framework, or coordinate system, that you can add layers to. Without
adding any layers, ggplot() produces a blank graphic.</span>
</span><span class="tooltipr"> cars,  <span class="tooltiprtext">This is
simply the name of your data set, like KidsFeet or starwars.</span>
</span><span class="tooltipr"> aes( <span class="tooltiprtext">aes
stands for aesthetic. Inside of aes(), you place elements that you want
to map to the coordinate system, like x and y variables.</span>
</span><span class="tooltipr"> x =  <span class="tooltiprtext">“x = ”
declares which variable will become the x-axis of the graphic, your
explanatory variable. Both “x= ” and “y= ” are optional phrasesin the
ggplot2 syntax.</span> </span><span class="tooltipr"> speed,  <span
class="tooltiprtext">This is the explanatory variable of the regression:
the variable used to <em>explain</em> the mean of y. It is the name of
the “numeric” column of YourDataSet.</span> </span><span
class="tooltipr"> y =  <span class="tooltiprtext">“y= ” declares which
variable will become the y-axis of the grpahic.</span> </span><span
class="tooltipr"> dist <span class="tooltiprtext">This is the response
variable of the regression: the variable that you are interested in
predicting. It is the name of a “numeric” column of YourDataSet.</span>
</span><span class="tooltipr"> ) <span class="tooltiprtext">Closing
parenthesis for aes(…) function.</span> </span><span class="tooltipr"> )
<span class="tooltiprtext">Closing parenthesis for ggplot(…)
function.</span> </span><span class="tooltipr">  + <span
class="tooltiprtext">The + allows you to add more layers to the
framework provided by ggplot(). In this case, you use + to add a
geom_point() layer on the next line.</span> </span><br/><span
class="tooltipr">   geom_point() <span class="tooltiprtext">geom_point()
allows you to add a layer of points, a scatterplot, over the ggplot()
framework. The x and y coordinates are received from the previously
specified x and y variables declared in the ggplot() aesthetic.</span>
</span><span class="tooltipr">  + <span class="tooltiprtext">Here the +
is used to add yet another layer to ggplot().</span> </span><br/><span
class="tooltipr">   geom_smooth( <span
class="tooltiprtext">geom_smooth() is a smoothing function that you can
use to add different lines or curves to ggplot(). In this case, you will
use it to add the least-squares regression line to the
scatterplot.</span> </span><span class="tooltipr"> method =  <span
class="tooltiprtext">Use “method = ” to tell geom_smooth() that you are
going to declare a specific smoothing function, or method, to alter the
line or curve..</span> </span><span class="tooltipr"> “lm”, <span
class="tooltiprtext">lm stands for linear model. Using method = “lm”
tells geom_smooth() to fit a least-squares regression line onto the
graphic. The regression line is modeled using y ~ x, which variables
were declared in the initial ggplot() aesthetic.</span> </span><span
class="tooltipr">  formula = y~x, <span class="tooltiprtext">This tells
geom_smooth to place a simple linear regression line on the plot. Other
formula statements can be used in the same way as lm(…) to place more
complicated models on the plot.</span> </span><span class="tooltipr">
 se = FALSE, <span class="tooltiprtext">se stands for “standard error”.
Specifying FALSE turns this feature off. When TRUE, a gray band showing
the “confidence band” for the regression is shown. Unless you know how
to interpret this confidence band, leave it turned off.</span>
</span><span class="tooltipr">  size = 2, <span class="tooltiprtext">Use
<em>size = 2</em> to adjust the thickness of the line to size 2.</span>
</span><span class="tooltipr">  color = “orange”, <span
class="tooltiprtext">Use <em>color = “orange”</em> to change the color
of the line to orange.</span> </span><br><span class="tooltipr">
  linetype = “dashed” <span class="tooltiprtext">Use <em>linetype =
“dashed”</em> to change the solid line to a dashed line. Some linetype
options include “dashed”, “dotted”, “longdash”, “dotdash”, etc.</span>
</span><span class="tooltipr"> ) <span class="tooltiprtext">Closing
parenthesis for the geom_smooth() function.</span> </span><span
class="tooltipr" style="float:right;font-size:.8em;">  Click to Show
Output  <span class="tooltiprtext">Click to View Output.</span>
</span></p>
</div>
<p></a></p>
<div id="ggplotline" style="display:none;">
<pre><code>## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.</code></pre>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-7-1.png" width="672" /></p>
</div>
<p>In addition to customizing the regression line, you can customize the
points, add points, add lines, and much more.</p>
<a href="javascript:showhide('ggplotpoints')">
<div class="hoverchunk">
<p><span class="tooltipr"> ggplot( <span class="tooltiprtext">Every
ggplot2 graphic begins with the ggplot() command, which creates a
framework, or coordinate system, that you can add layers to. Without
adding any layers, ggplot() produces a blank graphic.</span>
</span><span class="tooltipr"> cars,  <span class="tooltiprtext">This is
simply the name of your data set, like KidsFeet or starwars.</span>
</span><span class="tooltipr"> aes( <span class="tooltiprtext">aes
stands for aesthetic. Inside of aes(), you place elements that you want
to map to the coordinate system, like x and y variables.</span>
</span><span class="tooltipr"> x =  <span class="tooltiprtext">“x = ”
declares which variable will become the x-axis of the graphic, your
explanatory variable. Both “x= ” and “y= ” are optional phrasesin the
ggplot2 syntax.</span> </span><span class="tooltipr"> speed,  <span
class="tooltiprtext">This is the explanatory variable of the regression:
the variable used to <em>explain</em> the mean of y. It is the name of
the “numeric” column of YourDataSet.</span> </span><span
class="tooltipr"> y =  <span class="tooltiprtext">“y= ” declares which
variable will become the y-axis of the grpahic.</span> </span><span
class="tooltipr"> dist <span class="tooltiprtext">This is the response
variable of the regression: the variable that you are interested in
predicting. It is the name of a “numeric” column of YourDataSet.</span>
</span><span class="tooltipr"> ) <span class="tooltiprtext">Closing
parenthesis for aes(…) function.</span> </span><span class="tooltipr"> )
<span class="tooltiprtext">Closing parenthesis for ggplot(…)
function.</span> </span><span class="tooltipr">  + <span
class="tooltiprtext">The + allows you to add more layers to the
framework provided by ggplot(). In this case, you use + to add a
geom_point() layer on the next line.</span> </span><br/><span
class="tooltipr">   geom_point( <span class="tooltiprtext">geom_point()
allows you to add a layer of points, a scatterplot, over the ggplot()
framework. The x and y coordinates are received from the previously
specified x and y variables declared in the ggplot() aesthetic.</span>
</span><span class="tooltipr"> size = 1.5, <span
class="tooltiprtext">Use <em>size = 1.5</em> to change the size of the
points.</span> </span><span class="tooltipr">  color = “skyblue” <span
class="tooltiprtext">Use <em>color = “skyblue”</em> to change the color
of the points to Brother Saunders’ favorite color.</span> </span><span
class="tooltipr">  alpha = 0.5 <span class="tooltiprtext">Use <em>alpha
= 0.5</em> to change the transparency of the points to 0.5.</span>
</span><span class="tooltipr"> ) <span class="tooltiprtext">Closing
parenthesis of geom_point() function. </span> </span><span
class="tooltipr">  + <span class="tooltiprtext">The + allows you to add
more layers to the framework provided by ggplot().</span>
</span><br><span class="tooltipr">   geom_smooth( <span
class="tooltiprtext">geom_smooth() is a smoothing function that you can
use to add different lines or curves to ggplot(). In this case, you will
use it to add the least-squares regression line to the
scatterplot.</span> </span><span class="tooltipr"> method =  <span
class="tooltiprtext">Use “method = ” to tell geom_smooth() that you are
going to declare a specific smoothing function, or method, to alter the
line or curve..</span> </span><span class="tooltipr"> “lm”, <span
class="tooltiprtext">lm stands for linear model. Using method = “lm”
tells geom_smooth() to fit a least-squares regression line onto the
graphic.</span> </span><span class="tooltipr">  formula = y~x, <span
class="tooltiprtext">This tells geom_smooth to place a simple linear
regression line on the plot. Other formula statements can be used in
ways similar to lm(…) to place more complicated models on the
plot.</span> </span><span class="tooltipr">  se = FALSE, <span
class="tooltiprtext">se stands for “standard error”. Specifying FALSE
turns this feature off. When TRUE, a gray band showing the “confidence
band” for the regression is shown. Unless you know how to interpret this
confidence band, leave it turned off.</span> </span><span
class="tooltipr">  color = “navy”, <span class="tooltiprtext">Use
<em>color = “navy”</em> to change the color of the line to navy
blue.</span> </span><span class="tooltipr">  size = 1.5 <span
class="tooltiprtext">Use <em>size = 1.5</em> to adjust the thickness of
the line to 1.5.</span> </span><span class="tooltipr"> ) <span
class="tooltiprtext">Closing parenthesis of geom_smooth()
function.</span> </span><span class="tooltipr">  + <span
class="tooltiprtext">The + allows you to add more layers to the
framework provided by ggplot().</span> </span><br><span
class="tooltipr">   geom_hline( <span class="tooltiprtext">Use
geom_hline() to add a horizontal line at a specified y-intercept. You
can also use geom_vline(xintercept = some_number) to add a vertical line
to the graph.</span> </span><span class="tooltipr"> yintercept = <span
class="tooltiprtext">Use “yintercept =” to tell geom_hline() that you
are going to declare a y intercept for the horizontal line.</span>
</span><span class="tooltipr">  75 <span class="tooltiprtext">75 is the
value of the y-intercept.</span> </span><span class="tooltipr"> , color
= “firebrick” <span class="tooltiprtext">Use <em>color =
“firebrick”</em> to change the color of the horizontal line to firebrick
red.</span> </span><span class="tooltipr"> , size = 1, <span
class="tooltiprtext">Use <em>size = 1</em> to adjust the thickness of
the horizontal line to size 1.</span> </span><br><span class="tooltipr">
             linetype = “longdash” <span class="tooltiprtext">Use
<em>linetype = “longdash”</em> to change the solid line to a dashed line
with longer dashes. Some linetype options include “dashed”, “dotted”,
“longdash”, “dotdash”, etc.</span> </span><span class="tooltipr"> ,
alpha = 0.5 <span class="tooltiprtext">Use <em>alpha = 0.5</em> to
change the transparency of the horizontal line to 0.5.</span>
</span><span class="tooltipr"> ) <span class="tooltiprtext">Closing
parenthesis of geom_hline function.</span> </span><span
class="tooltipr">  + <span class="tooltiprtext">The + allows you to add
more layers to the framework provided by ggplot().</span>
</span><br><span class="tooltipr">   geom_segment( <span
class="tooltiprtext">geom_segment() allows you to add a line segment to
ggplot() by using specified start and end points.</span> </span><span
class="tooltipr"> x = <span class="tooltiprtext">“x =” tells
geom_segment() that you are going to declare the x-coordinate for the
starting point of the line segment.</span> </span><span
class="tooltipr">  14, <span class="tooltiprtext">14 is a number on the
x-axis of your graph. It is the x-coordinate of the starting point of
the line segment.</span> </span><span class="tooltipr">  y =<br />
<span class="tooltiprtext">“y =” tells geom_segment() that you are going
to declare the y-coordinate for the starting point of the line
segment.</span> </span><span class="tooltipr">  75, <span
class="tooltiprtext">75 is a number on the y-axis of your graph. It is
the y-coordinate of the starting point of the line segment.</span>
</span><span class="tooltipr">  xend = <span class="tooltiprtext">“xend
=” tells geom_segment() that you are going to declare the x-coordinate
for the end point of the line segment.</span> </span><span
class="tooltipr">  14, <span class="tooltiprtext">14 is a number on the
x-axis of your graph. It is the x-coordinate of the end point of the
line segment.</span> </span><span class="tooltipr">  yend = <span
class="tooltiprtext">“yend =” tells geom_segment() that you are going to
declare the y-coordinate for the end point of the line segment.</span>
</span><span class="tooltipr">  38, <span class="tooltiprtext">38 is a
number on the y-axis of your graph. It is the y-coordinate of the end
point of the line segment.</span> </span><br><span class="tooltipr">
               size = 1 <span class="tooltiprtext">Use <em>size = 1</em>
to adjust the thickness of the line segment.</span> </span><span
class="tooltipr"> , color = “lightgray” <span class="tooltiprtext">Use
<em>color = “lightgray”</em> to change the color of the line segment to
light gray.</span> </span><span class="tooltipr"> , linetype =
“longdash” <span class="tooltiprtext">Use *linetype = “longdash* to
change the solid line segment to a dashed one. Some linetype options
include”dashed”, “dotted”, “longdash”, “dotdash”, etc.</span>
</span><span class="tooltipr"> ) <span class="tooltiprtext">Closing
parenthesis for geom_segment() function.</span> </span><span
class="tooltipr">  + <span class="tooltiprtext">The + allows you to add
more layers to the framework provided by ggplot().</span>
</span><br><span class="tooltipr">   geom_point( <span
class="tooltiprtext">geom_point() can also be used to add individual
points to the graph. Simply declare the x and y coordinates of the point
you want to plot.</span> </span><span class="tooltipr"> x = <span
class="tooltiprtext">“x =” tells geom_point() that you are going to
declare the x-coordinate for the point.</span> </span><span
class="tooltipr">  14, <span class="tooltiprtext">14 is a number on the
x-axis of your graph. It is the x-coordinate of the point.</span>
</span><span class="tooltipr">  y = <span class="tooltiprtext">“y =”
tells geom_point() that you are going to declare the y-coordinate for
the point.</span> </span><span class="tooltipr">  75 <span
class="tooltiprtext">75 is a number on the y-axis of your graph. It is
the y-coordinate of the point.</span> </span><span class="tooltipr"> ,
size = 3 <span class="tooltiprtext">Use <em>size = 3</em> to make the
point stand out more.</span> </span><span class="tooltipr"> , color =
“firebrick” <span class="tooltiprtext">Use <em>color = “firebrick”</em>
to change the color of the point to firebrick red.</span> </span><span
class="tooltipr"> ) <span class="tooltiprtext">Closing parenthesis of
the geom_point() function.</span> </span><span class="tooltipr">  +
<span class="tooltiprtext">The + allows you to add more layers to the
framework provided by ggplot().</span> </span><br><span
class="tooltipr">   geom_text( <span class="tooltiprtext">geom_text()
allows you to add customized text anywhere on the graph. It is very
similar to the base R equivalent, text(…).</span> </span><span
class="tooltipr"> x = <span class="tooltiprtext">“x =” tells geom_text()
that you are going to declare the x-coordinate for the text.</span>
</span><span class="tooltipr">  14, <span class="tooltiprtext">14 is a
number on the x-axis of your graph. It is the x-coordinate of the
text.</span> </span><span class="tooltipr">  y = <span
class="tooltiprtext">“y =” tells geom_text() that you are going to
declare the y-coordinate for the text.</span> </span><span
class="tooltipr">  84, <span class="tooltiprtext">84 is a number on the
y-axis of your graph. It is the y-coordinate of the text.</span>
</span><span class="tooltipr">  label = <span
class="tooltiprtext">“label =” tells geom_text() that you are going to
give it the label.</span> </span><span class="tooltipr">  “My Point (14,
75)”, <span class="tooltiprtext"><em>“My Point (14, 75)”</em> is the
text that will appear on the graph.</span> </span><br><span
class="tooltipr">             color = “navy” <span
class="tooltiprtext">Use <em>color = “navy”</em> to change the color of
the text to navy blue.</span> </span><span class="tooltipr"> , size = 3
<span class="tooltiprtext">Use <em>size = 3</em> to change the size of
the text.</span> </span><span class="tooltipr"> ) <span
class="tooltiprtext">Closing parenthesis of the geom_text()
function.</span> </span><span class="tooltipr">  + <span
class="tooltiprtext">The + allows you to add more layers to the
framework provided by ggplot().</span> </span><br><span
class="tooltipr">   theme_minimal() <span class="tooltiprtext">Add a
minimalistic theme to the graph. There are many other themes that you
can try out.</span> </span><span class="tooltipr"
style="float:right;font-size:.8em;">  Click to Show Output  <span
class="tooltiprtext">Click to View Output.</span> </span></p>
</div>
<p></a></p>
<div id="ggplotpoints" style="display:none;">
<pre><code>## `geom_smooth()` using formula = &#39;y ~ x&#39;</code></pre>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-8-1.png" width="672" /></p>
</div>
</p>
</div>
<p><br/></p>
<p><strong>Accessing Parts of the Regression</strong></p>
<p>Finally, note that the <code>mylm</code> object contains the
<code>names(mylm)</code> of</p>
<a href="javascript:showhide('coeff')">
<div class="hoverchunk">
<p><span class="tooltipr"> mylm$coefficients <span
class="tooltiprtext">Contains two values. The first is the estimated
<span class="math inline">\(y\)</span>-intercept. The second is the
estimated slope.</span> </span></p>
</div>
<p></a></p>
<div id="coeff" style="display:none;">
<pre><code>## (Intercept)       speed 
##  -17.579095    3.932409</code></pre>
</div>
<a href="javascript:showhide('resid')">
<div class="hoverchunk">
<p><span class="tooltipr"> mylm$residuals <span
class="tooltiprtext">Contains the residuals from the regression in the
same order as the actual dataset.</span> </span></p>
</div>
<p></a></p>
<div id="resid" style="display:none;">
<pre><code>##          1          2          3          4          5          6          7 
##   3.849460  11.849460  -5.947766  12.052234   2.119825  -7.812584  -3.744993 
##          8          9         10         11         12         13         14 
##   4.255007  12.255007  -8.677401   2.322599 -15.609810  -9.609810  -5.609810 
##         15         16         17         18         19         20         21 
##  -1.609810  -7.542219   0.457781   0.457781  12.457781 -11.474628  -1.474628 
##         22         23         24         25         26         27         28 
##  22.525372  42.525372 -21.407036 -15.407036  12.592964 -13.339445  -5.339445 
##         29         30         31         32         33         34         35 
## -17.271854  -9.271854   0.728146 -11.204263   2.795737  22.795737  30.795737 
##         36         37         38         39         40         41         42 
## -21.136672 -11.136672  10.863328 -29.069080 -13.069080  -9.069080  -5.069080 
##         43         44         45         46         47         48         49 
##   2.930920  -2.933898 -18.866307  -6.798715  15.201285  16.201285  43.201285 
##         50 
##   4.268876</code></pre>
</div>
<a href="javascript:showhide('fit')">
<div class="hoverchunk">
<p><span class="tooltipr"> mylm$fitted.values <span
class="tooltiprtext">The values of <span
class="math inline">\(\hat{Y}\)</span> in the same order as the original
dataset.</span> </span></p>
</div>
<p></a></p>
<div id="fit" style="display:none;">
<pre><code>##         1         2         3         4         5         6         7         8 
## -1.849460 -1.849460  9.947766  9.947766 13.880175 17.812584 21.744993 21.744993 
##         9        10        11        12        13        14        15        16 
## 21.744993 25.677401 25.677401 29.609810 29.609810 29.609810 29.609810 33.542219 
##        17        18        19        20        21        22        23        24 
## 33.542219 33.542219 33.542219 37.474628 37.474628 37.474628 37.474628 41.407036 
##        25        26        27        28        29        30        31        32 
## 41.407036 41.407036 45.339445 45.339445 49.271854 49.271854 49.271854 53.204263 
##        33        34        35        36        37        38        39        40 
## 53.204263 53.204263 53.204263 57.136672 57.136672 57.136672 61.069080 61.069080 
##        41        42        43        44        45        46        47        48 
## 61.069080 61.069080 61.069080 68.933898 72.866307 76.798715 76.798715 76.798715 
##        49        50 
## 76.798715 80.731124</code></pre>
</div>
<div class="hoverchunk">
<p><span class="tooltipr"> mylm$… <span class="tooltiprtext">several
other things that will not be explained here.</span> </span></p>
</div>
<p><br/></p>
<p><strong>Making Predictions</strong></p>
<a href="javascript:showhide('prediction')">
<div class="hoverchunk">
<p><span class="tooltipr"> predict( <span class="tooltiprtext">The R
function predict(…) allows you to use an lm(…) object to make
predictions for specified x-values.</span> </span><span
class="tooltipr"> mylm, <span class="tooltiprtext">This is the name of a
previously performed lm(…) that was saved into the name
<code>mylm &lt;- lm(...)</code>.</span> </span><span class="tooltipr">
 data.frame( <span class="tooltiprtext">To specify the values of <span
class="math inline">\(x\)</span> that you want to use in the prediction,
you have to put those x-values into a data set, or more specifally, a
data.frame(…).</span> </span><span class="tooltipr"> X= <span
class="tooltiprtext">The value for <code>X=</code> should be whatever
x-variable name was used in the original regression. For example, if
<code>mylm &lt;- lm(dist ~ speed, data=cars)</code> was the original
regression, then this code would read <code>speed =</code> instead of
<code>X=</code>… Further, the value of <span
class="math inline">\(Xh\)</span> should be some specific number, like
<code>speed=12</code> for example.</span> </span><span class="tooltipr">
Xh <span class="tooltiprtext">The value of <span
class="math inline">\(Xh\)</span> should be some specific number, like
<code>12</code>, as in <code>speed=12</code> for example.</span>
</span><span class="tooltipr"> ) <span class="tooltiprtext">Closing
parenthesis for the data.frame(…) function.</span> </span><span
class="tooltipr"> ) <span class="tooltiprtext">Closing parenthesis for
the predict(…) function.</span> </span></p>
</div>
<p></a></p>
<div id="prediction" style="display:none;">
<p><code>mylm &lt;- lm(dist ~ speed, data = cars)</code></p>
<p><code>predict(mylm, data.frame(speed = 12))</code></p>
<table class="rconsole">
<tr>
<td align="right">
<span class="tooltiprout"> 1<br/>   29.60981 <span
class="tooltiprouttext">The value given is the “fitted-value” or
“predicted-value” for the specified x-value. In this case, a car with a
speed of 12 is predicted to have a stopping distance of 29.60981
feet.</span> </span>
</td>
</tr>
</table>
</div>
<a href="javascript:showhide('predictionInterval')">
<div class="hoverchunk">
<p><span class="tooltipr"> predict( <span class="tooltiprtext">The R
function predict(…) allows you to use an lm(…) object to make
predictions for specified x-values.</span> </span><span
class="tooltipr"> mylm, <span class="tooltiprtext">This is the name of a
previously performed lm(…) that was saved into the name
<code>mylm &lt;- lm(...)</code>.</span> </span><span class="tooltipr">
 data.frame( <span class="tooltiprtext">To specify the values of <span
class="math inline">\(x\)</span> that you want to use in the prediction,
you have to put those x-values into a data set, or more specifally, a
data.frame(…).</span> </span><span class="tooltipr"> X= <span
class="tooltiprtext">The value for <code>X=</code> should be whatever
x-variable name was used in the original regression. For example, if
<code>mylm &lt;- lm(dist ~ speed, data=cars)</code> was the original
regression, then this code would read <code>speed =</code> instead of
<code>X=</code>… Further, the value of <span
class="math inline">\(Xh\)</span> should be some specific number, like
<code>speed=12</code> for example.</span> </span><span class="tooltipr">
Xh <span class="tooltiprtext">The value of <span
class="math inline">\(Xh\)</span> should be some specific number, like
<code>12</code>, as in <code>speed=12</code> for example.</span>
</span><span class="tooltipr"> ), <span class="tooltiprtext">Closing
parenthesis for the data.frame(…) function.</span> </span><span
class="tooltipr">  interval= <span class="tooltiprtext">This optional
command allows you to specify if the predicted value should be
accompanied by either a confidence interval or a prediction
interval.</span> </span><span class="tooltipr"> “prediction” <span
class="tooltiprtext">This specifies that a prediction interval will be
included with the predicted value. A prediction interval gives you a 95%
confidence interval that captures 95% of the data, or <span
class="math inline">\(Y_i\)</span> values for the specific <span
class="math inline">\(X\)</span>-value specified in the
prediction.</span> </span><span class="tooltipr"> ) <span
class="tooltiprtext">Closing parenthesis of the predict(…)
function.</span> </span></p>
</div>
<p></a></p>
<div id="predictionInterval" style="display:none;">
<p><code>mylm &lt;- lm(dist ~ speed, data = cars)</code></p>
<p><code>predict(mylm, data.frame(speed = 12), interval = "prediction")</code></p>
<table class="rconsole">
<tr>
<td align="right">
<span class="tooltiprout">   fit <span class="tooltiprouttext">The “fit”
is the predicted value.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">   lwr <span class="tooltiprouttext">The “lwr”
is the lower bound.</span>
</td>
<td align="right">
<span class="tooltiprout">   upr <span class="tooltiprouttext">The “upr”
is the upper bound.</span> </span>
</td>
</tr>
<tr>
<td align="left">
<span class="tooltiprout"> 1 29.60981 <span class="tooltiprouttext">In
this case, a car with a speed of 12 mph is predicted to have a stopping
distance of 29.60981 feet. However, we are wise enough to recognize that
the stopping distance for individual cars will vary anywhere from
-1.749529 (or 0 because distance can’t go negative) feet to 60.96915
feet.</span> </span>
</td>
<td align="left">
<span class="tooltiprout"> -1.749529 <span class="tooltiprouttext">This
is the lower bound of the prediction interval. While we predict a
stopping distance of 29.60981 feet, this prediction interval reminds us
the stopping distance could be as quick as -1.749529 feet (or 0 because
distance can’t go negative).</span> </span>
</td>
<td align="left">
<span class="tooltiprout"> 60.96915 <span class="tooltiprouttext">This
is the upper bound of the prediction interval. While we predict a
stopping distance of 29.60981 feet, this prediction interval reminds us
that the actual stopping distance could be as high as 60.96915
feet.</span> </span>
</td>
</tr>
</table>
</div>
<a href="javascript:showhide('predictionConfidence')">
<div class="hoverchunk">
<p><span class="tooltipr"> predict( <span class="tooltiprtext">The R
function predict(…) allows you to use an lm(…) object to make
predictions for specified x-values.</span> </span><span
class="tooltipr"> mylm, <span class="tooltiprtext">This is the name of a
previously performed lm(…) that was saved into the name
<code>mylm &lt;- lm(...)</code>.</span> </span><span class="tooltipr">
 data.frame( <span class="tooltiprtext">To specify the values of <span
class="math inline">\(x\)</span> that you want to use in the prediction,
you have to put those x-values into a data set, or more specifally, a
data.frame(…).</span> </span><span class="tooltipr"> X= <span
class="tooltiprtext">The value for <code>X=</code> should be whatever
x-variable name was used in the original regression. For example, if
<code>mylm &lt;- lm(dist ~ speed, data=cars)</code> was the original
regression, then this code would read <code>speed =</code> instead of
<code>X=</code>… Further, the value of <span
class="math inline">\(Xh\)</span> should be some specific number, like
<code>speed=12</code> for example.</span> </span><span class="tooltipr">
Xh <span class="tooltiprtext">The value of <span
class="math inline">\(Xh\)</span> should be some specific number, like
<code>12</code>, as in <code>speed=12</code> for example.</span>
</span><span class="tooltipr"> ), <span class="tooltiprtext">Closing
parenthesis for the data.frame(…) function.</span> </span><span
class="tooltipr">  interval= <span class="tooltiprtext">This optional
command allows you to specify if the predicted value should be
accompanied by either a confidence interval or a prediction
interval.</span> </span><span class="tooltipr"> “confidence” <span
class="tooltiprtext">This specifies that a confidence interval for the
prediction should be provided. This is of use whenever your interest is
in just estimating the average y-value, not the actual y-values.</span>
</span><span class="tooltipr"> ) <span class="tooltiprtext">Closing
parenthesis of the predict(…) function.</span> </span></p>
</div>
<p></a></p>
<div id="predictionConfidence" style="display:none;">
<p><code>mylm &lt;- lm(dist ~ speed, data = cars)</code></p>
<p><code>predict(mylm, data.frame(speed = 12), interval = "confidence")</code></p>
<table class="rconsole">
<tr>
<td align="right">
<span class="tooltiprout">   fit <span class="tooltiprouttext">The “fit”
is the predicted value.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">   lwr <span class="tooltiprouttext">The “lwr”
is the lower bound.</span>
</td>
<td align="right">
<span class="tooltiprout">   upr <span class="tooltiprouttext">The “upr”
is the upper bound.</span> </span>
</td>
</tr>
<tr>
<td align="left">
<span class="tooltiprout"> 1 29.60981 <span class="tooltiprouttext">In
this case, cars with a speed of 12 mph are predicted to have an average
stopping distance of 29.60981 feet, where the average could be anywhere
from 24.39514 feet to 34.82448 feet.</span> </span>
</td>
<td align="left">
<span class="tooltiprout"> 24.39514 <span class="tooltiprouttext">This
is the lower bound of the confidence interval. We are 95% confident that
the average stopping distance of cars going 12 mph is greater than this
value.</span> </span>
</td>
<td align="left">
<span class="tooltiprout"> 34.82448 <span class="tooltiprouttext">This
is the upper bound of the confidence interval. We are 95% confident that
the average stopping distance of cars going 12 mph is less than this
value.</span> </span>
</td>
</tr>
</table>
</div>
<p><br/></p>
<p><strong>Finding Confidence Intervals for Model
Parameters</strong></p>
<a href="javascript:showhide('confint')">
<div class="hoverchunk">
<p><span class="tooltipr"> confint( <span class="tooltiprtext">The R
function confint(…) allows you to use an lm(…) object to compute
confidence intervals for one or more parameters (like <span
class="math inline">\(\beta_0\)</span> or <span
class="math inline">\(\beta_1\)</span>) in your model.</span>
</span><span class="tooltipr"> mylm, <span class="tooltiprtext">This is
the name of a previously performed lm(…) that was saved into the name
<code>mylm &lt;- lm(...)</code>.</span> </span><span class="tooltipr">
 level = <span class="tooltiprtext">“level =” tells the confint(…)
function that you are going to declare at what level of confidence you
want the interval. The default is “level = 0.95.” If you want to find
95% confidence intervals for your parameters, then just run
<code>confint(mylm)</code>.</span> </span><span class="tooltipr">
 someConfidenceLevel <span class="tooltiprtext">someConfidenceLevel is
simply a confidence level you choose when you want something other than
a 95% confidence interval. Some examples of appropriate levels include
0.90 and 0.99.</span> </span><span class="tooltipr"> ) <span
class="tooltiprtext">Closing parenthesis for confint(..)
function.</span> </span></p>
</div>
<p></a></p>
<div id="confint" style="display:none;">
<p><code>mylm &lt;- lm(dist ~ speed, data = cars)</code></p>
<p><code>confint(mylm, level = 0.90)</code></p>
<table class="rconsole">
<tr>
<td align="left">
 
</td>
<td align="right">
<span class="tooltiprout">   5 % <span class="tooltiprouttext">The lower
bound of a 90% confidence interval occurs at the 5th percentile. This is
because at 90% confidence, 10% is left in the tails, with 5% on each
end. The upper bound will thus end at the 95th percentile, hence the 5%
and 95% as the column names.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">   95 % <span class="tooltiprouttext">The
upper bound of a 90% confidence interval ends at the 95th
percentile.</span> </span>
</td>
</tr>
<tr>
<td align="left">
<span class="tooltiprout"> (Intercept) <span
class="tooltiprouttext">This row of output specifies a confidence
interval for <span class="math inline">\(\beta_0\)</span>, the true
y-intercept.</span> </span>
</td>
<td align="left">
<span class="tooltiprout"> -28.914514 <span class="tooltiprouttext">This
is the lower bound for the confidence interval of the y-intercept, <span
class="math inline">\(\beta_0\)</span>. In this example, the confidence
interval for the y-intercept does not make sense because you cannot have
negative distance.</span> </span>
</td>
<td align="left">
<span class="tooltiprout"> -6.243676 <span class="tooltiprouttext">This
is the upper bound for the confidence interval for <span
class="math inline">\(\beta_0\)</span>, the y-intercpet. In this
example, the confidence interval for the y-intercept does not make sense
because you cannot have negative distance.</span> </span>
</td>
</tr>
<tr>
<td align="left">
<span class="tooltiprout"> speed <span class="tooltiprouttext">This row
of the output provides the upper and lower bound for the confidence
interval for <span class="math inline">\(\beta_1\)</span>, the true
slope. In this case, you can be 90% confident that the true slope lies
between 3.235501 and 4.629317.</span> </span>
</td>
<td align="right">
<span class="tooltiprout"> 3.235501 <span class="tooltiprouttext">This
is the lower bound of the confidence interval. In this case, you can be
90% confident that the slope lies between 3.235501 and 4.629317.</span>
</span>
</td>
<td align="right">
<span class="tooltiprout"> 4.629317 <span class="tooltiprouttext">This
is the upper bound of the confidence interval. In this case, you can be
90% confident that the slope lies between 3.235501 and 4.629317.</span>
</span>
</td>
</td>
</tr>
</table>
<p><br/> <br/></p>
<p><code>mylm &lt;- lm(dist ~ speed, data = cars)</code></p>
<p><code>confint(mylm, level = 0.95)</code></p>
<table class="rconsole">
<tr>
<td align="left">
 
</td>
<td align="right">
<span class="tooltiprout">   2.5 % <span class="tooltiprouttext">The
lower bound of a 95% confidence interval occurs at the 2.5th percentile.
This is because at 95% confidence, 5% is left in the tails, with 2.5% on
each end. The upper bound will thus end at the 97.5th percentile, hence
the 2.5% and 97.5% as the column names for the lower and upper bounds,
respectively.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">   97.5 % <span class="tooltiprouttext">The
upper bound of a 95% confidence interval ends at the 97.5th
percentile.</span> </span>
</td>
</tr>
<tr>
<td align="left">
<span class="tooltiprout"> (Intercept) <span
class="tooltiprouttext">This row of output specifies a confidence
interval for <span class="math inline">\(\beta_0\)</span>, the true
y-intercept.</span> </span>
</td>
<td align="left">
<span class="tooltiprout"> -31.167850 <span class="tooltiprouttext">This
is the lower bound for the confidence interval of the y-intercept, <span
class="math inline">\(\beta_0\)</span>. In this example, the confidence
interval for the y-intercept does not make sense because you cannot have
negative distance.</span> </span>
</td>
<td align="left">
<span class="tooltiprout"> -3.990340 <span class="tooltiprouttext">This
is the upper bound for the confidence interval for <span
class="math inline">\(\beta_0\)</span>, the y-intercpet. In this
example, the confidence interval for the y-intercept does not make sense
because you cannot have negative distance.</span> </span>
</td>
</tr>
<tr>
<td align="left">
<span class="tooltiprout"> speed <span class="tooltiprouttext">This row
of the output provides the upper and lower bound for the confidence
interval for <span class="math inline">\(\beta_1\)</span>, the true
slope. In this case, you can be 90% confident that the true slope lies
between 3.096964 and 4.767853.</span> </span>
</td>
<td align="right">
<span class="tooltiprout"> 3.096964 <span class="tooltiprouttext">This
is the lower bound of the confidence interval. In this case, you can be
90% confident that the slope lies between 3.096964 and 4.767853</span>
</span>
</td>
<td align="right">
<span class="tooltiprout"> 4.767853 <span class="tooltiprouttext">This
is the upper bound of the confidence interval. In this case, you can be
95% confident that the slope lies between 3.096964 and 4.767853</span>
</span>
</td>
</td>
</tr>
</table>
</div>
<hr />
</div>
</div>
<div id="explanation" class="section level3">
<h3>Explanation</h3>
<div style="padding-left:125px;">

<p>Linear regression has a rich mathematical theory behind it. This is
because it uses a mathematical function and a random error term to
describe the regression relation between a response variable <span
class="math inline">\(Y\)</span> and an explanatory variable called
<span class="math inline">\(X\)</span>.</p>
<div style="padding-left:30px;color:darkgray;">
<p>Expand each element below to learn more.</p>
</div>
<p><span
style="color:steelblue;font-size:.8em;padding-left:160px;">Regression
Cheat Sheet</span>
<a href="javascript:showhide('regressioncheatsheet')" style="font-size:.6em;color:skyblue;">(Expand)</a></p>
<div id="regressioncheatsheet" style="display:none;font-size:.7em;">
<table>
<colgroup>
<col width="13%" />
<col width="34%" />
<col width="15%" />
<col width="17%" />
<col width="19%" />
</colgroup>
<thead>
<tr class="header">
<th>Term</th>
<th>Pronunciation</th>
<th>Meaning</th>
<th>Math</th>
<th>R Code</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td><span class="tooltipr"><span class="math inline">\(Y_i\)</span><span
class="tooltiprtext"> <code>$Y_i$</code></span>
</span><span class="tooltipr"></td>
<td>“why-eye”</td>
<td>The data</td>
<td><span class="tooltipr"> <span class="math inline">\(Y_i = \beta_0 +
\beta_1 X_i + \epsilon_i \quad \text{where} \ \epsilon_i \sim N(0,
\sigma^2)\)</span><span class="tooltiprtext">
<code>$Y_i = \beta_0 + \beta_1 X_i +</code>
<code>\epsilon_i \quad \text{where} \</code>
<code>\epsilon_i \sim N(0, \sigma^2)$</code></span>
</span><span class="tooltipr"></td>
<td><code>YourDataSet$YourYvariable</code></td>
</tr>
<tr class="even">
<td><span class="tooltipr"><span
class="math inline">\(\hat{Y}_i\)</span><span class="tooltiprtext">
<code>$\hat{Y}_i$</code></span> </span><span class="tooltipr"></td>
<td>“why-hat-eye”</td>
<td>The fitted line</td>
<td><span class="tooltipr"> <span class="math inline">\(\hat{Y}_i = b_0
+ b_1 X_i\)</span><span class="tooltiprtext">
<code>$\hat{Y}_i = b_0 + b_1 X_i$</code></span></td>
<td><code>lmObject$fitted.values</code></td>
</tr>
<tr class="odd">
<td><span class="tooltipr"><span
class="math inline">\(E\{Y_i\}\)</span><span class="tooltiprtext">
<code>$E\{Y_i\}$</code></span> </span><span class="tooltipr"></td>
<td>“expected value of why-eye”</td>
<td>True mean y-value</td>
<td><span class="tooltipr"><span class="math inline">\(E\{Y_i\} =
\beta_0 + \beta_1 X_i\)</span><span class="tooltiprtext">
<code>$E\{Y_i\} = \beta_0 + \beta_1 X_i$</code></span></td>
<td><code>&lt;none&gt;</code></td>
</tr>
<tr class="even">
<td><span class="tooltipr"><span
class="math inline">\(\beta_0\)</span><span class="tooltiprtext">
<code>$\beta_0$</code></span> </span><span class="tooltipr"></td>
<td>“beta-zero”</td>
<td>True y-intercept</td>
<td><code>&lt;none&gt;</code></td>
<td><code>&lt;none&gt;</code></td>
</tr>
<tr class="odd">
<td><span class="tooltipr"><span
class="math inline">\(\beta_1\)</span><span class="tooltiprtext">
<code>$\beta_1$</code></span> </span><span class="tooltipr"></td>
<td>“beta-one”</td>
<td>True slope</td>
<td><code>&lt;none&gt;</code></td>
<td><code>&lt;none&gt;</code></td>
</tr>
<tr class="even">
<td><span class="tooltipr"><span class="math inline">\(b_0\)</span><span
class="tooltiprtext"> <code>$b_0$</code></span>
</span><span class="tooltipr"></td>
<td>“b-zero”</td>
<td>Estimated y-intercept</td>
<td><span class="tooltipr"><span class="math inline">\(b_0 = \bar{Y} -
b_1\bar{X}\)</span><span class="tooltiprtext">
<code>$b_0 = \bar{Y} - b_1\bar{X}</code></span></td>
<td><code>b_0 &lt;- mean(Y) - b_1*mean(X)$</code></td>
</tr>
<tr class="odd">
<td><span class="tooltipr"><span class="math inline">\(b_1\)</span><span
class="tooltiprtext"> <code>$b_1$</code></span>
</span><span class="tooltipr"></td>
<td>“b-one”</td>
<td>Estimated slope</td>
<td><span class="tooltipr"><span class="math inline">\(b_1 = \frac{\sum
X_i(Y_i - \bar{Y})}{\sum(X_i - \bar{X})^2}\)</span><span
class="tooltiprtext"> <code>$b_1 = \frac{\sum X_i(Y_i - \bar{Y})}</code>
<code>{\sum(X_i - \bar{X})^2}$</code></span></td>
<td><code>b_1 &lt;- sum( X*(Y - mean(Y)) ) / sum( (X - mean(X))^2 )</code></td>
</tr>
<tr class="even">
<td><span class="tooltipr"><span
class="math inline">\(\epsilon_i\)</span><span class="tooltiprtext">
<code>$\epsilon_i$</code></span> </span><span class="tooltipr"></td>
<td>“epsilon-eye”</td>
<td>Distance of dot to true line</td>
<td><span class="tooltipr"><span class="math inline">\(\epsilon_i = Y_i
- E\{Y_i\}\)</span><span class="tooltiprtext">
<code>$\epsilon_i = Y_i - E\{Y_i\}$</code></span></td>
<td><code>&lt;none&gt;</code></td>
</tr>
<tr class="odd">
<td><span class="tooltipr"><span class="math inline">\(r_i\)</span><span
class="tooltiprtext"> <code>$r_i$</code></span>
</span><span class="tooltipr"></td>
<td>“r-eye” or “residual-eye”</td>
<td>Distance of dot to estimated line</td>
<td><span class="tooltipr"><span class="math inline">\(r_i = Y_i -
\hat{Y}_i\)</span><span class="tooltiprtext">
<code>$r_i = Y_i - \hat{Y}_i$</code></span></td>
<td><code>lmObject$residuals</code></td>
</tr>
<tr class="even">
<td><span class="tooltipr"><span
class="math inline">\(\sigma^2\)</span><span class="tooltiprtext">
<code>$\sigma^2$</code></span> </span><span class="tooltipr"></td>
<td>“sigma-squared”</td>
<td>Variance of the <span class="math inline">\(\epsilon_i\)</span></td>
<td><span class="tooltipr"><span class="math inline">\(Var\{\epsilon_i\}
= \sigma^2\)</span><span
class="tooltiprtext"><code>$Var\{\epsilon_i\} = \sigma^2$</code></span></td>
<td><code>&lt;none&gt;</code></td>
</tr>
<tr class="odd">
<td><span class="tooltipr"><span class="math inline">\(MSE\)</span><span
class="tooltiprtext"> <code>$MSE$</code></span>
</span><span class="tooltipr"></td>
<td>“mean squared error”</td>
<td>Estimate of <span class="math inline">\(\sigma^2\)</span></td>
<td><span class="tooltipr"><span class="math inline">\(MSE =
\frac{SSE}{n-p}\)</span><span
class="tooltiprtext"><code>$MSE = \frac{SSE}{n-p}$</code></span></td>
<td><code>sum( lmObject$res^2 ) / (n - p)</code></td>
</tr>
<tr class="even">
<td><span class="tooltipr"><span class="math inline">\(SSE\)</span><span
class="tooltiprtext"> <code>$SSE$</code></span>
</span><span class="tooltipr"></td>
<td>“sum of squared error” (residuals)</td>
<td>Measure of dot’s total deviation from the line</td>
<td><span class="tooltipr"><span class="math inline">\(SSE =
\sum_{i=1}^n (Y_i - \hat{Y}_i)^2\)</span><span
class="tooltiprtext"><code>$SSE = \sum_{i=1}^n</code>
<code>(Y_i - \hat{Y}_i)^2$</code></span></td>
<td><code>sum( lmObject$res^2 )</code></td>
</tr>
<tr class="odd">
<td><span class="tooltipr"><span class="math inline">\(SSR\)</span><span
class="tooltiprtext"> <code>$SSR$</code></span>
</span><span class="tooltipr"></td>
<td>“sum of squared regression error”</td>
<td>Measure of line’s deviation from y-bar</td>
<td><span class="tooltipr"> <span class="math inline">\(SSR =
\sum_{i=1}^n (\hat{Y}_i - \bar{Y})^2\)</span><span
class="tooltiprtext"><code>$SSR = \sum_{i=1}^n</code>
<code>(\hat{Y}_i - \bar{Y})^2$</code></span></td>
<td><code>sum( (lmObject$fit - mean(YourData$Y))^2 )</code></td>
</tr>
<tr class="even">
<td><span class="tooltipr"><span
class="math inline">\(SSTO\)</span><span class="tooltiprtext">
<code>$SSTO$</code></span> </span><span class="tooltipr"></td>
<td>“total sum of squares”</td>
<td>Measure of total variation in Y</td>
<td><span class="tooltipr"><span class="math inline">\(SSR + SSE = SSTO
= \sum_{i=1}^n (Y_i - \bar{Y})^2\)</span><span
class="tooltiprtext"><code>$SSR + SSE = SSTO = \sum_{i=1}^n</code>
<code>(Y_i - \bar{Y})^2$</code></span></td>
<td><code>sum( (YourData$Y - mean(YourData$Y))^2 )</code></td>
</tr>
<tr class="odd">
<td><span class="tooltipr"><span class="math inline">\(R^2\)</span><span
class="tooltiprtext"> <code>$R^2$</code></span>
</span><span class="tooltipr"></td>
<td>“R-squared”</td>
<td>Proportion of variation in Y explained by the regression</td>
<td><span class="tooltipr"><span class="math inline">\(R^2 =
\frac{SSR}{SSTO} = 1 - \frac{SSE}{SSTO}\)</span><span
class="tooltiprtext"><code>$R^2 = \frac{SSR}{SSTO} = 1</code>
<code>- \frac{SSE}{SSTO}$</code></span></td>
<td><code>SSR/SSTO</code></td>
</tr>
<tr class="even">
<td><span class="tooltipr"><span class="math inline">\(r\)</span><span
class="tooltiprtext"> <code>$r$</code></span>
</span><span class="tooltipr"></td>
<td>“r”</td>
<td>Correlation between X and Y.</td>
<td><span class="tooltipr"><span class="math inline">\(r =
\sqrt{R^2}\)</span><span
class="tooltiprtext"><code>$r = \sqrt{R^2}$</code></span></td>
<td><code>sqrt(R^2)</code></td>
</tr>
<tr class="odd">
<td><span class="tooltipr"><span
class="math inline">\(\hat{Y}_h\)</span><span class="tooltiprtext">
<code>$\hat{Y}_h$</code></span></span></td>
<td>“why-hat-aitch”</td>
<td>Estimated mean y-value for some x-value called <span
class="math inline">\(X_h\)</span></td>
<td><span class="tooltipr"><span class="math inline">\(\hat{Y}_h = b_0 +
b_1 X_h\)</span><span
class="tooltiprtext"><code>$\hat{Y}_h = b_0 + b_1 X_h$</code></span></span></td>
<td><code>predict(lmObject, data.frame(XvarName=#))</code></td>
</tr>
<tr class="even">
<td><span class="tooltipr"><span class="math inline">\(X_h\)</span><span
class="tooltiprtext"> <code>$X_h$</code></span> </span></td>
<td>“ex-aitch”</td>
<td>Some x-value, not necessarily one of the <span
class="math inline">\(X_i\)</span> values used in the regression</td>
<td><span class="tooltipr"><span class="math inline">\(X_h =\)</span>
some number<span
class="tooltiprtext"><code>$X_h = $</code></span></span></td>
<td><code>Xh = #</code></td>
</tr>
<tr class="odd">
<td>Confidence Interval</td>
<td>“confidence interval”</td>
<td>Estimated bounds at a certain level of confidence for a
parameter</td>
<td><span class="tooltipr"><span class="math inline">\(b_0 \pm t^* \cdot
s_{b_0}\)</span><span
class="tooltiprtext"><code>b_0 \pm t^* \cdot s_{b_0}</code></span></span>
or <span class="tooltipr"><span class="math inline">\(b_1 \pm t^* \cdot
s_{b_1}\)</span><span
class="tooltiprtext"><code>b_1 \pm t^* \cdot s_{b_1}</code></span></span></td>
<td><code>confint(mylm, level = someConfidenceLevel)</code></td>
</tr>
</tbody>
</table>
<table>
<thead>
<tr class="header">
<th>Parameter</th>
<th>Estimate</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td><span class="math inline">\(\beta_0\)</span></td>
<td><span class="math inline">\(b_0\)</span></td>
</tr>
<tr class="even">
<td><span class="math inline">\(\beta_1\)</span></td>
<td><span class="math inline">\(b_1\)</span></td>
</tr>
<tr class="odd">
<td><span class="math inline">\(\epsilon_i\)</span></td>
<td><span class="math inline">\(r_i\)</span></td>
</tr>
<tr class="even">
<td><span class="math inline">\(\sigma^2\)</span></td>
<td><span class="math inline">\(MSE\)</span></td>
</tr>
<tr class="odd">
<td><span class="math inline">\(\sigma\)</span></td>
<td><span class="math inline">\(\sqrt{MSE}\)</span>, the Residual
standard error</td>
</tr>
</tbody>
</table>
</div>
<p><br /></p>
<div id="the-mathematical-model-expand" class="section level4">
<h4>The Mathematical Model
<a href="javascript:showhide('mathmodel1')" style="font-size:.6em;color:skyblue;">(Expand)</a></h4>
<p><span class="expand-caption"><span
class="math inline">\(Y_i\)</span>, <span
class="math inline">\(\hat{Y}_i\)</span>, and <span
class="math inline">\(E\{Y_i\}\)</span>…</span></p>
<div id="mathmodel1" style="display:none;">
<p>There are three main elements to the mathematical model of
regression. Each of these three elements is pictured below in the
“Regression Relation Diagram.”</p>
<div style="padding-left:60px;color:darkgray;font-size:.8em;">
<p>Study both the three bullet points and their visual representations
in the plot below for a clearer understanding.</p>
</div>
<ol style="list-style-type: decimal">
<li>The <strong>true line</strong>, i.e., the regression relation:</li>
</ol>
<div style="padding-left:60px;color:darkgray;">
<div style="color:steelblue;">
<p><span class="math inline">\(\underbrace{E\{Y\}}_{\substack{\text{true
mean} \\ \text{y-value}}} =
\underbrace{\overbrace{\beta_0}^\text{y-intercept} +
\overbrace{\beta_1}^\text{slope} X}_\text{equation of a
line}\)</span></p>
</div>
<p><a href="javascript:showhide('readmoretrueline')" style="font-size:.8em;color:skyblue;">(Read
more…)</a></p>
<div id="readmoretrueline" style="display:none;">
<p>The true line is shown by the dotted line in the graph pictured
below. This is typically unobservable. Think of it as “natural law” or
“God’s law”. It is some true line that is unknown to us.</p>
<p>The regression relation <span class="math inline">\(E\{Y\} = \beta_0
+ \beta_1 X\)</span> creates the line of regression where <span
class="math inline">\(\beta_0\)</span> is the <span
class="math inline">\(y\)</span>-intercept of the line and <span
class="math inline">\(\beta_1\)</span> is the slope of the line. The
regression relationship provides the average <span
class="math inline">\(Y\)</span>-value, denoted <span
class="math inline">\(E\{Y_i\}\)</span>, for a given <span
class="math inline">\(X\)</span>-value, denoted by <span
class="math inline">\(X_i\)</span>.</p>
<p>Note: <span class="math inline">\(E\{Y\}\)</span> is pronounced “the
expected value of y” because, well… the mean is the typical, average, or
“expected” value.</p>
</div>
</div>
<ol start="2" style="list-style-type: decimal">
<li>The <strong>dots</strong>, i.e., the regression relation plus an
error term:</li>
</ol>
<div style="padding-left:60px;color:darkgray;">
<div style="color:steelblue;">
<p><span class="math inline">\(Y_i = \underbrace{\beta_0 + \beta_1
X_i}_{E\{Y_i\}} + \underbrace{\epsilon_i}_\text{error term} \quad
\text{where} \ \epsilon_i\sim N(0,\sigma^2)\)</span></p>
</div>
<p><a href="javascript:showhide('readmoredots')" style="font-size:.8em;color:skyblue;">(Read
more…)</a></p>
<div id="readmoredots" style="display:none;">
<p>This is shown by the dots in the graph below. This is the data. In
regression, the assumption is that the y-value for individual <span
class="math inline">\(i\)</span>, denoted by <span
class="math inline">\(Y_i\)</span>, was “created” by adding an error
term <span class="math inline">\(\epsilon_i\)</span> to each
individual’s “expected” value <span class="math inline">\(\beta_0 +
\beta_1 X_i\)</span>. Note the “order of creation” would require first
knowing an indivual’s x-value, <span class="math inline">\(X_i\)</span>,
then their expected value from the regression relation <span
class="math inline">\(E\{Y_i\} = \beta_0 + \beta_1 X_i\)</span> and then
adding their <span class="math inline">\(\epsilon_i\)</span> value to
the result. The <span class="math inline">\(\epsilon_i\)</span> allows
each individual to deviate from the line. Some individuals deviate
dramatically, some deviate only a little, but all dots vary some
distance <span class="math inline">\(\epsilon_i\)</span> from the
line.</p>
<p>Note: <span class="math inline">\(Y_i\)</span> is pronounced
“why-eye” because it is the y-value for individual <span
class="math inline">\(i\)</span>. Sometimes also called “why-sub-eye”
because <span class="math inline">\(i\)</span> is in the subscript of
<span class="math inline">\(Y\)</span>.</p>
</div>
</div>
<ol start="3" style="list-style-type: decimal">
<li>The <strong>estimated line</strong>, i.e., the line we get from a
sample of data.</li>
</ol>
<div style="padding-left:60px;color:darkgray;">
<div style="color:steelblue;">
<p><span
class="math inline">\(\underbrace{\hat{Y}_i}_{\substack{\text{estimated
mean} \\ \text{y-value}}} = \underbrace{b_0 + b_1 X_i}_\text{estimated
regression equation}\)</span></p>
</div>
<p><a href="javascript:showhide('readmoreestimatedline')" style="font-size:.8em;color:skyblue;">(Read
more…)</a></p>
<div id="readmoreestimatedline" style="display:none;">
<p>The estimated line is shown by the solid line in the graph below.
<span class="math inline">\(\hat{Y}\)</span> is the estimated regression
equation obtained from the sample of data. It is the estimator of the
true regression equation <span class="math inline">\(E\{Y\}\)</span>. So
<span class="math inline">\(\hat{Y}\)</span> is interpreted as the
estimated average (or mean) <span class="math inline">\(Y\)</span>-value
for any given <span class="math inline">\(X\)</span>-value. Thus, <span
class="math inline">\(b_0\)</span> is the estimated y-intercept and
<span class="math inline">\(b_1\)</span> is the estimated slope. The b’s
are sample statistics, like <span class="math inline">\(\bar{x}\)</span>
and the <span class="math inline">\(\beta\)</span>’s are population
parameters like <span class="math inline">\(\mu\)</span>. The <span
class="math inline">\(b\)</span>’s estimate the <span
class="math inline">\(\beta\)</span>’s.</p>
<p>Note: <span class="math inline">\(\hat{Y}_i\)</span> is pronounced
“why-hat-eye” and is known as the “estimated y-value” or “fitted
y-value” because it is the y-value you get from <span
class="math inline">\(b_0 + b_1 X_i\)</span>. It is always different
from <span class="math inline">\(Y_i\)</span> because dots are rarely if
ever exactly on the estimated regression line.</p>
</div>
</div>
<p>This graphic depicts the true, but typically unknown, regression
relation (dotted line). It also shows how a sample of data from the true
regression relation (the dots) can be used to obtain an estimated
regression equation (solid line) that is fairly close to the truth
(dotted line).</p>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-12-1.png" width="672" /></p>
<p>Something to ponder: The true line, when coupled with the error
terms, “creates” the data. The estimated (or fitted) line uses the
sampled data to try to “re-create” the true line.</p>
<p>We could loosely call this the “order of creation” as shown by the
following diagram.</p>
<div class="sourceCode" id="cb13"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mfrow=</span><span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">3</span>), <span class="at">mai=</span><span class="fu">c</span>(.<span class="dv">2</span>,.<span class="dv">2</span>,.<span class="dv">4</span>,.<span class="dv">1</span>))</span>
<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(y <span class="sc">~</span> x, <span class="at">col=</span><span class="st">&quot;white&quot;</span>,  <span class="at">main=</span><span class="st">&quot;A Law is Given&quot;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>)</span>
<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(beta0 <span class="sc">+</span> beta1<span class="sc">*</span>x, <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">lty=</span><span class="dv">2</span>)</span>
<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(y <span class="sc">~</span> x, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">main=</span><span class="st">&quot;Data is Created&quot;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>)</span>
<span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(beta0 <span class="sc">+</span> beta1<span class="sc">*</span>x, <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">lty=</span><span class="dv">2</span>)</span>
<span id="cb13-6"><a href="#cb13-6" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(y <span class="sc">~</span> x, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">main=</span><span class="st">&quot;The Law is Estimated&quot;</span>)</span>
<span id="cb13-7"><a href="#cb13-7" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(xylm<span class="sc">$</span>coef[<span class="dv">1</span>] <span class="sc">+</span> xylm<span class="sc">$</span>coef[<span class="dv">2</span>]<span class="sc">*</span>x, <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>)</span>
<span id="cb13-8"><a href="#cb13-8" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(beta0 <span class="sc">+</span> beta1<span class="sc">*</span>x, <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">lty=</span><span class="dv">2</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-13-1.png" width="672" /></p>
<table>
<colgroup>
<col width="29%" />
<col width="31%" />
<col width="38%" />
</colgroup>
<thead>
<tr class="header">
<th>A Law is Given</th>
<th>Data is Created</th>
<th>The Law is Estimated</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td><span class="math inline">\(E\{Y_i\} = \beta_0 + \beta_1
X_i\)</span></td>
<td><span class="math inline">\(Y_i = E\{Y_i\} +
\epsilon_i\)</span></td>
<td><span class="math inline">\(\hat{Y}_i = b_0 + b_1 X_i\)</span></td>
</tr>
<tr class="even">
<td>The true line is the “law”.</td>
<td>The <span class="math inline">\(Y_i\)</span> are created by adding
<span class="math inline">\(\epsilon_i\)</span> to <span
class="math inline">\(E\{Y_i\}\)</span> where <span
class="math inline">\(E\{Y_i\} = \beta_0 + \beta_1 X_i\)</span>.</td>
<td>The law is estimated with <span
class="math inline">\(\hat{Y}_i\)</span> which is given with
<code>lm(...)</code>.</td>
</tr>
</tbody>
</table>
<p>Click open the “Code” buttom below to the right to find code that
runs a simulation demonstrating this “order of creation”.</p>
<div class="sourceCode" id="cb14"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="do">## Simulating Data from a Regression Model</span></span>
<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a><span class="do">## This R-chunk is meant to be played in your R Console.</span></span>
<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a><span class="do">## It allows you to explore how the various elements</span></span>
<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a><span class="do">## of the regression model combine together to &quot;create&quot;</span></span>
<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a><span class="do">## data and then use the data to &quot;re-create&quot; the line.</span></span>
<span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-7"><a href="#cb14-7" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">101</span>) <span class="co">#Allows us to always get the same &quot;random&quot; sample</span></span>
<span id="cb14-8"><a href="#cb14-8" aria-hidden="true" tabindex="-1"></a>              <span class="co">#Change to a new number to get a new sample</span></span>
<span id="cb14-9"><a href="#cb14-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-10"><a href="#cb14-10" aria-hidden="true" tabindex="-1"></a>  n <span class="ot">&lt;-</span> <span class="dv">3</span> <span class="co">#set the sample size</span></span>
<span id="cb14-11"><a href="#cb14-11" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-12"><a href="#cb14-12" aria-hidden="true" tabindex="-1"></a>  X_i <span class="ot">&lt;-</span> <span class="fu">runif</span>(n, <span class="dv">15</span>, <span class="dv">45</span>) <span class="co">#Gives n random values from a uniform distribution between 15 to 45.</span></span>
<span id="cb14-13"><a href="#cb14-13" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-14"><a href="#cb14-14" aria-hidden="true" tabindex="-1"></a>  beta0 <span class="ot">&lt;-</span> <span class="dv">3</span> <span class="co">#Our choice for the y-intercept. </span></span>
<span id="cb14-15"><a href="#cb14-15" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-16"><a href="#cb14-16" aria-hidden="true" tabindex="-1"></a>  beta1 <span class="ot">&lt;-</span> .<span class="dv">1</span> <span class="co">#Our choice for the slope. </span></span>
<span id="cb14-17"><a href="#cb14-17" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-18"><a href="#cb14-18" aria-hidden="true" tabindex="-1"></a>  sigma <span class="ot">&lt;-</span> <span class="fl">12.5</span> <span class="co">#Our choice for the std. deviation of the error terms.</span></span>
<span id="cb14-19"><a href="#cb14-19" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-20"><a href="#cb14-20" aria-hidden="true" tabindex="-1"></a>  epsilon_i <span class="ot">&lt;-</span> <span class="fu">rnorm</span>(n, <span class="dv">0</span>, sigma) <span class="co">#Gives n random values from a normal distribution with mean = 0, st. dev. = sigma.</span></span>
<span id="cb14-21"><a href="#cb14-21" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-22"><a href="#cb14-22" aria-hidden="true" tabindex="-1"></a>  Y_i <span class="ot">&lt;-</span> beta0 <span class="sc">+</span> beta1<span class="sc">*</span>X_i <span class="sc">+</span> epsilon_i <span class="co">#Create Y using the normal error regression model</span></span>
<span id="cb14-23"><a href="#cb14-23" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-24"><a href="#cb14-24" aria-hidden="true" tabindex="-1"></a>  fabData <span class="ot">&lt;-</span> <span class="fu">data.frame</span>(<span class="at">y=</span>Y_i, <span class="at">x=</span>X_i) <span class="co">#Store the data as data</span></span>
<span id="cb14-25"><a href="#cb14-25" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-26"><a href="#cb14-26" aria-hidden="true" tabindex="-1"></a>  <span class="fu">View</span>(fabData) </span>
<span id="cb14-27"><a href="#cb14-27" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb14-28"><a href="#cb14-28" aria-hidden="true" tabindex="-1"></a>  <span class="co">#In the real world, we begin with data (like fabData) and try to recover the model that (we assume) was used to created it.</span></span>
<span id="cb14-29"><a href="#cb14-29" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-30"><a href="#cb14-30" aria-hidden="true" tabindex="-1"></a>  fab.lm <span class="ot">&lt;-</span> <span class="fu">lm</span>(y <span class="sc">~</span> x, <span class="at">data=</span>fabData) <span class="co">#Fit an estimated regression model to the fabData.</span></span>
<span id="cb14-31"><a href="#cb14-31" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-32"><a href="#cb14-32" aria-hidden="true" tabindex="-1"></a>  <span class="fu">summary</span>(fab.lm) <span class="co">#Summarize your model. </span></span>
<span id="cb14-33"><a href="#cb14-33" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-34"><a href="#cb14-34" aria-hidden="true" tabindex="-1"></a>  <span class="fu">plot</span>(y <span class="sc">~</span> x, <span class="at">data=</span>fabData) <span class="co">#Plot the data.</span></span>
<span id="cb14-35"><a href="#cb14-35" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-36"><a href="#cb14-36" aria-hidden="true" tabindex="-1"></a>  <span class="fu">abline</span>(fab.lm) <span class="co">#Add the estimated regression line to your plot.</span></span>
<span id="cb14-37"><a href="#cb14-37" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-38"><a href="#cb14-38" aria-hidden="true" tabindex="-1"></a><span class="co"># Now for something you can&#39;t do in real life... but since we created the data...</span></span>
<span id="cb14-39"><a href="#cb14-39" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-40"><a href="#cb14-40" aria-hidden="true" tabindex="-1"></a>  <span class="fu">abline</span>(beta0, beta1, <span class="at">lty=</span><span class="dv">2</span>) <span class="co">#Add the true regression line to your plot using a dashed line (lty=2). </span></span>
<span id="cb14-41"><a href="#cb14-41" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-42"><a href="#cb14-42" aria-hidden="true" tabindex="-1"></a>  <span class="fu">legend</span>(<span class="st">&quot;topleft&quot;</span>, <span class="at">legend=</span><span class="fu">c</span>(<span class="st">&quot;True Line&quot;</span>, <span class="st">&quot;Estimated Line&quot;</span>), <span class="at">lty=</span><span class="fu">c</span>(<span class="dv">2</span>,<span class="dv">1</span>), <span class="at">bty=</span><span class="st">&quot;n&quot;</span>) <span class="co">#Add a legend to your plot specifying which line is which.</span></span></code></pre></div>
</div>
<p><br /></p>
</div>
<div id="interpreting-the-model-parameters-expand"
class="section level4">
<h4>Interpreting the Model Parameters
<a href="javascript:showhide('interpretingparameters')" style="font-size:.6em;color:skyblue;">(Expand)</a></h4>
<p><span class="expand-caption"><span
class="math inline">\(\beta_0\)</span> (intercept) and <span
class="math inline">\(\beta_1\)</span> (slope), estimated by <span
class="math inline">\(b_0\)</span> and <span
class="math inline">\(b_1\)</span>, interpreted as…</span></p>
<div id="interpretingparameters" style="display:none;">
<p>The interpretation of <span class="math inline">\(\beta_0\)</span> is
only meaningful if <span class="math inline">\(X=0\)</span> is in the
scope of the model. If <span class="math inline">\(X=0\)</span> is in
the scope of the model, then the intercept is interpreted as the average
y-value, denoted <span class="math inline">\(E\{Y\}\)</span>, when <span
class="math inline">\(X=0\)</span>.</p>
<p>The interpretation of <span class="math inline">\(\beta_1\)</span> is
the amount of increase (or decrease) in the average y-value, denoted
<span class="math inline">\(E\{Y\}\)</span>, per unit change in <span
class="math inline">\(X\)</span>. It is often misunderstood to be the
“average change in y” or just “the change in y” but it is more correctly
referred to as the “change in the average y”.</p>
<p>To better see this, consider the three graphics shown below.</p>
<div class="sourceCode" id="cb15"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mfrow=</span><span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">3</span>))</span>
<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a><span class="fu">hist</span>(mtcars<span class="sc">$</span>mpg, <span class="at">main=</span><span class="st">&quot;Gas Mileage of mtcars Vehicles&quot;</span>, <span class="at">ylab=</span><span class="st">&quot;Number of Vehicles&quot;</span>, <span class="at">xlab=</span><span class="st">&quot;Gas Mileage (mpg)&quot;</span>, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>)</span>
<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a><span class="fu">boxplot</span>(mpg <span class="sc">~</span> cyl, <span class="at">data=</span>mtcars, <span class="at">border=</span><span class="st">&quot;skyblue&quot;</span>, <span class="at">boxwex=</span><span class="fl">0.5</span>, <span class="at">main=</span><span class="st">&quot;Gas Mileage of mtcars Vehicles&quot;</span>, <span class="at">ylab=</span><span class="st">&quot;Gas Mileage (mpg)&quot;</span>, <span class="at">xlab=</span><span class="st">&quot;Number of Cylinders of Engine (cyl)&quot;</span>)</span>
<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(mpg <span class="sc">~</span> qsec, <span class="at">data=</span><span class="fu">subset</span>(mtcars, am<span class="sc">==</span><span class="dv">0</span>), <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>, <span class="at">main=</span><span class="st">&quot;Gas Mileage of mtcars Vehicles&quot;</span>, <span class="at">ylab=</span><span class="st">&quot;Gas Mileage (mpg)&quot;</span>, <span class="at">xlab=</span><span class="st">&quot;Quarter Mile Time (qsec)&quot;</span>)</span>
<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(<span class="fu">lm</span>(mpg <span class="sc">~</span> qsec, <span class="at">data=</span><span class="fu">subset</span>(mtcars, am<span class="sc">==</span><span class="dv">0</span>)), <span class="at">col=</span><span class="st">&quot;darkgray&quot;</span>)</span>
<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a><span class="fu">mtext</span>(<span class="at">side=</span><span class="dv">3</span>, <span class="at">text=</span><span class="st">&quot;Automatic Transmissions Only (am==0)&quot;</span>, <span class="at">cex=</span><span class="fl">0.5</span>)</span>
<span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(<span class="at">v =</span> <span class="fu">seq</span>(<span class="dv">16</span>,<span class="dv">22</span>,<span class="dv">2</span>), <span class="at">h=</span><span class="fu">seq</span>(<span class="dv">10</span>,<span class="dv">30</span>,<span class="dv">5</span>), <span class="at">lty=</span><span class="dv">3</span>, <span class="at">col=</span><span class="st">&quot;gray&quot;</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-14-1.png" width="672" /></p>
<table>
<colgroup>
<col width="31%" />
<col width="31%" />
<col width="36%" />
</colgroup>
<thead>
<tr class="header">
<th>The Histogram</th>
<th>The Boxplot</th>
<th>The Scatterplot</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>The <strong>histogram</strong> on the left shows gas mileages of
vehicles from the mtcars data set. The average gas mileage is
20.09.</td>
<td>The <strong>boxplot</strong> in the middle shows that if we look at
gas mileage for 4, 6, and 8 cylinder vehicles separately, we find the
means to be 26.66, 19.74, and 15.1, respectively. If we wanted to, we
could talk about the change in the means across cylinders, and would see
that the mean is decreasing, first by <span class="math inline">\(26.66
- 19.74 = 6.92\)</span> mpg, then by <span class="math inline">\(19.74 -
15.1 = 4.64\)</span> mpg.</td>
<td>The <strong>scatterplot</strong> on the right shows that the average
gas mileage (for just automatic transmission vehicles) increases by a
slope of 1.44 for each 1 second increase in quarter mile time. In other
words, the line gives the average y-value for any x-value. Thus, the
slope of the line is the change in the average y-value.</td>
</tr>
</tbody>
</table>
</div>
<p><br /></p>
</div>
<div id="residuals-and-errors-expand" class="section level4">
<h4>Residuals and Errors
<a href="javascript:showhide('residualsanderrors')" style="font-size:.6em;color:skyblue;">(Expand)</a></h4>
<p><span class="expand-caption"><span
class="math inline">\(r_i\)</span>, the residual, estimates <span
class="math inline">\(\epsilon_i\)</span>, the true error…</span></p>
<div id="residualsanderrors" style="display:none;">
<p>Residuals are the difference between the observed value of <span
class="math inline">\(Y_i\)</span> (the point) and the predicted, or
estimated value, for that point called <span
class="math inline">\(\hat{Y_i}\)</span>. The errors are the true
distances between the observed <span class="math inline">\(Y_i\)</span>
and the actual regression relation for that point, <span
class="math inline">\(E\{Y_i\}\)</span>.</p>
<p>We will denote a <strong>residual</strong> for individual <span
class="math inline">\(i\)</span> by <span
class="math inline">\(r_i\)</span>, <span class="math display">\[
  r_i = \underbrace{Y_i}_{\substack{\text{Observed} \\ \text{Y-value}}}
- \underbrace{\hat{Y}_i}_{\substack{\text{Predicted} \\ \text{Y-value}}}
\quad \text{(residual)}
\]</span> The residual <span class="math inline">\(r_i\)</span>
estimates the true <strong>error</strong> for individual <span
class="math inline">\(i\)</span>, <span
class="math inline">\(\epsilon_i\)</span>, <span class="math display">\[
  \epsilon_i = \underbrace{Y_i}_{\substack{\text{Observed} \\
\text{Y-value}}} - \underbrace{E\{Y_i\}}_{\substack{\text{True Mean} \\
\text{Y-value}}} \quad \text{(error)}
\]</span></p>
<p>In summary…</p>
<div style="padding-left:30px;">
<table>
<colgroup>
<col width="44%" />
<col width="55%" />
</colgroup>
<thead>
<tr class="header">
<th>Residual <span class="math inline">\(r_i\)</span></th>
<th>Error <span class="math inline">\(\epsilon_i\)</span></th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>Distance between the dot <span class="math inline">\(Y_i\)</span>
and the estimated line <span
class="math inline">\(\hat{Y}_i\)</span></td>
<td>Distance between the dot <span class="math inline">\(Y_i\)</span>
and the true line <span class="math inline">\(E\{Y_i\}\)</span>.</td>
</tr>
<tr class="even">
<td><span class="math inline">\(r_i = Y_i - \hat{Y}_i\)</span></td>
<td><span class="math inline">\(\epsilon_i = Y_i -
E\{Y_i\}\)</span></td>
</tr>
<tr class="odd">
<td>Known</td>
<td>Typically Unknown</td>
</tr>
</tbody>
</table>
</div>
<p>As shown in the graph below, the residuals are known values and they
estimate the unknown (but true) error terms.</p>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-15-1.png" width="672" /></p>
<p>Keep in mind the idea that the errors <span
class="math inline">\(\epsilon_i\)</span> “created” the data and that
the residuals <span class="math inline">\(r_i\)</span> are computed
after using the data to “re-create” the line.</p>
<p>Residuals have many uses in regression analysis. They allow us to</p>
<ol style="list-style-type: decimal">
<li>diagnose the regression assumptions,</li>
</ol>
<div style="padding-left:60px;color:darkgray;font-size:.8em;">
<p>See the “Assumptions” section below for more details.</p>
</div>
<ol start="2" style="list-style-type: decimal">
<li>estimate the regression relation,</li>
</ol>
<div style="padding-left:60px;color:darkgray;font-size:.8em;">
<p>See the “Estimating the Model Parameters” section below for more
details.</p>
</div>
<ol start="3" style="list-style-type: decimal">
<li>estimate the variance of the error terms,</li>
</ol>
<div style="padding-left:60px;color:darkgray;font-size:.8em;">
<p>See the “Estimating the Model Variance” section below for more
details.</p>
</div>
<ol start="4" style="list-style-type: decimal">
<li>and assess the fit of the regression relation.</li>
</ol>
<div style="padding-left:60px;color:darkgray;font-size:.8em;">
<p>See the “Assessing the Fit of a Regression” section below for more
details.</p>
</div>
</div>
<p><br /></p>
</div>
<div id="assessing-the-fit-of-a-regression-expand"
class="section level4">
<h4>Assessing the Fit of a Regression
<a href="javascript:showhide('assessingthefit')" style="font-size:.6em;color:skyblue;">(Expand)</a></h4>
<p><span class="expand-caption"><span
class="math inline">\(R^2\)</span>, SSTO, SSR, and SSE…</span></p>
<div id="assessingthefit" style="display:none;">
<p>Not all regressions are created equally as the three plots below
show. Sometimes the dots are a clustered very tightly to the line. At
other times, the dots spread out fairly dramatically from the line.</p>
<div class="sourceCode" id="cb16"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mfrow=</span><span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">3</span>), <span class="at">mai=</span><span class="fu">c</span>(.<span class="dv">1</span>,.<span class="dv">1</span>,.<span class="dv">5</span>,.<span class="dv">1</span>))</span>
<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">2</span>)</span>
<span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a>x <span class="ot">&lt;-</span> <span class="fu">runif</span>(<span class="dv">30</span>,<span class="dv">0</span>,<span class="dv">20</span>)</span>
<span id="cb16-4"><a href="#cb16-4" aria-hidden="true" tabindex="-1"></a>y1 <span class="ot">&lt;-</span> <span class="dv">2</span> <span class="sc">+</span> <span class="fl">3.5</span><span class="sc">*</span>x <span class="sc">+</span> <span class="fu">rnorm</span>(<span class="dv">30</span>,<span class="dv">0</span>,<span class="dv">2</span>)</span>
<span id="cb16-5"><a href="#cb16-5" aria-hidden="true" tabindex="-1"></a>y2 <span class="ot">&lt;-</span> <span class="dv">2</span> <span class="sc">+</span> <span class="fl">3.5</span><span class="sc">*</span>x <span class="sc">+</span> <span class="fu">rnorm</span>(<span class="dv">30</span>,<span class="dv">0</span>,<span class="dv">8</span>)</span>
<span id="cb16-6"><a href="#cb16-6" aria-hidden="true" tabindex="-1"></a>y3 <span class="ot">&lt;-</span> <span class="dv">2</span> <span class="sc">+</span> <span class="fl">3.5</span><span class="sc">*</span>x <span class="sc">+</span> <span class="fu">rnorm</span>(<span class="dv">30</span>,<span class="dv">0</span>,<span class="dv">27</span>)</span>
<span id="cb16-7"><a href="#cb16-7" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(y1 <span class="sc">~</span> x, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;darkgray&quot;</span>, <span class="at">xlim=</span><span class="fu">c</span>(<span class="sc">-</span><span class="dv">1</span>,<span class="dv">21</span>), <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">ylim=</span><span class="fu">c</span>(<span class="sc">-</span><span class="dv">10</span>,<span class="dv">100</span>), <span class="at">main=</span><span class="st">&quot;Excellent Fit&quot;</span>)</span>
<span id="cb16-8"><a href="#cb16-8" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(<span class="fu">lm</span>(y1 <span class="sc">~</span> x), <span class="at">col=</span><span class="st">&quot;gray&quot;</span>)</span>
<span id="cb16-9"><a href="#cb16-9" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(y2 <span class="sc">~</span> x, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;darkgray&quot;</span>, <span class="at">xlim=</span><span class="fu">c</span>(<span class="sc">-</span><span class="dv">1</span>,<span class="dv">21</span>), <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">ylim=</span><span class="fu">c</span>(<span class="sc">-</span><span class="dv">10</span>,<span class="dv">100</span>), <span class="at">main=</span><span class="st">&quot;Good Fit&quot;</span>)</span>
<span id="cb16-10"><a href="#cb16-10" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(<span class="fu">lm</span>(y2 <span class="sc">~</span> x), <span class="at">col=</span><span class="st">&quot;gray&quot;</span>)</span>
<span id="cb16-11"><a href="#cb16-11" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(y3 <span class="sc">~</span> x, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;darkgray&quot;</span>, <span class="at">xlim=</span><span class="fu">c</span>(<span class="sc">-</span><span class="dv">1</span>,<span class="dv">21</span>), <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">ylim=</span><span class="fu">c</span>(<span class="sc">-</span><span class="dv">10</span>,<span class="dv">100</span>), <span class="at">main=</span><span class="st">&quot;Poor Fit&quot;</span>)</span>
<span id="cb16-12"><a href="#cb16-12" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(<span class="fu">lm</span>(y3 <span class="sc">~</span> x), <span class="at">col=</span><span class="st">&quot;gray&quot;</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-16-1.png" width="672" /></p>
<p>A common way to measure the fit of a regression is with <a
href="NumericalSummaries.html#correlation">correlation</a>. While this
can be a useful measurement, there is greater insight in using the
square of the correlation, called <span
class="math inline">\(R^2\)</span>. (If you are a Math 325 student, just
stick with correlation for now and skip on to the next section of this
Explanation tab. If you are a Math 425 student, it is critical that you
come to understand <span class="math inline">\(R^2\)</span> deeply, so
read on.)</p>
<p>Before you can understand <span class="math inline">\(R^2\)</span>,
you must understand three important “sums of squares”.</p>
<div style="padding-left:30px;">
<p><a href="javascript:showhide('sumsofsquaresread')" style="font-size:.8em;color:skyblue;">(Read
more about sums…)</a></p>
<div id="sumsofsquaresread" style="display:none;">
<p>A sum is just a fancy word for adding things together. <span
class="math display">\[
  1 + 2 + 3 + 4 + 5 + 6 = 21
\]</span></p>
<p>Long sums get tedious to write out by hand. So we use the symbol
<span class="math inline">\(\Sigma\)</span> to denote the word “sum”.
Further, we use a subscript <span
class="math inline">\(\Sigma_{i=1}\)</span> to state what value the sum
is beginning with, and a superscript <span
class="math inline">\(\Sigma_{i=1}^6\)</span> to state the value we are
ending at. This gives <span class="math display">\[
  \sum_{i=1}^6 i = 1 + 2 + 3 + 4 + 5 + 6 = 21
\]</span></p>
<p>Test your knowledge, do you see why the answer is 6 to the sum below?
<span class="math display">\[
  \sum_{i=1}^3 i = 6
\]</span></p>
<p>Computing sums in R is fairly easy. Type the following codes in your
R Console.</p>
<p><code>sum(1:6) #gives the answer of 21</code></p>
<p><code>sum(1:3) #gives the answer of 6</code></p>
<p>However, sums really become useful when used with a data set.</p>
<p>Each row of a data set represents an “individual’s” data. We can
reference each individual with a row number. In the data below,
individual 3, denoted by <span class="math inline">\(i=3\)</span>, has a
<code>speed</code> of 7 and a <code>dist</code> of 4.</p>
<div class="sourceCode" id="cb17"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="fu">pander</span>(<span class="fu">head</span>(<span class="fu">cbind</span>(<span class="at">Individual =</span> <span class="dv">1</span><span class="sc">:</span><span class="dv">50</span>, cars), <span class="dv">6</span>), <span class="at">emphasize.strong.rows=</span><span class="dv">3</span>)</span></code></pre></div>
<table style="width:40%;">
<colgroup>
<col width="18%" />
<col width="11%" />
<col width="11%" />
</colgroup>
<thead>
<tr class="header">
<th align="center">Individual</th>
<th align="center">speed</th>
<th align="center">dist</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="center">1</td>
<td align="center">4</td>
<td align="center">2</td>
</tr>
<tr class="even">
<td align="center">2</td>
<td align="center">4</td>
<td align="center">10</td>
</tr>
<tr class="odd">
<td align="center"><strong>3</strong></td>
<td align="center"><strong>7</strong></td>
<td align="center"><strong>4</strong></td>
</tr>
<tr class="even">
<td align="center">4</td>
<td align="center">7</td>
<td align="center">22</td>
</tr>
<tr class="odd">
<td align="center">5</td>
<td align="center">8</td>
<td align="center">16</td>
</tr>
<tr class="even">
<td align="center">6</td>
<td align="center">9</td>
<td align="center">10</td>
</tr>
</tbody>
</table>
<p>To compute the sum of the <strong>speed</strong> column, use
<code>sum(speed)</code>. If we divided this sum by 6, we would get the
mean of speed <code>mean(speed)</code>. In fact, the two most used
statistics <code>mean(...)</code> and <code>sd(...)</code> both use
sums. Take a moment to review the formulas for <a
href="NumericalSummaries.html#mean">mean</a> and <a
href="NumericalSummaries.html#standard-deviation">standard
deviation</a>. It is strongly recommended that you study the Explanation
tab for both as well. We’ll wait. See you back here shortly.</p>
<p>…</p>
<p>Welcome back.</p>
<p>Suppose we let <code>X = speed</code> and <code>Y = dist</code>. Then
<span class="math inline">\(X_3 = 7\)</span> and <span
class="math inline">\(Y_3 = 4\)</span> because we are accessing row 3 of
both the <span class="math inline">\(X\)</span> (or speed) column and
<span class="math inline">\(Y\)</span> (or dist) column. (Remember from
the above discussion that for individual #3, the speed was 7 and the
dist was 4.) Further, <code>sum(speed)</code> would be written
mathematically as <span class="math inline">\(\sum_{i=1}^6 X_i\)</span>
and <code>sum(dist)</code> would be written as <span
class="math inline">\(\sum_{i=1}^6 Y_i\)</span>.</p>
</div>
</div>
<table>
<colgroup>
<col width="32%" />
<col width="36%" />
<col width="30%" />
</colgroup>
<thead>
<tr class="header">
<th><strong>Sum of Squared Errors</strong></th>
<th><strong>Sum of Squares Regression</strong></th>
<th><strong>Total Sum of Squares</strong></th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td><span class="math inline">\(\text{SSE} = \sum_{i=1}^n \left(Y_i -
\hat{Y}_i\right)^2\)</span></td>
<td><span class="math inline">\(\text{SSR} = \sum_{i=1}^n
\left(\hat{Y}_i - \bar{Y}\right)^2\)</span></td>
<td><span class="math inline">\(\text{SSTO} = \sum_{i=1}^n \left(Y_i -
\bar{Y}\right)^2\)</span></td>
</tr>
<tr class="even">
<td>Measures how much the residuals deviate from the line.</td>
<td>Measures how much the regression line deviates from the average
y-value.</td>
<td>Measures how much the y-values deviate from the average
y-value.</td>
</tr>
<tr class="odd">
<td>Equals SSTO - SSR</td>
<td>Equals SSTO - SSE</td>
<td>Equals SSE + SSR</td>
</tr>
<tr class="even">
<td><code>sum( (Y - mylm$fit)^2 )</code></td>
<td><code>sum( (mylm$fit - mean(Y))^2 )</code></td>
<td><code>sum( (Y - mean(Y))^2 )</code></td>
</tr>
<tr class="odd">
<td></td>
<td></td>
<td></td>
</tr>
</tbody>
</table>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-18-1.png" width="672" /></p>
<hr style="border-color:#d5d5d5; border-style:solid;"/>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-19-1.png" width="672" /></p>
<p>It is important to remember that SSE and SSR split up SSTO, so that
<span class="math display">\[
  \text{SSTO} = \text{SSE} + \text{SSR}
\]</span> This implies that if SSE is large (close to SSTO) then SSR is
small (close to zero) and visa versa. The following three graphics
demonstrate how this works.</p>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-20-1.png" width="672" /></p>
<p>The above graphs reveal that the idea of correlation is tightly
linked with sums of squares. In fact, the correlation squared is equal
to SSR/SSTO. And this fraction, SSR/SSTO is called <span
class="math inline">\(R^2\)</span> (“r-squared”).</p>
<p><strong>R-Squared (<span class="math inline">\(R^2\)</span>)</strong>
<span class="math display">\[
  \underbrace{R^2 = \frac{SSR}{SSTO} = 1 -
\frac{SSE}{SSTO}}_\text{Interpretation: Proportion of variation in Y
explained by the regression.}
\]</span></p>
<p>The smallest <span class="math inline">\(R^2\)</span> can be is zero,
and the largest it can be is 1. This is because <span
class="math inline">\(SSR\)</span> must be between 0 and SSTO,
inclusive.</p>
</div>
<p><br /></p>
</div>
<div id="residual-plots-regression-assumptions-expand"
class="section level4">
<h4>Residual Plots &amp; Regression Assumptions
<a href="javascript:showhide('assumptions1')" style="font-size:.6em;color:skyblue;">(Expand)</a></h4>
<p><span class="expand-caption">Residuals vs. fitted-values, Q-Q Plot of
the residuals, and residuals vs. order plots…</span></p>
<div id="assumptions1" style="display:none;">
<p>There are five assumptions that should be met for the mathematical
model of simple linear regression to be appropriate.</p>
<div style="padding-left:60px;color:darkgray;font-size:.8em;">
<p>Each assumption is labeled in the regression equation below.</p>
</div>
<ol style="list-style-type: decimal">
<li>The regression relation between <span
class="math inline">\(Y\)</span> and <span
class="math inline">\(X\)</span> is linear.</li>
<li>The error terms are normally distributed with <span
class="math inline">\(E\{\epsilon_i\}=0\)</span>.</li>
<li>The variance of the error terms is constant over all <span
class="math inline">\(X\)</span> values.</li>
<li>The <span class="math inline">\(X\)</span> values can be considered
fixed and measured without error.</li>
<li>The error terms are independent.</li>
</ol>
<p><span style="color:darkgray;">Regression Equation</span> <span
class="math display">\[
  Y_i = \underbrace{\beta_0 + \beta_1
\overbrace{X_i}^\text{#4}}_{\text{#1}} + \epsilon_i \quad \text{where} \
\overbrace{\epsilon_i \sim}^\text{#5} \overbrace{N(0}^\text{#2},
\overbrace{\sigma^2}^\text{#3})
\]</span></p>
<p>Residuals are used to diagnose departures from the regression
assumptions.</p>
<p><a href="javascript:showhide('moreassumptionsdetail')" style="font-size:.8em;color:skyblue;">(Read
more…)</a></p>
<div id="moreassumptionsdetail" style="display:none;">
<p>As shown above, the regression equation makes several claims, or
assumptions, about the error terms <span
class="math inline">\(\epsilon_i\)</span>, specifically 2, 3, and 5 of
the regression assumptions are hidden inside the statement <span
class="math inline">\(\epsilon_i \sim N(0, \sigma^2)\)</span> as shown
here <span class="math display">\[
  \epsilon_i \underbrace{\sim}_{\substack{\text{Independent} \\
\text{Errors}}} \overbrace{N}^{\substack{\text{Normally} \\
\text{distributed}}}(\underbrace{0}_{\substack{\text{mean of} \\
\text{zero}}}, \underbrace{\sigma^2}_{\substack{\text{Constant} \\
\text{Variance}}})
\]</span></p>
<p>While the actual error terms (<span
class="math inline">\(\epsilon_i\)</span>) are unknown in real life, the
residuals (<span class="math inline">\(r_i\)</span>) are known. Thus, we
can use the residuals to check if the assumptions of the regression
appear to be satisfied or not.</p>
</div>
<p><br /></p>
<div style="padding-left:15px;">
<div id="residuals-versus-fitted-values-plot-checks-assumptions-1-and-3"
class="section level5">
<h5>Residuals versus Fitted-values Plot: Checks Assumptions #1 and
#3</h5>
<table width="90%">
<tr>
<td with="15%">
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-21-1.png" width="144" /></p>
</td>
<td width="75%">
<p>The linear relationship and constant variance assumptions can be
diagnosed using a residuals versus fitted-values plot. The fitted values
are the <span class="math inline">\(\hat{Y}_i\)</span>. The residuals
are the <span class="math inline">\(r_i\)</span>. This plot compares the
residual to the magnitude of the fitted-value. No discernable pattern in
this plot is desirable.</p>
<p>|
<a href="javascript:showhide('residualsvsfittedvalues')" style="font-size:.8em;color:steelblue2;">Show
Examples</a> |</p>
</td>
</tr>
</table>
<div id="residualsvsfittedvalues" style="display:none;">
<p><a href="javascript:showhide('residualsvsfittedvaluesread')" style="font-size:.8em;color:skyblue;">(Read
more…)</a></p>
<div id="residualsvsfittedvaluesread" style="display:none;">
<p>The residuals versus fitted values plot checks for departures from
the linear relation assumption and the constant variance assumption.</p>
<ul>
<li><p>The linear relation is assumed to be satisfied if there are no
apparent trends in the plot.</p></li>
<li><p>The constant variance assumption is assumed to be satisfied if
the vertical spread of the residuals remains roughly consistent across
all fitted values.</p></li>
</ul>
<p>The left column of plots below show scenarios that would be
considered not linear. The right column of plots show scenarios that
would be considered linear, but lacking constant variance. The middle
column of plots shows scenarios that would satisfy both assumptions,
linear and constant variance.</p>
</div>
<div class="sourceCode" id="cb18"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">2</span>)</span>
<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a>X <span class="ot">&lt;-</span> <span class="fu">rnorm</span>(<span class="dv">30</span>,<span class="dv">15</span>,<span class="dv">3</span>)</span>
<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a>notLin <span class="ot">&lt;-</span> <span class="fu">data.frame</span>(<span class="at">X =</span> X, <span class="at">Y =</span> <span class="dv">500</span><span class="sc">-</span>X<span class="sc">^</span><span class="dv">2</span><span class="sc">+</span><span class="fu">rnorm</span>(<span class="dv">30</span>,<span class="dv">1</span>,<span class="dv">8</span>))</span>
<span id="cb18-4"><a href="#cb18-4" aria-hidden="true" tabindex="-1"></a>notLin.lm <span class="ot">&lt;-</span> <span class="fu">lm</span>(Y<span class="sc">~</span>X, <span class="at">data=</span>notLin)</span>
<span id="cb18-5"><a href="#cb18-5" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">15</span>)</span>
<span id="cb18-6"><a href="#cb18-6" aria-hidden="true" tabindex="-1"></a>Lin <span class="ot">&lt;-</span> <span class="fu">data.frame</span>(<span class="at">X=</span>X, <span class="at">Y =</span> <span class="dv">5</span><span class="fl">+1.8</span><span class="sc">*</span>X<span class="sc">+</span><span class="fu">rnorm</span>(<span class="dv">30</span>,<span class="dv">2</span>,<span class="fl">1.3</span>))</span>
<span id="cb18-7"><a href="#cb18-7" aria-hidden="true" tabindex="-1"></a>Lin.lm <span class="ot">&lt;-</span> <span class="fu">lm</span>(Y<span class="sc">~</span>X, <span class="at">data=</span>Lin)</span>
<span id="cb18-8"><a href="#cb18-8" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mfrow=</span><span class="fu">c</span>(<span class="dv">3</span>,<span class="dv">3</span>),  <span class="at">mai=</span><span class="fu">c</span>(.<span class="dv">25</span>,.<span class="dv">25</span>,.<span class="dv">25</span>,.<span class="dv">25</span>), <span class="at">mgp=</span><span class="fu">c</span>(<span class="dv">1</span>,.<span class="dv">75</span>,<span class="dv">0</span>))</span>
<span id="cb18-9"><a href="#cb18-9" aria-hidden="true" tabindex="-1"></a>  <span class="fu">plot</span>(notLin.lm<span class="sc">$</span>fitted.values,notLin.lm<span class="sc">$</span>residuals, <span class="at">pch=</span><span class="dv">20</span>,</span>
<span id="cb18-10"><a href="#cb18-10" aria-hidden="true" tabindex="-1"></a>       <span class="at">xlab=</span><span class="st">&quot;Fitted Values&quot;</span>, <span class="at">ylab=</span><span class="st">&quot;Residuals&quot;</span>, </span>
<span id="cb18-11"><a href="#cb18-11" aria-hidden="true" tabindex="-1"></a>       <span class="at">main=</span><span class="st">&quot;Not Linear&quot;</span>, <span class="at">cex.main=</span><span class="fl">0.95</span>,</span>
<span id="cb18-12"><a href="#cb18-12" aria-hidden="true" tabindex="-1"></a>       <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>)</span>
<span id="cb18-13"><a href="#cb18-13" aria-hidden="true" tabindex="-1"></a>  mycurve <span class="ot">&lt;-</span> <span class="fu">lowess</span>(notLin.lm<span class="sc">$</span>fitted.values,notLin.lm<span class="sc">$</span>residuals)</span>
<span id="cb18-14"><a href="#cb18-14" aria-hidden="true" tabindex="-1"></a>  mycurveOrder <span class="ot">&lt;-</span> <span class="fu">order</span>(mycurve<span class="sc">$</span>x)</span>
<span id="cb18-15"><a href="#cb18-15" aria-hidden="true" tabindex="-1"></a>  mycurve<span class="sc">$</span>x <span class="ot">&lt;-</span> mycurve<span class="sc">$</span>x[mycurveOrder]</span>
<span id="cb18-16"><a href="#cb18-16" aria-hidden="true" tabindex="-1"></a>  mycurve<span class="sc">$</span>y <span class="ot">&lt;-</span> mycurve<span class="sc">$</span>y[mycurveOrder]</span>
<span id="cb18-17"><a href="#cb18-17" aria-hidden="true" tabindex="-1"></a>  <span class="fu">polygon</span>(<span class="fu">c</span>(mycurve<span class="sc">$</span>x,<span class="fu">rev</span>(mycurve<span class="sc">$</span>x)), <span class="fu">c</span>(mycurve<span class="sc">$</span>y<span class="sc">+</span><span class="dv">10</span>, <span class="fu">rev</span>(mycurve<span class="sc">$</span>y<span class="dv">-10</span>)), <span class="at">col=</span><span class="fu">rgb</span>(.<span class="dv">7</span>,.<span class="dv">7</span>,.<span class="dv">7</span>,.<span class="dv">2</span>), <span class="at">border=</span><span class="cn">NA</span>) </span>
<span id="cb18-18"><a href="#cb18-18" aria-hidden="true" tabindex="-1"></a>  <span class="fu">abline</span>(<span class="at">h=</span><span class="dv">0</span>)</span>
<span id="cb18-19"><a href="#cb18-19" aria-hidden="true" tabindex="-1"></a>  <span class="fu">plot</span>(Lin.lm<span class="sc">$</span>fitted.values,Lin.lm<span class="sc">$</span>residuals, <span class="at">pch=</span><span class="dv">20</span>, </span>
<span id="cb18-20"><a href="#cb18-20" aria-hidden="true" tabindex="-1"></a>       <span class="at">xlab=</span><span class="st">&quot;Fitted Values&quot;</span>, <span class="at">ylab=</span><span class="st">&quot;Residuals&quot;</span>, </span>
<span id="cb18-21"><a href="#cb18-21" aria-hidden="true" tabindex="-1"></a>       <span class="at">main=</span><span class="st">&quot;Good: Linear, Constant Variance&quot;</span>, </span>
<span id="cb18-22"><a href="#cb18-22" aria-hidden="true" tabindex="-1"></a>       <span class="at">cex.main=</span><span class="fl">0.95</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>)</span>
<span id="cb18-23"><a href="#cb18-23" aria-hidden="true" tabindex="-1"></a>  <span class="fu">abline</span>(<span class="at">h=</span><span class="dv">0</span>)</span>
<span id="cb18-24"><a href="#cb18-24" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb18-25"><a href="#cb18-25" aria-hidden="true" tabindex="-1"></a>  <span class="fu">set.seed</span>(<span class="dv">6</span>)</span>
<span id="cb18-26"><a href="#cb18-26" aria-hidden="true" tabindex="-1"></a>notCon <span class="ot">&lt;-</span> <span class="fu">data.frame</span>(<span class="at">X =</span> X, <span class="at">Y =</span> <span class="dv">5</span><span class="fl">+1.8</span><span class="sc">*</span>X <span class="sc">+</span> <span class="fu">rnorm</span>(<span class="dv">30</span>,<span class="dv">2</span>,X<span class="sc">^</span><span class="fl">1.5</span>))</span>
<span id="cb18-27"><a href="#cb18-27" aria-hidden="true" tabindex="-1"></a>notCon.lm <span class="ot">&lt;-</span> <span class="fu">lm</span>(Y<span class="sc">~</span>X, <span class="at">data=</span>notCon)</span>
<span id="cb18-28"><a href="#cb18-28" aria-hidden="true" tabindex="-1"></a>LinO <span class="ot">&lt;-</span> <span class="fu">data.frame</span>(<span class="at">X=</span>X, <span class="at">Y =</span> <span class="dv">5</span><span class="fl">+1.8</span><span class="sc">*</span>X<span class="sc">+</span><span class="fu">rnorm</span>(<span class="dv">30</span>,<span class="dv">2</span>,<span class="fl">1.3</span>))</span>
<span id="cb18-29"><a href="#cb18-29" aria-hidden="true" tabindex="-1"></a>LinO[<span class="dv">1</span>] <span class="ot">&lt;-</span> LinO[<span class="dv">1</span>]<span class="sc">^</span><span class="dv">2</span></span>
<span id="cb18-30"><a href="#cb18-30" aria-hidden="true" tabindex="-1"></a>LinO.lm <span class="ot">&lt;-</span> <span class="fu">lm</span>(Y<span class="sc">~</span>X, <span class="at">data=</span>LinO)</span>
<span id="cb18-31"><a href="#cb18-31" aria-hidden="true" tabindex="-1"></a>  <span class="fu">plot</span>(notCon.lm<span class="sc">$</span>fitted.values,notCon.lm<span class="sc">$</span>residuals, <span class="at">pch=</span><span class="dv">20</span>, <span class="at">xlab=</span><span class="st">&quot;Fitted Values&quot;</span>, <span class="at">ylab=</span><span class="st">&quot;Residuals&quot;</span>, <span class="at">main=</span><span class="st">&quot;Unconstant Variance&quot;</span>, <span class="at">cex.main=</span><span class="fl">0.95</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>)</span>
<span id="cb18-32"><a href="#cb18-32" aria-hidden="true" tabindex="-1"></a>  <span class="fu">polygon</span>(<span class="fu">c</span>(<span class="fu">rep</span>(<span class="fu">min</span>(notCon.lm<span class="sc">$</span>fit),<span class="dv">2</span>), <span class="fu">rep</span>(<span class="fu">max</span>(notCon.lm<span class="sc">$</span>fit), <span class="dv">2</span>)), <span class="fu">c</span>(<span class="sc">-</span><span class="dv">30</span>,<span class="dv">30</span>,<span class="fl">1.2</span><span class="sc">*</span><span class="fu">max</span>(notCon.lm<span class="sc">$</span>res),<span class="fl">1.2</span><span class="sc">*</span><span class="fu">min</span>(notCon.lm<span class="sc">$</span>res)), <span class="at">col=</span><span class="fu">rgb</span>(.<span class="dv">7</span>,.<span class="dv">7</span>,.<span class="dv">7</span>,.<span class="dv">2</span>), <span class="at">border=</span><span class="cn">NA</span>) </span>
<span id="cb18-33"><a href="#cb18-33" aria-hidden="true" tabindex="-1"></a>  <span class="fu">abline</span>(<span class="at">h=</span><span class="dv">0</span>)</span>
<span id="cb18-34"><a href="#cb18-34" aria-hidden="true" tabindex="-1"></a><span class="co">#  plot(LinO.lm$fitted.values,LinO.lm$residuals, pch=20, xlab=&quot;Fitted Values&quot;, ylab=&quot;Residuals&quot;, main=&quot;Outliers&quot;, cex.main=0.95)</span></span>
<span id="cb18-35"><a href="#cb18-35" aria-hidden="true" tabindex="-1"></a><span class="co">#  abline(h=0)</span></span>
<span id="cb18-36"><a href="#cb18-36" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb18-37"><a href="#cb18-37" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb18-38"><a href="#cb18-38" aria-hidden="true" tabindex="-1"></a>  tmp <span class="ot">&lt;-</span> <span class="fu">lm</span>(height <span class="sc">~</span> age, <span class="at">data=</span>Loblolly)</span>
<span id="cb18-39"><a href="#cb18-39" aria-hidden="true" tabindex="-1"></a>  <span class="fu">plot</span>(tmp<span class="sc">$</span>residuals <span class="sc">~</span> tmp<span class="sc">$</span>fitted.values, <span class="at">pch=</span><span class="dv">20</span>,</span>
<span id="cb18-40"><a href="#cb18-40" aria-hidden="true" tabindex="-1"></a>       <span class="at">xlab=</span><span class="st">&quot;Fitted Values&quot;</span>, <span class="at">ylab=</span><span class="st">&quot;Residuals&quot;</span>, </span>
<span id="cb18-41"><a href="#cb18-41" aria-hidden="true" tabindex="-1"></a>       <span class="at">main=</span><span class="st">&quot;&quot;</span>, <span class="at">cex.main=</span><span class="fl">0.95</span>,</span>
<span id="cb18-42"><a href="#cb18-42" aria-hidden="true" tabindex="-1"></a>       <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>)</span>
<span id="cb18-43"><a href="#cb18-43" aria-hidden="true" tabindex="-1"></a>  mycurve <span class="ot">&lt;-</span> <span class="fu">lowess</span>(tmp<span class="sc">$</span>fitted.values,tmp<span class="sc">$</span>residuals)</span>
<span id="cb18-44"><a href="#cb18-44" aria-hidden="true" tabindex="-1"></a>  mycurveOrder <span class="ot">&lt;-</span> <span class="fu">order</span>(mycurve<span class="sc">$</span>x)</span>
<span id="cb18-45"><a href="#cb18-45" aria-hidden="true" tabindex="-1"></a>  mycurve<span class="sc">$</span>x <span class="ot">&lt;-</span> mycurve<span class="sc">$</span>x[mycurveOrder]</span>
<span id="cb18-46"><a href="#cb18-46" aria-hidden="true" tabindex="-1"></a>  mycurve<span class="sc">$</span>y <span class="ot">&lt;-</span> mycurve<span class="sc">$</span>y[mycurveOrder]</span>
<span id="cb18-47"><a href="#cb18-47" aria-hidden="true" tabindex="-1"></a>  <span class="fu">polygon</span>(<span class="fu">c</span>(mycurve<span class="sc">$</span>x,<span class="fu">rev</span>(mycurve<span class="sc">$</span>x)), <span class="fu">c</span>(mycurve<span class="sc">$</span>y<span class="sc">+</span><span class="dv">3</span>, <span class="fu">rev</span>(mycurve<span class="sc">$</span>y<span class="dv">-1</span>)), <span class="at">col=</span><span class="fu">rgb</span>(.<span class="dv">7</span>,.<span class="dv">7</span>,.<span class="dv">7</span>,.<span class="dv">2</span>), <span class="at">border=</span><span class="cn">NA</span>) </span>
<span id="cb18-48"><a href="#cb18-48" aria-hidden="true" tabindex="-1"></a>  <span class="fu">abline</span>(<span class="at">h=</span><span class="dv">0</span>)</span>
<span id="cb18-49"><a href="#cb18-49" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb18-50"><a href="#cb18-50" aria-hidden="true" tabindex="-1"></a>  tmp <span class="ot">&lt;-</span> <span class="fu">lm</span>(Girth <span class="sc">~</span> Volume, <span class="at">data=</span>trees[<span class="sc">-</span><span class="dv">31</span>,])</span>
<span id="cb18-51"><a href="#cb18-51" aria-hidden="true" tabindex="-1"></a>  <span class="fu">plot</span>(tmp<span class="sc">$</span>residuals <span class="sc">~</span> tmp<span class="sc">$</span>fitted.values, <span class="at">pch=</span><span class="dv">20</span>,</span>
<span id="cb18-52"><a href="#cb18-52" aria-hidden="true" tabindex="-1"></a>       <span class="at">xlab=</span><span class="st">&quot;Fitted Values&quot;</span>, <span class="at">ylab=</span><span class="st">&quot;Residuals&quot;</span>, </span>
<span id="cb18-53"><a href="#cb18-53" aria-hidden="true" tabindex="-1"></a>       <span class="at">main=</span><span class="st">&quot;&quot;</span>, <span class="at">cex.main=</span><span class="fl">0.95</span>,</span>
<span id="cb18-54"><a href="#cb18-54" aria-hidden="true" tabindex="-1"></a>       <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>)</span>
<span id="cb18-55"><a href="#cb18-55" aria-hidden="true" tabindex="-1"></a>  <span class="fu">abline</span>(<span class="at">h=</span><span class="dv">0</span>)</span>
<span id="cb18-56"><a href="#cb18-56" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb18-57"><a href="#cb18-57" aria-hidden="true" tabindex="-1"></a>  tmp <span class="ot">&lt;-</span> <span class="fu">lm</span>(Height <span class="sc">~</span> Volume, <span class="at">data=</span>trees)</span>
<span id="cb18-58"><a href="#cb18-58" aria-hidden="true" tabindex="-1"></a>  <span class="fu">plot</span>(tmp<span class="sc">$</span>residuals <span class="sc">~</span> tmp<span class="sc">$</span>fitted.values, <span class="at">pch=</span><span class="dv">20</span>,</span>
<span id="cb18-59"><a href="#cb18-59" aria-hidden="true" tabindex="-1"></a>       <span class="at">xlab=</span><span class="st">&quot;Fitted Values&quot;</span>, <span class="at">ylab=</span><span class="st">&quot;Residuals&quot;</span>, </span>
<span id="cb18-60"><a href="#cb18-60" aria-hidden="true" tabindex="-1"></a>       <span class="at">main=</span><span class="st">&quot;&quot;</span>, <span class="at">cex.main=</span><span class="fl">0.95</span>,</span>
<span id="cb18-61"><a href="#cb18-61" aria-hidden="true" tabindex="-1"></a>       <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>)</span>
<span id="cb18-62"><a href="#cb18-62" aria-hidden="true" tabindex="-1"></a>  <span class="fu">polygon</span>(<span class="fu">c</span>(<span class="fu">rep</span>(<span class="fu">min</span>(tmp<span class="sc">$</span>fit), <span class="dv">2</span>), <span class="fu">max</span>(tmp<span class="sc">$</span>fit)), <span class="fu">c</span>(<span class="fl">1.3</span><span class="sc">*</span><span class="fu">max</span>(tmp<span class="sc">$</span>res),<span class="fl">1.2</span><span class="sc">*</span><span class="fu">min</span>(tmp<span class="sc">$</span>res),<span class="dv">0</span>), <span class="at">col=</span><span class="fu">rgb</span>(.<span class="dv">8</span>,.<span class="dv">8</span>,.<span class="dv">8</span>,.<span class="dv">2</span>), <span class="at">border=</span><span class="cn">NA</span>) </span>
<span id="cb18-63"><a href="#cb18-63" aria-hidden="true" tabindex="-1"></a>  <span class="fu">abline</span>(<span class="at">h=</span><span class="dv">0</span>)</span>
<span id="cb18-64"><a href="#cb18-64" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb18-65"><a href="#cb18-65" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb18-66"><a href="#cb18-66" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb18-67"><a href="#cb18-67" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb18-68"><a href="#cb18-68" aria-hidden="true" tabindex="-1"></a>  tmp <span class="ot">&lt;-</span> <span class="fu">lm</span>(mpg <span class="sc">~</span> disp, <span class="at">data=</span>mtcars)</span>
<span id="cb18-69"><a href="#cb18-69" aria-hidden="true" tabindex="-1"></a>  <span class="fu">plot</span>(tmp<span class="sc">$</span>residuals <span class="sc">~</span> tmp<span class="sc">$</span>fitted.values, <span class="at">pch=</span><span class="dv">20</span>,</span>
<span id="cb18-70"><a href="#cb18-70" aria-hidden="true" tabindex="-1"></a>       <span class="at">xlab=</span><span class="st">&quot;Fitted Values&quot;</span>, <span class="at">ylab=</span><span class="st">&quot;Residuals&quot;</span>, </span>
<span id="cb18-71"><a href="#cb18-71" aria-hidden="true" tabindex="-1"></a>       <span class="at">main=</span><span class="st">&quot;&quot;</span>, <span class="at">cex.main=</span><span class="fl">0.95</span>,</span>
<span id="cb18-72"><a href="#cb18-72" aria-hidden="true" tabindex="-1"></a>       <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>)</span>
<span id="cb18-73"><a href="#cb18-73" aria-hidden="true" tabindex="-1"></a>  mycurve <span class="ot">&lt;-</span> <span class="fu">lowess</span>(tmp<span class="sc">$</span>fitted.values,tmp<span class="sc">$</span>residuals, <span class="at">f=</span>.<span class="dv">4</span>)</span>
<span id="cb18-74"><a href="#cb18-74" aria-hidden="true" tabindex="-1"></a>  mycurveOrder <span class="ot">&lt;-</span> <span class="fu">order</span>(mycurve<span class="sc">$</span>x)</span>
<span id="cb18-75"><a href="#cb18-75" aria-hidden="true" tabindex="-1"></a>  mycurve<span class="sc">$</span>x <span class="ot">&lt;-</span> mycurve<span class="sc">$</span>x[mycurveOrder]</span>
<span id="cb18-76"><a href="#cb18-76" aria-hidden="true" tabindex="-1"></a>  mycurve<span class="sc">$</span>y <span class="ot">&lt;-</span> mycurve<span class="sc">$</span>y[mycurveOrder]</span>
<span id="cb18-77"><a href="#cb18-77" aria-hidden="true" tabindex="-1"></a>  <span class="fu">polygon</span>(<span class="fu">c</span>(mycurve<span class="sc">$</span>x,<span class="fu">rev</span>(mycurve<span class="sc">$</span>x)), <span class="fu">c</span>(mycurve<span class="sc">$</span>y<span class="fl">+3.5</span>, <span class="fu">rev</span>(mycurve<span class="sc">$</span>y<span class="dv">-2</span>)), <span class="at">col=</span><span class="fu">rgb</span>(.<span class="dv">7</span>,.<span class="dv">7</span>,.<span class="dv">7</span>,.<span class="dv">2</span>), <span class="at">border=</span><span class="cn">NA</span>) </span>
<span id="cb18-78"><a href="#cb18-78" aria-hidden="true" tabindex="-1"></a>  <span class="fu">abline</span>(<span class="at">h=</span><span class="dv">0</span>) </span>
<span id="cb18-79"><a href="#cb18-79" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb18-80"><a href="#cb18-80" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb18-81"><a href="#cb18-81" aria-hidden="true" tabindex="-1"></a>  tmp <span class="ot">&lt;-</span> <span class="fu">lm</span>(weight <span class="sc">~</span> repwt, <span class="at">data=</span>Davis[<span class="sc">-</span><span class="dv">12</span>,])</span>
<span id="cb18-82"><a href="#cb18-82" aria-hidden="true" tabindex="-1"></a>  <span class="fu">plot</span>(tmp<span class="sc">$</span>residuals <span class="sc">~</span> tmp<span class="sc">$</span>fitted.values, <span class="at">pch=</span><span class="dv">20</span>,</span>
<span id="cb18-83"><a href="#cb18-83" aria-hidden="true" tabindex="-1"></a>       <span class="at">xlab=</span><span class="st">&quot;Fitted Values&quot;</span>, <span class="at">ylab=</span><span class="st">&quot;Residuals&quot;</span>, </span>
<span id="cb18-84"><a href="#cb18-84" aria-hidden="true" tabindex="-1"></a>       <span class="at">main=</span><span class="st">&quot;&quot;</span>, <span class="at">cex.main=</span><span class="fl">0.95</span>,</span>
<span id="cb18-85"><a href="#cb18-85" aria-hidden="true" tabindex="-1"></a>       <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>)</span>
<span id="cb18-86"><a href="#cb18-86" aria-hidden="true" tabindex="-1"></a>  <span class="fu">abline</span>(<span class="at">h=</span><span class="dv">0</span>) </span>
<span id="cb18-87"><a href="#cb18-87" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb18-88"><a href="#cb18-88" aria-hidden="true" tabindex="-1"></a>  tmp <span class="ot">&lt;-</span> <span class="fu">lm</span>(weight <span class="sc">~</span> repht, <span class="at">data=</span>Davis[<span class="sc">-</span><span class="dv">12</span>,])</span>
<span id="cb18-89"><a href="#cb18-89" aria-hidden="true" tabindex="-1"></a>  <span class="fu">plot</span>(tmp<span class="sc">$</span>residuals <span class="sc">~</span> tmp<span class="sc">$</span>fitted.values, <span class="at">pch=</span><span class="dv">20</span>,</span>
<span id="cb18-90"><a href="#cb18-90" aria-hidden="true" tabindex="-1"></a>       <span class="at">xlab=</span><span class="st">&quot;Fitted Values&quot;</span>, <span class="at">ylab=</span><span class="st">&quot;Residuals&quot;</span>, </span>
<span id="cb18-91"><a href="#cb18-91" aria-hidden="true" tabindex="-1"></a>       <span class="at">main=</span><span class="st">&quot;&quot;</span>, <span class="at">cex.main=</span><span class="fl">0.95</span>,</span>
<span id="cb18-92"><a href="#cb18-92" aria-hidden="true" tabindex="-1"></a>       <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>)</span>
<span id="cb18-93"><a href="#cb18-93" aria-hidden="true" tabindex="-1"></a>  <span class="fu">polygon</span>(<span class="fu">c</span>(<span class="fu">min</span>(tmp<span class="sc">$</span>fit),<span class="fu">rep</span>(<span class="fu">max</span>(tmp<span class="sc">$</span>fit), <span class="dv">2</span>)), <span class="fu">c</span>(<span class="dv">2</span>,<span class="fu">max</span>(tmp<span class="sc">$</span>res),<span class="fl">1.6</span><span class="sc">*</span><span class="fu">min</span>(tmp<span class="sc">$</span>res)), <span class="at">col=</span><span class="fu">rgb</span>(.<span class="dv">85</span>,.<span class="dv">85</span>,.<span class="dv">85</span>,.<span class="dv">2</span>), <span class="at">border=</span><span class="cn">NA</span>) </span>
<span id="cb18-94"><a href="#cb18-94" aria-hidden="true" tabindex="-1"></a>  <span class="fu">abline</span>(<span class="at">h=</span><span class="dv">0</span>) </span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-22-1.png" width="672" /></p>
</div>
<p><br /></p>
</div>
<div id="q-q-plot-of-the-residuals-checks-assumption-2"
class="section level5">
<h5>Q-Q Plot of the Residuals: Checks Assumption #2</h5>
<table width="90%">
<tr>
<td with="15%">
<img src="LinearRegression_files/figure-html/unnamed-chunk-23-1.png" width="144" />
</td>
<td width="75%">
<p>The normality of the error terms can be assessed by considering a
normal probability plot (Q-Q Plot) of the residuals. If the residuals
appear to be normal, then the error terms are also considered to be
normal. If the residuals do not appear to be normal, then the error
terms are also assumed to violate the normality assumption.</p>
<p>|
<a href="javascript:showhide('qqplots')" style="font-size:.8em;color:steelblue2;">Show
Examples</a> |</p>
</td>
</tr>
</table>
<div id="qqplots" style="display:none;">
<p><a href="javascript:showhide('qqplotsread')" style="font-size:.8em;color:skyblue;">(Read
more…)</a></p>
<div id="qqplotsread" style="display:none;">
<p>There are four main trends that occur in a normal probability plot.
Examples of each are plotted below with a histogram of the data next to
the normal probability plot.</p>
<p>Often the plot is called a Q-Q Plot, which stands for
quantile-quantile plot. The idea is to compare the observed distribution
of data to what the distribution should look like in theory if it was
normal. Q-Q Plots are more general than normal probability plots because
they can be used with any theoretical distribution, not just the normal
distribution.</p>
</div>
<div class="sourceCode" id="cb19"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mfrow=</span><span class="fu">c</span>(<span class="dv">2</span>,<span class="dv">2</span>),  <span class="at">mai=</span><span class="fu">c</span>(.<span class="dv">5</span>,.<span class="dv">5</span>,.<span class="dv">25</span>,.<span class="dv">25</span>), <span class="at">mgp=</span><span class="fu">c</span>(<span class="dv">1</span>,.<span class="dv">75</span>,<span class="dv">0</span>))</span>
<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">123</span>)</span>
<span id="cb19-4"><a href="#cb19-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb19-5"><a href="#cb19-5" aria-hidden="true" tabindex="-1"></a>  tmp <span class="ot">&lt;-</span> <span class="fu">rnorm</span>(<span class="dv">100</span>)</span>
<span id="cb19-6"><a href="#cb19-6" aria-hidden="true" tabindex="-1"></a>  <span class="fu">qqnorm</span>(tmp, <span class="at">pch=</span><span class="dv">20</span>, <span class="at">ylab=</span><span class="st">&quot;Observed&quot;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>)</span>
<span id="cb19-7"><a href="#cb19-7" aria-hidden="true" tabindex="-1"></a>  <span class="fu">qqline</span>(tmp)</span>
<span id="cb19-8"><a href="#cb19-8" aria-hidden="true" tabindex="-1"></a>  <span class="fu">hist</span>(tmp, <span class="at">xlab=</span><span class="st">&quot;&quot;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">main=</span><span class="st">&quot;Normal&quot;</span>, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>)</span>
<span id="cb19-9"><a href="#cb19-9" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb19-10"><a href="#cb19-10" aria-hidden="true" tabindex="-1"></a>  tmp <span class="ot">&lt;-</span> Davis<span class="sc">$</span>weight</span>
<span id="cb19-11"><a href="#cb19-11" aria-hidden="true" tabindex="-1"></a>  <span class="fu">qqnorm</span>(tmp, <span class="at">pch=</span><span class="dv">20</span>, <span class="at">ylab=</span><span class="st">&quot;Observed&quot;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>)</span>
<span id="cb19-12"><a href="#cb19-12" aria-hidden="true" tabindex="-1"></a>  <span class="fu">qqline</span>(tmp)</span>
<span id="cb19-13"><a href="#cb19-13" aria-hidden="true" tabindex="-1"></a>  <span class="fu">hist</span>(tmp, <span class="at">xlab=</span><span class="st">&quot;&quot;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">main=</span><span class="st">&quot;Right-skewed&quot;</span>,</span>
<span id="cb19-14"><a href="#cb19-14" aria-hidden="true" tabindex="-1"></a>       <span class="at">breaks=</span><span class="dv">15</span>, <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-24-1.png" width="672" /></p>
<div class="sourceCode" id="cb20"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mfrow=</span><span class="fu">c</span>(<span class="dv">2</span>,<span class="dv">2</span>),  <span class="at">mai=</span><span class="fu">c</span>(.<span class="dv">5</span>,.<span class="dv">5</span>,.<span class="dv">25</span>,.<span class="dv">25</span>), <span class="at">mgp=</span><span class="fu">c</span>(<span class="dv">1</span>,.<span class="dv">75</span>,<span class="dv">0</span>))</span>
<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a>  tmp <span class="ot">&lt;-</span> <span class="fu">rbeta</span>(<span class="dv">100</span>, <span class="dv">5</span>,<span class="dv">1</span>)</span>
<span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">qqnorm</span>(tmp, <span class="at">pch=</span><span class="dv">20</span>, <span class="at">ylab=</span><span class="st">&quot;Observed&quot;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>)</span>
<span id="cb20-5"><a href="#cb20-5" aria-hidden="true" tabindex="-1"></a>  <span class="fu">qqline</span>(tmp)</span>
<span id="cb20-6"><a href="#cb20-6" aria-hidden="true" tabindex="-1"></a>  <span class="fu">hist</span>(tmp, <span class="at">xlab=</span><span class="st">&quot;&quot;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">main=</span><span class="st">&quot;Left-skewed&quot;</span>,</span>
<span id="cb20-7"><a href="#cb20-7" aria-hidden="true" tabindex="-1"></a>       <span class="at">breaks=</span><span class="fu">seq</span>(<span class="fu">min</span>(tmp),<span class="fu">max</span>(tmp), <span class="at">length.out=</span><span class="dv">13</span>), <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>)</span>
<span id="cb20-8"><a href="#cb20-8" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb20-9"><a href="#cb20-9" aria-hidden="true" tabindex="-1"></a>  tmp <span class="ot">&lt;-</span> <span class="fu">rbeta</span>(<span class="dv">100</span>,<span class="dv">2</span>,<span class="dv">2</span>)</span>
<span id="cb20-10"><a href="#cb20-10" aria-hidden="true" tabindex="-1"></a>  <span class="fu">qqnorm</span>(tmp, <span class="at">pch=</span><span class="dv">20</span>, <span class="at">ylab=</span><span class="st">&quot;Observed&quot;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>)</span>
<span id="cb20-11"><a href="#cb20-11" aria-hidden="true" tabindex="-1"></a>  <span class="fu">qqline</span>(tmp)</span>
<span id="cb20-12"><a href="#cb20-12" aria-hidden="true" tabindex="-1"></a>  <span class="fu">hist</span>(tmp, <span class="at">xlab=</span><span class="st">&quot;&quot;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">main=</span><span class="st">&quot;Heavy-tailed&quot;</span>, <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-24-2.png" width="672" /></p>
</div>
<p><br /></p>
</div>
<div id="residuals-versus-order-plot-checks-assumption-5"
class="section level5">
<h5>Residuals versus Order Plot: Checks Assumption #5</h5>
<table width="90%">
<tr>
<td with="15%">
<img src="LinearRegression_files/figure-html/unnamed-chunk-25-1.png" width="144" />
</td>
<td width="75%">
<p>When the data is collected in a specific order, or has some other
important ordering to it, then the independence of the error terms can
be assessed. This is typically done by plotting the residuals against
their order of occurrance. If any dramatic trends are visible in the
plot, then the independence assumption is violated.</p>
<p>|
<a href="javascript:showhide('resorderplots')" style="font-size:.8em;color:steelblue2;">Show
Examples</a> |</p>
</td>
</tr>
</table>
<div id="resorderplots" style="display:none;">
<p><a href="javascript:showhide('resorderplotsread')" style="font-size:.8em;color:skyblue;">(Read
more…)</a></p>
<div id="resorderplotsread" style="display:none;">
<p>Plotting the residuals against the order in which the data was
collected provides insight as to whether or not the observations can be
considered independent. If the plot shows no trend, then the error terms
are considered independent and the regression assumption satisfied. If
there is a visible trend in the plot, then the regression assumption is
likely violated.</p>
</div>
<div class="sourceCode" id="cb21"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mfrow=</span><span class="fu">c</span>(<span class="dv">2</span>,<span class="dv">2</span>),  <span class="at">mai=</span><span class="fu">c</span>(.<span class="dv">5</span>,.<span class="dv">5</span>,.<span class="dv">25</span>,.<span class="dv">25</span>), <span class="at">mgp=</span><span class="fu">c</span>(<span class="dv">1</span>,.<span class="dv">75</span>,<span class="dv">0</span>))</span>
<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a>  tmp <span class="ot">&lt;-</span> <span class="fu">lm</span>(mpg <span class="sc">~</span> disp, <span class="at">data=</span>mtcars)</span>
<span id="cb21-4"><a href="#cb21-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">plot</span>(tmp<span class="sc">$</span>residuals, <span class="at">pch=</span><span class="dv">20</span>,</span>
<span id="cb21-5"><a href="#cb21-5" aria-hidden="true" tabindex="-1"></a>       <span class="at">xlab=</span><span class="st">&quot;Order&quot;</span>, <span class="at">ylab=</span><span class="st">&quot;Residuals&quot;</span>, </span>
<span id="cb21-6"><a href="#cb21-6" aria-hidden="true" tabindex="-1"></a>       <span class="at">main=</span><span class="st">&quot;Good: No Trend&quot;</span>, <span class="at">cex.main=</span><span class="fl">0.95</span>,</span>
<span id="cb21-7"><a href="#cb21-7" aria-hidden="true" tabindex="-1"></a>       <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>)</span>
<span id="cb21-8"><a href="#cb21-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb21-9"><a href="#cb21-9" aria-hidden="true" tabindex="-1"></a>  tmp <span class="ot">&lt;-</span> <span class="fu">lm</span>(height <span class="sc">~</span> age, <span class="at">data=</span>Loblolly)</span>
<span id="cb21-10"><a href="#cb21-10" aria-hidden="true" tabindex="-1"></a>  <span class="fu">plot</span>(tmp<span class="sc">$</span>residuals, <span class="at">pch=</span><span class="dv">20</span>,</span>
<span id="cb21-11"><a href="#cb21-11" aria-hidden="true" tabindex="-1"></a>       <span class="at">xlab=</span><span class="st">&quot;Order&quot;</span>, <span class="at">ylab=</span><span class="st">&quot;Residuals&quot;</span>, </span>
<span id="cb21-12"><a href="#cb21-12" aria-hidden="true" tabindex="-1"></a>       <span class="at">main=</span><span class="st">&quot;Questionable: General Trend&quot;</span>, <span class="at">cex.main=</span><span class="fl">0.95</span>,</span>
<span id="cb21-13"><a href="#cb21-13" aria-hidden="true" tabindex="-1"></a>       <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">col=</span><span class="st">&quot;orangered&quot;</span>)</span>
<span id="cb21-14"><a href="#cb21-14" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb21-15"><a href="#cb21-15" aria-hidden="true" tabindex="-1"></a>  tmp <span class="ot">&lt;-</span> <span class="fu">lm</span>(hp <span class="sc">~</span> qsec, <span class="at">data=</span>mtcars)</span>
<span id="cb21-16"><a href="#cb21-16" aria-hidden="true" tabindex="-1"></a>  <span class="fu">plot</span>(tmp<span class="sc">$</span>residuals, <span class="at">pch=</span><span class="dv">20</span>,</span>
<span id="cb21-17"><a href="#cb21-17" aria-hidden="true" tabindex="-1"></a>       <span class="at">xlab=</span><span class="st">&quot;Order&quot;</span>, <span class="at">ylab=</span><span class="st">&quot;Residuals&quot;</span>, </span>
<span id="cb21-18"><a href="#cb21-18" aria-hidden="true" tabindex="-1"></a>       <span class="at">main=</span><span class="st">&quot;Questionable: Interesting Patterns&quot;</span>, <span class="at">cex.main=</span><span class="fl">0.95</span>,</span>
<span id="cb21-19"><a href="#cb21-19" aria-hidden="true" tabindex="-1"></a>       <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">col=</span><span class="st">&quot;orangered&quot;</span>)</span>
<span id="cb21-20"><a href="#cb21-20" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb21-21"><a href="#cb21-21" aria-hidden="true" tabindex="-1"></a>  tmp <span class="ot">&lt;-</span> <span class="fu">lm</span>(hp <span class="sc">~</span> drat, <span class="at">data=</span>mtcars[<span class="fu">order</span>(mtcars<span class="sc">$</span>cyl),])</span>
<span id="cb21-22"><a href="#cb21-22" aria-hidden="true" tabindex="-1"></a>  <span class="fu">plot</span>(tmp<span class="sc">$</span>residuals, <span class="at">pch=</span><span class="dv">20</span>,</span>
<span id="cb21-23"><a href="#cb21-23" aria-hidden="true" tabindex="-1"></a>       <span class="at">xlab=</span><span class="st">&quot;Order&quot;</span>, <span class="at">ylab=</span><span class="st">&quot;Residuals&quot;</span>, </span>
<span id="cb21-24"><a href="#cb21-24" aria-hidden="true" tabindex="-1"></a>       <span class="at">main=</span><span class="st">&quot;Bad: Obvious Trend&quot;</span>, <span class="at">cex.main=</span><span class="fl">0.95</span>,</span>
<span id="cb21-25"><a href="#cb21-25" aria-hidden="true" tabindex="-1"></a>       <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-26-1.png" width="672" /></p>
</div>
</div>
</div>
<p><br /></p>
<div id="problems-from-failed-assumptions" class="section level5">
<h5>Problems from Failed Assumptions</h5>
<p>There are various problems that can arise when certain of the
regression assupmtions are not satisfied.</p>
<p><strong>Lack of Linearity</strong></p>
<p>When the linearity assumption is violated, pretty much everything we
obtain from the regression summary is no longer meaningful.</p>
<ul>
<li><p>The y-intercept estimate can be drastically off from its actual
true value.</p></li>
<li><p>Important model information is lost by trying to use a simple
slope term <span class="math inline">\(\beta_1\)</span> to describe the
model with respect to <span class="math inline">\(X\)</span>.</p></li>
<li><p>The residual standard error will be much higher than it otherwise
would be because of curvature patterns in the data that the line cannot
capture. Thus, R-squared will be lower than it otherwise should
be.</p></li>
<li><p>P-values can become non-significant, when in fact there is a
strong pattern in the data, but that pattern just cannot be captured by
a simple line.</p></li>
</ul>
<p>*Normality of the errors is often put into question as well when a
simplified line is used to try to capture a more complicated curved
model.</p>
<p>The plot below demonstrate these difficulties.</p>
<div class="sourceCode" id="cb22"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Create Data from a True Model</span></span>
<span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb22-3"><a href="#cb22-3" aria-hidden="true" tabindex="-1"></a>n <span class="ot">&lt;-</span> <span class="dv">30</span>                           <span class="co">#sample size</span></span>
<span id="cb22-4"><a href="#cb22-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb22-5"><a href="#cb22-5" aria-hidden="true" tabindex="-1"></a>beta_0 <span class="ot">&lt;-</span> <span class="fl">14.2</span>                    <span class="co">#True y-intercept</span></span>
<span id="cb22-6"><a href="#cb22-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb22-7"><a href="#cb22-7" aria-hidden="true" tabindex="-1"></a>beta_1 <span class="ot">&lt;-</span> <span class="fl">7.5</span>                     <span class="co">#True slope</span></span>
<span id="cb22-8"><a href="#cb22-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb22-9"><a href="#cb22-9" aria-hidden="true" tabindex="-1"></a>beta_2 <span class="ot">&lt;-</span> <span class="sc">-</span><span class="fl">0.25</span>                   <span class="co">#True bend</span></span>
<span id="cb22-10"><a href="#cb22-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb22-11"><a href="#cb22-11" aria-hidden="true" tabindex="-1"></a>X_i <span class="ot">&lt;-</span> <span class="fu">runif</span>(n, <span class="dv">0</span>, <span class="dv">20</span>)            <span class="co">#Sample of X-values</span></span>
<span id="cb22-12"><a href="#cb22-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb22-13"><a href="#cb22-13" aria-hidden="true" tabindex="-1"></a>sigma <span class="ot">&lt;-</span> <span class="fl">2.5</span>                      <span class="co">#True standard deviation</span></span>
<span id="cb22-14"><a href="#cb22-14" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb22-15"><a href="#cb22-15" aria-hidden="true" tabindex="-1"></a>epsilon_i <span class="ot">&lt;-</span> <span class="fu">rnorm</span>(n, <span class="dv">0</span>, sigma)   <span class="co">#normally distributed errors</span></span>
<span id="cb22-16"><a href="#cb22-16" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb22-17"><a href="#cb22-17" aria-hidden="true" tabindex="-1"></a>Y_i <span class="ot">&lt;-</span> beta_0 <span class="sc">+</span> beta_1<span class="sc">*</span>X_i <span class="sc">+</span> beta_2<span class="sc">*</span>X_i<span class="sc">^</span><span class="dv">2</span> <span class="sc">+</span> epsilon_i </span>
<span id="cb22-18"><a href="#cb22-18" aria-hidden="true" tabindex="-1"></a>                                  <span class="co">#Sample of Y-values from model</span></span>
<span id="cb22-19"><a href="#cb22-19" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb22-20"><a href="#cb22-20" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb22-21"><a href="#cb22-21" aria-hidden="true" tabindex="-1"></a><span class="co"># Plot the Data and Fitted Model</span></span>
<span id="cb22-22"><a href="#cb22-22" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb22-23"><a href="#cb22-23" aria-hidden="true" tabindex="-1"></a>mylm <span class="ot">&lt;-</span> <span class="fu">lm</span>(Y_i <span class="sc">~</span> X_i)            <span class="co">#Fit Model to Data</span></span>
<span id="cb22-24"><a href="#cb22-24" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb22-25"><a href="#cb22-25" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb22-26"><a href="#cb22-26" aria-hidden="true" tabindex="-1"></a><span class="fu">layout</span>(<span class="fu">matrix</span>(<span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">1</span>,<span class="dv">1</span>,<span class="dv">2</span>,<span class="dv">3</span>,<span class="dv">4</span>), <span class="dv">2</span>, <span class="dv">3</span>, <span class="at">byrow =</span> <span class="cn">TRUE</span>), </span>
<span id="cb22-27"><a href="#cb22-27" aria-hidden="true" tabindex="-1"></a>   <span class="at">widths=</span><span class="fu">c</span>(<span class="dv">2</span>,<span class="dv">2</span>,<span class="dv">2</span>), <span class="at">heights=</span><span class="fu">c</span>(<span class="dv">4</span>,<span class="dv">2</span>,<span class="dv">2</span>))</span>
<span id="cb22-28"><a href="#cb22-28" aria-hidden="true" tabindex="-1"></a>                                 <span class="co">#create plot panel</span></span>
<span id="cb22-29"><a href="#cb22-29" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb22-30"><a href="#cb22-30" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb22-31"><a href="#cb22-31" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(Y_i <span class="sc">~</span> X_i,                  <span class="co">#Plot the data</span></span>
<span id="cb22-32"><a href="#cb22-32" aria-hidden="true" tabindex="-1"></a>     <span class="at">pch=</span><span class="dv">16</span>, </span>
<span id="cb22-33"><a href="#cb22-33" aria-hidden="true" tabindex="-1"></a>     <span class="at">col=</span><span class="st">&quot;darkgray&quot;</span>, </span>
<span id="cb22-34"><a href="#cb22-34" aria-hidden="true" tabindex="-1"></a>     <span class="at">xlim=</span><span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">20</span>), </span>
<span id="cb22-35"><a href="#cb22-35" aria-hidden="true" tabindex="-1"></a>     <span class="at">ylim=</span><span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">100</span>),</span>
<span id="cb22-36"><a href="#cb22-36" aria-hidden="true" tabindex="-1"></a>     <span class="at">main=</span><span class="st">&quot;Non-Linear Relation&quot;</span>)</span>
<span id="cb22-37"><a href="#cb22-37" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb22-38"><a href="#cb22-38" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(mylm, <span class="at">col=</span><span class="st">&quot;gray&quot;</span>)         <span class="co">#Add fitted line to plot</span></span>
<span id="cb22-39"><a href="#cb22-39" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb22-40"><a href="#cb22-40" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(beta_0 <span class="sc">+</span> beta_1<span class="sc">*</span>x <span class="sc">+</span> beta_2<span class="sc">*</span>x<span class="sc">^</span><span class="dv">2</span>, <span class="at">col=</span><span class="st">&quot;gray&quot;</span>, <span class="at">lty=</span><span class="dv">2</span>, <span class="at">add=</span><span class="cn">TRUE</span>) </span>
<span id="cb22-41"><a href="#cb22-41" aria-hidden="true" tabindex="-1"></a>                                 <span class="co">#Add True line to plot</span></span>
<span id="cb22-42"><a href="#cb22-42" aria-hidden="true" tabindex="-1"></a>      </span>
<span id="cb22-43"><a href="#cb22-43" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb22-44"><a href="#cb22-44" aria-hidden="true" tabindex="-1"></a>                                 <span class="co">#Add summary to plot</span></span>
<span id="cb22-45"><a href="#cb22-45" aria-hidden="true" tabindex="-1"></a><span class="fu">legend</span>(<span class="st">&quot;topleft&quot;</span>, <span class="at">legend=</span><span class="fu">c</span>(<span class="fu">paste</span>(<span class="st">&quot;Y-Intercept:&quot;</span>, <span class="fu">round</span>(mylm<span class="sc">$</span>coef[[<span class="dv">1</span>]], <span class="dv">3</span>), <span class="st">&quot;  (True value:&quot;</span>, beta_0, <span class="st">&quot;)&quot;</span>),</span>
<span id="cb22-46"><a href="#cb22-46" aria-hidden="true" tabindex="-1"></a>                           <span class="fu">paste</span>(<span class="st">&quot;Slope:&quot;</span>, <span class="fu">round</span>(mylm<span class="sc">$</span>coef[[<span class="dv">2</span>]], <span class="dv">3</span>), <span class="st">&quot;  (True value:&quot;</span>, beta_1, <span class="st">&quot;)&quot;</span>),</span>
<span id="cb22-47"><a href="#cb22-47" aria-hidden="true" tabindex="-1"></a>                           <span class="fu">paste</span>(<span class="st">&quot;Sigma:&quot;</span>, <span class="fu">round</span>(<span class="fu">summary</span>(mylm)<span class="sc">$</span>sigma, <span class="dv">3</span>), <span class="st">&quot;  (True value:&quot;</span>, sigma, <span class="st">&quot;)&quot;</span>)), <span class="at">bty=</span><span class="st">&#39;n&#39;</span>)</span>
<span id="cb22-48"><a href="#cb22-48" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb22-49"><a href="#cb22-49" aria-hidden="true" tabindex="-1"></a>                                 <span class="co">#Draw diagnostic plots</span></span>
<span id="cb22-50"><a href="#cb22-50" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(mylm, <span class="at">which=</span><span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>)</span>
<span id="cb22-51"><a href="#cb22-51" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(mylm<span class="sc">$</span>residuals, <span class="at">ylab=</span><span class="st">&quot;Residuals&quot;</span>)</span>
<span id="cb22-52"><a href="#cb22-52" aria-hidden="true" tabindex="-1"></a><span class="fu">mtext</span>(<span class="st">&quot;Residuals vs Order&quot;</span>, <span class="at">side=</span><span class="dv">3</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-27-1.png" width="672" /></p>
<p><br/></p>
<p><strong>Non-normal Error Terms</strong></p>
<p>When the normality assumption for the error terms is violated, not
all is lost. In fact, the estimate of the slope and intercept are still
often fairly meaningful. However, it is unwise to put too much trust in
the residual standard error as an estimate of the standard deviation
<span class="math inline">\(\sigma\)</span>. This is because the
standard deviation in skewed distributions does not carry the same
meaning it has in normal distributions.</p>
<div class="sourceCode" id="cb23"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Create Data from a True Model</span></span>
<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a>n <span class="ot">&lt;-</span> <span class="dv">30</span>                           <span class="co">#sample size</span></span>
<span id="cb23-4"><a href="#cb23-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-5"><a href="#cb23-5" aria-hidden="true" tabindex="-1"></a>beta_0 <span class="ot">&lt;-</span> <span class="fl">14.2</span>                    <span class="co">#True y-intercept</span></span>
<span id="cb23-6"><a href="#cb23-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-7"><a href="#cb23-7" aria-hidden="true" tabindex="-1"></a>beta_1 <span class="ot">&lt;-</span> <span class="fl">3.5</span>                     <span class="co">#True slope</span></span>
<span id="cb23-8"><a href="#cb23-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-9"><a href="#cb23-9" aria-hidden="true" tabindex="-1"></a>X_i <span class="ot">&lt;-</span> <span class="fu">runif</span>(n, <span class="dv">0</span>, <span class="dv">20</span>)            <span class="co">#Sample of X-values</span></span>
<span id="cb23-10"><a href="#cb23-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-11"><a href="#cb23-11" aria-hidden="true" tabindex="-1"></a>sigma <span class="ot">&lt;-</span> <span class="fl">2.5</span>                      <span class="co">#True standard deviation</span></span>
<span id="cb23-12"><a href="#cb23-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-13"><a href="#cb23-13" aria-hidden="true" tabindex="-1"></a>epsilon_i <span class="ot">&lt;-</span> <span class="fu">rchisq</span>(n, <span class="dv">1</span>)<span class="sc">*</span><span class="dv">3</span> <span class="sc">-</span> <span class="dv">1</span> <span class="co">#non-normally distributed errors</span></span>
<span id="cb23-14"><a href="#cb23-14" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-15"><a href="#cb23-15" aria-hidden="true" tabindex="-1"></a>Y_i <span class="ot">&lt;-</span> beta_0 <span class="sc">+</span> beta_1<span class="sc">*</span>X_i <span class="sc">+</span> epsilon_i </span>
<span id="cb23-16"><a href="#cb23-16" aria-hidden="true" tabindex="-1"></a>                                  <span class="co">#Sample of Y-values from model</span></span>
<span id="cb23-17"><a href="#cb23-17" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-18"><a href="#cb23-18" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-19"><a href="#cb23-19" aria-hidden="true" tabindex="-1"></a><span class="co"># Plot the Data and Fitted Model</span></span>
<span id="cb23-20"><a href="#cb23-20" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-21"><a href="#cb23-21" aria-hidden="true" tabindex="-1"></a>mylm <span class="ot">&lt;-</span> <span class="fu">lm</span>(Y_i <span class="sc">~</span> X_i)            <span class="co">#Fit Model to Data</span></span>
<span id="cb23-22"><a href="#cb23-22" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-23"><a href="#cb23-23" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-24"><a href="#cb23-24" aria-hidden="true" tabindex="-1"></a><span class="fu">layout</span>(<span class="fu">matrix</span>(<span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">1</span>,<span class="dv">1</span>,<span class="dv">2</span>,<span class="dv">3</span>,<span class="dv">4</span>), <span class="dv">2</span>, <span class="dv">3</span>, <span class="at">byrow =</span> <span class="cn">TRUE</span>), </span>
<span id="cb23-25"><a href="#cb23-25" aria-hidden="true" tabindex="-1"></a>   <span class="at">widths=</span><span class="fu">c</span>(<span class="dv">2</span>,<span class="dv">2</span>,<span class="dv">2</span>), <span class="at">heights=</span><span class="fu">c</span>(<span class="dv">4</span>,<span class="dv">2</span>,<span class="dv">2</span>))</span>
<span id="cb23-26"><a href="#cb23-26" aria-hidden="true" tabindex="-1"></a>                                 <span class="co">#create plot panel</span></span>
<span id="cb23-27"><a href="#cb23-27" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-28"><a href="#cb23-28" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-29"><a href="#cb23-29" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(Y_i <span class="sc">~</span> X_i,                  <span class="co">#Plot the data</span></span>
<span id="cb23-30"><a href="#cb23-30" aria-hidden="true" tabindex="-1"></a>     <span class="at">pch=</span><span class="dv">16</span>, </span>
<span id="cb23-31"><a href="#cb23-31" aria-hidden="true" tabindex="-1"></a>     <span class="at">col=</span><span class="st">&quot;darkgray&quot;</span>, </span>
<span id="cb23-32"><a href="#cb23-32" aria-hidden="true" tabindex="-1"></a>     <span class="at">xlim=</span><span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">20</span>), </span>
<span id="cb23-33"><a href="#cb23-33" aria-hidden="true" tabindex="-1"></a>     <span class="at">ylim=</span><span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">100</span>),</span>
<span id="cb23-34"><a href="#cb23-34" aria-hidden="true" tabindex="-1"></a>     <span class="at">main=</span><span class="st">&quot;Normality Assumption Violated&quot;</span>)</span>
<span id="cb23-35"><a href="#cb23-35" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-36"><a href="#cb23-36" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(mylm, <span class="at">col=</span><span class="st">&quot;gray&quot;</span>)         <span class="co">#Add fitted line to plot</span></span>
<span id="cb23-37"><a href="#cb23-37" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-38"><a href="#cb23-38" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(beta_0, beta_1,           <span class="co">#Add True line to plot</span></span>
<span id="cb23-39"><a href="#cb23-39" aria-hidden="true" tabindex="-1"></a>       <span class="at">col=</span><span class="st">&quot;gray&quot;</span>, <span class="at">lty=</span><span class="dv">2</span>)</span>
<span id="cb23-40"><a href="#cb23-40" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-41"><a href="#cb23-41" aria-hidden="true" tabindex="-1"></a>                                 <span class="co">#Add summary to plot</span></span>
<span id="cb23-42"><a href="#cb23-42" aria-hidden="true" tabindex="-1"></a><span class="fu">legend</span>(<span class="st">&quot;topleft&quot;</span>, <span class="at">legend=</span><span class="fu">c</span>(<span class="fu">paste</span>(<span class="st">&quot;Y-Intercept:&quot;</span>, <span class="fu">round</span>(mylm<span class="sc">$</span>coef[[<span class="dv">1</span>]], <span class="dv">3</span>), <span class="st">&quot;  (&quot;</span>, beta_0, <span class="st">&quot;)&quot;</span>),</span>
<span id="cb23-43"><a href="#cb23-43" aria-hidden="true" tabindex="-1"></a>                           <span class="fu">paste</span>(<span class="st">&quot;Slope:&quot;</span>, <span class="fu">round</span>(mylm<span class="sc">$</span>coef[[<span class="dv">2</span>]], <span class="dv">3</span>), <span class="st">&quot;  (&quot;</span>, beta_1, <span class="st">&quot;)&quot;</span>),</span>
<span id="cb23-44"><a href="#cb23-44" aria-hidden="true" tabindex="-1"></a>                           <span class="fu">paste</span>(<span class="st">&quot;Sigma:&quot;</span>, <span class="fu">round</span>(<span class="fu">summary</span>(mylm)<span class="sc">$</span>sigma, <span class="dv">3</span>), <span class="st">&quot;  (&quot;</span>, sigma, <span class="st">&quot;)&quot;</span>)), <span class="at">bty=</span><span class="st">&#39;n&#39;</span>)</span>
<span id="cb23-45"><a href="#cb23-45" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-46"><a href="#cb23-46" aria-hidden="true" tabindex="-1"></a>                                 <span class="co">#Draw diagnostic plots</span></span>
<span id="cb23-47"><a href="#cb23-47" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(mylm, <span class="at">which=</span><span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>)</span>
<span id="cb23-48"><a href="#cb23-48" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(mylm<span class="sc">$</span>residuals, <span class="at">ylab=</span><span class="st">&quot;Residuals&quot;</span>)</span>
<span id="cb23-49"><a href="#cb23-49" aria-hidden="true" tabindex="-1"></a><span class="fu">mtext</span>(<span class="st">&quot;Residuals vs Order&quot;</span>, <span class="at">side=</span><span class="dv">3</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-28-1.png" width="672" /></p>
<p><strong>Unconstant Variance</strong></p>
<p>When variance of the error term changes across the regression, the
regression approximates the “average variance” of the errors because the
regression is still assuming the variance is constant across the
regression. The estimates of the slope and intercept are still typically
quite good, and can be used for interpretation. The residual standard
error however should not be considered to be meaningful as it will be
too large on one end of the regression and too small on the other
end.</p>
<div class="sourceCode" id="cb24"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Create Data from a True Model</span></span>
<span id="cb24-2"><a href="#cb24-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb24-3"><a href="#cb24-3" aria-hidden="true" tabindex="-1"></a>n <span class="ot">&lt;-</span> <span class="dv">30</span>                           <span class="co">#sample size</span></span>
<span id="cb24-4"><a href="#cb24-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb24-5"><a href="#cb24-5" aria-hidden="true" tabindex="-1"></a>beta_0 <span class="ot">&lt;-</span> <span class="fl">14.2</span>                    <span class="co">#True y-intercept</span></span>
<span id="cb24-6"><a href="#cb24-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb24-7"><a href="#cb24-7" aria-hidden="true" tabindex="-1"></a>beta_1 <span class="ot">&lt;-</span> <span class="fl">3.5</span>                     <span class="co">#True slope</span></span>
<span id="cb24-8"><a href="#cb24-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb24-9"><a href="#cb24-9" aria-hidden="true" tabindex="-1"></a>X_i <span class="ot">&lt;-</span> <span class="fu">runif</span>(n, <span class="dv">0</span>, <span class="dv">20</span>)            <span class="co">#Sample of X-values</span></span>
<span id="cb24-10"><a href="#cb24-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb24-11"><a href="#cb24-11" aria-hidden="true" tabindex="-1"></a>sigma <span class="ot">&lt;-</span> <span class="fl">2.5</span>                      <span class="co">#True standard deviation</span></span>
<span id="cb24-12"><a href="#cb24-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb24-13"><a href="#cb24-13" aria-hidden="true" tabindex="-1"></a>epsilon_i <span class="ot">&lt;-</span> <span class="fu">rnorm</span>(n, <span class="dv">0</span>, sigma <span class="sc">+</span> X_i)   </span>
<span id="cb24-14"><a href="#cb24-14" aria-hidden="true" tabindex="-1"></a>                                  <span class="co">#normally distributed errors</span></span>
<span id="cb24-15"><a href="#cb24-15" aria-hidden="true" tabindex="-1"></a>                                  <span class="co">#with increasing variance</span></span>
<span id="cb24-16"><a href="#cb24-16" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb24-17"><a href="#cb24-17" aria-hidden="true" tabindex="-1"></a>Y_i <span class="ot">&lt;-</span> beta_0 <span class="sc">+</span> beta_1<span class="sc">*</span>X_i <span class="sc">+</span> epsilon_i </span>
<span id="cb24-18"><a href="#cb24-18" aria-hidden="true" tabindex="-1"></a>                                  <span class="co">#Sample of Y-values from model</span></span>
<span id="cb24-19"><a href="#cb24-19" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb24-20"><a href="#cb24-20" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb24-21"><a href="#cb24-21" aria-hidden="true" tabindex="-1"></a><span class="co"># Plot the Data and Fitted Model</span></span>
<span id="cb24-22"><a href="#cb24-22" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb24-23"><a href="#cb24-23" aria-hidden="true" tabindex="-1"></a>mylm <span class="ot">&lt;-</span> <span class="fu">lm</span>(Y_i <span class="sc">~</span> X_i)            <span class="co">#Fit Model to Data</span></span>
<span id="cb24-24"><a href="#cb24-24" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb24-25"><a href="#cb24-25" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb24-26"><a href="#cb24-26" aria-hidden="true" tabindex="-1"></a><span class="fu">layout</span>(<span class="fu">matrix</span>(<span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">1</span>,<span class="dv">1</span>,<span class="dv">2</span>,<span class="dv">3</span>,<span class="dv">4</span>), <span class="dv">2</span>, <span class="dv">3</span>, <span class="at">byrow =</span> <span class="cn">TRUE</span>), </span>
<span id="cb24-27"><a href="#cb24-27" aria-hidden="true" tabindex="-1"></a>   <span class="at">widths=</span><span class="fu">c</span>(<span class="dv">2</span>,<span class="dv">2</span>,<span class="dv">2</span>), <span class="at">heights=</span><span class="fu">c</span>(<span class="dv">4</span>,<span class="dv">2</span>,<span class="dv">2</span>))</span>
<span id="cb24-28"><a href="#cb24-28" aria-hidden="true" tabindex="-1"></a>                                 <span class="co">#create plot panel</span></span>
<span id="cb24-29"><a href="#cb24-29" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb24-30"><a href="#cb24-30" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb24-31"><a href="#cb24-31" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(Y_i <span class="sc">~</span> X_i,                  <span class="co">#Plot the data</span></span>
<span id="cb24-32"><a href="#cb24-32" aria-hidden="true" tabindex="-1"></a>     <span class="at">pch=</span><span class="dv">16</span>, </span>
<span id="cb24-33"><a href="#cb24-33" aria-hidden="true" tabindex="-1"></a>     <span class="at">col=</span><span class="st">&quot;darkgray&quot;</span>, </span>
<span id="cb24-34"><a href="#cb24-34" aria-hidden="true" tabindex="-1"></a>     <span class="at">xlim=</span><span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">20</span>), </span>
<span id="cb24-35"><a href="#cb24-35" aria-hidden="true" tabindex="-1"></a>     <span class="at">ylim=</span><span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">100</span>),</span>
<span id="cb24-36"><a href="#cb24-36" aria-hidden="true" tabindex="-1"></a>     <span class="at">main=</span><span class="st">&quot;Variance Varies (Non-Constant)&quot;</span>)</span>
<span id="cb24-37"><a href="#cb24-37" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb24-38"><a href="#cb24-38" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(mylm, <span class="at">col=</span><span class="st">&quot;gray&quot;</span>)         <span class="co">#Add fitted line to plot</span></span>
<span id="cb24-39"><a href="#cb24-39" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb24-40"><a href="#cb24-40" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(beta_0, beta_1,           <span class="co">#Add True line to plot</span></span>
<span id="cb24-41"><a href="#cb24-41" aria-hidden="true" tabindex="-1"></a>       <span class="at">col=</span><span class="st">&quot;gray&quot;</span>, <span class="at">lty=</span><span class="dv">2</span>)</span>
<span id="cb24-42"><a href="#cb24-42" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb24-43"><a href="#cb24-43" aria-hidden="true" tabindex="-1"></a>                                 <span class="co">#Summarize the Model Fit</span></span>
<span id="cb24-44"><a href="#cb24-44" aria-hidden="true" tabindex="-1"></a><span class="fu">pander</span>(<span class="fu">rbind</span>(<span class="st">`</span><span class="at">Y-Intercept</span><span class="st">`</span> <span class="ot">=</span> <span class="fu">c</span>(<span class="at">True =</span> beta_0, <span class="at">Estimated =</span> mylm<span class="sc">$</span>coef[[<span class="dv">1</span>]]),</span>
<span id="cb24-45"><a href="#cb24-45" aria-hidden="true" tabindex="-1"></a>      <span class="at">Slope =</span> <span class="fu">c</span>(<span class="at">True =</span> beta_1, <span class="at">Estimated =</span> mylm<span class="sc">$</span>coef[[<span class="dv">2</span>]]),</span>
<span id="cb24-46"><a href="#cb24-46" aria-hidden="true" tabindex="-1"></a>      <span class="at">Sigma =</span> <span class="fu">c</span>(<span class="at">True =</span> sigma, <span class="at">Estimated =</span> <span class="fu">summary</span>(mylm)<span class="sc">$</span>sigma)))</span></code></pre></div>
<table style="width:51%;">
<colgroup>
<col width="25%" />
<col width="9%" />
<col width="16%" />
</colgroup>
<thead>
<tr class="header">
<th align="center"> </th>
<th align="center">True</th>
<th align="center">Estimated</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="center"><strong>Y-Intercept</strong></td>
<td align="center">14.2</td>
<td align="center">13.82</td>
</tr>
<tr class="even">
<td align="center"><strong>Slope</strong></td>
<td align="center">3.5</td>
<td align="center">3.768</td>
</tr>
<tr class="odd">
<td align="center"><strong>Sigma</strong></td>
<td align="center">2.5</td>
<td align="center">13.02</td>
</tr>
</tbody>
</table>
<div class="sourceCode" id="cb25"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a>                                 <span class="co">#Add summary to plot</span></span>
<span id="cb25-2"><a href="#cb25-2" aria-hidden="true" tabindex="-1"></a><span class="fu">legend</span>(<span class="st">&quot;topleft&quot;</span>, <span class="at">legend=</span><span class="fu">c</span>(<span class="fu">paste</span>(<span class="st">&quot;Y-Intercept:&quot;</span>, <span class="fu">round</span>(mylm<span class="sc">$</span>coef[[<span class="dv">1</span>]], <span class="dv">3</span>), <span class="st">&quot;  (&quot;</span>, beta_0, <span class="st">&quot;)&quot;</span>),</span>
<span id="cb25-3"><a href="#cb25-3" aria-hidden="true" tabindex="-1"></a>                           <span class="fu">paste</span>(<span class="st">&quot;Slope:&quot;</span>, <span class="fu">round</span>(mylm<span class="sc">$</span>coef[[<span class="dv">2</span>]], <span class="dv">3</span>), <span class="st">&quot;  (&quot;</span>, beta_1, <span class="st">&quot;)&quot;</span>),</span>
<span id="cb25-4"><a href="#cb25-4" aria-hidden="true" tabindex="-1"></a>                           <span class="fu">paste</span>(<span class="st">&quot;Sigma:&quot;</span>, <span class="fu">round</span>(<span class="fu">summary</span>(mylm)<span class="sc">$</span>sigma, <span class="dv">3</span>), <span class="st">&quot;  (&quot;</span>, <span class="fu">round</span>(<span class="fu">mean</span>(sigma <span class="sc">+</span> X_i), <span class="dv">2</span>), <span class="st">&quot;, mean)&quot;</span>)), <span class="at">bty=</span><span class="st">&#39;n&#39;</span>)</span>
<span id="cb25-5"><a href="#cb25-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb25-6"><a href="#cb25-6" aria-hidden="true" tabindex="-1"></a>                                 <span class="co">#Draw diagnostic plots</span></span>
<span id="cb25-7"><a href="#cb25-7" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(mylm, <span class="at">which=</span><span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>)</span>
<span id="cb25-8"><a href="#cb25-8" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(mylm<span class="sc">$</span>residuals, <span class="at">ylab=</span><span class="st">&quot;Residuals&quot;</span>)</span>
<span id="cb25-9"><a href="#cb25-9" aria-hidden="true" tabindex="-1"></a><span class="fu">mtext</span>(<span class="st">&quot;Residuals vs Order&quot;</span>, <span class="at">side=</span><span class="dv">3</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-29-1.png" width="672" /></p>
<p><strong>Normality Violated</strong></p>
<p>As silly as it sounds, if the only problem with the regression is the
lack of normality of the error terms, it isn’t all that big of a
problem. Depending on how non-normal the residuals appear, there could
be some skewing to the residual standard error, but otherwise, the slope
and intercept are still interpretable and meaningful.</p>
<div class="sourceCode" id="cb26"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Create Data from a True Model</span></span>
<span id="cb26-2"><a href="#cb26-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb26-3"><a href="#cb26-3" aria-hidden="true" tabindex="-1"></a>n <span class="ot">&lt;-</span> <span class="dv">30</span>                           <span class="co">#sample size</span></span>
<span id="cb26-4"><a href="#cb26-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb26-5"><a href="#cb26-5" aria-hidden="true" tabindex="-1"></a>beta_0 <span class="ot">&lt;-</span> <span class="fl">14.2</span>                    <span class="co">#True y-intercept</span></span>
<span id="cb26-6"><a href="#cb26-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb26-7"><a href="#cb26-7" aria-hidden="true" tabindex="-1"></a>beta_1 <span class="ot">&lt;-</span> <span class="fl">3.5</span>                     <span class="co">#True slope</span></span>
<span id="cb26-8"><a href="#cb26-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb26-9"><a href="#cb26-9" aria-hidden="true" tabindex="-1"></a>X_i <span class="ot">&lt;-</span> <span class="fu">runif</span>(n, <span class="dv">0</span>, <span class="dv">20</span>)            <span class="co">#Sample of X-values</span></span>
<span id="cb26-10"><a href="#cb26-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb26-11"><a href="#cb26-11" aria-hidden="true" tabindex="-1"></a>sigma <span class="ot">&lt;-</span> <span class="fl">2.5</span>                      <span class="co">#True standard deviation</span></span>
<span id="cb26-12"><a href="#cb26-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb26-13"><a href="#cb26-13" aria-hidden="true" tabindex="-1"></a>epsilon_i <span class="ot">&lt;-</span> <span class="fu">runif</span>(n, <span class="sc">-</span><span class="fu">sqrt</span>(<span class="dv">12</span><span class="sc">*</span>sigma<span class="sc">^</span><span class="dv">2</span>)<span class="sc">/</span><span class="dv">2</span>, <span class="fu">sqrt</span>(<span class="dv">12</span><span class="sc">*</span>sigma<span class="sc">^</span><span class="dv">2</span>)<span class="sc">/</span><span class="dv">2</span>) <span class="co">#non-normally distributed errors</span></span>
<span id="cb26-14"><a href="#cb26-14" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb26-15"><a href="#cb26-15" aria-hidden="true" tabindex="-1"></a>Y_i <span class="ot">&lt;-</span> beta_0 <span class="sc">+</span> beta_1<span class="sc">*</span>X_i <span class="sc">+</span> epsilon_i </span>
<span id="cb26-16"><a href="#cb26-16" aria-hidden="true" tabindex="-1"></a>                                  <span class="co">#Sample of Y-values from model</span></span>
<span id="cb26-17"><a href="#cb26-17" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb26-18"><a href="#cb26-18" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb26-19"><a href="#cb26-19" aria-hidden="true" tabindex="-1"></a><span class="co"># Plot the Data and Fitted Model</span></span>
<span id="cb26-20"><a href="#cb26-20" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb26-21"><a href="#cb26-21" aria-hidden="true" tabindex="-1"></a>mylm <span class="ot">&lt;-</span> <span class="fu">lm</span>(Y_i <span class="sc">~</span> X_i)            <span class="co">#Fit Model to Data</span></span>
<span id="cb26-22"><a href="#cb26-22" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb26-23"><a href="#cb26-23" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb26-24"><a href="#cb26-24" aria-hidden="true" tabindex="-1"></a><span class="fu">layout</span>(<span class="fu">matrix</span>(<span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">1</span>,<span class="dv">1</span>,<span class="dv">2</span>,<span class="dv">3</span>,<span class="dv">4</span>), <span class="dv">2</span>, <span class="dv">3</span>, <span class="at">byrow =</span> <span class="cn">TRUE</span>), </span>
<span id="cb26-25"><a href="#cb26-25" aria-hidden="true" tabindex="-1"></a>   <span class="at">widths=</span><span class="fu">c</span>(<span class="dv">2</span>,<span class="dv">2</span>,<span class="dv">2</span>), <span class="at">heights=</span><span class="fu">c</span>(<span class="dv">4</span>,<span class="dv">2</span>,<span class="dv">2</span>))</span>
<span id="cb26-26"><a href="#cb26-26" aria-hidden="true" tabindex="-1"></a>                                 <span class="co">#create plot panel</span></span>
<span id="cb26-27"><a href="#cb26-27" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb26-28"><a href="#cb26-28" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb26-29"><a href="#cb26-29" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(Y_i <span class="sc">~</span> X_i,                  <span class="co">#Plot the data</span></span>
<span id="cb26-30"><a href="#cb26-30" aria-hidden="true" tabindex="-1"></a>     <span class="at">pch=</span><span class="dv">16</span>, </span>
<span id="cb26-31"><a href="#cb26-31" aria-hidden="true" tabindex="-1"></a>     <span class="at">col=</span><span class="st">&quot;darkgray&quot;</span>, </span>
<span id="cb26-32"><a href="#cb26-32" aria-hidden="true" tabindex="-1"></a>     <span class="at">xlim=</span><span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">20</span>), </span>
<span id="cb26-33"><a href="#cb26-33" aria-hidden="true" tabindex="-1"></a>     <span class="at">ylim=</span><span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">100</span>),</span>
<span id="cb26-34"><a href="#cb26-34" aria-hidden="true" tabindex="-1"></a>     <span class="at">main=</span><span class="st">&quot;Normality Assumption Violated&quot;</span>)</span>
<span id="cb26-35"><a href="#cb26-35" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb26-36"><a href="#cb26-36" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(mylm, <span class="at">col=</span><span class="st">&quot;gray&quot;</span>)         <span class="co">#Add fitted line to plot</span></span>
<span id="cb26-37"><a href="#cb26-37" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb26-38"><a href="#cb26-38" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(beta_0, beta_1,           <span class="co">#Add True line to plot</span></span>
<span id="cb26-39"><a href="#cb26-39" aria-hidden="true" tabindex="-1"></a>       <span class="at">col=</span><span class="st">&quot;gray&quot;</span>, <span class="at">lty=</span><span class="dv">2</span>)</span>
<span id="cb26-40"><a href="#cb26-40" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb26-41"><a href="#cb26-41" aria-hidden="true" tabindex="-1"></a>                                 <span class="co">#Summarize the Model Fit</span></span>
<span id="cb26-42"><a href="#cb26-42" aria-hidden="true" tabindex="-1"></a><span class="fu">pander</span>(<span class="fu">rbind</span>(<span class="st">`</span><span class="at">Y-Intercept</span><span class="st">`</span> <span class="ot">=</span> <span class="fu">c</span>(<span class="at">True =</span> beta_0, <span class="at">Estimated =</span> mylm<span class="sc">$</span>coef[[<span class="dv">1</span>]]),</span>
<span id="cb26-43"><a href="#cb26-43" aria-hidden="true" tabindex="-1"></a>      <span class="at">Slope =</span> <span class="fu">c</span>(<span class="at">True =</span> beta_1, <span class="at">Estimated =</span> mylm<span class="sc">$</span>coef[[<span class="dv">2</span>]]),</span>
<span id="cb26-44"><a href="#cb26-44" aria-hidden="true" tabindex="-1"></a>      <span class="at">Sigma =</span> <span class="fu">c</span>(<span class="at">True =</span> sigma, <span class="at">Estimated =</span> <span class="fu">summary</span>(mylm)<span class="sc">$</span>sigma)))</span></code></pre></div>
<table style="width:51%;">
<colgroup>
<col width="25%" />
<col width="9%" />
<col width="16%" />
</colgroup>
<thead>
<tr class="header">
<th align="center"> </th>
<th align="center">True</th>
<th align="center">Estimated</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="center"><strong>Y-Intercept</strong></td>
<td align="center">14.2</td>
<td align="center">14.7</td>
</tr>
<tr class="even">
<td align="center"><strong>Slope</strong></td>
<td align="center">3.5</td>
<td align="center">3.449</td>
</tr>
<tr class="odd">
<td align="center"><strong>Sigma</strong></td>
<td align="center">2.5</td>
<td align="center">2.317</td>
</tr>
</tbody>
</table>
<div class="sourceCode" id="cb27"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb27-1"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a>                                 <span class="co">#Add summary to plot</span></span>
<span id="cb27-2"><a href="#cb27-2" aria-hidden="true" tabindex="-1"></a><span class="fu">legend</span>(<span class="st">&quot;topleft&quot;</span>, <span class="at">legend=</span><span class="fu">c</span>(<span class="fu">paste</span>(<span class="st">&quot;Y-Intercept:&quot;</span>, <span class="fu">round</span>(mylm<span class="sc">$</span>coef[[<span class="dv">1</span>]], <span class="dv">3</span>), <span class="st">&quot;  (&quot;</span>, beta_0, <span class="st">&quot;)&quot;</span>),</span>
<span id="cb27-3"><a href="#cb27-3" aria-hidden="true" tabindex="-1"></a>                           <span class="fu">paste</span>(<span class="st">&quot;Slope:&quot;</span>, <span class="fu">round</span>(mylm<span class="sc">$</span>coef[[<span class="dv">2</span>]], <span class="dv">3</span>), <span class="st">&quot;  (&quot;</span>, beta_1, <span class="st">&quot;)&quot;</span>),</span>
<span id="cb27-4"><a href="#cb27-4" aria-hidden="true" tabindex="-1"></a>                           <span class="fu">paste</span>(<span class="st">&quot;Sigma:&quot;</span>, <span class="fu">round</span>(<span class="fu">summary</span>(mylm)<span class="sc">$</span>sigma, <span class="dv">3</span>), <span class="st">&quot;  (&quot;</span>, sigma, <span class="st">&quot;)&quot;</span>)), <span class="at">bty=</span><span class="st">&#39;n&#39;</span>)</span>
<span id="cb27-5"><a href="#cb27-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb27-6"><a href="#cb27-6" aria-hidden="true" tabindex="-1"></a>                                 <span class="co">#Draw diagnostic plots</span></span>
<span id="cb27-7"><a href="#cb27-7" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(mylm, <span class="at">which=</span><span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>)</span>
<span id="cb27-8"><a href="#cb27-8" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(mylm<span class="sc">$</span>residuals, <span class="at">ylab=</span><span class="st">&quot;Residuals&quot;</span>)</span>
<span id="cb27-9"><a href="#cb27-9" aria-hidden="true" tabindex="-1"></a><span class="fu">mtext</span>(<span class="st">&quot;Residuals vs Order&quot;</span>, <span class="at">side=</span><span class="dv">3</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-30-1.png" width="672" /></p>
<p><strong>Independence Assumption Violated</strong></p>
<p>While the slope and intercept are often still meaningful when the
independence assumption is violated, the residual standard error is
unnecessarily large in this case.</p>
<div class="sourceCode" id="cb28"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb28-1"><a href="#cb28-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Create Data from a True Model</span></span>
<span id="cb28-2"><a href="#cb28-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb28-3"><a href="#cb28-3" aria-hidden="true" tabindex="-1"></a>n <span class="ot">&lt;-</span> <span class="dv">30</span>                           <span class="co">#sample size</span></span>
<span id="cb28-4"><a href="#cb28-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb28-5"><a href="#cb28-5" aria-hidden="true" tabindex="-1"></a>beta_0 <span class="ot">&lt;-</span> <span class="fl">14.2</span>                    <span class="co">#True y-intercept</span></span>
<span id="cb28-6"><a href="#cb28-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb28-7"><a href="#cb28-7" aria-hidden="true" tabindex="-1"></a>beta_1 <span class="ot">&lt;-</span> <span class="fl">3.5</span>                     <span class="co">#True slope</span></span>
<span id="cb28-8"><a href="#cb28-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb28-9"><a href="#cb28-9" aria-hidden="true" tabindex="-1"></a>X_i <span class="ot">&lt;-</span> <span class="fu">runif</span>(n, <span class="dv">0</span>, <span class="dv">20</span>)            <span class="co">#Sample of X-values</span></span>
<span id="cb28-10"><a href="#cb28-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb28-11"><a href="#cb28-11" aria-hidden="true" tabindex="-1"></a>sigma <span class="ot">&lt;-</span> <span class="fl">2.5</span>                      <span class="co">#True standard deviation</span></span>
<span id="cb28-12"><a href="#cb28-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb28-13"><a href="#cb28-13" aria-hidden="true" tabindex="-1"></a>epsilon_i <span class="ot">&lt;-</span> <span class="fu">rnorm</span>(n, <span class="dv">0</span>, <span class="fl">2.5</span>) <span class="sc">+</span> (<span class="dv">1</span><span class="sc">:</span>n <span class="sc">-</span>n<span class="sc">/</span><span class="dv">2</span>)<span class="sc">*</span>.<span class="dv">5</span></span>
<span id="cb28-14"><a href="#cb28-14" aria-hidden="true" tabindex="-1"></a>                                <span class="co">#normal, but correlated errors</span></span>
<span id="cb28-15"><a href="#cb28-15" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb28-16"><a href="#cb28-16" aria-hidden="true" tabindex="-1"></a>Y_i <span class="ot">&lt;-</span> beta_0 <span class="sc">+</span> beta_1<span class="sc">*</span>X_i <span class="sc">+</span> epsilon_i </span>
<span id="cb28-17"><a href="#cb28-17" aria-hidden="true" tabindex="-1"></a>                                  <span class="co">#Sample of Y-values from model</span></span>
<span id="cb28-18"><a href="#cb28-18" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb28-19"><a href="#cb28-19" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb28-20"><a href="#cb28-20" aria-hidden="true" tabindex="-1"></a><span class="co"># Plot the Data and Fitted Model</span></span>
<span id="cb28-21"><a href="#cb28-21" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb28-22"><a href="#cb28-22" aria-hidden="true" tabindex="-1"></a>mylm <span class="ot">&lt;-</span> <span class="fu">lm</span>(Y_i <span class="sc">~</span> X_i)            <span class="co">#Fit Model to Data</span></span>
<span id="cb28-23"><a href="#cb28-23" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb28-24"><a href="#cb28-24" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb28-25"><a href="#cb28-25" aria-hidden="true" tabindex="-1"></a><span class="fu">layout</span>(<span class="fu">matrix</span>(<span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">1</span>,<span class="dv">1</span>,<span class="dv">2</span>,<span class="dv">3</span>,<span class="dv">4</span>), <span class="dv">2</span>, <span class="dv">3</span>, <span class="at">byrow =</span> <span class="cn">TRUE</span>), </span>
<span id="cb28-26"><a href="#cb28-26" aria-hidden="true" tabindex="-1"></a>   <span class="at">widths=</span><span class="fu">c</span>(<span class="dv">2</span>,<span class="dv">2</span>,<span class="dv">2</span>), <span class="at">heights=</span><span class="fu">c</span>(<span class="dv">4</span>,<span class="dv">2</span>,<span class="dv">2</span>))</span>
<span id="cb28-27"><a href="#cb28-27" aria-hidden="true" tabindex="-1"></a>                                 <span class="co">#create plot panel</span></span>
<span id="cb28-28"><a href="#cb28-28" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb28-29"><a href="#cb28-29" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb28-30"><a href="#cb28-30" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(Y_i <span class="sc">~</span> X_i,                  <span class="co">#Plot the data</span></span>
<span id="cb28-31"><a href="#cb28-31" aria-hidden="true" tabindex="-1"></a>     <span class="at">pch=</span><span class="dv">16</span>, </span>
<span id="cb28-32"><a href="#cb28-32" aria-hidden="true" tabindex="-1"></a>     <span class="at">col=</span><span class="st">&quot;darkgray&quot;</span>, </span>
<span id="cb28-33"><a href="#cb28-33" aria-hidden="true" tabindex="-1"></a>     <span class="at">xlim=</span><span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">20</span>), </span>
<span id="cb28-34"><a href="#cb28-34" aria-hidden="true" tabindex="-1"></a>     <span class="at">ylim=</span><span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">100</span>),</span>
<span id="cb28-35"><a href="#cb28-35" aria-hidden="true" tabindex="-1"></a>     <span class="at">main=</span><span class="st">&quot;Independence Assumption Violated&quot;</span>)</span>
<span id="cb28-36"><a href="#cb28-36" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb28-37"><a href="#cb28-37" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(mylm, <span class="at">col=</span><span class="st">&quot;gray&quot;</span>)         <span class="co">#Add fitted line to plot</span></span>
<span id="cb28-38"><a href="#cb28-38" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb28-39"><a href="#cb28-39" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(beta_0, beta_1,           <span class="co">#Add True line to plot</span></span>
<span id="cb28-40"><a href="#cb28-40" aria-hidden="true" tabindex="-1"></a>       <span class="at">col=</span><span class="st">&quot;gray&quot;</span>, <span class="at">lty=</span><span class="dv">2</span>)</span>
<span id="cb28-41"><a href="#cb28-41" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb28-42"><a href="#cb28-42" aria-hidden="true" tabindex="-1"></a>                                 <span class="co">#Summarize the Model Fit</span></span>
<span id="cb28-43"><a href="#cb28-43" aria-hidden="true" tabindex="-1"></a><span class="fu">pander</span>(<span class="fu">rbind</span>(<span class="st">`</span><span class="at">Y-Intercept</span><span class="st">`</span> <span class="ot">=</span> <span class="fu">c</span>(<span class="at">True =</span> beta_0, <span class="at">Estimated =</span> mylm<span class="sc">$</span>coef[[<span class="dv">1</span>]]),</span>
<span id="cb28-44"><a href="#cb28-44" aria-hidden="true" tabindex="-1"></a>      <span class="at">Slope =</span> <span class="fu">c</span>(<span class="at">True =</span> beta_1, <span class="at">Estimated =</span> mylm<span class="sc">$</span>coef[[<span class="dv">2</span>]]),</span>
<span id="cb28-45"><a href="#cb28-45" aria-hidden="true" tabindex="-1"></a>      <span class="at">Sigma =</span> <span class="fu">c</span>(<span class="at">True =</span> sigma, <span class="at">Estimated =</span> <span class="fu">summary</span>(mylm)<span class="sc">$</span>sigma)))</span></code></pre></div>
<table style="width:51%;">
<colgroup>
<col width="25%" />
<col width="9%" />
<col width="16%" />
</colgroup>
<thead>
<tr class="header">
<th align="center"> </th>
<th align="center">True</th>
<th align="center">Estimated</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="center"><strong>Y-Intercept</strong></td>
<td align="center">14.2</td>
<td align="center">16.47</td>
</tr>
<tr class="even">
<td align="center"><strong>Slope</strong></td>
<td align="center">3.5</td>
<td align="center">3.296</td>
</tr>
<tr class="odd">
<td align="center"><strong>Sigma</strong></td>
<td align="center">2.5</td>
<td align="center">4.819</td>
</tr>
</tbody>
</table>
<div class="sourceCode" id="cb29"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb29-1"><a href="#cb29-1" aria-hidden="true" tabindex="-1"></a>                                 <span class="co">#Add summary to plot</span></span>
<span id="cb29-2"><a href="#cb29-2" aria-hidden="true" tabindex="-1"></a><span class="fu">legend</span>(<span class="st">&quot;topleft&quot;</span>, <span class="at">legend=</span><span class="fu">c</span>(<span class="fu">paste</span>(<span class="st">&quot;Y-Intercept:&quot;</span>, <span class="fu">round</span>(mylm<span class="sc">$</span>coef[[<span class="dv">1</span>]], <span class="dv">3</span>), <span class="st">&quot;  (&quot;</span>, beta_0, <span class="st">&quot;)&quot;</span>),</span>
<span id="cb29-3"><a href="#cb29-3" aria-hidden="true" tabindex="-1"></a>                           <span class="fu">paste</span>(<span class="st">&quot;Slope:&quot;</span>, <span class="fu">round</span>(mylm<span class="sc">$</span>coef[[<span class="dv">2</span>]], <span class="dv">3</span>), <span class="st">&quot;  (&quot;</span>, beta_1, <span class="st">&quot;)&quot;</span>),</span>
<span id="cb29-4"><a href="#cb29-4" aria-hidden="true" tabindex="-1"></a>                           <span class="fu">paste</span>(<span class="st">&quot;Sigma:&quot;</span>, <span class="fu">round</span>(<span class="fu">summary</span>(mylm)<span class="sc">$</span>sigma, <span class="dv">3</span>), <span class="st">&quot;  (&quot;</span>, sigma, <span class="st">&quot;)&quot;</span>)), <span class="at">bty=</span><span class="st">&#39;n&#39;</span>)</span>
<span id="cb29-5"><a href="#cb29-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb29-6"><a href="#cb29-6" aria-hidden="true" tabindex="-1"></a>                                 <span class="co">#Draw diagnostic plots</span></span>
<span id="cb29-7"><a href="#cb29-7" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(mylm, <span class="at">which=</span><span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>)</span>
<span id="cb29-8"><a href="#cb29-8" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(mylm<span class="sc">$</span>residuals, <span class="at">ylab=</span><span class="st">&quot;Residuals&quot;</span>)</span>
<span id="cb29-9"><a href="#cb29-9" aria-hidden="true" tabindex="-1"></a><span class="fu">mtext</span>(<span class="st">&quot;Residuals vs Order&quot;</span>, <span class="at">side=</span><span class="dv">3</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-31-1.png" width="672" /></p>
<p><strong>Outliers Present</strong></p>
<p>While outliers do not violate any of the regression assumptions, they
do pose substantial difficulties for the least squares regression
estimates of the slope and intercept.</p>
<div class="sourceCode" id="cb30"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb30-1"><a href="#cb30-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Create Data from a True Model</span></span>
<span id="cb30-2"><a href="#cb30-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-3"><a href="#cb30-3" aria-hidden="true" tabindex="-1"></a>n <span class="ot">&lt;-</span> <span class="dv">30</span>                           <span class="co">#sample size</span></span>
<span id="cb30-4"><a href="#cb30-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-5"><a href="#cb30-5" aria-hidden="true" tabindex="-1"></a>beta_0 <span class="ot">&lt;-</span> <span class="fl">14.2</span>                    <span class="co">#True y-intercept</span></span>
<span id="cb30-6"><a href="#cb30-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-7"><a href="#cb30-7" aria-hidden="true" tabindex="-1"></a>beta_1 <span class="ot">&lt;-</span> <span class="fl">3.5</span>                     <span class="co">#True slope</span></span>
<span id="cb30-8"><a href="#cb30-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-9"><a href="#cb30-9" aria-hidden="true" tabindex="-1"></a>X_i <span class="ot">&lt;-</span> <span class="fu">runif</span>(n, <span class="dv">0</span>, <span class="dv">20</span>)            <span class="co">#Sample of X-values</span></span>
<span id="cb30-10"><a href="#cb30-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-11"><a href="#cb30-11" aria-hidden="true" tabindex="-1"></a>sigma <span class="ot">&lt;-</span> <span class="fl">2.5</span>                      <span class="co">#True standard deviation</span></span>
<span id="cb30-12"><a href="#cb30-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-13"><a href="#cb30-13" aria-hidden="true" tabindex="-1"></a>epsilon_i <span class="ot">&lt;-</span> <span class="fu">rnorm</span>(n, <span class="dv">0</span>, sigma)   <span class="co">#normally distributed errors</span></span>
<span id="cb30-14"><a href="#cb30-14" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-15"><a href="#cb30-15" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-16"><a href="#cb30-16" aria-hidden="true" tabindex="-1"></a>epsilon_i[<span class="dv">3</span>] <span class="ot">&lt;-</span> <span class="fu">ifelse</span>(X_i[<span class="dv">3</span>] <span class="sc">&lt;</span> <span class="dv">10</span>, <span class="fu">runif</span>(<span class="dv">1</span>,<span class="dv">25</span>,<span class="dv">35</span>), <span class="sc">-</span><span class="fu">runif</span>(<span class="dv">1</span>,<span class="dv">25</span>,<span class="dv">35</span>))</span>
<span id="cb30-17"><a href="#cb30-17" aria-hidden="true" tabindex="-1"></a>                                  <span class="co">#create outlier</span></span>
<span id="cb30-18"><a href="#cb30-18" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-19"><a href="#cb30-19" aria-hidden="true" tabindex="-1"></a>Y_i <span class="ot">&lt;-</span> beta_0 <span class="sc">+</span> beta_1<span class="sc">*</span>X_i <span class="sc">+</span> epsilon_i </span>
<span id="cb30-20"><a href="#cb30-20" aria-hidden="true" tabindex="-1"></a>                                  <span class="co">#Sample of Y-values from model</span></span>
<span id="cb30-21"><a href="#cb30-21" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-22"><a href="#cb30-22" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-23"><a href="#cb30-23" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-24"><a href="#cb30-24" aria-hidden="true" tabindex="-1"></a><span class="co"># Plot the Data and Fitted Model</span></span>
<span id="cb30-25"><a href="#cb30-25" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-26"><a href="#cb30-26" aria-hidden="true" tabindex="-1"></a>mylm <span class="ot">&lt;-</span> <span class="fu">lm</span>(Y_i <span class="sc">~</span> X_i)            <span class="co">#Fit Model to Data</span></span>
<span id="cb30-27"><a href="#cb30-27" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-28"><a href="#cb30-28" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-29"><a href="#cb30-29" aria-hidden="true" tabindex="-1"></a><span class="fu">layout</span>(<span class="fu">matrix</span>(<span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">1</span>,<span class="dv">1</span>,<span class="dv">2</span>,<span class="dv">3</span>,<span class="dv">4</span>), <span class="dv">2</span>, <span class="dv">3</span>, <span class="at">byrow =</span> <span class="cn">TRUE</span>), </span>
<span id="cb30-30"><a href="#cb30-30" aria-hidden="true" tabindex="-1"></a>   <span class="at">widths=</span><span class="fu">c</span>(<span class="dv">2</span>,<span class="dv">2</span>,<span class="dv">2</span>), <span class="at">heights=</span><span class="fu">c</span>(<span class="dv">4</span>,<span class="dv">2</span>,<span class="dv">2</span>))</span>
<span id="cb30-31"><a href="#cb30-31" aria-hidden="true" tabindex="-1"></a>                                 <span class="co">#create plot panel</span></span>
<span id="cb30-32"><a href="#cb30-32" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-33"><a href="#cb30-33" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-34"><a href="#cb30-34" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(Y_i <span class="sc">~</span> X_i,                  <span class="co">#Plot the data</span></span>
<span id="cb30-35"><a href="#cb30-35" aria-hidden="true" tabindex="-1"></a>     <span class="at">pch=</span><span class="dv">16</span>, </span>
<span id="cb30-36"><a href="#cb30-36" aria-hidden="true" tabindex="-1"></a>     <span class="at">col=</span><span class="st">&quot;darkgray&quot;</span>, </span>
<span id="cb30-37"><a href="#cb30-37" aria-hidden="true" tabindex="-1"></a>     <span class="at">xlim=</span><span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">20</span>), </span>
<span id="cb30-38"><a href="#cb30-38" aria-hidden="true" tabindex="-1"></a>     <span class="at">ylim=</span><span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">100</span>),</span>
<span id="cb30-39"><a href="#cb30-39" aria-hidden="true" tabindex="-1"></a>     <span class="at">main=</span><span class="st">&quot;An Outlier Present&quot;</span>)</span>
<span id="cb30-40"><a href="#cb30-40" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-41"><a href="#cb30-41" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(mylm, <span class="at">col=</span><span class="st">&quot;gray&quot;</span>)         <span class="co">#Add fitted line to plot</span></span>
<span id="cb30-42"><a href="#cb30-42" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-43"><a href="#cb30-43" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(beta_0, beta_1,           <span class="co">#Add True line to plot</span></span>
<span id="cb30-44"><a href="#cb30-44" aria-hidden="true" tabindex="-1"></a>       <span class="at">col=</span><span class="st">&quot;gray&quot;</span>, <span class="at">lty=</span><span class="dv">2</span>)</span>
<span id="cb30-45"><a href="#cb30-45" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-46"><a href="#cb30-46" aria-hidden="true" tabindex="-1"></a>                                 <span class="co">#Summarize the Model Fit</span></span>
<span id="cb30-47"><a href="#cb30-47" aria-hidden="true" tabindex="-1"></a><span class="fu">pander</span>(<span class="fu">rbind</span>(<span class="st">`</span><span class="at">Y-Intercept</span><span class="st">`</span> <span class="ot">=</span> <span class="fu">c</span>(<span class="at">True =</span> beta_0, <span class="at">Estimated =</span> mylm<span class="sc">$</span>coef[[<span class="dv">1</span>]]),</span>
<span id="cb30-48"><a href="#cb30-48" aria-hidden="true" tabindex="-1"></a>      <span class="at">Slope =</span> <span class="fu">c</span>(<span class="at">True =</span> beta_1, <span class="at">Estimated =</span> mylm<span class="sc">$</span>coef[[<span class="dv">2</span>]]),</span>
<span id="cb30-49"><a href="#cb30-49" aria-hidden="true" tabindex="-1"></a>      <span class="at">Sigma =</span> <span class="fu">c</span>(<span class="at">True =</span> sigma, <span class="at">Estimated =</span> <span class="fu">summary</span>(mylm)<span class="sc">$</span>sigma)))</span></code></pre></div>
<table style="width:51%;">
<colgroup>
<col width="25%" />
<col width="9%" />
<col width="16%" />
</colgroup>
<thead>
<tr class="header">
<th align="center"> </th>
<th align="center">True</th>
<th align="center">Estimated</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="center"><strong>Y-Intercept</strong></td>
<td align="center">14.2</td>
<td align="center">13.18</td>
</tr>
<tr class="even">
<td align="center"><strong>Slope</strong></td>
<td align="center">3.5</td>
<td align="center">3.447</td>
</tr>
<tr class="odd">
<td align="center"><strong>Sigma</strong></td>
<td align="center">2.5</td>
<td align="center">6.389</td>
</tr>
</tbody>
</table>
<div class="sourceCode" id="cb31"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb31-1"><a href="#cb31-1" aria-hidden="true" tabindex="-1"></a>                                 <span class="co">#Add summary to plot</span></span>
<span id="cb31-2"><a href="#cb31-2" aria-hidden="true" tabindex="-1"></a><span class="fu">legend</span>(<span class="st">&quot;topleft&quot;</span>, <span class="at">legend=</span><span class="fu">c</span>(<span class="fu">paste</span>(<span class="st">&quot;Y-Intercept:&quot;</span>, <span class="fu">round</span>(mylm<span class="sc">$</span>coef[[<span class="dv">1</span>]], <span class="dv">3</span>), <span class="st">&quot;  (&quot;</span>, beta_0, <span class="st">&quot;)&quot;</span>),</span>
<span id="cb31-3"><a href="#cb31-3" aria-hidden="true" tabindex="-1"></a>                           <span class="fu">paste</span>(<span class="st">&quot;Slope:&quot;</span>, <span class="fu">round</span>(mylm<span class="sc">$</span>coef[[<span class="dv">2</span>]], <span class="dv">3</span>), <span class="st">&quot;  (&quot;</span>, beta_1, <span class="st">&quot;)&quot;</span>),</span>
<span id="cb31-4"><a href="#cb31-4" aria-hidden="true" tabindex="-1"></a>                           <span class="fu">paste</span>(<span class="st">&quot;Sigma:&quot;</span>, <span class="fu">round</span>(<span class="fu">summary</span>(mylm)<span class="sc">$</span>sigma, <span class="dv">3</span>), <span class="st">&quot;  (&quot;</span>, sigma, <span class="st">&quot;)&quot;</span>)), <span class="at">bty=</span><span class="st">&#39;n&#39;</span>)</span>
<span id="cb31-5"><a href="#cb31-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb31-6"><a href="#cb31-6" aria-hidden="true" tabindex="-1"></a>                                 <span class="co">#Draw diagnostic plots</span></span>
<span id="cb31-7"><a href="#cb31-7" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(mylm, <span class="at">which=</span><span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>)</span>
<span id="cb31-8"><a href="#cb31-8" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(mylm<span class="sc">$</span>residuals, <span class="at">ylab=</span><span class="st">&quot;Residuals&quot;</span>)</span>
<span id="cb31-9"><a href="#cb31-9" aria-hidden="true" tabindex="-1"></a><span class="fu">mtext</span>(<span class="st">&quot;Residuals vs Order&quot;</span>, <span class="at">side=</span><span class="dv">3</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-32-1.png" width="672" /></p>
</div>
</div>
<p><br /></p>
<hr />
<p><br/></p>
<p><em>The material below this section is meant for Math 425 students
only.</em></p>
<p><br/></p>
</div>
<div id="estimating-the-model-parameters-expand" class="section level4">
<h4>Estimating the Model Parameters
<a href="javascript:showhide('estimatingparameters')" style="font-size:.6em;color:skyblue;" id="estMod">(Expand)</a></h4>
<p><span class="expand-caption">How to get <span
class="math inline">\(b_0\)</span> and <span
class="math inline">\(b_1\)</span>: least squares &amp; maximum
likelihood…</span></p>
<div id="estimatingparameters" style="display:none;">
<p>There are two approaches to estimating the parameters <span
class="math inline">\(\beta_0\)</span> and <span
class="math inline">\(\beta_1\)</span> in the regression model. The
oldest and most tradiational approach is using the idea of least
squares. A more general approach uses the idea of maximum likelihood
(see below). Fortunately, for simple linear regression, the estimates
for <span class="math inline">\(\beta_0\)</span> and <span
class="math inline">\(\beta_1\)</span> obtained from either method are
identical. The estimates for the true parameter values <span
class="math inline">\(\beta_0\)</span> and <span
class="math inline">\(\beta_1\)</span> are typically denoted by <span
class="math inline">\(b_0\)</span> and <span
class="math inline">\(b_1\)</span>, respectively, and are given by the
following formulas.</p>
<table>
<colgroup>
<col width="40%" />
<col width="44%" />
<col width="16%" />
</colgroup>
<thead>
<tr class="header">
<th>Parameter Estimate</th>
<th>Mathematical Formula</th>
<th>R Code</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>Slope</td>
<td><span class="math inline">\(b_1 = \frac{\sum
X_i(Y_i-\bar{Y})}{\sum(X_i-\bar{X})^2}\)</span></td>
<td><code>b_1 &lt;- sum( X*(Y - mean(Y)) ) / sum( (X - mean(X))^2 )</code></td>
</tr>
<tr class="even">
<td>Intercept</td>
<td><span class="math inline">\(b_0 = \bar{Y} - b_1\bar{X}\)</span></td>
<td><code>b_0 &lt;- mean(Y) - b_1*mean(X)</code></td>
</tr>
</tbody>
</table>
<p>It is important to note that these estimates are entirely determined
from the observed data <span class="math inline">\(X\)</span> and <span
class="math inline">\(Y\)</span>. When the regression equation is
written using the estimates instead of the parameters, we use the
notation <span class="math inline">\(\hat{Y}\)</span>, which is the
estimator of <span class="math inline">\(E\{Y\}\)</span>. Thus, we write
<span class="math display">\[\begin{equation}
  \hat{Y}_i = b_0 + b_1 X_i
\end{equation}\]</span> which is directly comparable to the true, but
unknown values <span class="math display">\[\begin{equation}
  E\{Y_i\} = \beta_0 + \beta_1 X_i.
  \label{exp}
\end{equation}\]</span></p>
<div id="leastSquares" class="section level5">
<h5>Least Squares</h5>
<p>To estimate the model parameters <span
class="math inline">\(\beta_0\)</span> and <span
class="math inline">\(\beta_1\)</span> using least squares, we start by
defining the function <span class="math inline">\(Q\)</span> as the sum
of the squared errors, <span class="math inline">\(\epsilon_i\)</span>.
<span class="math display">\[
  Q = \sum_{i=1}^n \epsilon_i^2 = \sum_{i=1}^n (Y_i - (\beta_0 + \beta_1
X_i))^2
\]</span> Then we use the function Q as if it were a function of <span
class="math inline">\(\beta_0\)</span> and <span
class="math inline">\(\beta_1\)</span>. Ironically, the values of <span
class="math inline">\(Y\)</span> and <span
class="math inline">\(X\)</span> are considered fixed. However, this
makes sense because once a particular data set has been observed, these
values are all known for that data set. What we don’t know are the
values of <span class="math inline">\(\beta_0\)</span> and <span
class="math inline">\(\beta_1\)</span>.</p>
<p>This <a
href="https://phet.colorado.edu/sims/html/least-squares-regression/latest/least-squares-regression_en.html">least
squares applet</a> is a good way to explore how various choices of the
slope and intercept yield different values of the “sum of squared
residuals”. But it turns out that there is one “best” choice of the
slope and intercept that yields a “smallest” value of the “sum of
squared residuals.” This best choice can actually be found using
calculus by taking the partial derivatives of <span
class="math inline">\(Q\)</span> with respect to both <span
class="math inline">\(\beta_0\)</span> and <span
class="math inline">\(\beta_1\)</span>. <span class="math display">\[
  \frac{\partial Q}{\partial \beta_0} = -2\sum (Y_i - \beta_0 -
\beta_1X_i)
\]</span> <span class="math display">\[
  \frac{\partial Q}{\partial \beta_1} = -2\sum
X_i(Y_i-\beta_0-\beta_1X_i)
\]</span> Setting these partial derivatives to zero, and solving the
resulting system of equations provides the values of the parameters
which minimize <span class="math inline">\(Q\)</span> for a given set of
data. After all the calculations are completed we find the values of the
parameter estimators <span class="math inline">\(b_0\)</span> and <span
class="math inline">\(b_1\)</span> (of <span
class="math inline">\(\beta_0\)</span> and <span
class="math inline">\(\beta_1\)</span>, respectively) are as stated
previously.</p>
</div>
<div id="mle" class="section level5">
<h5>Maximum Likelihood</h5>
<p>The idea of maximum likelihood estimation is opposite that of least
squares. Instead of choosing those values of <span
class="math inline">\(\beta_0\)</span> and <span
class="math inline">\(\beta_1\)</span> which minime the least squares
<span class="math inline">\(Q\)</span> function, we choose the values of
<span class="math inline">\(\beta_0\)</span> and <span
class="math inline">\(\beta_1\)</span> which maximize the likelihood
function. The likelihood function is created by first determining the
joint distribution of the <span class="math inline">\(Y_i\)</span> for
all observations <span class="math inline">\(i=1,\ldots,n\)</span>. We
can do this rather simply by using the assumption that the errors, <span
class="math inline">\(\epsilon_i\)</span> are independently normally
distributed. When events are independent, their joint probability is
simply the product of their individual probabilities. Thus, if <span
class="math inline">\(f(Y_i)\)</span> denotes the probability density
function for <span class="math inline">\(Y_i\)</span>, then the joint
probability density for all <span class="math inline">\(Y_i\)</span>,
<span class="math inline">\(f(Y_1,\ldots,Y_n)\)</span> is given by <span
class="math display">\[
  f(Y_1,\ldots,Y_n) = \prod_{i=1}^n f(Y_i)
\]</span> Since each <span class="math inline">\(Y_i\)</span> is assumed
to be normally distributed with mean <span class="math inline">\(\beta_0
+ \beta_1 X_i\)</span> and variance <span
class="math inline">\(\sigma^2\)</span> (see model (<span
class="math inline">\(\ref{model}\)</span>)) we have that <span
class="math display">\[
  f(Y_i) =
\frac{1}{\sqrt{2\pi}\sigma}\exp{\left[-\frac{1}{2}\left(\frac{Y_i-\beta_0-\beta_1X_i}{\sigma}\right)^2\right]}
\]</span> which provides the joint probability as <span
class="math display">\[
  f(Y_1,\ldots,Y_n) = \prod_{i=1}^n f(Y_i) =
\frac{1}{(2\pi\sigma^2)^{n/2}}\exp{\left[-\frac{1}{2\sigma^2}\sum_{i=1}^n(Y_i-\beta_0-\beta_1X_i)^2\right]}
\]</span> The likelihood function <span class="math inline">\(L\)</span>
is then given by consider the <span class="math inline">\(Y_i\)</span>
and <span class="math inline">\(X_i\)</span> fixed and the parameters
<span class="math inline">\(\beta_0\)</span>, <span
class="math inline">\(\beta_1\)</span> and <span
class="math inline">\(\sigma^2\)</span> as the variables in the
function. <span class="math display">\[
  L(\beta_0,\beta_1,\sigma^2) =
\frac{1}{(2\pi\sigma^2)^{n/2}}\exp{\left[-\frac{1}{2\sigma^2}\sum_{i=1}^n(Y_i-\beta_0-\beta_1X_i)^2\right]}
\]</span> Instead of taking partial derivatives of <span
class="math inline">\(L\)</span> directly (with respect to all
parameters) we take the partial derivatives of the <span
class="math inline">\(\log\)</span> of <span
class="math inline">\(L\)</span>, which is easier to work with. In a
similar, but more difficult calculation, to that of minimizing <span
class="math inline">\(Q\)</span>, we obtain the values of <span
class="math inline">\(\beta_0\)</span>, <span
class="math inline">\(\beta_1\)</span>, and <span
class="math inline">\(\sigma^2\)</span> which maximize the log of <span
class="math inline">\(L\)</span>, and which therefore maximize <span
class="math inline">\(L\)</span>. (This is not an obvious result, but
can be verified after some intense calculations.) The additional result
that maximimum likelihood estimation provides that the least squares
estimates did not give us is the estimate <span
class="math inline">\(\hat{\sigma}^2\)</span> of <span
class="math inline">\(\sigma^2\)</span>. <span class="math display">\[
  \hat{\sigma}^2 = \frac{\sum(Y_i-\hat{Y}_i)^2}{n}
\]</span></p>
</div>
</div>
<p><br /></p>
</div>
<div id="estimating-the-model-variance-expand" class="section level4">
<h4>Estimating the Model Variance
<a href="javascript:showhide('estimatingvariance')" style="font-size:.6em;color:skyblue;" id="varEst">(Expand)</a></h4>
<p><span class="expand-caption">Estimating <span
class="math inline">\(\sigma^2\)</span> with MSE…</span></p>
<div id="estimatingvariance" style="display:none;">
<p>As shown previously in the “Estimating Model Parameters” section of
this page, we can obtain estimates for the model parameters <span
class="math inline">\(\beta_0\)</span> and <span
class="math inline">\(\beta_1\)</span> by using either least squares
estimation or maximum likelihood estimation. Those estimates were given
by the formulas</p>
<p><span class="math display">\[
b_1 = \frac{\sum X_i(Y_i-\bar{Y})}{\sum(X_i-\bar{X})^2} \quad
\text{(Unbiased Estimate of $\beta_1$)}
\]</span></p>
<p><span class="math display">\[
b_0 = \bar{Y} - b_1\bar{X} \quad \text{(Unbiased Estimate of $\beta_0$)}
\]</span></p>
<p>It turns out that these estimates for <span
class="math inline">\(\beta_0\)</span> and <span
class="math inline">\(\beta_1\)</span> are nice in the sense that on
average they provide the correct estimate of the true parameter, i.e.,
they are unbiased estimators. Unfortunately, this is not the case for
the maximum likelihood estimate <span
class="math inline">\(\widehat{\sigma}^2\)</span> of the model variance
<span class="math inline">\(\sigma^2\)</span>. This estimate turns out
to be a biased estimator. This means that it is consistently wrong in
its estimates of <span class="math inline">\(\sigma^2\)</span>. If we
left the estimator alone, our estimates for <span
class="math inline">\(\sigma^2\)</span> would always be wrong. This is
bad. Fortunately, there is a way to fix it, and this corrected version
of the estimator is what we will actually use in practice to estimate
<span class="math inline">\(\sigma^2\)</span>.</p>
<p>Without going into all the details, to “fix” the biased estimator of
<span class="math inline">\(\sigma^2\)</span> that is given to us
through maximum likelihood estimation, we need to correct its
denominator so that it properly represent the degrees of freedom
associated with the numerator, which it does not currently. To find the
correct degrees of freedom, we have to notice that the <span
class="math inline">\(\hat{Y}_i\)</span> in the numerator of <span
class="math inline">\(\widehat{\sigma}^2\)</span> is defined by <span
class="math display">\[\begin{equation}
  \widehat{Y}_i = b_0 + b_1X_i
  \label{hatY}
\end{equation}\]</span> From this equation, we notice that two means,
<span class="math inline">\(\bar{X}\)</span> and <span
class="math inline">\(\bar{Y}\)</span>, were estimated from the data in
order to obtain <span class="math inline">\(\hat{Y}_i\)</span>. (See the
formulas for <span class="math inline">\(b_0\)</span> and <span
class="math inline">\(b_1\)</span> above, and note how they use both
<span class="math inline">\(\bar{X}\)</span> and <span
class="math inline">\(\bar{Y}\)</span> in their calculation.) Anytime a
mean is estimated from the data we lose a degree of freedom. Hence, the
denominator for <span class="math inline">\(\hat{\sigma}^2\)</span>
should be <span class="math inline">\(n-2\)</span> instead of <span
class="math inline">\(n\)</span>. Some incredibly long calculations will
show that the “fixed” estimator <span
class="math display">\[\begin{equation}
  s^2 = MSE = \frac{\sum(Y_i-\hat{Y}_i)^2}{n-2} \quad \text{(Unbiased
Estimator of $\sigma^2$)}
\end{equation}\]</span> is an unbiased estimator of <span
class="math inline">\(\sigma^2\)</span>. Here <span
class="math inline">\(MSE\)</span> stands for <strong>m</strong>ean
<strong>s</strong>quared <strong>e</strong>rror, which is the most
obvious name for a formula that squares the errors <span
class="math inline">\(Y_i-\hat{Y}_i\)</span> then adds them up and
divides by their degrees of freedom. Similarly, we call the numerator
<span class="math inline">\(\sum(Y_i-\hat{Y}_i)^2\)</span> the sum of
the squared errors, denoted by <span class="math inline">\(SSE\)</span>.
It is also important to note that the errors are often denoted by <span
class="math inline">\(r_i = Y_i-\hat{Y}_i\)</span>, the residuals.
Putting this all together we get the following equivalent statements for
<span class="math inline">\(MSE\)</span>. <span
class="math display">\[\begin{equation}
  s^2 = MSE = \frac{SSE}{n-2} = \frac{\sum(Y_i-\widehat{Y}_i)^2}{n-2} =
\frac{\sum r_i^2}{n-2}
\end{equation}\]</span> As a final note, even though the expected value
<span class="math inline">\(E\{MSE\} = \sigma^2\)</span>, which shows
<span class="math inline">\(MSE\)</span> is an unbiased estimator of
<span class="math inline">\(\sigma^2\)</span>, it unfortunately isn’t
true that <span class="math inline">\(\sqrt{MSE}\)</span> is an unbiased
estimator of <span class="math inline">\(\sigma\)</span>. This presents
a few problems later on, but these are minimal enough that we can
overlook the issue and move forward.</p>
</div>
<p><br /></p>
</div>
<div id="transformations-expand" class="section level4">
<h4>Transformations
<a href="javascript:showhide('transformations')" style="font-size:.6em;color:skyblue;">(Expand)</a></h4>
<p><span class="expand-caption"><span
class="math inline">\(Y&#39;\)</span>, <span
class="math inline">\(X&#39;\)</span>, and returning to the original
space…</span></p>
<div id="transformations" style="display:none;">
<p>Y transformations are denoted by y-prime, written <span
class="math inline">\(Y&#39;\)</span>, and consist of raising <span
class="math inline">\(Y\)</span> to some power called <span
class="math inline">\(\lambda\)</span>.</p>
<p><span class="math display">\[
  Y&#39; = Y^\lambda \quad \text{(Y Transformation)}
\]</span></p>
<table>
<colgroup>
<col width="35%" />
<col width="42%" />
<col width="21%" />
</colgroup>
<thead>
<tr class="header">
<th align="center">Value of <span
class="math inline">\(\lambda\)</span></th>
<th>Transformation to Use</th>
<th>R Code</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="center">-2</td>
<td><span class="math inline">\(Y&#39; = Y^{-2} = 1/Y^2\)</span></td>
<td><code>lm(Y^-2 ~ X)</code></td>
</tr>
<tr class="even">
<td align="center">-1</td>
<td><span class="math inline">\(Y&#39; = Y^{-1} = 1/Y\)</span></td>
<td><code>lm(Y^-1 ~ X)</code></td>
</tr>
<tr class="odd">
<td align="center">0</td>
<td><span class="math inline">\(Y&#39; = \log(Y)\)</span></td>
<td><code>lm(log(Y) ~ X)</code></td>
</tr>
<tr class="even">
<td align="center">0.25</td>
<td><span class="math inline">\(Y&#39; = \sqrt(\sqrt(Y))\)</span></td>
<td><code>lm(sqrt(sqrt(Y)) ~ X)</code></td>
</tr>
<tr class="odd">
<td align="center">0.5</td>
<td><span class="math inline">\(Y&#39; = \sqrt(Y)\)</span></td>
<td><code>lm(sqrt(Y) ~ X)</code></td>
</tr>
<tr class="even">
<td align="center">1</td>
<td><span class="math inline">\(Y&#39; = Y\)</span></td>
<td><code>lm(Y ~ X)</code></td>
</tr>
<tr class="odd">
<td align="center">2</td>
<td><span class="math inline">\(Y&#39; = Y^2\)</span></td>
<td><code>lm(Y^2 ~ X)</code></td>
</tr>
</tbody>
</table>
<p>Using “maximum-likelihood” estimation, the Box-Cox procedure can
actually automatically detect the “optimal” value of <span
class="math inline">\(\lambda\)</span> to consider for a
Y-transformation. Keep in mind however, that simply accepting a
suggested Y-transformation without considering the scatterplot and
diagnostic plots first, is unwise.</p>
<div class="tab">
<p><button class="tablinks" onclick="openTab(event, 'ScatterplotView')">Scatterplot
Recognition</button>
<button class="tablinks" onclick="openTab(event, 'BoxCoxView')">Box-Cox
Suggestion</button>
<button class="tablinks" onclick="openTab(event, 'YTransExample')">An
Example</button></p>
</div>
<div id="ScatterplotView" class="tabcontent" style="display:block;">
<p>
<div id="scatterplot-recognition" class="section level6">
<h6>Scatterplot Recognition</h6>
<p>The following panel of scatterplots can give you a good feel for when
to try different values of <span
class="math inline">\(\lambda\)</span>.</p>
<div class="sourceCode" id="cb32"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb32-1"><a href="#cb32-1" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">15</span>)</span>
<span id="cb32-2"><a href="#cb32-2" aria-hidden="true" tabindex="-1"></a>N <span class="ot">&lt;-</span> <span class="dv">300</span></span>
<span id="cb32-3"><a href="#cb32-3" aria-hidden="true" tabindex="-1"></a>X <span class="ot">&lt;-</span> <span class="fu">runif</span>(N, <span class="dv">5</span>, <span class="dv">50</span>)</span>
<span id="cb32-4"><a href="#cb32-4" aria-hidden="true" tabindex="-1"></a>Y <span class="ot">&lt;-</span> <span class="dv">25</span> <span class="sc">+</span> <span class="fl">3.5</span><span class="sc">*</span>X <span class="sc">+</span> <span class="fu">rnorm</span>(N, <span class="dv">0</span>, <span class="dv">20</span>)</span>
<span id="cb32-5"><a href="#cb32-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb32-6"><a href="#cb32-6" aria-hidden="true" tabindex="-1"></a>Ya <span class="ot">&lt;-</span> <span class="dv">1</span><span class="sc">/</span><span class="fu">sqrt</span>(Y)   <span class="co">#1/Y^2   Lam = -2</span></span>
<span id="cb32-7"><a href="#cb32-7" aria-hidden="true" tabindex="-1"></a>Yb <span class="ot">&lt;-</span> <span class="dv">1</span><span class="sc">/</span>Y         <span class="co">#1/Y     Lam = -1</span></span>
<span id="cb32-8"><a href="#cb32-8" aria-hidden="true" tabindex="-1"></a>Yc <span class="ot">&lt;-</span> <span class="fu">exp</span>(.<span class="dv">02</span><span class="sc">*</span>Y)  <span class="co">#log(Y)  Lam =  0</span></span>
<span id="cb32-9"><a href="#cb32-9" aria-hidden="true" tabindex="-1"></a>Yd <span class="ot">&lt;-</span> Y<span class="sc">^</span><span class="dv">2</span>         <span class="co">#sqrt(Y) Lam =  0.5</span></span>
<span id="cb32-10"><a href="#cb32-10" aria-hidden="true" tabindex="-1"></a>Ye <span class="ot">&lt;-</span> Y           <span class="co">#Y       Lam =  1</span></span>
<span id="cb32-11"><a href="#cb32-11" aria-hidden="true" tabindex="-1"></a>Yf <span class="ot">&lt;-</span> <span class="fu">sqrt</span>(Y)     <span class="co">#Y^2     Lam =  2</span></span>
<span id="cb32-12"><a href="#cb32-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb32-13"><a href="#cb32-13" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb32-14"><a href="#cb32-14" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mfrow=</span><span class="fu">c</span>(<span class="dv">2</span>,<span class="dv">3</span>), <span class="at">mai=</span><span class="fu">c</span>(.<span class="dv">4</span>,.<span class="dv">4</span>,.<span class="dv">3</span>,.<span class="dv">2</span>), <span class="at">mgp=</span><span class="fu">c</span>(<span class="fl">0.5</span>,<span class="fl">0.5</span>,<span class="dv">0</span>))</span>
<span id="cb32-15"><a href="#cb32-15" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb32-16"><a href="#cb32-16" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(Ya <span class="sc">~</span> X, <span class="at">main=</span><span class="fu">expression</span>(<span class="fu">paste</span>(<span class="st">&quot;Use &quot;</span>, lambda <span class="sc">==</span> <span class="sc">-</span><span class="dv">2</span>)), <span class="at">ylab=</span><span class="st">&quot;Y in Original Units&quot;</span>, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;gray45&quot;</span>, <span class="at">cex=</span><span class="fl">0.9</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xlab=</span><span class="st">&quot;X in Original Units&quot;</span>)</span>
<span id="cb32-17"><a href="#cb32-17" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(<span class="fu">lm</span>(Ya<span class="sc">^-</span><span class="dv">2</span> <span class="sc">~</span> X))</span>
<span id="cb32-18"><a href="#cb32-18" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(<span class="dv">1</span><span class="sc">/</span><span class="fu">sqrt</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x), <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="st">&quot;green&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb32-19"><a href="#cb32-19" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb32-20"><a href="#cb32-20" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb32-21"><a href="#cb32-21" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(Yb <span class="sc">~</span> X, <span class="at">main=</span><span class="fu">expression</span>(<span class="fu">paste</span>(<span class="st">&quot;Use &quot;</span>, lambda <span class="sc">==</span> <span class="sc">-</span><span class="dv">1</span>)), <span class="at">ylab=</span><span class="st">&quot;Y in Original Units&quot;</span>, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;gray45&quot;</span>, <span class="at">cex=</span><span class="fl">0.9</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xlab=</span><span class="st">&quot;X in Original Units&quot;</span>)</span>
<span id="cb32-22"><a href="#cb32-22" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(<span class="fu">lm</span>(Yb<span class="sc">^-</span><span class="dv">1</span> <span class="sc">~</span> X))</span>
<span id="cb32-23"><a href="#cb32-23" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(<span class="dv">1</span><span class="sc">/</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x), <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="st">&quot;green&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb32-24"><a href="#cb32-24" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb32-25"><a href="#cb32-25" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(Yc <span class="sc">~</span> X, <span class="at">main=</span><span class="fu">expression</span>(<span class="fu">paste</span>(<span class="st">&quot;Use &quot;</span>, lambda <span class="sc">==</span> <span class="dv">0</span>, <span class="st">&quot; i.e., log(...)&quot;</span>)), <span class="at">ylab=</span><span class="st">&quot;Y in Original Units&quot;</span>, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;gray45&quot;</span>, <span class="at">cex=</span><span class="fl">0.9</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xlab=</span><span class="st">&quot;X in Original Units&quot;</span>)</span>
<span id="cb32-26"><a href="#cb32-26" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(<span class="fu">lm</span>(<span class="fu">log</span>(Yc) <span class="sc">~</span> X))</span>
<span id="cb32-27"><a href="#cb32-27" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(<span class="fu">exp</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x), <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="st">&quot;green&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb32-28"><a href="#cb32-28" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb32-29"><a href="#cb32-29" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb32-30"><a href="#cb32-30" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(Yd <span class="sc">~</span> X, <span class="at">main=</span><span class="fu">expression</span>(<span class="fu">paste</span>(<span class="st">&quot;Use &quot;</span>, lambda <span class="sc">==</span> <span class="fl">0.5</span>)), <span class="at">ylab=</span><span class="st">&quot;Y in Original Units&quot;</span>, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;gray45&quot;</span>, <span class="at">cex=</span><span class="fl">0.9</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xlab=</span><span class="st">&quot;X in Original Units&quot;</span>)</span>
<span id="cb32-31"><a href="#cb32-31" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(<span class="fu">lm</span>(<span class="fu">sqrt</span>(Yd) <span class="sc">~</span> X))</span>
<span id="cb32-32"><a href="#cb32-32" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>((b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x)<span class="sc">^</span><span class="dv">2</span>, <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="st">&quot;green&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb32-33"><a href="#cb32-33" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb32-34"><a href="#cb32-34" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(Ye <span class="sc">~</span> X, <span class="at">main=</span><span class="fu">expression</span>(<span class="fu">paste</span>(<span class="st">&quot;Use &quot;</span>, lambda <span class="sc">==</span> <span class="dv">1</span>, <span class="st">&quot; (No Transformation)&quot;</span>)), <span class="at">ylab=</span><span class="st">&quot;Y in Original Units&quot;</span>, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;gray45&quot;</span>, <span class="at">cex=</span><span class="fl">0.9</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xlab=</span><span class="st">&quot;X in Original Units&quot;</span>)</span>
<span id="cb32-35"><a href="#cb32-35" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(<span class="fu">lm</span>(Ye <span class="sc">~</span> X))</span>
<span id="cb32-36"><a href="#cb32-36" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>((b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x), <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="st">&quot;green&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb32-37"><a href="#cb32-37" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb32-38"><a href="#cb32-38" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(Yf <span class="sc">~</span> X, <span class="at">main=</span><span class="fu">expression</span>(<span class="fu">paste</span>(<span class="st">&quot;Use &quot;</span>, lambda <span class="sc">==</span> <span class="dv">2</span>)), </span>
<span id="cb32-39"><a href="#cb32-39" aria-hidden="true" tabindex="-1"></a><span class="at">ylab=</span><span class="st">&quot;Y in Original Units&quot;</span>, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;gray45&quot;</span>, <span class="at">cex=</span><span class="fl">0.9</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xlab=</span><span class="st">&quot;X in Original Units&quot;</span>)</span>
<span id="cb32-40"><a href="#cb32-40" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(<span class="fu">lm</span>(Yf<span class="sc">^</span><span class="dv">2</span> <span class="sc">~</span> X))</span>
<span id="cb32-41"><a href="#cb32-41" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(<span class="fu">sqrt</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x), <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="st">&quot;green&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-33-1.png" width="672" /></p>
</p>
</div>
</div>
<div id="BoxCoxView" class="tabcontent">
<p>
<div id="box-cox-suggestion" class="section level6">
<h6>Box-Cox Suggestion</h6>
<p>The <code>boxCox(...)</code> function in <code>library(car)</code>
can also be helpful on finding values of <span
class="math inline">\(\lambda\)</span> to try.</p>
<div class="sourceCode" id="cb33"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb33-1"><a href="#cb33-1" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mfrow=</span><span class="fu">c</span>(<span class="dv">2</span>,<span class="dv">3</span>), <span class="at">mai=</span><span class="fu">c</span>(.<span class="dv">4</span>,.<span class="dv">4</span>,.<span class="dv">3</span>,.<span class="dv">2</span>), <span class="at">mgp=</span><span class="fu">c</span>(<span class="dv">2</span>,<span class="fl">0.5</span>,<span class="dv">0</span>))</span>
<span id="cb33-2"><a href="#cb33-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb33-3"><a href="#cb33-3" aria-hidden="true" tabindex="-1"></a><span class="fu">boxCox</span>(<span class="fu">lm</span>(Ya <span class="sc">~</span> X))</span>
<span id="cb33-4"><a href="#cb33-4" aria-hidden="true" tabindex="-1"></a><span class="fu">mtext</span>(<span class="at">side=</span><span class="dv">3</span>, <span class="at">text=</span><span class="fu">expression</span>(<span class="fu">paste</span>(<span class="st">&quot;Use &quot;</span>, lambda <span class="sc">==</span> <span class="sc">-</span><span class="dv">2</span>)), <span class="at">line=</span>.<span class="dv">5</span>)</span>
<span id="cb33-5"><a href="#cb33-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb33-6"><a href="#cb33-6" aria-hidden="true" tabindex="-1"></a><span class="fu">boxCox</span>(<span class="fu">lm</span>(Yb <span class="sc">~</span> X))</span>
<span id="cb33-7"><a href="#cb33-7" aria-hidden="true" tabindex="-1"></a><span class="fu">mtext</span>(<span class="at">side=</span><span class="dv">3</span>, <span class="at">text=</span><span class="fu">expression</span>(<span class="fu">paste</span>(<span class="st">&quot;Use &quot;</span>, lambda <span class="sc">==</span> <span class="sc">-</span><span class="dv">1</span>)), <span class="at">line=</span>.<span class="dv">5</span>)</span>
<span id="cb33-8"><a href="#cb33-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb33-9"><a href="#cb33-9" aria-hidden="true" tabindex="-1"></a><span class="fu">boxCox</span>(<span class="fu">lm</span>(Yc <span class="sc">~</span> X))</span>
<span id="cb33-10"><a href="#cb33-10" aria-hidden="true" tabindex="-1"></a><span class="fu">mtext</span>(<span class="at">side=</span><span class="dv">3</span>, <span class="at">text=</span><span class="fu">expression</span>(<span class="fu">paste</span>(<span class="st">&quot;Use &quot;</span>, lambda <span class="sc">==</span> <span class="dv">0</span>, <span class="st">&quot; i.e., log(...)&quot;</span>)), <span class="at">line=</span>.<span class="dv">5</span>)</span>
<span id="cb33-11"><a href="#cb33-11" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb33-12"><a href="#cb33-12" aria-hidden="true" tabindex="-1"></a><span class="fu">boxCox</span>(<span class="fu">lm</span>(Yd <span class="sc">~</span> X))</span>
<span id="cb33-13"><a href="#cb33-13" aria-hidden="true" tabindex="-1"></a><span class="fu">mtext</span>(<span class="at">side=</span><span class="dv">3</span>, <span class="at">text=</span><span class="fu">expression</span>(<span class="fu">paste</span>(<span class="st">&quot;Use &quot;</span>, lambda <span class="sc">==</span> <span class="fl">0.5</span>)), <span class="at">line=</span>.<span class="dv">5</span>)</span>
<span id="cb33-14"><a href="#cb33-14" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb33-15"><a href="#cb33-15" aria-hidden="true" tabindex="-1"></a><span class="fu">boxCox</span>(<span class="fu">lm</span>(Ye <span class="sc">~</span> X))</span>
<span id="cb33-16"><a href="#cb33-16" aria-hidden="true" tabindex="-1"></a><span class="fu">mtext</span>(<span class="at">side=</span><span class="dv">3</span>, <span class="at">text=</span><span class="fu">expression</span>(<span class="fu">paste</span>(<span class="st">&quot;Use &quot;</span>, lambda <span class="sc">==</span> <span class="dv">1</span>, <span class="st">&quot; (No Transformation)&quot;</span>)), <span class="at">line=</span>.<span class="dv">5</span>)</span>
<span id="cb33-17"><a href="#cb33-17" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb33-18"><a href="#cb33-18" aria-hidden="true" tabindex="-1"></a><span class="fu">boxCox</span>(<span class="fu">lm</span>(Yf <span class="sc">~</span> X))</span>
<span id="cb33-19"><a href="#cb33-19" aria-hidden="true" tabindex="-1"></a><span class="fu">mtext</span>(<span class="at">side=</span><span class="dv">3</span>, <span class="at">text=</span><span class="fu">expression</span>(<span class="fu">paste</span>(<span class="st">&quot;Use &quot;</span>, lambda <span class="sc">==</span> <span class="dv">2</span>)), <span class="at">line=</span>.<span class="dv">5</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-34-1.png" width="672" /></p>
</p>
</div>
</div>
<div id="YTransExample" class="tabcontent">
<p>
<div id="an-example" class="section level6">
<h6>An Example</h6>
<p>Suppose we were running a simple linear regression on the
<code>cars</code> dataset.</p>
<p>This would be done with the code</p>
<p><code>cars.lm &lt;- lm(dist ~ speed, data=cars)</code></p>
<p><code>summary(cars.lm)</code></p>
<p>Notice the line doesn’t quite fit the data as well as we would hope.
Instead, the data looks a little curved.</p>
<div class="sourceCode" id="cb34"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb34-1"><a href="#cb34-1" aria-hidden="true" tabindex="-1"></a>cars.lm <span class="ot">&lt;-</span><span class="fu">lm</span>(dist <span class="sc">~</span> speed,<span class="at">data=</span>cars)</span>
<span id="cb34-2"><a href="#cb34-2" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(dist <span class="sc">~</span> speed, <span class="at">data=</span>cars, <span class="at">pch=</span><span class="dv">20</span>, <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>, <span class="at">cex=</span><span class="fl">1.2</span>, <span class="at">las=</span><span class="dv">1</span>,</span>
<span id="cb34-3"><a href="#cb34-3" aria-hidden="true" tabindex="-1"></a>     <span class="at">xlab=</span><span class="st">&quot;Speed of the Vehicle (mph) </span><span class="sc">\n</span><span class="st"> the Moment the Brakes were Applied&quot;</span>, <span class="at">ylab=</span><span class="st">&quot;Distance (ft) it took the Vehicle to Stop&quot;</span>,</span>
<span id="cb34-4"><a href="#cb34-4" aria-hidden="true" tabindex="-1"></a>     <span class="at">main=</span><span class="st">&quot;Don&#39;t Step in front of a Moving 1920&#39;s Vehicle...&quot;</span>)</span>
<span id="cb34-5"><a href="#cb34-5" aria-hidden="true" tabindex="-1"></a><span class="fu">mtext</span>(<span class="at">side=</span><span class="dv">3</span>, <span class="at">text=</span><span class="st">&quot;...they take a few feet to stop.&quot;</span>, <span class="at">cex=</span><span class="fl">0.7</span>, <span class="at">line=</span>.<span class="dv">5</span>)</span>
<span id="cb34-6"><a href="#cb34-6" aria-hidden="true" tabindex="-1"></a><span class="fu">legend</span>(<span class="st">&quot;topleft&quot;</span>, <span class="at">legend=</span><span class="st">&quot;Stopping Distance Experiment&quot;</span>, <span class="at">bty=</span><span class="st">&quot;n&quot;</span>)</span>
<span id="cb34-7"><a href="#cb34-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb34-8"><a href="#cb34-8" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(cars.lm, <span class="at">col=</span><span class="st">&quot;gray&quot;</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-35-1.png" width="672" /></p>
<p>Using the <code>boxCox(...)</code> function from
<code>library(car)</code> we would compute the following to determine
which Y-transformation would be most meaningful.</p>
<p><code>library(car)</code></p>
<p><code>boxCox(cars.lm)</code></p>
<p>The output from the <code>boxCox(...)</code> function looks as
follows.</p>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-36-1.png" width="672" /></p>
<p>This plot tells use to use the <span class="math inline">\(\lambda =
0.5\)</span> transformation, so that <span class="math inline">\(Y&#39;
= Y^0.5 = \sqrt{Y}\)</span>. (To see this yourself, click on the
“Box-Cox Suggestion” tab above, as well as on the “Scatterplot
Recognition” tab.)</p>
<p>Now, a transformation regression is performed using
<code>sqrt(Y)</code> in place of <code>Y</code> as follows:</p>
<p><code>cars.lm.t &lt;- lm(sqrt(dist) ~ speed, data=cars)</code></p>
<p><code>summary(cars.lm.t)</code></p>
<table>
<thead>
<tr class="header">
<th> </th>
<th>Estimate</th>
<th>Std. Error</th>
<th>t value</th>
<th>Pr(&gt;</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td><strong>(Intercept)</strong></td>
<td>1.277</td>
<td>0.4844</td>
<td>2.636</td>
<td>0.01126</td>
</tr>
<tr class="even">
<td><strong>speed</strong></td>
<td>0.3224</td>
<td>0.02978</td>
<td>10.83</td>
<td>1.773e-14</td>
</tr>
</tbody>
</table>
<p>Then,</p>
<p><span class="math display">\[
  \widehat{Y}_i&#39; = 1.277 + 0.3224 X_i
\]</span></p>
<p>And replacing <span class="math inline">\(\hat{Y}_i&#39; =
\sqrt{\hat{Y}_i}\)</span> we have</p>
<p><span class="math display">\[
  \sqrt{\widehat{Y}_i} = 1.277 + 0.3224 X_i
\]</span></p>
<p>Solving for <span class="math inline">\(\hat{Y}_i\)</span> gives</p>
<p><span class="math display">\[
  \widehat{Y}_i = (1.277 + 0.3224 X_i)^2
\]</span></p>
<p>Which, using <code>curve((1.277 + 0.3224*x)^2, add=TRUE)</code> (see
code for details) looks like this:</p>
<div class="sourceCode" id="cb35"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb35-1"><a href="#cb35-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(dist <span class="sc">~</span> speed, <span class="at">data=</span>cars, <span class="at">pch=</span><span class="dv">20</span>, <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>, <span class="at">cex=</span><span class="fl">1.2</span>, <span class="at">las=</span><span class="dv">1</span>,</span>
<span id="cb35-2"><a href="#cb35-2" aria-hidden="true" tabindex="-1"></a>     <span class="at">xlab=</span><span class="st">&quot;Speed of the Vehicle (mph) </span><span class="sc">\n</span><span class="st"> the Moment the Brakes were Applied&quot;</span>, <span class="at">ylab=</span><span class="st">&quot;Distance (ft) it took the Vehicle to Stop&quot;</span>,</span>
<span id="cb35-3"><a href="#cb35-3" aria-hidden="true" tabindex="-1"></a>     <span class="at">main=</span><span class="st">&quot;Don&#39;t Step in front of a Moving 1920&#39;s Vehicle...&quot;</span>)</span>
<span id="cb35-4"><a href="#cb35-4" aria-hidden="true" tabindex="-1"></a><span class="fu">mtext</span>(<span class="at">side=</span><span class="dv">3</span>, <span class="at">text=</span><span class="st">&quot;...they take a few feet to stop.&quot;</span>, <span class="at">cex=</span><span class="fl">0.7</span>, <span class="at">line=</span>.<span class="dv">5</span>)</span>
<span id="cb35-5"><a href="#cb35-5" aria-hidden="true" tabindex="-1"></a><span class="fu">legend</span>(<span class="st">&quot;topleft&quot;</span>, <span class="at">legend=</span><span class="st">&quot;Stopping Distance Experiment&quot;</span>, <span class="at">bty=</span><span class="st">&quot;n&quot;</span>)</span>
<span id="cb35-6"><a href="#cb35-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb35-7"><a href="#cb35-7" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>( (<span class="fl">1.277</span> <span class="sc">+</span> <span class="fl">0.3224</span><span class="sc">*</span>x)<span class="sc">^</span><span class="dv">2</span>, <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-37-1.png" width="672" /></p>
</p>
</div>
</div>
<p><br /></p>
<div id="x-transformations" class="section level5">
<h5>X-Transformations</h5>
<p>X-transformations are more difficult to recognize than
y-transformations. This is partially because there is no Box-Cox method
to automatically search for them.</p>
<p>The best indicator that you should consider an x-transformation is
when the variance of the residuals is constant across all fitted-values,
but linearity is clearly violated.</p>
<p>The following panel of scatterplots can give you a good feel for when
to try different values of an x-transformation.</p>
<div class="sourceCode" id="cb36"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb36-1"><a href="#cb36-1" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">15</span>)</span>
<span id="cb36-2"><a href="#cb36-2" aria-hidden="true" tabindex="-1"></a>N <span class="ot">&lt;-</span> <span class="dv">300</span></span>
<span id="cb36-3"><a href="#cb36-3" aria-hidden="true" tabindex="-1"></a>X <span class="ot">&lt;-</span> <span class="fu">runif</span>(N, <span class="dv">5</span>, <span class="dv">50</span>)</span>
<span id="cb36-4"><a href="#cb36-4" aria-hidden="true" tabindex="-1"></a>Y <span class="ot">&lt;-</span> <span class="dv">25</span> <span class="sc">+</span> <span class="fl">3.5</span><span class="sc">*</span>X <span class="sc">+</span> <span class="fu">rnorm</span>(N, <span class="dv">0</span>, <span class="dv">20</span>)</span>
<span id="cb36-5"><a href="#cb36-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb36-6"><a href="#cb36-6" aria-hidden="true" tabindex="-1"></a>Xa <span class="ot">&lt;-</span> <span class="dv">1</span><span class="sc">/</span><span class="fu">sqrt</span>(X)   <span class="co">#1/X^2   Lam = -2</span></span>
<span id="cb36-7"><a href="#cb36-7" aria-hidden="true" tabindex="-1"></a>Xb <span class="ot">&lt;-</span> <span class="dv">1</span><span class="sc">/</span>X         <span class="co">#1/X     Lam = -1</span></span>
<span id="cb36-8"><a href="#cb36-8" aria-hidden="true" tabindex="-1"></a>Xc <span class="ot">&lt;-</span> <span class="fu">exp</span>(.<span class="dv">02</span><span class="sc">*</span>X)  <span class="co">#log(X)  Lam =  0</span></span>
<span id="cb36-9"><a href="#cb36-9" aria-hidden="true" tabindex="-1"></a>Xd <span class="ot">&lt;-</span> X<span class="sc">^</span><span class="dv">2</span>         <span class="co">#sqrt(X) Lam =  0.5</span></span>
<span id="cb36-10"><a href="#cb36-10" aria-hidden="true" tabindex="-1"></a>Xe <span class="ot">&lt;-</span> X           <span class="co">#X       Lam =  1</span></span>
<span id="cb36-11"><a href="#cb36-11" aria-hidden="true" tabindex="-1"></a>Xf <span class="ot">&lt;-</span> <span class="fu">sqrt</span>(X)     <span class="co">#X^2     Lam =  2</span></span>
<span id="cb36-12"><a href="#cb36-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb36-13"><a href="#cb36-13" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb36-14"><a href="#cb36-14" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mfrow=</span><span class="fu">c</span>(<span class="dv">2</span>,<span class="dv">3</span>), <span class="at">mai=</span><span class="fu">c</span>(.<span class="dv">4</span>,.<span class="dv">4</span>,.<span class="dv">3</span>,.<span class="dv">2</span>), <span class="at">mgp=</span><span class="fu">c</span>(<span class="fl">0.5</span>,<span class="fl">0.5</span>,<span class="dv">0</span>))</span>
<span id="cb36-15"><a href="#cb36-15" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb36-16"><a href="#cb36-16" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(Y <span class="sc">~</span> Xa, <span class="at">main=</span><span class="fu">expression</span>(<span class="fu">paste</span>(<span class="st">&quot;Use &quot;</span>, X<span class="sc">*</span>minute <span class="sc">==</span> X<span class="sc">^-</span><span class="dv">2</span>)), <span class="at">ylab=</span><span class="st">&quot;Y in Original Units&quot;</span>, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;gray45&quot;</span>, <span class="at">cex=</span><span class="fl">0.9</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xlab=</span><span class="st">&quot;X in Original Units&quot;</span>)</span>
<span id="cb36-17"><a href="#cb36-17" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(<span class="fu">lm</span>(Y <span class="sc">~</span> <span class="fu">I</span>(Xa<span class="sc">^-</span><span class="dv">2</span>)))</span>
<span id="cb36-18"><a href="#cb36-18" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x<span class="sc">^-</span><span class="dv">2</span>, <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="st">&quot;green&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb36-19"><a href="#cb36-19" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb36-20"><a href="#cb36-20" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb36-21"><a href="#cb36-21" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(Y <span class="sc">~</span> Xb, <span class="at">main=</span><span class="fu">expression</span>(<span class="fu">paste</span>(<span class="st">&quot;Use &quot;</span>, X<span class="sc">*</span>minute <span class="sc">==</span> X<span class="sc">^-</span><span class="dv">1</span>)), <span class="at">ylab=</span><span class="st">&quot;Y in Original Units&quot;</span>, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;gray45&quot;</span>, <span class="at">cex=</span><span class="fl">0.9</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xlab=</span><span class="st">&quot;X in Original Units&quot;</span>)</span>
<span id="cb36-22"><a href="#cb36-22" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(<span class="fu">lm</span>(Y <span class="sc">~</span> <span class="fu">I</span>(Xb<span class="sc">^-</span><span class="dv">1</span>)))</span>
<span id="cb36-23"><a href="#cb36-23" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x<span class="sc">^-</span><span class="dv">1</span>, <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="st">&quot;green&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb36-24"><a href="#cb36-24" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb36-25"><a href="#cb36-25" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(Y <span class="sc">~</span> Xc, <span class="at">main=</span><span class="fu">expression</span>(<span class="fu">paste</span>(<span class="st">&quot;Use &quot;</span>, X<span class="sc">*</span>minute <span class="sc">==</span> <span class="fu">log</span>(X))), <span class="at">ylab=</span><span class="st">&quot;Y in Original Units&quot;</span>, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;gray45&quot;</span>, <span class="at">cex=</span><span class="fl">0.9</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xlab=</span><span class="st">&quot;X in Original Units&quot;</span>)</span>
<span id="cb36-26"><a href="#cb36-26" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(<span class="fu">lm</span>(Y <span class="sc">~</span> <span class="fu">log</span>(Xc)))</span>
<span id="cb36-27"><a href="#cb36-27" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span><span class="fu">log</span>(x), <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="st">&quot;green&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb36-28"><a href="#cb36-28" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb36-29"><a href="#cb36-29" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb36-30"><a href="#cb36-30" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(Y <span class="sc">~</span> Xd, <span class="at">main=</span><span class="fu">expression</span>(<span class="fu">paste</span>(<span class="st">&quot;Use &quot;</span>, X<span class="sc">*</span>minute <span class="sc">==</span> <span class="fu">sqrt</span>(X))), <span class="at">ylab=</span><span class="st">&quot;Y in Original Units&quot;</span>, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;gray45&quot;</span>, <span class="at">cex=</span><span class="fl">0.9</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xlab=</span><span class="st">&quot;X in Original Units&quot;</span>)</span>
<span id="cb36-31"><a href="#cb36-31" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(<span class="fu">lm</span>(Y <span class="sc">~</span> <span class="fu">sqrt</span>(Xd)))</span>
<span id="cb36-32"><a href="#cb36-32" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span><span class="fu">sqrt</span>(x), <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="st">&quot;green&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb36-33"><a href="#cb36-33" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb36-34"><a href="#cb36-34" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(Y <span class="sc">~</span> Xe, <span class="at">main=</span><span class="fu">expression</span>(<span class="fu">paste</span>(<span class="st">&quot;Use &quot;</span>, X<span class="sc">*</span>minute <span class="sc">==</span> X, <span class="st">&quot; (No Transformation)&quot;</span>)), <span class="at">ylab=</span><span class="st">&quot;Y in Original Units&quot;</span>, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;gray45&quot;</span>, <span class="at">cex=</span><span class="fl">0.9</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xlab=</span><span class="st">&quot;X in Original Units&quot;</span>)</span>
<span id="cb36-35"><a href="#cb36-35" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(<span class="fu">lm</span>(Y <span class="sc">~</span> Xe))</span>
<span id="cb36-36"><a href="#cb36-36" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>((b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x), <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="st">&quot;green&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb36-37"><a href="#cb36-37" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb36-38"><a href="#cb36-38" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(Y <span class="sc">~</span> Xf, <span class="at">main=</span><span class="fu">expression</span>(<span class="fu">paste</span>(<span class="st">&quot;Use &quot;</span>, X<span class="sc">*</span>minute <span class="sc">==</span> X<span class="sc">^</span><span class="dv">2</span>)), </span>
<span id="cb36-39"><a href="#cb36-39" aria-hidden="true" tabindex="-1"></a><span class="at">ylab=</span><span class="st">&quot;Y in Original Units&quot;</span>, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;gray45&quot;</span>, <span class="at">cex=</span><span class="fl">0.9</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xlab=</span><span class="st">&quot;X in Original Units&quot;</span>)</span>
<span id="cb36-40"><a href="#cb36-40" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(<span class="fu">lm</span>(Y <span class="sc">~</span> <span class="fu">I</span>(Xf<span class="sc">^</span><span class="dv">2</span>)))</span>
<span id="cb36-41"><a href="#cb36-41" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">2</span>, <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="st">&quot;green&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-38-1.png" width="672" /></p>
</div>
</div>
<p><br /></p>
</div>
<div id="inference-for-the-model-parameters-expand"
class="section level4">
<h4>Inference for the Model Parameters
<a href="javascript:showhide('inference1')" style="font-size:.6em;color:skyblue;" id="infModelParam">(Expand)</a></h4>
<p><span class="expand-caption">t test formulas, sampling distributions,
confidence intervals, and F tests…</span></p>
<div id="inference1" style="display:none;">

<p>When fitting the regression model given by the equation</p>
<p><span class="math display">\[
  Y_i = \beta_0 + \beta_1 X_i + \epsilon_i \quad \text{where} \
\epsilon_i \sim N(0, \sigma^2)
\]</span> to a sample of data, we typically test hypotheses about the
parameters <span class="math inline">\(\beta_0\)</span>, <span
class="math inline">\(\beta_1\)</span>, or both.</p>
<table class="fancytable">
<tr>
<th>
<strong>Hypotheses</strong>
</th>
<th>
<strong>Test Statistic</strong>
</th>
<th>
<strong>P-value</strong>
</th>
</tr>
<tr>
<td style="text-align:center;width:25%;">
<p><span class="math inline">\(H_0: \beta_0 =\)</span> <span
class="tooltiprbold"> <span class="math inline">\(\underbrace{0}_\text{a
number}\)</span> <span class="tooltiprtext">This could be any number,
not just 0. However, the default summar(mylm) output in R only shows the
test statistic and p-value for the test that uses 0. To test a different
value, you would need to compute the test statistic and p-value by hand
using the formula shown.</span> </span></p>
<p><span class="math inline">\(H_a: \beta_0\)</span><span
class="tooltiprbold"> <span class="math inline">\(\,\neq\,\)</span>
<span class="tooltiprtext">You could use <span
class="math inline">\(&gt;\)</span> or <span
class="math inline">\(&lt;\)</span> instead of <span
class="math inline">\(\neq\)</span> for the alternative hypothesis. By
default, the p-value from summary(mylm) in R uses <span
class="math inline">\(\neq\)</span>.</span> </span><span
class="tooltiprbold"> <span class="math inline">\(\underbrace{0}_\text{a
number}\)</span> <span class="tooltiprtext">This could be any number,
not just 0. However, the default summar(mylm) output in R only shows the
test statistic and p-value for the test that uses 0. To test a different
value, you would need to compute the test statistic and p-value by hand
using the formula shown.</span> </span></p>
</td>
<td style="text-align:center;width:25%;">
<p><span class="tooltiprbold"> <span class="math display">\[t =
\frac{b_0 - \overbrace{0}^\text{a number}}{s_{b_0}}\]</span> <span
class="tooltiprtext">This is the formula for the test statistic. It
measures how far the estimated y-intercept <span
class="math inline">\(b_0\)</span> is from the null hypothesis for <span
class="math inline">\(\beta_0\)</span> in units of “standard errors of
<span class="math inline">\(b_0\)</span>”. Thus the division by <span
class="math inline">\(s_{b_0}\)</span>. Though the hypothesized value of
<span class="math inline">\(\beta_0\)</span> is typically 0, it could be
any number.</span> </span></p>
</td>
<td style="text-align:center;width:50%;">
<p><a href="https://byuimath.com/apps/normprobwitht.html" target="_blank" title="Go to t applet"></p>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-39-1.png" width="672" /></p>
</a>
</td>
</tr>
<tr>
<td style="text-align:center;width:25%;">
<p><span class="math inline">\(H_0: \beta_1 =\)</span> <span
class="tooltiprbold"> <span class="math inline">\(\underbrace{0}_\text{a
number}\)</span> <span class="tooltiprtext">This could be any number,
not just 0. However, the default summar(mylm) output in R only shows the
test statistic and p-value for the test that uses 0. To test a different
value, you would need to compute the test statistic and p-value by hand
using the formula shown.</span> </span></p>
<p><span class="math inline">\(H_a: \beta_1\)</span><span
class="tooltiprbold"> <span class="math inline">\(\,\neq\,\)</span>
<span class="tooltiprtext">You could use <span
class="math inline">\(&gt;\)</span> or <span
class="math inline">\(&lt;\)</span> instead of <span
class="math inline">\(\neq\)</span> for the alternative hypothesis. By
default, the p-value from summary(mylm) in R uses <span
class="math inline">\(\neq\)</span>.</span> </span><span
class="tooltiprbold"> <span class="math inline">\(\underbrace{0}_\text{a
number}\)</span> <span class="tooltiprtext">This could be any number,
not just 0. However, the default summar(mylm) output in R only shows the
test statistic and p-value for the test that uses 0. To test a different
value, you would need to compute the test statistic and p-value by hand
using the formula shown.</span> </span></p>
</td>
<td style="text-align:center;width:25%;">
<p><span class="tooltiprbold"> <span class="math display">\[t =
\frac{b_1 - \overbrace{0}^\text{a number}}{s_{b_1}}\]</span> <span
class="tooltiprtext">This is the formula for the test statistic. It
measures how far the estimated slope <span
class="math inline">\(b_1\)</span> is from the null hypothesis for <span
class="math inline">\(\beta_1\)</span> in units of “standard errors of
<span class="math inline">\(b_1\)</span>”. Thus the division by <span
class="math inline">\(s_{b_1}\)</span>. Though the hypothesized value of
<span class="math inline">\(\beta_1\)</span> is typically 0, it could be
any number.</span> </span></p>
</td>
<td>
<p>Left-tailed p-value =
<code>pt(-abs(tvalue), degrees of freedom)</code>.</p>
Double it to get the two-sided p-value.
</td>
</tr>
</table>
<p><br></p>
<p>In R, these values correspond to the output summary of an lm as
follows.</p>
<p><br></p>
<p><img src="Images/summaryOutputLabeled.png"></p>
<p><a href="javascript:showhide('ttestexample')" style="font-size:.9em;color:skyblue;">(Show
Example)</a></p>
<div id="ttestexample" style="display:none;">

<p>Consider the <code>cars</code> data in R. Suppose we used the
regression model given by</p>
<p><span class="math display">\[
  \underbrace{Y_i}_\text{Feet to Stop} = \beta_0 + \beta_1
\underbrace{X_{i}}_\text{mph} + \epsilon_i \quad \text{where} \
\epsilon_i \quad \sim N(0,\sigma^2)
\]</span> to model the feet a vehicle (from the 1920’s) takes to stop
when traveling at a certain speed (in miles per hour, mph) prior to
stopping. When the regression is performed and summarized in R, it is
always testing the following two hypotheses:</p>
<p><span class="math display">\[
H_0: \beta_0 = 0 \quad\quad H_0: \beta_1 = 0 \\
H_a: \beta_0 \neq 0 \quad\quad H_a: \beta_1 \neq 0
\]</span></p>
<p>To perform the test of these hypotheses for the regression stated
above, we would run the following codes in R.</p>
<p><code>cars.lm &lt;- lm(dist ~ speed, data=cars)</code></p>
<p><code>pander(summary(cars.lm)$coefficients)</code></p>
<p>These would produce summary output like the following, but the
following output has been labeled with the math notation corresponding
to each value.</p>
<p><img src="Images/summaryOutputLabeled.png"></p>
<p>Let’s emphasize what is happening in this summary output table.</p>
<p>First, here is how the “t value” is calculated for the “(Intercept)”
in the summary table above.</p>
<p><span class="math display">\[
t = \frac{b_0-0}{s_{b_0}} = \frac{-17.58 - 0}{6.758} = -2.601
\]</span> Second, here is a visual representation of how the P-value,
the “Pr(&gt;|t|)” as it is called in the summary table above, is
calculated for this test statistic. (Click the graph to view an
interactive applet showing this calculation.) Notice both ends of the
t-distribution are being shaded to compute the P-value because the
alternative hypothesis was <span class="math inline">\(H_a: \beta_0 \neq
0\)</span>.</p>
<p><a href="https://byuimath.com/apps/normprobwitht.html?z=-2.601&df=48" target="_blank" title="Click to View in t Applet"></p>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-40-1.png" width="672" /></p>
<p></a></p>
<p>To compute the P-value in R, we use the “percentile function for the
t-distribution” called <code>pt( )</code>. This function requires two
things, the t-value and the degrees of freedom, in our case
<code>pt(-2.601, 48)</code>. Note the degrees of freedom (df) are 48
because the sample size is <span class="math inline">\(n=50\)</span> and
there are two parameters (<span class="math inline">\(\beta_0\)</span>
and <span class="math inline">\(\beta_1\)</span>) in our regression
model.</p>
<p>Running this code in R gives: <code>pt(-2.601, 48)</code> =
0.00616</p>
<p>However, note that this value is only half of the actual P-value of
0.0123. To get the “two-sided” P-value (note that our alternative
hypothesis used a <span class="math inline">\(\neq\)</span> symbol) we
need to double this left-tailed P-value.</p>
<p><code>2*pt(-2.601, 48))</code> = 0.0123</p>
<p>Finally, note that the same procedure can be used to test hypotheses
that use a value other than 0 in the null and alternative. For example,
to test the hypotheses:</p>
<p><span class="math display">\[
H_0: \beta_1 = 3 \\
H_a: \beta_1 \neq 3
\]</span> Use the t-formula</p>
<p><span class="math display">\[
  t = \frac{b_1 - 3}{s_{b_1}} = \frac{\overbrace{3.932}^{b_1} -
\overbrace{3}^{H_0}}{\underbrace{0.4155}_{s_{b_1}}} = 2.243
\]</span> then the P-value is calculated in R by</p>
<p><code>2*pt(-abs(2.243), 48)</code> = 0.0295495</p>
<hr>
</div>
<p><br/></p>
<p>To obtain confidence intervals in R use
<code>confint(mylm)</code>.</p>
<table class="fancytable">
<tr>
<th>
<strong>Confidence Interval</strong>
</th>
<th>
<strong>Formula</strong>
</th>
<th>
<strong>Standard Error</strong>
</th>
</tr>
<tr>
<td style="text-align:center;">
<span class="math inline">\(\beta_0\)</span>
</td>
<td style="text-align:center;">
<span class="math inline">\(b_0 \pm\)</span><span class="tooltiprbold">
<span class="math inline">\(t^*\)</span> <span class="tooltiprtext">This
is called the “critical value” and denotes the number of standard
deviations that are needed to obtain a 95% confidence interval from a t
distribution with degrees of freedom <span
class="math inline">\(n-p\)</span>. Use <code>qt(0.975, df)</code> to
get <span class="math inline">\(t*\)</span> in R.</span> </span><span
class="tooltiprbold"> <span class="math inline">\(\cdot\)</span> <span
class="tooltiprtext">The critical value is multiplied by the standard
error of <span class="math inline">\(b_0\)</span>.</span> </span><span
class="tooltiprbold"> <span class="math inline">\(s_{b_0}\)</span> <span
class="tooltiprtext">The standard error of <span
class="math inline">\(b_0\)</span>, denoted by <span
class="math inline">\(s_{b_0}\)</span> is provided in the regression
summary output under the column header called “Std. Error” for the
“(Intercept)” row of the output. It is calculated using the formula
shown below.</span> </span>
</td>
<td style="text-align:center;">
<span class="tooltiprbold"> <span class="math display">\[s^2_{b_0} =
MSE\left[\frac{1}{n} +
\frac{\bar{X}^2}{\sum(X_i-\bar{X})^2}\right]\]</span> <span
class="tooltiprtext">This is called the “estimated variance of <span
class="math inline">\(b_0\)</span>”. Taking the square root of this
number gives the “standard error of <span
class="math inline">\(b_0\)</span>”.</span> </span>
</td>
</tr>
<tr>
<td style="text-align:center;">
<span class="math inline">\(\beta_1\)</span>
</td>
<td style="text-align:center;">
<span class="math inline">\(b_1 \pm\)</span><span class="tooltiprbold">
<span class="math inline">\(t^*\)</span> <span class="tooltiprtext">This
is called the “critical value” and denotes the number of standard
deviations that are needed to obtain a 95% confidence interval from a t
distribution with degrees of freedom <span
class="math inline">\(n-p\)</span> (sample size - number of parameters
in the regression model). Use <code>qt(0.975, df)</code> to get <span
class="math inline">\(t*\)</span> in R</span> </span><span
class="tooltiprbold"> <span class="math inline">\(\cdot\)</span> <span
class="tooltiprtext">The critical value is multiplied by the standard
error of <span class="math inline">\(b_1\)</span>.</span> </span><span
class="tooltiprbold"> <span class="math inline">\(s_{b_1}\)</span> <span
class="tooltiprtext">The standard error of <span
class="math inline">\(b_1\)</span>, denoted by <span
class="math inline">\(s_{b_1}\)</span> is provided in the regression
summary output under the column header called “Std. Error”. It is
calculated using the formula shown below.</span> </span>
</td>
<td style="text-align:center;">
<span class="tooltiprbold"> <span class="math display">\[s^2_{b_1} =
\frac{MSE}{\sum(X_i-\bar{X})^2}\]</span> <span class="tooltiprtext">This
is called the “estimated variance of <span
class="math inline">\(b_1\)</span>”. Taking the square root of this
number gives the “standard error of <span
class="math inline">\(b_1\)</span>”.</span> </span>
</td>
</tr>
</table>
<p>To be more exact, the types of inference we are interested in are the
following.</p>
<ol style="list-style-type: decimal">
<li><p>Determine if there is evidence of a meaningful linear
relationship in the data. If <span class="math inline">\(\beta_1 =
0\)</span>, then there is no relation between <span
class="math inline">\(X\)</span> and <span
class="math inline">\(E\{Y\}\)</span>. Hence we might be interested in
testing the hypotheses <span class="math display">\[
  H_0: \beta_1 = 0
\]</span> <span class="math display">\[
  H_a: \beta_1 \neq 0
\]</span></p></li>
<li><p>Determine if the slope is greater, less than, or different from
some other hypothesized value. In this case, we would be interested in
using hypotheses of the form <span class="math display">\[
  H_0: \beta_1 = \beta_{10}
\]</span> <span class="math display">\[
  H_a: \beta_1 \neq \beta_{10}
\]</span> where <span class="math inline">\(\beta_{10}\)</span> is some
hypothesized number.</p></li>
<li><p>To provide a confidence interval for the true value of <span
class="math inline">\(\beta_1\)</span>.</p></li>
</ol>
<p><br /></p>
<p>Before we discuss how to test the hypotheses listed above or
construct a confidence interval, we must understand the <strong>sampling
distribution</strong> of the estimate <span
class="math inline">\(b_1\)</span> of the parameter <span
class="math inline">\(\beta_1\)</span>. And, while we are at it, we may
as well come to understand the sampling distribution of the estimate
<span class="math inline">\(b_0\)</span> of the parameter <span
class="math inline">\(\beta_0\)</span>.</p>
<div style="padding-left:30px;color:darkgray;font-size:.8em;">
<p>Review <a
href="http://statistics.byuimath.com/index.php?title=Lesson_6:_Distribution_of_Sample_Means_%26_The_Central_Limit_Theorem#Introduction_to_Sampling_Distributions">sampling
distributions</a> from Math 221.</p>
</div>
<p>Since <span class="math inline">\(b_1\)</span> is an estimate, it
will vary from sample to sample, even though the truth, <span
class="math inline">\(\beta_1\)</span>, remains fixed. (The same holds
for <span class="math inline">\(b_0\)</span> and <span
class="math inline">\(\beta_0\)</span>.) It turns out that the sampling
distribution of <span class="math inline">\(b_1\)</span> (where the
<span class="math inline">\(X\)</span> values remain fixed from study to
study) is normal with mean and variance: <span class="math display">\[
  \mu_{b_1} = \beta_1
\]</span> <span class="math display">\[
  \sigma^2_{b_1} = \frac{\sigma^2}{\sum(X_i-\bar{X})^2}
\]</span></p>
<div class="sourceCode" id="cb37"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb37-1"><a href="#cb37-1" aria-hidden="true" tabindex="-1"></a><span class="do">## Simulation to Show relationship between Standard Errors</span></span>
<span id="cb37-2"><a href="#cb37-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb37-3"><a href="#cb37-3" aria-hidden="true" tabindex="-1"></a><span class="do">##-----------------------------------------------</span></span>
<span id="cb37-4"><a href="#cb37-4" aria-hidden="true" tabindex="-1"></a><span class="do">## Edit anything in this area... </span></span>
<span id="cb37-5"><a href="#cb37-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb37-6"><a href="#cb37-6" aria-hidden="true" tabindex="-1"></a>n <span class="ot">&lt;-</span> <span class="dv">100</span> <span class="co">#sample size</span></span>
<span id="cb37-7"><a href="#cb37-7" aria-hidden="true" tabindex="-1"></a>Xstart <span class="ot">&lt;-</span> <span class="dv">30</span> <span class="co">#lower-bound for x-axis</span></span>
<span id="cb37-8"><a href="#cb37-8" aria-hidden="true" tabindex="-1"></a>Xstop <span class="ot">&lt;-</span> <span class="dv">100</span> <span class="co">#upper-bound for x-axis</span></span>
<span id="cb37-9"><a href="#cb37-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb37-10"><a href="#cb37-10" aria-hidden="true" tabindex="-1"></a>beta_0 <span class="ot">&lt;-</span> <span class="dv">2</span> <span class="co">#choice of true y-intercept</span></span>
<span id="cb37-11"><a href="#cb37-11" aria-hidden="true" tabindex="-1"></a>beta_1 <span class="ot">&lt;-</span> <span class="fl">3.5</span> <span class="co">#choice of true slope</span></span>
<span id="cb37-12"><a href="#cb37-12" aria-hidden="true" tabindex="-1"></a>sigma <span class="ot">&lt;-</span> <span class="fl">13.8</span> <span class="co">#choice of st. deviation of error terms</span></span>
<span id="cb37-13"><a href="#cb37-13" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb37-14"><a href="#cb37-14" aria-hidden="true" tabindex="-1"></a><span class="do">## End of Editable area.</span></span>
<span id="cb37-15"><a href="#cb37-15" aria-hidden="true" tabindex="-1"></a><span class="do">##-----------------------------------------------</span></span>
<span id="cb37-16"><a href="#cb37-16" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb37-17"><a href="#cb37-17" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb37-18"><a href="#cb37-18" aria-hidden="true" tabindex="-1"></a><span class="co"># Create X, which will be used in the next R-chunk.</span></span>
<span id="cb37-19"><a href="#cb37-19" aria-hidden="true" tabindex="-1"></a>X <span class="ot">&lt;-</span> <span class="fu">rep</span>(<span class="fu">seq</span>(Xstart,Xstop, <span class="at">length.out=</span>n<span class="sc">/</span><span class="dv">2</span>), <span class="at">each=</span><span class="dv">2</span>) </span>
<span id="cb37-20"><a href="#cb37-20" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb37-21"><a href="#cb37-21" aria-hidden="true" tabindex="-1"></a><span class="do">## After playing this chunk, play the next chunk as well.</span></span></code></pre></div>
<p>To see that this is true, consider the regression model with values
specified for each parameter as follows.</p>
<p><span class="math display">\[
  Y_i = \overbrace{\beta_0}^{2} + \overbrace{\beta_1}^{3.5} X_i +
\epsilon_i \quad \text{where} \ \epsilon_i \sim N(0,
\overbrace{\sigma^2}^{\sigma=13.8})
\]</span></p>
<p>Using the equations above for <span
class="math inline">\(\mu_{b_1}\)</span> and <span
class="math inline">\(\sigma^2_{b_1}\)</span> we obtain that the mean of
the sampling distribution of <span class="math inline">\(b_1\)</span>
will be</p>
<p><span class="math inline">\(\mu_{b_1} = \beta_1 = 3.5\)</span></p>
<p>Further, we see that the variance of the sampling distribution of
<span class="math inline">\(b_1\)</span> will be</p>
<p><span class="math inline">\(\sigma^2_{b_1} =
\frac{\sigma^2}{\sum(X_i-\bar{X})^2} = \frac{13.8^2}{4.25\times
10^{4}}\)</span></p>
<p>Taking the square root of the variance, the standard deviation of the
sampling distribution of <span class="math inline">\(b_1\)</span> will
be</p>
<p><span class="math inline">\(\sigma_{b_1} = 0.067\)</span>.</p>
<p>That’s very nice. But to really believe it, let’s run a simulation
ourselves. The “Code” below is worth studying. It runs a simulation that
(1) takes a sample of data from the true regression relation, (2) fits
the sampled data with an estimated regression equation (gray lines in
the plot), and (3) computes the estimated values of <span
class="math inline">\(b_1\)</span> and <span
class="math inline">\(b_0\)</span> for that regression.</p>
<p>After doing this many, many times, the results of every single
regression are plotted (in gray lines, which creates a gray shaded
region because there are so many lines) in the scatterplot below.
Further, each obtained estimate of <span
class="math inline">\(b_0\)</span> is plotted in the histogram on the
left (below the scatterplot) and each obtained estimate of <span
class="math inline">\(b_1\)</span> is plotted in the histogram on the
right. Looking at the histograms carefully, it can be seen that the mean
of each histogram is very close to the true parameter value of <span
class="math inline">\(\beta_0\)</span> or <span
class="math inline">\(\beta_1\)</span>, respectively. Also, the “Std.
Error” of each histogram is incredibly close (if not exact to 3 decimal
places) to the computed value of <span
class="math inline">\(\sigma_{b_0}\)</span> and <span
class="math inline">\(\sigma_{b_1}\)</span>, respectively. Amazing!</p>
<div class="sourceCode" id="cb38"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb38-1"><a href="#cb38-1" aria-hidden="true" tabindex="-1"></a>N <span class="ot">&lt;-</span> <span class="dv">5000</span> <span class="co">#number of times to pull a random sample</span></span>
<span id="cb38-2"><a href="#cb38-2" aria-hidden="true" tabindex="-1"></a>storage_b0 <span class="ot">&lt;-</span> storage_b1 <span class="ot">&lt;-</span> storage_rmse <span class="ot">&lt;-</span> <span class="fu">rep</span>(<span class="cn">NA</span>, N)</span>
<span id="cb38-3"><a href="#cb38-3" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> (i <span class="cf">in</span> <span class="dv">1</span><span class="sc">:</span>N){</span>
<span id="cb38-4"><a href="#cb38-4" aria-hidden="true" tabindex="-1"></a>  Y <span class="ot">&lt;-</span> beta_0 <span class="sc">+</span> beta_1<span class="sc">*</span>X <span class="sc">+</span> <span class="fu">rnorm</span>(n, <span class="dv">0</span>, sigma) <span class="co">#Sample Y from true model</span></span>
<span id="cb38-5"><a href="#cb38-5" aria-hidden="true" tabindex="-1"></a>  mylm <span class="ot">&lt;-</span> <span class="fu">lm</span>(Y <span class="sc">~</span> X)</span>
<span id="cb38-6"><a href="#cb38-6" aria-hidden="true" tabindex="-1"></a>  storage_b0[i] <span class="ot">&lt;-</span> <span class="fu">coef</span>(mylm)[<span class="dv">1</span>]</span>
<span id="cb38-7"><a href="#cb38-7" aria-hidden="true" tabindex="-1"></a>  storage_b1[i] <span class="ot">&lt;-</span> <span class="fu">coef</span>(mylm)[<span class="dv">2</span>]</span>
<span id="cb38-8"><a href="#cb38-8" aria-hidden="true" tabindex="-1"></a>  storage_rmse[i] <span class="ot">&lt;-</span> <span class="fu">summary</span>(mylm)<span class="sc">$</span>sigma</span>
<span id="cb38-9"><a href="#cb38-9" aria-hidden="true" tabindex="-1"></a>}</span>
<span id="cb38-10"><a href="#cb38-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb38-11"><a href="#cb38-11" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb38-12"><a href="#cb38-12" aria-hidden="true" tabindex="-1"></a><span class="fu">layout</span>(<span class="fu">matrix</span>(<span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">1</span>,<span class="dv">2</span>,<span class="dv">3</span>), <span class="dv">2</span>, <span class="dv">2</span>, <span class="at">byrow =</span> <span class="cn">TRUE</span>), <span class="at">widths=</span><span class="fu">c</span>(<span class="dv">2</span>,<span class="dv">2</span>), <span class="at">heights=</span><span class="fu">c</span>(<span class="dv">3</span>,<span class="dv">3</span>))</span>
<span id="cb38-13"><a href="#cb38-13" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb38-14"><a href="#cb38-14" aria-hidden="true" tabindex="-1"></a>Ystart <span class="ot">&lt;-</span> <span class="dv">0</span> <span class="co">#min(0,min(Y)) </span></span>
<span id="cb38-15"><a href="#cb38-15" aria-hidden="true" tabindex="-1"></a>Ystop <span class="ot">&lt;-</span> <span class="dv">500</span> <span class="co">#max(max(Y), 0)</span></span>
<span id="cb38-16"><a href="#cb38-16" aria-hidden="true" tabindex="-1"></a>Yrange <span class="ot">&lt;-</span> Ystop <span class="sc">-</span> Ystart</span>
<span id="cb38-17"><a href="#cb38-17" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb38-18"><a href="#cb38-18" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(Y <span class="sc">~</span> X, <span class="at">xlim=</span><span class="fu">c</span>(<span class="fu">min</span>(<span class="dv">0</span>,Xstart<span class="dv">-2</span>), <span class="fu">max</span>(<span class="dv">0</span>,Xstop<span class="sc">+</span><span class="dv">2</span>)), </span>
<span id="cb38-19"><a href="#cb38-19" aria-hidden="true" tabindex="-1"></a>     <span class="at">ylim=</span><span class="fu">c</span>(Ystart, Ystop), <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;gray&quot;</span>,</span>
<span id="cb38-20"><a href="#cb38-20" aria-hidden="true" tabindex="-1"></a>     <span class="at">main=</span><span class="st">&quot;Regression Lines from many Samples (gray lines) </span><span class="sc">\n</span><span class="st"> Plus Residual Standard Deviation Lines (green lines)&quot;</span>)</span>
<span id="cb38-21"><a href="#cb38-21" aria-hidden="true" tabindex="-1"></a><span class="fu">text</span>(Xstart, Ystop, <span class="fu">bquote</span>(sigma <span class="sc">==</span> .(sigma)), <span class="at">pos=</span><span class="dv">1</span>)</span>
<span id="cb38-22"><a href="#cb38-22" aria-hidden="true" tabindex="-1"></a><span class="fu">text</span>(Xstart, Ystop<span class="fl">-.1</span><span class="sc">*</span>Yrange, <span class="fu">bquote</span>(<span class="fu">sum</span> ((x[i]<span class="sc">-</span><span class="fu">bar</span>(x))<span class="sc">^</span><span class="dv">2</span>, i<span class="sc">==</span><span class="dv">1</span>, n) <span class="sc">==</span> .(<span class="fu">var</span>(X)<span class="sc">*</span>(n<span class="dv">-1</span>))), <span class="at">pos=</span><span class="dv">1</span>)</span>
<span id="cb38-23"><a href="#cb38-23" aria-hidden="true" tabindex="-1"></a><span class="fu">text</span>(Xstart, Ystop<span class="fl">-.25</span><span class="sc">*</span>Yrange, <span class="fu">bquote</span>(<span class="fu">sqrt</span>(MSE) <span class="sc">==</span> .(<span class="fu">mean</span>(storage_rmse))), <span class="at">pos=</span><span class="dv">1</span>)</span>
<span id="cb38-24"><a href="#cb38-24" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb38-25"><a href="#cb38-25" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb38-26"><a href="#cb38-26" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> (i <span class="cf">in</span> <span class="dv">1</span><span class="sc">:</span>N){</span>
<span id="cb38-27"><a href="#cb38-27" aria-hidden="true" tabindex="-1"></a>  <span class="fu">abline</span>(storage_b0[i], storage_b1[i], <span class="at">col=</span><span class="st">&quot;darkgray&quot;</span>)  </span>
<span id="cb38-28"><a href="#cb38-28" aria-hidden="true" tabindex="-1"></a>}</span>
<span id="cb38-29"><a href="#cb38-29" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(beta_0, beta_1, <span class="at">col=</span><span class="st">&quot;green&quot;</span>, <span class="at">lwd=</span><span class="dv">3</span>)</span>
<span id="cb38-30"><a href="#cb38-30" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(beta_0<span class="sc">+</span>sigma, beta_1, <span class="at">col=</span><span class="st">&quot;green&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb38-31"><a href="#cb38-31" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(beta_0<span class="sc">-</span>sigma, beta_1, <span class="at">col=</span><span class="st">&quot;green&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb38-32"><a href="#cb38-32" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(beta_0<span class="sc">+</span><span class="dv">2</span><span class="sc">*</span>sigma, beta_1, <span class="at">col=</span><span class="st">&quot;green&quot;</span>, <span class="at">lwd=</span><span class="dv">1</span>)</span>
<span id="cb38-33"><a href="#cb38-33" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(beta_0<span class="dv">-2</span><span class="sc">*</span>sigma, beta_1, <span class="at">col=</span><span class="st">&quot;green&quot;</span>, <span class="at">lwd=</span><span class="dv">1</span>)</span>
<span id="cb38-34"><a href="#cb38-34" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(beta_0<span class="sc">+</span><span class="dv">3</span><span class="sc">*</span>sigma, beta_1, <span class="at">col=</span><span class="st">&quot;green&quot;</span>, <span class="at">lwd=</span>.<span class="dv">5</span>)</span>
<span id="cb38-35"><a href="#cb38-35" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(beta_0<span class="dv">-3</span><span class="sc">*</span>sigma, beta_1, <span class="at">col=</span><span class="st">&quot;green&quot;</span>, <span class="at">lwd=</span>.<span class="dv">5</span>)</span>
<span id="cb38-36"><a href="#cb38-36" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb38-37"><a href="#cb38-37" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mai=</span><span class="fu">c</span>(<span class="dv">1</span>,.<span class="dv">6</span>,.<span class="dv">5</span>,.<span class="dv">01</span>))</span>
<span id="cb38-38"><a href="#cb38-38" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb38-39"><a href="#cb38-39" aria-hidden="true" tabindex="-1"></a>  addnorm <span class="ot">&lt;-</span> <span class="cf">function</span>(m,s, <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>){</span>
<span id="cb38-40"><a href="#cb38-40" aria-hidden="true" tabindex="-1"></a>    <span class="fu">curve</span>(<span class="fu">dnorm</span>(x, m, s), <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span>col, <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb38-41"><a href="#cb38-41" aria-hidden="true" tabindex="-1"></a>    <span class="fu">lines</span>(<span class="fu">c</span>(m,m), <span class="fu">c</span>(<span class="dv">0</span>, <span class="fu">dnorm</span>(m,m,s)), <span class="at">lwd=</span><span class="dv">2</span>, <span class="at">col=</span>col)</span>
<span id="cb38-42"><a href="#cb38-42" aria-hidden="true" tabindex="-1"></a>    <span class="fu">lines</span>(<span class="fu">rep</span>(m<span class="sc">-</span>s,<span class="dv">2</span>), <span class="fu">c</span>(<span class="dv">0</span>, <span class="fu">dnorm</span>(m<span class="sc">-</span>s, m, s)), <span class="at">lwd=</span><span class="dv">2</span>, <span class="at">col=</span>col)</span>
<span id="cb38-43"><a href="#cb38-43" aria-hidden="true" tabindex="-1"></a>    <span class="fu">lines</span>(<span class="fu">rep</span>(m<span class="dv">-2</span><span class="sc">*</span>s,<span class="dv">2</span>), <span class="fu">c</span>(<span class="dv">0</span>, <span class="fu">dnorm</span>(m<span class="dv">-2</span><span class="sc">*</span>s, m, s)), <span class="at">lwd=</span><span class="dv">2</span>, <span class="at">col=</span>col)</span>
<span id="cb38-44"><a href="#cb38-44" aria-hidden="true" tabindex="-1"></a>    <span class="fu">lines</span>(<span class="fu">rep</span>(m<span class="dv">-3</span><span class="sc">*</span>s,<span class="dv">2</span>), <span class="fu">c</span>(<span class="dv">0</span>, <span class="fu">dnorm</span>(m<span class="dv">-3</span><span class="sc">*</span>s, m, s)), <span class="at">lwd=</span><span class="dv">2</span>, <span class="at">col=</span>col)</span>
<span id="cb38-45"><a href="#cb38-45" aria-hidden="true" tabindex="-1"></a>    <span class="fu">lines</span>(<span class="fu">rep</span>(m<span class="sc">+</span>s,<span class="dv">2</span>), <span class="fu">c</span>(<span class="dv">0</span>, <span class="fu">dnorm</span>(m<span class="sc">+</span>s, m, s)), <span class="at">lwd=</span><span class="dv">2</span>, <span class="at">col=</span>col)</span>
<span id="cb38-46"><a href="#cb38-46" aria-hidden="true" tabindex="-1"></a>    <span class="fu">lines</span>(<span class="fu">rep</span>(m<span class="sc">+</span><span class="dv">2</span><span class="sc">*</span>s,<span class="dv">2</span>), <span class="fu">c</span>(<span class="dv">0</span>, <span class="fu">dnorm</span>(m<span class="sc">+</span><span class="dv">2</span><span class="sc">*</span>s, m, s)), <span class="at">lwd=</span><span class="dv">2</span>, <span class="at">col=</span>col)</span>
<span id="cb38-47"><a href="#cb38-47" aria-hidden="true" tabindex="-1"></a>    <span class="fu">lines</span>(<span class="fu">rep</span>(m<span class="sc">+</span><span class="dv">3</span><span class="sc">*</span>s,<span class="dv">2</span>), <span class="fu">c</span>(<span class="dv">0</span>, <span class="fu">dnorm</span>(m<span class="sc">+</span><span class="dv">3</span><span class="sc">*</span>s, m, s)), <span class="at">lwd=</span><span class="dv">2</span>, <span class="at">col=</span>col)</span>
<span id="cb38-48"><a href="#cb38-48" aria-hidden="true" tabindex="-1"></a>    <span class="fu">legend</span>(<span class="st">&quot;topleft&quot;</span>, <span class="at">legend=</span><span class="fu">paste</span>(<span class="st">&quot;Std. Error = &quot;</span>, <span class="fu">round</span>(s,<span class="dv">3</span>)), <span class="at">cex=</span><span class="fl">0.7</span>, <span class="at">bty=</span><span class="st">&quot;n&quot;</span>)</span>
<span id="cb38-49"><a href="#cb38-49" aria-hidden="true" tabindex="-1"></a>  }</span>
<span id="cb38-50"><a href="#cb38-50" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb38-51"><a href="#cb38-51" aria-hidden="true" tabindex="-1"></a>  h0 <span class="ot">&lt;-</span> <span class="fu">hist</span>(storage_b0, </span>
<span id="cb38-52"><a href="#cb38-52" aria-hidden="true" tabindex="-1"></a>             <span class="at">col=</span><span class="st">&quot;skyblue3&quot;</span>, </span>
<span id="cb38-53"><a href="#cb38-53" aria-hidden="true" tabindex="-1"></a>             <span class="at">main=</span><span class="st">&quot;Sampling Distribution</span><span class="sc">\n</span><span class="st"> Y-intercept&quot;</span>,</span>
<span id="cb38-54"><a href="#cb38-54" aria-hidden="true" tabindex="-1"></a>             <span class="at">xlab=</span><span class="fu">expression</span>(<span class="fu">paste</span>(<span class="st">&quot;Estimates of &quot;</span>, beta[<span class="dv">0</span>], <span class="st">&quot; from each Sample&quot;</span>)),</span>
<span id="cb38-55"><a href="#cb38-55" aria-hidden="true" tabindex="-1"></a>             <span class="at">freq=</span><span class="cn">FALSE</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">ylab=</span><span class="st">&quot;&quot;</span>)</span>
<span id="cb38-56"><a href="#cb38-56" aria-hidden="true" tabindex="-1"></a>  m0 <span class="ot">&lt;-</span> <span class="fu">mean</span>(storage_b0)</span>
<span id="cb38-57"><a href="#cb38-57" aria-hidden="true" tabindex="-1"></a>  s0 <span class="ot">&lt;-</span> <span class="fu">sd</span>(storage_b0)</span>
<span id="cb38-58"><a href="#cb38-58" aria-hidden="true" tabindex="-1"></a>  <span class="fu">addnorm</span>(m0,s0, <span class="at">col=</span><span class="st">&quot;green&quot;</span>)</span>
<span id="cb38-59"><a href="#cb38-59" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb38-60"><a href="#cb38-60" aria-hidden="true" tabindex="-1"></a>  h1 <span class="ot">&lt;-</span> <span class="fu">hist</span>(storage_b1, </span>
<span id="cb38-61"><a href="#cb38-61" aria-hidden="true" tabindex="-1"></a>             <span class="at">col=</span><span class="st">&quot;skyblue3&quot;</span>, </span>
<span id="cb38-62"><a href="#cb38-62" aria-hidden="true" tabindex="-1"></a>             <span class="at">main=</span><span class="st">&quot;Sampling Distribution</span><span class="sc">\n</span><span class="st"> Slope&quot;</span>,</span>
<span id="cb38-63"><a href="#cb38-63" aria-hidden="true" tabindex="-1"></a>             <span class="at">xlab=</span><span class="fu">expression</span>(<span class="fu">paste</span>(<span class="st">&quot;Estimates of &quot;</span>, beta[<span class="dv">1</span>], <span class="st">&quot; from each Sample&quot;</span>)),</span>
<span id="cb38-64"><a href="#cb38-64" aria-hidden="true" tabindex="-1"></a>             <span class="at">freq=</span><span class="cn">FALSE</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">ylab=</span><span class="st">&quot;&quot;</span>)</span>
<span id="cb38-65"><a href="#cb38-65" aria-hidden="true" tabindex="-1"></a>  m1 <span class="ot">&lt;-</span> <span class="fu">mean</span>(storage_b1)</span>
<span id="cb38-66"><a href="#cb38-66" aria-hidden="true" tabindex="-1"></a>  s1 <span class="ot">&lt;-</span> <span class="fu">sd</span>(storage_b1)</span>
<span id="cb38-67"><a href="#cb38-67" aria-hidden="true" tabindex="-1"></a>  <span class="fu">addnorm</span>(m1,s1, <span class="at">col=</span><span class="st">&quot;green&quot;</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-42-1.png" width="768" /></p>
<div style="padding-left:15px;">
<div id="tTests" class="section level5">
<h5>t Tests</h5>
<p>Using the information above about the sampling distributions of <span
class="math inline">\(b_1\)</span> and <span
class="math inline">\(b_0\)</span>, an immediate choice of statistical
test to test the hypotheses <span class="math display">\[
  H_0: \beta_1 = \beta_{10}
\]</span> <span class="math display">\[
  H_a: \beta_1 \neq \beta_{10}
\]</span> where <span class="math inline">\(\beta_{10}\)</span> can be
zero, or any other value, is a t test given by <span
class="math display">\[
  t = \frac{b_1 - \beta_{10}}{s_{b_1}}
\]</span> where <span class="math inline">\(s^2_{b_1} =
\frac{MSE}{\sum(X_i-\bar{X})^2}\)</span>. (You may want to review the
section “Estimating the Model Variance” of this file to know where MSE
came from.) With quite a bit of work it has been shown that <span
class="math inline">\(t\)</span> is distributed as a <span
class="math inline">\(t\)</span> distribution with <span
class="math inline">\(n-2\)</span> degrees of freedom. The nearly
identical test statistic for testing <span class="math display">\[
  H_0: \beta_0 = \beta_{00}
\]</span> <span class="math display">\[
  H_a: \beta_0 \neq \beta_{00}
\]</span> is given by <span class="math display">\[
  t = \frac{b_0 - \beta_{00}}{s_{b_0}}
\]</span> where <span class="math inline">\(s^2_{b_0} =
MSE\left[\frac{1}{n}+\frac{\bar{X}^2}{\sum(X_i-\bar{X})^2}\right]\)</span>.
This version of <span class="math inline">\(t\)</span> has also been
shown to be distributed as a <span class="math inline">\(t\)</span>
distribution with <span class="math inline">\(n-2\)</span> degrees of
freedom.</p>
</div>
<div id="confidence-intervals" class="section level5">
<h5>Confidence Intervals</h5>
<p>Creating a confidence interval for either <span
class="math inline">\(\beta_1\)</span> or <span
class="math inline">\(\beta_0\)</span> follows immediately from these
results using the formulas <span class="math display">\[
  b_1 \pm t^*_{n-2}\cdot s_{b_1}
\]</span> <span class="math display">\[
  b_0 \pm t^*_{n-2}\cdot s_{b_0}
\]</span> where <span class="math inline">\(t^*_{n-2}\)</span> is the
critical value from a t distribution with <span
class="math inline">\(n-2\)</span> degrees of freedom corresponding to
the chosen confidence level.</p>
<p><br /></p>
</div>
<div id="Ftests" class="section level5">
<h5>F tests</h5>
<p>Another way to test the hypotheses <span class="math display">\[
  H_0: \beta_1 = \beta_{10}  \quad\quad \text{or} \quad\quad H_0:
\beta_0 = \beta_{00}
\]</span> <span class="math display">\[
  H_a: \beta_1 \neq \beta_{10} \quad\quad \ \ \quad \quad H_a: \beta_0
\neq \beta_{00}
\]</span> is with an <span class="math inline">\(F\)</span> Test. One
downside of the F test is that we cannot construct confidence intervals.
Another is that we can only perform two-sided tests, we cannot use
one-sided alternatives with an F test. The upside is that an <span
class="math inline">\(F\)</span> test is very general and can be used in
many places that a t test cannot.</p>
<p>In its most general form, the <span class="math inline">\(F\)</span>
test partitions the sums of squared errors into different pieces and
compares the pieces to see what is accounting for the most variation in
the data. To test the hypothesis that <span
class="math inline">\(H_0:\beta_1=0\)</span> against the alternative
that <span class="math inline">\(H_a: \beta_1\neq 0\)</span>, we are
essentially comparing two models against each other. If <span
class="math inline">\(\beta_1=0\)</span>, then the corresponding model
would be <span class="math inline">\(E\{Y_i\} = \beta_0\)</span>. If
<span class="math inline">\(\beta_1\neq0\)</span>, then the model
remains <span
class="math inline">\(E\{Y_i\}=\beta_0+\beta_1X_i\)</span>. We call the
model corresponding to the null hypothesis the reduced model because it
will always have fewer parameters than the model corresponding to the
alternative hypothesis (which we call the full model). This is the first
requirement of the <span class="math inline">\(F\)</span> Test, that the
null model (reduced model) have fewer “free” parameters than the
alternative model (full model). To demonstrate what we mean by “free”
parameters, consider the following example.</p>
<p>Say we wanted to test the hypothesis that <span
class="math inline">\(H_0:\beta_1 = 2.5\)</span> against the alternative
that <span class="math inline">\(\beta_1\neq2.5\)</span>. Then the null,
or reduced model, would be <span
class="math inline">\(E\{Y_i\}=\beta_0+2.5X_i\)</span>. The alternative,
or full model, would be <span
class="math inline">\(E\{Y_i\}=\beta_0+\beta_1X_i\)</span>. Thus, the
null (reduced) model contains only one “free” parameter because <span
class="math inline">\(\beta_1\)</span> has been fixed to be 2.5 and is
no longer free to be estimated from the data. The alternative (full)
model contains two “free” parameters, both are to be estimated from the
data. The null (reduced) model must contain fewer free parameters than
the alternative (full) model.</p>
<p>Once the null and alternative models have been specified, the General
Linear Test is performed by appropriately partitioning the squared
errors into pieces corresponding to each model. In the first example
where we were testing <span class="math inline">\(H_0:
\beta_1=0\)</span> against <span
class="math inline">\(H_a:\beta_1\neq0\)</span> we have the partition
<span class="math display">\[
  \underbrace{Y_i-\bar{Y}}_{Total} = \underbrace{\hat{Y}_i -
\bar{Y}}_{Regression} + \underbrace{Y_i-\hat{Y}_i}_{Error}
\]</span> The reason we use <span class="math inline">\(\bar{Y}\)</span>
for the null model is that <span class="math inline">\(\bar{Y}\)</span>
is the unbiased estimator of <span
class="math inline">\(\beta_0\)</span> for the null model, <span
class="math inline">\(E\{Y_i\} = \beta_0\)</span>. Thus we would compute
the following sums of squares: <span class="math display">\[
  SSTO = \sum(Y_i-\bar{Y})^2
\]</span> <span class="math display">\[
  SSR = \sum(\hat{Y}_i-\bar{Y})^2
\]</span> <span class="math display">\[
  SSE = \sum(Y_i-\hat{Y}_i)^2
\]</span> and note that <span class="math inline">\(SSTO = SSR +
SSE\)</span>. Important to note is that <span
class="math inline">\(SSTO\)</span> uses the difference between the
observations <span class="math inline">\(Y_i\)</span> and the null
(reduced) model. The <span class="math inline">\(SSR\)</span> uses the
diffences between the alternative (full) and null (reduced) model. The
<span class="math inline">\(SSE\)</span> uses the differences between
the observations <span class="math inline">\(Y_i\)</span> and the
alternative (full) model. From these we could set up a General <span
class="math inline">\(F\)</span> table of the form</p>
<table style="width:100%;">
<colgroup>
<col width="21%" />
<col width="21%" />
<col width="10%" />
<col width="23%" />
<col width="23%" />
</colgroup>
<thead>
<tr class="header">
<th> </th>
<th>Sum Sq</th>
<th>Df</th>
<th>Mean Sq</th>
<th>F Value</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>Model Error</td>
<td><span class="math inline">\(SSR\)</span></td>
<td><span class="math inline">\(df_R-df_F\)</span></td>
<td><span class="math inline">\(\frac{SSR}{df_R-df_F}\)</span></td>
<td><span
class="math inline">\(\frac{SSR}{df_R-df_F}\cdot\frac{df_F}{SSE}\)</span></td>
</tr>
<tr class="even">
<td>Residual Error</td>
<td><span class="math inline">\(SSE\)</span></td>
<td><span class="math inline">\(df_F\)</span></td>
<td><span class="math inline">\(\frac{SSE}{df_F}\)</span></td>
<td></td>
</tr>
<tr class="odd">
<td>Total Error</td>
<td><span class="math inline">\(SSTO\)</span></td>
<td><span class="math inline">\(df_R\)</span></td>
<td></td>
<td></td>
</tr>
</tbody>
</table>
</div>
</div>
</div>
<p><br /></p>
</div>
<div id="prediction-and-confidence-intervals-for-haty_h-expand"
class="section level4">
<h4>Prediction and Confidence Intervals for <span
class="math inline">\(\hat{Y}_h\)</span>
<a href="javascript:showhide('predictionintervals')" style="font-size:.6em;color:skyblue;">(Expand)</a></h4>
<p><span class="expand-caption">predict(…, interval=“prediction”)…
</span></p>
<div id="predictionintervals" style="display:none;">
<p>It is a common mistake to assume that averages (means) describe
individuals. They do not. So, when providing predictions on individuals,
it is crucial to capture the variability of individuals around the
line.</p>
<table>
<colgroup>
<col width="21%" />
<col width="17%" />
<col width="32%" />
<col width="28%" />
</colgroup>
<thead>
<tr class="header">
<th>Interval</th>
<th>R Code</th>
<th>Math Equation</th>
<th>When to Use</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>Prediction</td>
<td><span
style="font-size:.8em;"><code>predict(..., interval="prediction")</code></span></td>
<td><span class="math inline">\(\hat{Y}_i \pm t^* \cdot s_{\text{Pred}\
Y}\)</span></td>
<td>Predict an individual’s value.</td>
</tr>
<tr class="even">
<td>Confidence</td>
<td><span
style="font-size:.8em;"><code>predict(..., interval="confidence")</code></span></td>
<td><span class="math inline">\(\hat{Y}_i \pm t^* \cdot
s_{\hat{Y}}\)</span></td>
<td>Estimate location of the mean y-value.</td>
</tr>
</tbody>
</table>
<p><code>predict(mylm, data.frame(XvarName = number), interval=...)</code></p>
<p><br /> <br /></p>
<p>For example, consider this graph. Then
<a href="javascript:showhide('predictionintervalsgraph')" style="color:skyblue;">click
here</a> to read about the graph.</p>
<div id="predictionintervalsgraph"
style="padding-left:30px;padding-right:30px;font-size:.9em;display:none;">
<p>Notice the three dots above 15 mph in the graph. Each of these dots
show a car that was going 15 mph when it applied the brakes. However,
stopping distances of the three individual cars differ with one at 20
feet, one at 26 feet and one at 54 feet.</p>
<p>The regression line represents the average stopping distance of cars.
In this case, cars going 15 mph are estimated to have an average
stopping distance of about 40 feet, as shown by the line. But individual
vehicles, all going the same speed of 15 mph, varied from stopping
distances of 20 feet up to 54 feet!</p>
<p>So, to predict that a car going 15 mph will take 41.4 feet to stop,
doesn’t tell the whole story. Far more revealing is the complete
statement, “Cars going 15 mph are predicted to take anywhere from 10.2
to 72.6 feet to stop, with an average stopping distance of 41.4 feet.”
This is called the “prediction interval” and is shown in the graph in
blue. It is obtained in R with the codes:</p>
<p><code>cars.lm &lt;- lm(dist ~ speed, data=cars)</code></p>
<p><code>predict(cars.lm, data.frame(speed=15), interval="prediction")</code></p>
<div class="sourceCode" id="cb39"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb39-1"><a href="#cb39-1" aria-hidden="true" tabindex="-1"></a>cars.lm <span class="ot">&lt;-</span> <span class="fu">lm</span>(dist <span class="sc">~</span> speed, <span class="at">data=</span>cars)</span>
<span id="cb39-2"><a href="#cb39-2" aria-hidden="true" tabindex="-1"></a><span class="fu">pander</span>(<span class="fu">predict</span>(cars.lm, <span class="fu">data.frame</span>(<span class="at">speed=</span><span class="dv">15</span>), <span class="at">interval=</span><span class="st">&quot;prediction&quot;</span>))</span></code></pre></div>
<table style="width:33%;">
<colgroup>
<col width="11%" />
<col width="11%" />
<col width="11%" />
</colgroup>
<thead>
<tr class="header">
<th align="center">fit</th>
<th align="center">lwr</th>
<th align="center">upr</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="center">41.41</td>
<td align="center">10.17</td>
<td align="center">72.64</td>
</tr>
</tbody>
</table>
</div>
<div class="sourceCode" id="cb40"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb40-1"><a href="#cb40-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(dist <span class="sc">~</span> speed, <span class="at">data=</span>cars, <span class="at">pch=</span><span class="dv">20</span>, <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>, <span class="at">cex=</span><span class="fl">1.2</span>, <span class="at">las=</span><span class="dv">1</span>,</span>
<span id="cb40-2"><a href="#cb40-2" aria-hidden="true" tabindex="-1"></a>     <span class="at">xlab=</span><span class="st">&quot;Speed of the Vehicle (mph) </span><span class="sc">\n</span><span class="st"> the Moment the Brakes were Applied&quot;</span>, <span class="at">ylab=</span><span class="st">&quot;Distance (ft) it took the Vehicle to Stop&quot;</span>,</span>
<span id="cb40-3"><a href="#cb40-3" aria-hidden="true" tabindex="-1"></a>     <span class="at">main=</span><span class="st">&quot;Don&#39;t Step in front of a Moving 1920&#39;s Vehicle...&quot;</span>)</span>
<span id="cb40-4"><a href="#cb40-4" aria-hidden="true" tabindex="-1"></a><span class="fu">mtext</span>(<span class="at">side=</span><span class="dv">3</span>, <span class="at">text=</span><span class="st">&quot;...they take a few feet to stop.&quot;</span>, <span class="at">cex=</span><span class="fl">0.7</span>, <span class="at">line=</span>.<span class="dv">5</span>)</span>
<span id="cb40-5"><a href="#cb40-5" aria-hidden="true" tabindex="-1"></a><span class="fu">legend</span>(<span class="st">&quot;topleft&quot;</span>, <span class="at">legend=</span><span class="st">&quot;Stopping Distance Experiment&quot;</span>, <span class="at">bty=</span><span class="st">&quot;n&quot;</span>)</span>
<span id="cb40-6"><a href="#cb40-6" aria-hidden="true" tabindex="-1"></a><span class="fu">points</span>(dist <span class="sc">~</span> speed, <span class="at">data=</span><span class="fu">subset</span>(cars, speed<span class="sc">==</span><span class="dv">15</span>), <span class="at">pch=</span><span class="dv">20</span>, <span class="at">col=</span><span class="st">&quot;firebrick2&quot;</span>, <span class="at">cex=</span><span class="fl">1.5</span>)</span>
<span id="cb40-7"><a href="#cb40-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb40-8"><a href="#cb40-8" aria-hidden="true" tabindex="-1"></a>cars.lm <span class="ot">&lt;-</span> <span class="fu">lm</span>(dist <span class="sc">~</span> speed, <span class="at">data=</span>cars)</span>
<span id="cb40-9"><a href="#cb40-9" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(cars.lm, <span class="at">lwd=</span><span class="dv">2</span>, <span class="at">col=</span><span class="fu">rgb</span>(.<span class="dv">689</span>,.<span class="dv">133</span>,.<span class="dv">133</span>, .<span class="dv">3</span>))</span>
<span id="cb40-10"><a href="#cb40-10" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(<span class="at">h=</span><span class="fu">seq</span>(<span class="dv">0</span>,<span class="dv">120</span>,<span class="dv">20</span>), <span class="at">v=</span><span class="fu">seq</span>(<span class="dv">5</span>,<span class="dv">25</span>,<span class="dv">5</span>), <span class="at">lty=</span><span class="dv">2</span>, <span class="at">col=</span><span class="fu">rgb</span>(.<span class="dv">2</span>,.<span class="dv">2</span>,.<span class="dv">2</span>,.<span class="dv">2</span>))</span>
<span id="cb40-11"><a href="#cb40-11" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(<span class="at">v=</span><span class="dv">15</span>, <span class="at">lty=</span><span class="dv">2</span>, <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>)</span>
<span id="cb40-12"><a href="#cb40-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb40-13"><a href="#cb40-13" aria-hidden="true" tabindex="-1"></a>preds <span class="ot">&lt;-</span> <span class="fu">predict</span>(cars.lm, <span class="fu">data.frame</span>(<span class="at">speed=</span><span class="dv">15</span>), <span class="at">interval=</span><span class="st">&quot;prediction&quot;</span>)</span>
<span id="cb40-14"><a href="#cb40-14" aria-hidden="true" tabindex="-1"></a><span class="fu">lines</span>(<span class="fu">c</span>(<span class="dv">15</span>,<span class="dv">15</span>), preds[<span class="dv">2</span><span class="sc">:</span><span class="dv">3</span>] <span class="sc">-</span> <span class="fu">c</span>(<span class="sc">-</span>.<span class="dv">5</span>,.<span class="dv">5</span>), <span class="at">col=</span><span class="fu">rgb</span>(.<span class="dv">529</span>,.<span class="dv">8078</span>,.<span class="dv">9216</span>,.<span class="dv">4</span>), <span class="at">lwd=</span><span class="dv">12</span>)</span>
<span id="cb40-15"><a href="#cb40-15" aria-hidden="true" tabindex="-1"></a><span class="fu">lines</span>(<span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">15</span>), preds[<span class="fu">c</span>(<span class="dv">2</span>,<span class="dv">2</span>)], <span class="at">col=</span><span class="fu">rgb</span>(.<span class="dv">529</span>,.<span class="dv">8078</span>,.<span class="dv">9216</span>,.<span class="dv">8</span>))</span>
<span id="cb40-16"><a href="#cb40-16" aria-hidden="true" tabindex="-1"></a><span class="fu">lines</span>(<span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">15</span>), preds[<span class="fu">c</span>(<span class="dv">3</span>,<span class="dv">3</span>)], <span class="at">col=</span><span class="fu">rgb</span>(.<span class="dv">529</span>,.<span class="dv">8078</span>,.<span class="dv">9216</span>,.<span class="dv">8</span>))</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-44-1.png" width="672" /></p>
<p>Now, for the details behind prediction intervals and confidence
intervals.</p>
<p>Let’s begin by recalling some details (from the section “Inference
for the Model Parameters”) about the standard error of the y-intercept,
<span class="math inline">\(b_0\)</span>. Recall that the y-intercept is
the average y-value for the given x-value of <span
class="math inline">\(x=0\)</span>. Recall further that the formula for
the standard error of <span class="math inline">\(b_0\)</span> is given
by the formula</p>
<p><span class="math display">\[
  s^2_{b_0} = MSE\left[\frac{1}{n} +
\frac{\bar{X}^2}{\sum(X_i-\bar{X})^2}\right]
\]</span></p>
<p>If we wanted to be more exact with this formula, we would write it
as</p>
<p><span class="math display">\[
  s^2_{b_0} = MSE\left[\frac{1}{n} +
\frac{(0-\bar{X})^2}{\sum(X_i-\bar{X})^2}\right]
\]</span></p>
<p>Did you notice the addition of <span class="math inline">\((0 -
\bar{X})^2\)</span> instead of just <span
class="math inline">\(\bar{X}^2\)</span> in the numerator of the
right-most part of the equation? This more complete statement obviously
would reduce to just <span class="math inline">\(\bar{X}^2\)</span>, but
that is only because <span class="math inline">\(X=0\)</span> when we
are working with the y-intercept, <span
class="math inline">\(b_0\)</span>. We could be working with other
values of <span class="math inline">\(X\)</span> than just zero.</p>
<div class="note">
<p>Let’s take a quick detour and talk notation for a second. Typically,
<span class="math inline">\(X_i\)</span> and <span
class="math inline">\(Y_i\)</span> are used to denote the x-value and
y-value of points that are contained in our data set. When we want to
reference a point that wasn’t within our original data set, we use the
notation <span class="math inline">\(X_h\)</span> and <span
class="math inline">\(Y_h\)</span>. (The letter h is close to i, but
different from i, so why not. There is really no other reason to use h.)
Thus, <span class="math inline">\(Y_h\)</span> is the y-value for the
<span class="math inline">\(X_h\)</span> x-value, neither of which were
included in our original regression of <span
class="math inline">\(X_i\)</span>’s and <span
class="math inline">\(Y_i\)</span>’s.</p>
</div>
<p>Now, back to the previous discussion. If <span
class="math inline">\(X_h = 0\)</span>, then <span
class="math inline">\(\hat{Y}_h\)</span> is the y-intercept, so <span
class="math inline">\(\hat{Y}_h = b_0\)</span> when <span
class="math inline">\(X_h=0\)</span>. So, we could write,</p>
<p><span class="math display">\[
  s^2_{\hat{Y}_h} = MSE\left[\frac{1}{n} +
\frac{(X_h-\bar{X})^2}{\sum(X_i-\bar{X})^2}\right]
\]</span></p>
<p>Did you notice how the <span class="math inline">\(b_0\)</span> in
<span class="math inline">\(s_{b_0}\)</span> was replaced with <span
class="math inline">\(\hat{Y}_h\)</span> to get <span
class="math inline">\(s_{\hat{Y}_h}\)</span> and the 0 in <span
class="math inline">\((0 - \bar{X})^2\)</span> was replaced with <span
class="math inline">\(X_h\)</span> to get <span
class="math inline">\((X_h - \bar{X})^2\)</span>? Interesting. We now
have a formula that would give us the standard error of <span
class="math inline">\(\hat{Y}_h\)</span> for any <span
class="math inline">\(X_h\)</span> value, not just <span
class="math inline">\(X_h = 0\)</span>, or the y-intercept, <span
class="math inline">\(b_0\)</span>. That is fantastic. It would look
like this if plotted. Notice how the gray region is showing the standard
error for each <span class="math inline">\(\hat{Y}_h\)</span> value? (It
is technically showing the confidence interval for <span
class="math inline">\(E\{Y_h\}\)</span> at every possible <span
class="math inline">\(X_h\)</span> value, but that is just <span
class="math inline">\(\hat{Y}_h \pm t^* \cdot
s_{\hat{Y}_h}\)</span>.)</p>
<div class="sourceCode" id="cb41"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb41-1"><a href="#cb41-1" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(cars, <span class="fu">aes</span>(<span class="at">x=</span>speed, <span class="at">y=</span>dist)) <span class="sc">+</span> </span>
<span id="cb41-2"><a href="#cb41-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_point</span>() <span class="sc">+</span></span>
<span id="cb41-3"><a href="#cb41-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_smooth</span>(<span class="at">method=</span><span class="st">&quot;lm&quot;</span>, <span class="at">color=</span><span class="st">&quot;skyblue&quot;</span>) <span class="sc">+</span></span>
<span id="cb41-4"><a href="#cb41-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">theme_bw</span>()</span></code></pre></div>
<pre><code>## `geom_smooth()` using formula = &#39;y ~ x&#39;</code></pre>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-45-1.png" width="672" /></p>
<p><strong>Confidence Interval for <span
class="math inline">\(\hat{Y}_h\)</span></strong></p>
<p><span class="math display">\[
  \hat{Y}_h \pm t^* s_{\hat{Y}_h} \quad \text{where} \ s_{\hat{Y}_h}^2 =
MSE\left[\frac{1}{n} + \frac{(X_h - \bar{X})^2}{\sum(X_i -
\bar{X})^2}\right]
\]</span></p>
<p>The confidence interval is a wonderful tool for estimating <span
class="math inline">\(E\{Y_h\}\)</span>, the “true” average y-value for
a given x-value of <span class="math inline">\(X_h\)</span>. However, it
is not valuable for predicting an individual dot, or <span
class="math inline">\(Y_h\)</span> value. Notice how few of the dots of
the regression are actually contained within the confidence interval
band in the plot? The confidence interval does not really predict where
the dots will land, just where the average y-value is located for each
x-value.</p>
<p>Remember the 68-95-99.7 Rule of the normal distribution? If not, here
is a link back to that concept in the <a
href="https://byuistats.github.io/BYUI_M221_Book/Lesson05.html#normal-probability-computations">Math
221</a> textbook. This rule states that roughly 95% of data, when
normally distributed, will be between <span
class="math inline">\(z=-2\)</span> and <span
class="math inline">\(z=2\)</span> standard deviations from the mean.
So, is going two “residual standard errors” to both sides of the
regression line enough to capture 95% of the data? The answer is, not
quite. The reason for this is because our knowledge of where the true
mean lies is uncertain. (Notice the confidence interval band shown in
the plot.) However, adding two standard errors to the edges of the
confidence band would get us in the right place. In other words, there
are two sources of variability at play here, (1) our uncertaintity in
where the regression line is sitting, and (2) the natural variability of
the data points around the line. Thus, the “prediction interval”
requires accounting for both of these sources of variability to produce
the following equation.</p>
<p><strong>Prediction Interval for <span
class="math inline">\(Y_h\)</span></strong></p>
<p><span class="math display">\[
  \hat{Y}_h \pm t^* s_{Pred \hat{Y}_h} \quad \text{where} \ s_{Pred
\hat{Y}_h}^2 = MSE\left[\frac{1}{n} + 1 + \frac{(X_h -
\bar{X})^2}{\sum(X_i - \bar{X})^2}\right]
\]</span></p>
<p>This formula provides a useful band for identifying a region where we
are 95% confident that a new observation for <span
class="math inline">\(Y_h\)</span> will land, given the value of <span
class="math inline">\(X_h\)</span>.</p>
<p>It looks as follows. Notice the prediction interval is much wider
than the confidence interval. This is because data varies far more than
do means. Prediction is for where the individual data points will land,
confidence is for where the mean will land.</p>
<div class="sourceCode" id="cb43"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb43-1"><a href="#cb43-1" aria-hidden="true" tabindex="-1"></a>cars.lm <span class="ot">&lt;-</span> <span class="fu">lm</span>(dist <span class="sc">~</span> speed, <span class="at">data=</span>cars)</span>
<span id="cb43-2"><a href="#cb43-2" aria-hidden="true" tabindex="-1"></a>predy <span class="ot">&lt;-</span> <span class="fu">predict</span>(cars.lm, <span class="fu">data.frame</span>(<span class="at">speed=</span><span class="dv">15</span>), <span class="at">interval=</span><span class="st">&quot;prediction&quot;</span>)</span>
<span id="cb43-3"><a href="#cb43-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb43-4"><a href="#cb43-4" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(cars, <span class="fu">aes</span>(<span class="at">x=</span>speed, <span class="at">y=</span>dist)) <span class="sc">+</span> </span>
<span id="cb43-5"><a href="#cb43-5" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_point</span>() <span class="sc">+</span></span>
<span id="cb43-6"><a href="#cb43-6" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_smooth</span>(<span class="at">method=</span><span class="st">&quot;lm&quot;</span>, <span class="at">color=</span><span class="st">&quot;skyblue&quot;</span>) <span class="sc">+</span></span>
<span id="cb43-7"><a href="#cb43-7" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_segment</span>(<span class="fu">aes</span>(<span class="at">x=</span><span class="dv">15</span>, <span class="at">xend=</span><span class="dv">15</span>, <span class="at">y=</span>predy[<span class="dv">2</span>], <span class="at">yend=</span>predy[<span class="dv">3</span>]), <span class="at">lwd=</span><span class="dv">4</span>, <span class="at">color=</span><span class="fu">rgb</span>(.<span class="dv">5</span>,.<span class="dv">7</span>,.<span class="dv">5</span>,.<span class="dv">01</span>)) <span class="sc">+</span> </span>
<span id="cb43-8"><a href="#cb43-8" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_point</span>(<span class="fu">aes</span>(<span class="at">x=</span><span class="dv">15</span>, <span class="at">y=</span>predy[<span class="dv">1</span>]), <span class="at">cex=</span><span class="dv">2</span>, <span class="at">color=</span><span class="st">&quot;skyblue&quot;</span>, <span class="at">pch=</span><span class="dv">15</span>) <span class="sc">+</span></span>
<span id="cb43-9"><a href="#cb43-9" aria-hidden="true" tabindex="-1"></a>  <span class="fu">theme_bw</span>()</span></code></pre></div>
<pre><code>## `geom_smooth()` using formula = &#39;y ~ x&#39;</code></pre>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-46-1.png" width="672" /></p>
</div>
<p><br/></p>
</div>
<div id="lowess-and-loess-curves-expand" class="section level4">
<h4>Lowess (and Loess) Curves
<a href="javascript:showhide('lowesscurves')" style="font-size:.6em;color:skyblue;">(Expand)</a></h4>
<p><span class="expand-caption">A non-parametric approach to estimating
<span class="math inline">\(E\{Y_i\}\)</span>… </span></p>
<div id="lowesscurves" style="display:none;">
<p>Robust <strong>lo</strong>cally <strong>wei</strong>ghted regression
and <strong>s</strong>moothing <strong>s</strong>catterplots (LOWESS),
is an effective way to visually model the average y-value.</p>
<hr />
<table>
<tr>
<td>
<p><strong>Using Base R</strong></p>
<div class="sourceCode" id="cb45"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb45-1"><a href="#cb45-1" aria-hidden="true" tabindex="-1"></a>air2 <span class="ot">&lt;-</span> <span class="fu">na.omit</span>(<span class="fu">select</span>(airquality, Temp, Ozone))</span>
<span id="cb45-2"><a href="#cb45-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb45-3"><a href="#cb45-3" aria-hidden="true" tabindex="-1"></a><span class="co"># Just quickly draw the lowess curve:</span></span>
<span id="cb45-4"><a href="#cb45-4" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(Temp <span class="sc">~</span> Ozone, <span class="at">data=</span>air2, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;darkgray&quot;</span>)</span>
<span id="cb45-5"><a href="#cb45-5" aria-hidden="true" tabindex="-1"></a><span class="fu">lines</span>(<span class="fu">lowess</span>(air2<span class="sc">$</span>Ozone, air2<span class="sc">$</span>Temp), <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-47-1.png" width="672" /></p>
<div class="sourceCode" id="cb46"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb46-1"><a href="#cb46-1" aria-hidden="true" tabindex="-1"></a><span class="do">## OR optionally, </span></span>
<span id="cb46-2"><a href="#cb46-2" aria-hidden="true" tabindex="-1"></a><span class="do">## allow for predictions as well as the graph:</span></span>
<span id="cb46-3"><a href="#cb46-3" aria-hidden="true" tabindex="-1"></a><span class="co"># plot(Temp ~ Ozone, data=air2, pch=16, col=&quot;darkgray&quot;)</span></span>
<span id="cb46-4"><a href="#cb46-4" aria-hidden="true" tabindex="-1"></a><span class="co"># air2 &lt;- arrange(air2, desc(Ozone))</span></span>
<span id="cb46-5"><a href="#cb46-5" aria-hidden="true" tabindex="-1"></a><span class="co"># mylo &lt;- loess(Temp ~ Ozone, data=air2, degree=1)</span></span>
<span id="cb46-6"><a href="#cb46-6" aria-hidden="true" tabindex="-1"></a><span class="co"># lines(mylo$fit ~ Ozone, data=air2)</span></span></code></pre></div>
</td>
<td>
<p><strong>Using ggplot2</strong></p>
<div class="sourceCode" id="cb47"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb47-1"><a href="#cb47-1" aria-hidden="true" tabindex="-1"></a>air2 <span class="ot">&lt;-</span> <span class="fu">na.omit</span>(<span class="fu">select</span>(airquality, Temp, Ozone))</span>
<span id="cb47-2"><a href="#cb47-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb47-3"><a href="#cb47-3" aria-hidden="true" tabindex="-1"></a><span class="co"># Just quickly draw the lowess curve:</span></span>
<span id="cb47-4"><a href="#cb47-4" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(air2, <span class="fu">aes</span>(<span class="at">x=</span>Ozone, <span class="at">y=</span>Temp)) <span class="sc">+</span></span>
<span id="cb47-5"><a href="#cb47-5" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_point</span>(<span class="at">color=</span><span class="st">&quot;darkgray&quot;</span>) <span class="sc">+</span> </span>
<span id="cb47-6"><a href="#cb47-6" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_smooth</span>(<span class="at">se=</span>F, <span class="at">method=</span><span class="st">&quot;loess&quot;</span>, <span class="at">method.args =</span> <span class="fu">list</span>(<span class="at">degree=</span><span class="dv">1</span>)) <span class="sc">+</span> <span class="co">#Note, degree=2 by default.</span></span>
<span id="cb47-7"><a href="#cb47-7" aria-hidden="true" tabindex="-1"></a>  <span class="fu">theme_bw</span>()</span></code></pre></div>
<pre><code>## `geom_smooth()` using formula = &#39;y ~ x&#39;</code></pre>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-48-1.png" width="672" /></p>
<div class="sourceCode" id="cb49"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb49-1"><a href="#cb49-1" aria-hidden="true" tabindex="-1"></a><span class="do">## OR optionally, </span></span>
<span id="cb49-2"><a href="#cb49-2" aria-hidden="true" tabindex="-1"></a><span class="do">## allow for predictions as well as the graph:</span></span>
<span id="cb49-3"><a href="#cb49-3" aria-hidden="true" tabindex="-1"></a><span class="co"># air2 &lt;- arrange(air2, desc(Ozone))</span></span>
<span id="cb49-4"><a href="#cb49-4" aria-hidden="true" tabindex="-1"></a><span class="co"># mylo &lt;- loess(Temp ~ Ozone, data=air2, degree=1)</span></span>
<span id="cb49-5"><a href="#cb49-5" aria-hidden="true" tabindex="-1"></a><span class="co"># ggplot(air2, aes(x=Ozone, y=Temp)) +</span></span>
<span id="cb49-6"><a href="#cb49-6" aria-hidden="true" tabindex="-1"></a><span class="co">#   geom_point() +</span></span>
<span id="cb49-7"><a href="#cb49-7" aria-hidden="true" tabindex="-1"></a><span class="co">#   geom_line(data=air2, aes(y=mylo$fit, x=Ozone))</span></span></code></pre></div>
</td>
</tr>
</table>
<hr />
<p><br /></p>
<table>
<colgroup>
<col width="50%" />
<col width="50%" />
</colgroup>
<thead>
<tr class="header">
<th><strong>Advantages</strong></th>
<th><strong>Disadvantages</strong></th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>Quick. Good at ignoring outliers. Good at capturing the general
pattern in the data. Good for making predictions within the scope of the
data.</td>
<td>No mathematical model. Not interpretable. No p-values. No adjusted
R-squared.</td>
</tr>
</tbody>
</table>
<p><strong>How it Works</strong></p>
<p>The Lowess curve localizes the regression model to a “neighborhood”
of points, and then joins these localized regressions together into a
smooth line. It minimizes the effect of outliers, and let’s the data
“speak for itself”.</p>
<p>As a downside, it is not interpretable, and has no final way to write
the model mathematically. All the same, it is a very powerful tool for
identifying an appropriate model, or verifying the fit of a model, or
making predictions when no reasonable model does an adequate job.</p>
<p>Study this graphic and the explanations below to learn how it
works.</p>
<p><em>Recommendation: run the code in this “Code” chunk to the right in
your Console, and flip through the resulting graphics.</em></p>
<div class="sourceCode" id="cb50"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb50-1"><a href="#cb50-1" aria-hidden="true" tabindex="-1"></a>X <span class="ot">&lt;-</span> cars<span class="sc">$</span>speed</span>
<span id="cb50-2"><a href="#cb50-2" aria-hidden="true" tabindex="-1"></a>Y <span class="ot">&lt;-</span> cars<span class="sc">$</span>dist</span>
<span id="cb50-3"><a href="#cb50-3" aria-hidden="true" tabindex="-1"></a>X <span class="ot">&lt;-</span> X[<span class="sc">!</span><span class="fu">is.na</span>(X) <span class="sc">&amp;</span> <span class="sc">!</span><span class="fu">is.na</span>(Y)]</span>
<span id="cb50-4"><a href="#cb50-4" aria-hidden="true" tabindex="-1"></a>Y <span class="ot">&lt;-</span> Y[<span class="sc">!</span><span class="fu">is.na</span>(X) <span class="sc">&amp;</span> <span class="sc">!</span><span class="fu">is.na</span>(Y)]</span>
<span id="cb50-5"><a href="#cb50-5" aria-hidden="true" tabindex="-1"></a>f <span class="ot">&lt;-</span> <span class="dv">1</span><span class="sc">/</span><span class="dv">2</span></span>
<span id="cb50-6"><a href="#cb50-6" aria-hidden="true" tabindex="-1"></a>n <span class="ot">&lt;-</span> <span class="fu">length</span>(X)</span>
<span id="cb50-7"><a href="#cb50-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb50-8"><a href="#cb50-8" aria-hidden="true" tabindex="-1"></a>lfit <span class="ot">&lt;-</span> <span class="fu">rep</span>(<span class="cn">NA</span>,n)</span>
<span id="cb50-9"><a href="#cb50-9" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> (xh <span class="cf">in</span> <span class="dv">1</span><span class="sc">:</span>n){</span>
<span id="cb50-10"><a href="#cb50-10" aria-hidden="true" tabindex="-1"></a> xdists <span class="ot">&lt;-</span> X <span class="sc">-</span> X[xh]</span>
<span id="cb50-11"><a href="#cb50-11" aria-hidden="true" tabindex="-1"></a> nn <span class="ot">&lt;-</span> <span class="fu">floor</span>(n<span class="sc">*</span>f)</span>
<span id="cb50-12"><a href="#cb50-12" aria-hidden="true" tabindex="-1"></a> r <span class="ot">&lt;-</span> <span class="fu">sort</span>(<span class="fu">abs</span>(xdists))[nn]</span>
<span id="cb50-13"><a href="#cb50-13" aria-hidden="true" tabindex="-1"></a> xdists.nbrhd <span class="ot">&lt;-</span> <span class="fu">which</span>(<span class="fu">abs</span>(xdists) <span class="sc">&lt;</span> r)</span>
<span id="cb50-14"><a href="#cb50-14" aria-hidden="true" tabindex="-1"></a> w <span class="ot">&lt;-</span> <span class="fu">rep</span>(<span class="dv">0</span>, <span class="fu">length</span>(xdists))</span>
<span id="cb50-15"><a href="#cb50-15" aria-hidden="true" tabindex="-1"></a> w[xdists.nbrhd] <span class="ot">&lt;-</span> (<span class="dv">1</span> <span class="sc">-</span> <span class="fu">abs</span>(xdists[xdists.nbrhd]<span class="sc">/</span>r)<span class="sc">^</span><span class="dv">3</span>)<span class="sc">^</span><span class="dv">3</span></span>
<span id="cb50-16"><a href="#cb50-16" aria-hidden="true" tabindex="-1"></a> <span class="fu">plot</span>(Y <span class="sc">~</span> X, <span class="at">pch=</span><span class="dv">21</span>, <span class="at">bg=</span><span class="fu">rgb</span>(.<span class="dv">53</span>,.<span class="dv">81</span>,.<span class="dv">92</span>, w),   </span>
<span id="cb50-17"><a href="#cb50-17" aria-hidden="true" tabindex="-1"></a>      <span class="at">col=</span><span class="fu">rgb</span>(.<span class="dv">2</span>,.<span class="dv">2</span>,.<span class="dv">2</span>,.<span class="dv">3</span>), <span class="at">cex=</span><span class="fl">1.5</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xlab=</span><span class="st">&quot;&quot;</span>, <span class="at">ylab=</span><span class="st">&quot;&quot;</span>)</span>
<span id="cb50-18"><a href="#cb50-18" aria-hidden="true" tabindex="-1"></a> <span class="fu">points</span>(Y[xh] <span class="sc">~</span> X[xh], <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;orange&quot;</span>)</span>
<span id="cb50-19"><a href="#cb50-19" aria-hidden="true" tabindex="-1"></a> lmc <span class="ot">&lt;-</span> <span class="fu">lm</span>(Y <span class="sc">~</span> X, <span class="at">weights=</span>w)</span>
<span id="cb50-20"><a href="#cb50-20" aria-hidden="true" tabindex="-1"></a> <span class="fu">curve</span>(lmc<span class="sc">$</span>coef[<span class="dv">1</span>] <span class="sc">+</span> lmc<span class="sc">$</span>coef[<span class="dv">2</span>]<span class="sc">*</span>x, <span class="at">from=</span><span class="fu">min</span>(X[xdists.nbrhd]), <span class="at">to=</span><span class="fu">max</span>(X[xdists.nbrhd]), <span class="at">col=</span><span class="st">&quot;orange&quot;</span>, <span class="at">add=</span><span class="cn">TRUE</span>)</span>
<span id="cb50-21"><a href="#cb50-21" aria-hidden="true" tabindex="-1"></a> <span class="fu">lines</span>(lfit[<span class="dv">1</span><span class="sc">:</span>xh] <span class="sc">~</span> X[<span class="dv">1</span><span class="sc">:</span>xh], <span class="at">col=</span><span class="st">&quot;gray&quot;</span>)</span>
<span id="cb50-22"><a href="#cb50-22" aria-hidden="true" tabindex="-1"></a> </span>
<span id="cb50-23"><a href="#cb50-23" aria-hidden="true" tabindex="-1"></a> <span class="co">#lines(lowess(X,Y), col=rgb(0.698,0.133,0.133,.2))</span></span>
<span id="cb50-24"><a href="#cb50-24" aria-hidden="true" tabindex="-1"></a> <span class="fu">cat</span>(<span class="st">&quot;</span><span class="sc">\n\n</span><span class="st">&quot;</span>)</span>
<span id="cb50-25"><a href="#cb50-25" aria-hidden="true" tabindex="-1"></a> <span class="fu">readline</span>(<span class="at">prompt=</span><span class="fu">paste0</span>(<span class="st">&quot;Center point is point #&quot;</span>, xh, <span class="st">&quot;... Press [enter] to continue...&quot;</span>))</span>
<span id="cb50-26"><a href="#cb50-26" aria-hidden="true" tabindex="-1"></a> </span>
<span id="cb50-27"><a href="#cb50-27" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb50-28"><a href="#cb50-28" aria-hidden="true" tabindex="-1"></a> MADnotThereYet <span class="ot">&lt;-</span> <span class="cn">TRUE</span></span>
<span id="cb50-29"><a href="#cb50-29" aria-hidden="true" tabindex="-1"></a> count <span class="ot">&lt;-</span> <span class="dv">0</span></span>
<span id="cb50-30"><a href="#cb50-30" aria-hidden="true" tabindex="-1"></a> <span class="cf">while</span>(MADnotThereYet){</span>
<span id="cb50-31"><a href="#cb50-31" aria-hidden="true" tabindex="-1"></a>   </span>
<span id="cb50-32"><a href="#cb50-32" aria-hidden="true" tabindex="-1"></a>      <span class="fu">readline</span>(<span class="at">prompt=</span><span class="fu">paste0</span>(<span class="st">&quot;</span><span class="sc">\n</span><span class="st">   Adjusting line to account for outliers in the y-direction... Press [enter] to continue...&quot;</span>))   </span>
<span id="cb50-33"><a href="#cb50-33" aria-hidden="true" tabindex="-1"></a>   </span>
<span id="cb50-34"><a href="#cb50-34" aria-hidden="true" tabindex="-1"></a>   <span class="fu">curve</span>(lmc<span class="sc">$</span>coef[<span class="dv">1</span>] <span class="sc">+</span> lmc<span class="sc">$</span>coef[<span class="dv">2</span>]<span class="sc">*</span>x, <span class="at">from=</span><span class="fu">min</span>(X[xdists.nbrhd]), <span class="at">to=</span><span class="fu">max</span>(X[xdists.nbrhd]), <span class="at">col=</span><span class="st">&quot;wheat&quot;</span>, <span class="at">add=</span><span class="cn">TRUE</span>)</span>
<span id="cb50-35"><a href="#cb50-35" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb50-36"><a href="#cb50-36" aria-hidden="true" tabindex="-1"></a>   MAD <span class="ot">&lt;-</span> <span class="fu">median</span>(<span class="fu">abs</span>(lmc<span class="sc">$</span>res))</span>
<span id="cb50-37"><a href="#cb50-37" aria-hidden="true" tabindex="-1"></a>   resm <span class="ot">&lt;-</span> lmc<span class="sc">$</span>res<span class="sc">/</span>(<span class="dv">6</span><span class="sc">*</span>MAD)</span>
<span id="cb50-38"><a href="#cb50-38" aria-hidden="true" tabindex="-1"></a>   resm[resm<span class="sc">&gt;</span><span class="dv">1</span>] <span class="ot">&lt;-</span> <span class="dv">1</span></span>
<span id="cb50-39"><a href="#cb50-39" aria-hidden="true" tabindex="-1"></a>   bisq <span class="ot">&lt;-</span> (<span class="dv">1</span><span class="sc">-</span>resm<span class="sc">^</span><span class="dv">2</span>)<span class="sc">^</span><span class="dv">2</span></span>
<span id="cb50-40"><a href="#cb50-40" aria-hidden="true" tabindex="-1"></a>   w <span class="ot">&lt;-</span> w<span class="sc">*</span>bisq</span>
<span id="cb50-41"><a href="#cb50-41" aria-hidden="true" tabindex="-1"></a>   obs <span class="ot">&lt;-</span> <span class="fu">coef</span>(lmc)</span>
<span id="cb50-42"><a href="#cb50-42" aria-hidden="true" tabindex="-1"></a>   lmc <span class="ot">&lt;-</span> <span class="fu">lm</span>(Y <span class="sc">~</span> X, <span class="at">weights=</span>w)</span>
<span id="cb50-43"><a href="#cb50-43" aria-hidden="true" tabindex="-1"></a> </span>
<span id="cb50-44"><a href="#cb50-44" aria-hidden="true" tabindex="-1"></a>   <span class="fu">curve</span>(lmc<span class="sc">$</span>coef[<span class="dv">1</span>] <span class="sc">+</span> lmc<span class="sc">$</span>coef[<span class="dv">2</span>]<span class="sc">*</span>x, <span class="at">from=</span><span class="fu">min</span>(X[xdists.nbrhd]), <span class="at">to=</span><span class="fu">max</span>(X[xdists.nbrhd]), <span class="at">col=</span><span class="st">&quot;orange&quot;</span>, <span class="at">add=</span><span class="cn">TRUE</span>)</span>
<span id="cb50-45"><a href="#cb50-45" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb50-46"><a href="#cb50-46" aria-hidden="true" tabindex="-1"></a>   count <span class="ot">&lt;-</span> count <span class="sc">+</span> <span class="dv">1</span></span>
<span id="cb50-47"><a href="#cb50-47" aria-hidden="true" tabindex="-1"></a>   <span class="cf">if</span> ( (<span class="fu">sum</span>(<span class="fu">abs</span>(obs<span class="sc">-</span>lmc<span class="sc">$</span>coef))<span class="sc">&lt;</span>.<span class="dv">1</span>) <span class="sc">|</span> (count <span class="sc">&gt;</span> <span class="dv">3</span>))</span>
<span id="cb50-48"><a href="#cb50-48" aria-hidden="true" tabindex="-1"></a>     MADnotThereYet <span class="ot">&lt;-</span> <span class="cn">FALSE</span></span>
<span id="cb50-49"><a href="#cb50-49" aria-hidden="true" tabindex="-1"></a>       </span>
<span id="cb50-50"><a href="#cb50-50" aria-hidden="true" tabindex="-1"></a> }</span>
<span id="cb50-51"><a href="#cb50-51" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb50-52"><a href="#cb50-52" aria-hidden="true" tabindex="-1"></a>   <span class="fu">curve</span>(lmc<span class="sc">$</span>coef[<span class="dv">1</span>] <span class="sc">+</span> lmc<span class="sc">$</span>coef[<span class="dv">2</span>]<span class="sc">*</span>x, <span class="at">from=</span><span class="fu">min</span>(X[xdists.nbrhd]), <span class="at">to=</span><span class="fu">max</span>(X[xdists.nbrhd]), <span class="at">col=</span><span class="st">&quot;green&quot;</span>, <span class="at">add=</span><span class="cn">TRUE</span>)</span>
<span id="cb50-53"><a href="#cb50-53" aria-hidden="true" tabindex="-1"></a>   <span class="fu">points</span>(lmc<span class="sc">$</span>coef[<span class="dv">1</span>] <span class="sc">+</span> lmc<span class="sc">$</span>coef[<span class="dv">2</span>]<span class="sc">*</span>X[xh] <span class="sc">~</span> X[xh], <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;green&quot;</span>)</span>
<span id="cb50-54"><a href="#cb50-54" aria-hidden="true" tabindex="-1"></a>   </span>
<span id="cb50-55"><a href="#cb50-55" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb50-56"><a href="#cb50-56" aria-hidden="true" tabindex="-1"></a>  <span class="fu">readline</span>(<span class="at">prompt=</span><span class="fu">paste0</span>(<span class="st">&quot;</span><span class="sc">\n</span><span class="st">   Use final line to get fitted value for this point... Press [enter] to continue to next point...&quot;</span>))</span>
<span id="cb50-57"><a href="#cb50-57" aria-hidden="true" tabindex="-1"></a> </span>
<span id="cb50-58"><a href="#cb50-58" aria-hidden="true" tabindex="-1"></a> lfit[xh] <span class="ot">&lt;-</span> <span class="fu">predict</span>(lmc, <span class="fu">data.frame</span>(<span class="at">X=</span>X[xh]))</span>
<span id="cb50-59"><a href="#cb50-59" aria-hidden="true" tabindex="-1"></a> <span class="fu">lines</span>(lfit[<span class="dv">1</span><span class="sc">:</span>xh] <span class="sc">~</span> X[<span class="dv">1</span><span class="sc">:</span>xh], <span class="at">col=</span><span class="st">&quot;gray&quot;</span>)</span>
<span id="cb50-60"><a href="#cb50-60" aria-hidden="true" tabindex="-1"></a> </span>
<span id="cb50-61"><a href="#cb50-61" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb50-62"><a href="#cb50-62" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> (xh <span class="sc">==</span> n){</span>
<span id="cb50-63"><a href="#cb50-63" aria-hidden="true" tabindex="-1"></a>     <span class="fu">readline</span>(<span class="at">prompt=</span><span class="fu">paste0</span>(<span class="st">&quot;</span><span class="sc">\n</span><span class="st">  Press [enter] to see actual Lowess curve...&quot;</span>))</span>
<span id="cb50-64"><a href="#cb50-64" aria-hidden="true" tabindex="-1"></a>    <span class="fu">lines</span>(<span class="fu">lowess</span>(X,Y, <span class="at">f=</span>f), <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>)</span>
<span id="cb50-65"><a href="#cb50-65" aria-hidden="true" tabindex="-1"></a>    <span class="fu">legend</span>(<span class="st">&quot;topleft&quot;</span>, <span class="at">bty=</span><span class="st">&quot;n&quot;</span>, <span class="at">legend=</span><span class="st">&quot;Actual lowess Curve using lowess(...)&quot;</span>, <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>, <span class="at">lty=</span><span class="dv">1</span>)</span>
<span id="cb50-66"><a href="#cb50-66" aria-hidden="true" tabindex="-1"></a> }</span>
<span id="cb50-67"><a href="#cb50-67" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb50-68"><a href="#cb50-68" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb50-69"><a href="#cb50-69" aria-hidden="true" tabindex="-1"></a>}</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-50-1.png" width="672" /></p>
<ol style="list-style-type: decimal">
<li><p>Select a fraction of the data to use for the “neighborhood” of
points (shown in blue in the graph above). The <code>lowess</code>
function in R uses “f=2/3” and the <code>loess</code> function uses
“span=0.75” for this value, which selects the nearest two-thirds or 75%
of the data, respectively, depending on which function you use. For this
example, we set the fraction of points at 50%. Both functions can be set
to whatever you want.</p></li>
<li><p>Pick any point in the regression, eventually selecting all points
one at a time. The selected point becomes the “center” of a
“neighborhood” of points surrounding it. In this example, the center
point is in orange, and the neighboring points are in blue.</p></li>
<li><p>Use the points within the neighborhood to fit a regression line.
However, make the regression depend most on points closest to “center”
and least on points furthest from “center.” This is called a weighted
regression. Weights are decided according to what is called the tricubic
weight function, so that the weight <span
class="math inline">\(w\)</span> given to point <span
class="math inline">\(j\)</span> of the neighborhood of points is
defined by <span class="math display">\[
  w_j = \left(1- \left( \frac{|X_c - X_j|}{\max_k |X_c -
X_k|}\right)^3\right)^3
\]</span> where <span class="math inline">\(X_c\)</span> is the x-value
of the “center” dot and <span class="math inline">\(X_j\)</span> is the
x-value of any other dot in the neighborhood.</p></li>
<li><p>The fitted-value of <span
class="math inline">\(\hat{Y}_c\)</span> is obtained for the center
point <span class="math inline">\(X_c\)</span> of the current
regression. This point is used as the Lowess (or Loess) curve’s value at
that particular x-value. Well, almost. It’s a first guess at where this
value will end up, but there’s a little more to the algorithm before we
are done. Initial guesses for each of these fitted values are obtained
for each point in the regression.</p></li>
<li><p>Now each local regression for each neighborhood is re-run a few
times in such a way the the effect of outliers is minimized. The final
line for each neighborhood is obtained by the following steps.</p>
<ul>
<li>Compute all residuals for points in the neighborhood of the current
regression, denoted by <span class="math inline">\(r_i\)</span>.</li>
<li>Then compute the MAD, median absolute deviation, of the residuals
<span class="math inline">\(MAD = \text{median} (|r_1|, |r_2|,
\ldots)\)</span>.</li>
<li>Divide all residuals by 6 times the MAD: <span
class="math inline">\(u_i = r_i/(6\cdot MAD)\)</span> (If <span
class="math inline">\(r_i &gt; 6\cdot MAD\)</span> then set <span
class="math inline">\(u_i = 0\)</span>.)</li>
<li>Compute what are called bisquare weights using the formula: <span
class="math inline">\(b_i = (1 - u_i^2)^2\)</span></li>
<li>Perform a regression using the weights <span
class="math inline">\(w_i = w_i b_i\)</span></li>
<li>Repeat the above process with the new weights <span
class="math inline">\(w_i\)</span> until the weights stop changing very
much.</li>
</ul></li>
<li><p>The final fitted values for each <span
class="math inline">\(X\)</span>-value in the regression are obtained
from the final regression line for each neighborhood. These fitted
values make up the Lowess (or loess) curve.</p></li>
</ol>
<p>Note that the default of the <code>loess</code> function in R is to
use quadratic regressions in each neighborhood instead of linear
regressions. This can be controlled with the <code>loess</code> option
of “degree=2” (quadratic fits) or “degree = 1”. In the
<code>lowess</code> function only a linear regression in each
neighborhood is allowed.</p>
</div>
<p><br /></p>
<hr />
</div>
</div>
</div>
</div>
<div id="section" class="section level2">
<h2></h2>
<div style="padding-left:125px;">
<p><strong>Examples:</strong> <a
href="./Analyses/Linear%20Regression/Examples/BodyWeightSLR.html">bodyweight</a>,
<a
href="./Analyses/Linear%20Regression/Examples/carsSLR.html">cars</a></p>
</div>
<hr />
</div>
<div id="multiple-linear-regression"
class="section level2 tabset tabset-fade tabset-pills">
<h2 class="tabset tabset-fade tabset-pills">Multiple Linear
Regression</h2>
<div style="float:left;width:125px;" align="center">
<p><img src="Images/QuantYMultX.png" width=108px;></p>
</div>
<p>Multiple regression allows for more than one explanatory variable to
be included in the modeling of the expected value of the quantitative
response variable <span class="math inline">\(Y_i\)</span>. There are
infinitely many possible multiple regression models to choose from. Here
are a few “basic” models that work as building blocks to more
complicated models.</p>
<div id="overview-1" class="section level3">
<h3>Overview</h3>
<div style="padding-left:125px;">
<p>Select a model to see interpretation details, an example, and R Code
help.</p>
<div class="tab">
<p><button class="tablinks" onclick="openTab(event, 'LearnMoresimpleLinearModel')">Simple</button>
<button class="tablinks" onclick="openTab(event, 'LearnMoreQuadraticModel')">Quadratic</button>
<button class="tablinks" onclick="openTab(event, 'LearnMoreCubicModel')">Cubic</button>
<button class="tablinks" onclick="openTab(event, 'LearnMoreTwoLinesModel')">Two-Lines</button>
<button class="tablinks" onclick="openTab(event, 'LearnMorethreeDModel')">3D</button>
<button class="tablinks" onclick="openTab(event, 'LearnMoreHDModel')">HD</button></p>
</div>
<div id="LearnMoresimpleLinearModel" class="tabcontent"
style="display:block;">
<p>
<table>
<tr>
<td>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-51-1.png" width="144" /></p>
</td>
<td style="text-align: center;padding-left:15px;">
<p><span class="math display">\[
Y_i = \overbrace{\underbrace{\beta_0 + \beta_1
X_i}_{E\{Y_i\}}}^\text{Simple Model} + \epsilon_i
\]</span></p>
</td>
</tr>
</table>
<p><br/></p>
<p>The Simple Linear Regression model uses a single x-variable once:
<span class="math inline">\(X_i\)</span>.</p>
<table>
<thead>
<tr class="header">
<th>Parameter</th>
<th>Effect</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td><span class="math inline">\(\beta_0\)</span></td>
<td>Y-intercept of the Model</td>
</tr>
<tr class="even">
<td><span class="math inline">\(\beta_1\)</span></td>
<td>Slope of the line</td>
</tr>
</tbody>
</table>
</p>
</div>
<div id="LearnMoreQuadraticModel" class="tabcontent">
<p>
<table>
<tr>
<td>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-52-1.png" width="144" /></p>
</td>
<td style="text-align: center;padding-left:15px;">
<p><span class="math display">\[
Y_i = \overbrace{\underbrace{\beta_0 + \beta_1 X_i + \beta_2
X_i^2}_{E\{Y_i\}}}^\text{Quadratic Model} + \epsilon_i
\]</span></p>
</td>
</tr>
</table>
<p><br/></p>
<p>The Quadratic model uses the same <span
class="math inline">\(X\)</span>-variable twice, once with a <span
class="math inline">\(\beta_1 X_i\)</span> term and once with a <span
class="math inline">\(\beta_2 X_i^2\)</span> term. The <span
class="math inline">\(X_i^2\)</span> term is called the “quadratic”
term.</p>
<table>
<colgroup>
<col width="13%" />
<col width="86%" />
</colgroup>
<thead>
<tr class="header">
<th>Parameter</th>
<th>Effect</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td><span class="math inline">\(\beta_0\)</span></td>
<td>Y-intercept of the Model.</td>
</tr>
<tr class="even">
<td><span class="math inline">\(\beta_1\)</span></td>
<td>Controls the x-position of the vertex of the parabola by <span
class="math inline">\(\frac{-\beta_1}{2\cdot\beta_2}\)</span>.</td>
</tr>
<tr class="odd">
<td><span class="math inline">\(\beta_2\)</span></td>
<td>Controls the concavity and “steepness” of the Model: negative values
face down, positive values face up; large values imply “steeper”
parabolas and low values imply “flatter” parabolas. Also involved in the
position of the vertex, see <span
class="math inline">\(\beta_1\)</span>’s explanation.</td>
</tr>
</tbody>
</table>
<p><strong>An Example</strong></p>
<p>Using the <code>airquality</code> data set, we run the following
“quadratic” regression. Pay careful attention to how the mathematical
model for <span class="math inline">\(Y_i = \ldots\)</span> is
translated to R-Code inside of <code>lm(...)</code>.</p>
<p><span class="math display">\[
  \underbrace{Y_i}_\text{Temp} \underbrace{=}_{\sim}
\overbrace{\beta_0}^{\text{y-int}} +
\overbrace{\beta_1}^{\stackrel{\text{slope}}{\text{term}}}
\underbrace{X_{i}}_\text{Month} \underbrace{+}_{+}
\overbrace{\beta_2}^{\stackrel{\text{quadratic}}{\text{term}}}  \underbrace{X_{i}^2}_\text{I(Month^2)}
+ \epsilon_i
\]</span></p>
<a href="javascript:showhide('quadraticregressionexamplecode')">
<div class="hoverchunk">
<p><span class="tooltipr"> lm.quad &lt;- <span class="tooltiprtext">A
name we made up for our “quadratic” regression.</span> </span><span
class="tooltipr"> lm( <span class="tooltiprtext">R function lm used to
perform linear regressions in R. The lm stands for “linear
model”.</span> </span><span class="tooltipr"> Temp <span
class="tooltiprtext">Y-variable, should be quantitative.</span>
</span><span class="tooltipr">  ~  <span class="tooltiprtext">The tilde
<code>~</code> is what lm(…) uses to state the regression equation <span
class="math inline">\(Y_i = ...\)</span>. Notice that the <code>~</code>
is not followed by <span class="math inline">\(\beta_0 +
\beta_1\)</span> like <span class="math inline">\(Y_i = ...\)</span>.
Instead, <span class="math inline">\(X_{i}\)</span> (Month in this case)
is the first term following <code>~</code>. This is because the <span
class="math inline">\(\beta\)</span>’s are going to be estimated by the
lm(…). These “Estimates” can be found using summary(lmObject) and
looking at the <strong>Estimates</strong> column in the output.</span>
</span><span class="tooltipr"> Month <span class="tooltiprtext"><span
class="math inline">\(X_{i}\)</span>, should be quantitative.</span>
</span><span class="tooltipr">  +  <span class="tooltiprtext">The plus
<code>+</code> is used between each term in the model. Note that only
the x-variables are included in the lm(…) from the <span
class="math inline">\(Y_i = ...\)</span> model. No beta’s are
included.</span> </span><span class="tooltipr"> I(Month^2) <span
class="tooltiprtext"><span class="math inline">\(X_{i}^2\)</span>, where
the function I(…) protects the squaring of Month from how lm(…) would
otherwise interpret that statement. The I(…) function must be used
anytime you raise an x-variable to a power in the lm(…)
statement.</span> </span><span class="tooltipr"> , data=airquality <span
class="tooltiprtext">This is the data set we are using for the
regression.</span> </span><span class="tooltipr"> )<br />
<span class="tooltiprtext">Closing parenthsis for the lm(…)
function.</span> </span><span class="tooltipr">     <br />
<span class="tooltiprtext">Press Enter to run the code.</span>
</span><span class="tooltipr" style="float:right;">  …  <span
class="tooltiprtext">Click to View Output.</span> </span></p>
</div>
</a>
<div id="quadraticregressionexamplecode" style="display:none;">
<p>Pay special attention to how the lm(…) code uses <span
class="math inline">\(Y_i \sim X_{i} + X_{i}^2\)</span> and drops all
<span class="math inline">\(\beta\)</span>’s and <span
class="math inline">\(\epsilon\)</span> from the model statement. This
is because the estimates for the <span
class="math inline">\(\beta\)</span>’s and <span
class="math inline">\(\epsilon\)</span> are given by the output of the
lm(…) funtion in the “Estimates” column of summary(….) and in
<code>lmObject$residuals</code>.</p>
</div>
<div class="sourceCode" id="cb51"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb51-1"><a href="#cb51-1" aria-hidden="true" tabindex="-1"></a>lm.quad <span class="ot">&lt;-</span> <span class="fu">lm</span>(Temp <span class="sc">~</span> Month <span class="sc">+</span> <span class="fu">I</span>(Month<span class="sc">^</span><span class="dv">2</span>), <span class="at">data=</span>airquality)</span>
<span id="cb51-2"><a href="#cb51-2" aria-hidden="true" tabindex="-1"></a><span class="fu">emphasize.strong.cols</span>(<span class="dv">1</span>)</span>
<span id="cb51-3"><a href="#cb51-3" aria-hidden="true" tabindex="-1"></a><span class="fu">pander</span>(<span class="fu">summary</span>(lm.quad)<span class="sc">$</span>coefficients, )</span></code></pre></div>
<table style="width:92%;">
<colgroup>
<col width="25%" />
<col width="18%" />
<col width="18%" />
<col width="13%" />
<col width="16%" />
</colgroup>
<thead>
<tr class="header">
<th align="center"> </th>
<th align="center">Estimate</th>
<th align="center">Std. Error</th>
<th align="center">t value</th>
<th align="center">Pr(&gt;|t|)</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="center"><strong>(Intercept)</strong></td>
<td align="center"><strong>-95.73</strong></td>
<td align="center">15.24</td>
<td align="center">-6.281</td>
<td align="center">3.458e-09</td>
</tr>
<tr class="even">
<td align="center"><strong>Month</strong></td>
<td align="center"><strong>48.72</strong></td>
<td align="center">4.489</td>
<td align="center">10.85</td>
<td align="center">1.29e-20</td>
</tr>
<tr class="odd">
<td align="center"><strong>I(Month^2)</strong></td>
<td align="center"><strong>-3.283</strong></td>
<td align="center">0.3199</td>
<td align="center">-10.26</td>
<td align="center">4.737e-19</td>
</tr>
</tbody>
</table>
<p>The <strong>estimates</strong> shown in the summary output table
above approximate the <span class="math inline">\(\beta\)</span>’s in
the regression model:</p>
<ul>
<li><span class="math inline">\(\beta_0\)</span> is estimated by the
(Intercept) value of -95.73,</li>
<li><span class="math inline">\(\beta_1\)</span> is estimated by the
<code>Month</code> value of 48.72, and</li>
<li><span class="math inline">\(\beta_2\)</span> is estimated by the
<code>I(Month^2)</code> value of -3.283.</li>
</ul>
<p>Because the estimate of the <span
class="math inline">\(\beta_2\)</span> term is negative (-3.283), this
parabola will “open down” (concave). This tells us that average
temperatures will increase to a point, then decrease again. The vertex
of this parabola will be at <span class="math inline">\(-b_1/(2b_2) =
-(48.72)/(2\cdot (-3.283)) = 7.420043\)</span> months, which tells us
that the highest average temperature will occur around mid July (7.42
months to be exact). The y-intercept is -95.73, which would be awfully
cold if it were possible for the month to be “month zero.” Since this is
not possible, the y-intercept is not meaningful for this model.</p>
<p>Note that interpreting either <span
class="math inline">\(\beta_1\)</span> or <span
class="math inline">\(\beta_2\)</span> by themselves is quite difficult
because they both work with together with <span
class="math inline">\(X_{i}\)</span>.</p>
<p><span class="math display">\[
\hat{Y}_i = \overbrace{-95.73}^\text{y-int} +
\overbrace{48.72}^{\stackrel{\text{slope}}{\text{term}}} X_{i} +
\overbrace{-3.283}^{\stackrel{\text{quadratic}}{\text{term}}} X_{i}^2
\]</span></p>
<p>The regression function is drawn as follows. Be sure to look at the
“Code” to understand how this graph was created using the ideas in the
equation above.</p>
<table>
<tr>
<td>
<p><strong>Using Base R</strong></p>
<div class="sourceCode" id="cb52"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb52-1"><a href="#cb52-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(Temp <span class="sc">~</span> Month, <span class="at">data=</span>airquality, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>, <span class="at">pch=</span><span class="dv">21</span>, <span class="at">bg=</span><span class="st">&quot;gray83&quot;</span>, <span class="at">main=</span><span class="st">&quot;Quadratic Model using airquality data set&quot;</span>, <span class="at">cex.main=</span><span class="dv">1</span>)</span>
<span id="cb52-2"><a href="#cb52-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb52-3"><a href="#cb52-3" aria-hidden="true" tabindex="-1"></a><span class="co">#get the &quot;Estimates&quot; automatically:</span></span>
<span id="cb52-4"><a href="#cb52-4" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(lm.quad)</span>
<span id="cb52-5"><a href="#cb52-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Then b will have 3 numbers stored inside:</span></span>
<span id="cb52-6"><a href="#cb52-6" aria-hidden="true" tabindex="-1"></a><span class="co"># b[1] is the estimate of beta_0: -95.73</span></span>
<span id="cb52-7"><a href="#cb52-7" aria-hidden="true" tabindex="-1"></a><span class="co"># b[2] is the estimate of beta_1: 48.72</span></span>
<span id="cb52-8"><a href="#cb52-8" aria-hidden="true" tabindex="-1"></a><span class="co"># b[3] is the estimate of beta_2: -3.28</span></span>
<span id="cb52-9"><a href="#cb52-9" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x <span class="sc">+</span> b[<span class="dv">3</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">2</span>, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>, <span class="at">add=</span><span class="cn">TRUE</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-54-1.png" width="672" /></p>
</td>
<td>
<p><strong>Using ggplot2</strong></p>
<div class="sourceCode" id="cb53"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb53-1"><a href="#cb53-1" aria-hidden="true" tabindex="-1"></a><span class="co">#get the &quot;Estimates&quot; automatically:</span></span>
<span id="cb53-2"><a href="#cb53-2" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(lm.quad)</span>
<span id="cb53-3"><a href="#cb53-3" aria-hidden="true" tabindex="-1"></a><span class="co"># Then b will have 3 estimates:</span></span>
<span id="cb53-4"><a href="#cb53-4" aria-hidden="true" tabindex="-1"></a><span class="co"># b[1] is the estimate of beta_0: 35.38</span></span>
<span id="cb53-5"><a href="#cb53-5" aria-hidden="true" tabindex="-1"></a><span class="co"># b[2] is the estimate of beta_1: -7.099</span></span>
<span id="cb53-6"><a href="#cb53-6" aria-hidden="true" tabindex="-1"></a><span class="co"># b[3] is the estimate of beta_2: 0.4759</span></span>
<span id="cb53-7"><a href="#cb53-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb53-8"><a href="#cb53-8" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(airquality, <span class="fu">aes</span>(<span class="at">y=</span>Temp, <span class="at">x=</span>Month)) <span class="sc">+</span></span>
<span id="cb53-9"><a href="#cb53-9" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_point</span>(<span class="at">pch=</span><span class="dv">21</span>, <span class="at">bg=</span><span class="st">&quot;gray83&quot;</span>, <span class="at">color=</span><span class="st">&quot;skyblue&quot;</span>) <span class="sc">+</span></span>
<span id="cb53-10"><a href="#cb53-10" aria-hidden="true" tabindex="-1"></a>  <span class="co">#geom_smooth(method=&quot;lm&quot;, se=F, formula = y ~ poly(x, 2)) + #easy way, but the more involved manual way using stat_function (see below) is more dynamic.</span></span>
<span id="cb53-11"><a href="#cb53-11" aria-hidden="true" tabindex="-1"></a>  <span class="fu">stat_function</span>(<span class="at">fun =</span> <span class="cf">function</span>(x) b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x <span class="sc">+</span> b[<span class="dv">3</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">2</span>, <span class="at">color=</span><span class="st">&quot;skyblue&quot;</span>) <span class="sc">+</span></span>
<span id="cb53-12"><a href="#cb53-12" aria-hidden="true" tabindex="-1"></a>  <span class="fu">labs</span>(<span class="at">title=</span><span class="st">&quot;Quadratic Model using airquality data set&quot;</span>) </span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-55-1.png" width="672" /></p>
</td>
</tr>
</table>
</p>
</div>
<div id="LearnMoreCubicModel" class="tabcontent" style="display:none;">
<p>
<table>
<tr>
<td>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-56-1.png" width="144" /></p>
</td>
<td style="text-align: center;padding-left:15px;">
<p><span class="math display">\[
Y_i = \overbrace{\underbrace{\beta_0 + \beta_1 X_i + \beta_2 X_i^2 +
\beta_3 X_i^3}_{E\{Y_i\}}}^\text{Cubic Model} + \epsilon_i
\]</span></p>
</td>
</tr>
</table>
<p><br/></p>
<p>The Cubic model uses the same <span
class="math inline">\(X\)</span>-variable thrice, once with a <span
class="math inline">\(\beta_1 X_i\)</span> term, once with a <span
class="math inline">\(\beta_2 X_i^2\)</span> term, and once with a <span
class="math inline">\(\beta_3 X_i^3\)</span> term. The <span
class="math inline">\(X_i^3\)</span> term is called the “cubic”
term.</p>
<table>
<colgroup>
<col width="13%" />
<col width="86%" />
</colgroup>
<thead>
<tr class="header">
<th>Parameter</th>
<th>Effect</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td><span class="math inline">\(\beta_0\)</span></td>
<td>Y-intercept of the Model.</td>
</tr>
<tr class="even">
<td><span class="math inline">\(\beta_1\)</span></td>
<td>No clear interpretation, but could be called the “base slope
coefficient” and contributes to the position of the inflection points of
the cubic function.</td>
</tr>
<tr class="odd">
<td><span class="math inline">\(\beta_2\)</span></td>
<td>No clear interpretation, but it also contributes to the location of
the inflection points.</td>
</tr>
<tr class="even">
<td><span class="math inline">\(\beta_3\)</span></td>
<td>This is the coefficient of the cubic term. No clear interpretation,
but it determines the concavity of the model by its sign.</td>
</tr>
</tbody>
</table>
<p><strong>An Example</strong></p>
<p>Using the <code>CO2</code> data set, we run the following “cubic”
regression.</p>
<p><span class="math display">\[
  \underbrace{Y_i}_\text{uptake} \underbrace{=}_{\sim}
\overbrace{\beta_0}^{\text{y-int}} +
\overbrace{\beta_1}^{\stackrel{\text{slope}}{\text{term}}}
\underbrace{X_{i}}_\text{conc} +
\overbrace{\beta_2}^{\stackrel{\text{quadratic}}{\text{term}}}  \underbrace{X_{i}^2}_\text{I(conc^2)}
+  \overbrace{\beta_3}^{\stackrel{\text{cubic}}{\text{term}}}  \underbrace{X_{i}^3}_\text{I(conc^3)}
+ \epsilon_i
\]</span></p>
<a href="javascript:showhide('cubicregressionexamplecode')">
<div class="hoverchunk">
<p><span class="tooltipr"> lm.cubic &lt;- <span class="tooltiprtext">A
name we made up for our “cubic” regression.</span> </span><span
class="tooltipr"> lm( <span class="tooltiprtext">R function lm used to
perform linear regressions in R. The lm stands for “linear
model”.</span> </span><span class="tooltipr"> uptake <span
class="tooltiprtext">Y-variable, should be quantitative.</span>
</span><span class="tooltipr">  ~  <span class="tooltiprtext">The tilde
<code>~</code> is what lm(…) uses to state the regression equation <span
class="math inline">\(Y_i = ...\)</span>. Notice that the <code>~</code>
is not followed by <span class="math inline">\(\beta_0 +
\beta_1\)</span> like <span class="math inline">\(Y_i = ...\)</span>.
Instead, <span class="math inline">\(X_i\)</span> is the first term
following <code>~</code>. This is because the <span
class="math inline">\(\beta\)</span>’s are going to be estimated by the
lm(…). These estimates can be found using summary(lmObject).</span>
</span><span class="tooltipr"> conc <span class="tooltiprtext"><span
class="math inline">\(X_{i}\)</span>, should be quantitative.</span>
</span><span class="tooltipr">  +  <span class="tooltiprtext">The plus
<code>+</code> is used between each term in the model. Note that only
the x-variables are included in the lm(…) from the <span
class="math inline">\(Y_i = ...\)</span> model. No beta’s are
included.</span> </span><span class="tooltipr"> I(conc^2) <span
class="tooltiprtext"><span class="math inline">\(X_{i}^2\)</span>, where
the function I(…) protects the squaring of conc from how lm(…) would
otherwise interpret that statement. The I(…) function must be used
anytime you raise an x-variable to a power in the lm(…)
statement.</span> </span><span class="tooltipr">  +  <span
class="tooltiprtext">The plus <code>+</code> is used between each term
in the model. Note that only the x-variables are included in the lm(…)
from the <span class="math inline">\(Y_i = ...\)</span> model. No beta’s
are included.</span> </span><span class="tooltipr"> I(conc^3) <span
class="tooltiprtext"><span class="math inline">\(X_{i}^3\)</span>, where
the function I(…) protects the cubing of conc from how lm(…) would
otherwise interpret that statement. The I(…) function must be used
anytime you raise an x-variable to a power in the lm(…)
statement.</span> </span><span class="tooltipr"> , data=CO2 <span
class="tooltiprtext">This is the data set we are using for the
regression.</span> </span><span class="tooltipr"> )<br />
<span class="tooltiprtext">Closing parenthsis for the lm(…)
function.</span> </span><span class="tooltipr">     <br />
<span class="tooltiprtext">Press Enter to run the code.</span>
</span><span class="tooltipr" style="float:right;">  …  <span
class="tooltiprtext">Click to View Output.</span> </span></p>
</div>
</a>
<div id="cubicregressionexamplecode" style="display:none;">
<p>Pay special attention to how the lm(…) code uses <span
class="math inline">\(Y_i \sim X_{i} + X_{i}^2\)</span> and drops all
<span class="math inline">\(\beta\)</span>’s and <span
class="math inline">\(\epsilon\)</span> from the model statement. This
is because the estimates for the <span
class="math inline">\(\beta\)</span>’s and <span
class="math inline">\(\epsilon\)</span> are given by the output of the
lm(…) funtion in the “Estimates” column of summary(….) and in
<code>lmObject$residuals</code>.</p>
</div>
<div class="sourceCode" id="cb54"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb54-1"><a href="#cb54-1" aria-hidden="true" tabindex="-1"></a>lm.cubic <span class="ot">&lt;-</span> <span class="fu">lm</span>(uptake <span class="sc">~</span> conc <span class="sc">+</span> <span class="fu">I</span>(conc<span class="sc">^</span><span class="dv">2</span>) <span class="sc">+</span> <span class="fu">I</span>(conc<span class="sc">^</span><span class="dv">3</span>), <span class="at">data=</span>CO2)</span>
<span id="cb54-2"><a href="#cb54-2" aria-hidden="true" tabindex="-1"></a><span class="fu">pander</span>(<span class="fu">summary</span>(lm.cubic)<span class="sc">$</span>coefficients)</span></code></pre></div>
<table style="width:90%;">
<colgroup>
<col width="25%" />
<col width="18%" />
<col width="18%" />
<col width="13%" />
<col width="15%" />
</colgroup>
<thead>
<tr class="header">
<th align="center"> </th>
<th align="center">Estimate</th>
<th align="center">Std. Error</th>
<th align="center">t value</th>
<th align="center">Pr(&gt;|t|)</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="center"><strong>(Intercept)</strong></td>
<td align="center">-1.483</td>
<td align="center">5.043</td>
<td align="center">-0.2941</td>
<td align="center">0.7694</td>
</tr>
<tr class="even">
<td align="center"><strong>conc</strong></td>
<td align="center">0.1814</td>
<td align="center">0.0416</td>
<td align="center">4.36</td>
<td align="center">3.83e-05</td>
</tr>
<tr class="odd">
<td align="center"><strong>I(conc^2)</strong></td>
<td align="center">-0.0003063</td>
<td align="center">9.067e-05</td>
<td align="center">-3.378</td>
<td align="center">0.00113</td>
</tr>
<tr class="even">
<td align="center"><strong>I(conc^3)</strong></td>
<td align="center">1.601e-07</td>
<td align="center">5.512e-08</td>
<td align="center">2.905</td>
<td align="center">0.004745</td>
</tr>
</tbody>
</table>
<p>The <strong>estimates</strong> shown above approximate the <span
class="math inline">\(\beta\)</span>’s in the regression model: <span
class="math inline">\(\beta_0\)</span> is estimated by the (Intercept)
value of -1.483, <span class="math inline">\(\beta_1\)</span> is
estimated by the <code>conc</code> value of 0.1814, <span
class="math inline">\(\beta_2\)</span> is estimated by the
<code>I(conc^2)</code> value of -0.0003063, and <span
class="math inline">\(\beta_3\)</span> is estimated by the
<code>I(conc^3)</code> value of 1.601e-07, which translates to
0.0000001601.</p>
<p>Because the estimate of the <span
class="math inline">\(\beta_3\)</span> term is positive, this cubic
model will “open up”. In other words, as the function moves from left to
right, it will go off to positive infinity (up). If the term would have
been negative, then the function would head to negative infinity (down)
instead.</p>
<p><span class="math display">\[
\hat{Y}_i = \overbrace{-1.483}^\text{y-int} +
\overbrace{0.1814}^{\stackrel{\text{slope}}{\text{term}}} X_{i} +
\overbrace{-0.0003063}^{\stackrel{\text{quadratic}}{\text{term}}}
X_{i}^2 + \overbrace{1.601e-07}^{\stackrel{\text{cubic}}{\text{term}}}
X_{i}^3
\]</span></p>
<p>The regression function is drawn as follows. Be sure to look at the
“Code” to understand how this graph was created using the ideas in the
equation above.</p>
<table>
<tr>
<td>
<p><strong>Using Base R</strong></p>
<div class="sourceCode" id="cb55"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb55-1"><a href="#cb55-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(uptake <span class="sc">~</span> conc, <span class="at">data=</span>CO2, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>, <span class="at">pch=</span><span class="dv">21</span>, <span class="at">bg=</span><span class="st">&quot;gray83&quot;</span>, <span class="at">main=</span><span class="st">&quot;Cubic Model using CO2 data set&quot;</span>, <span class="at">cex.main=</span><span class="dv">1</span>)</span>
<span id="cb55-2"><a href="#cb55-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb55-3"><a href="#cb55-3" aria-hidden="true" tabindex="-1"></a><span class="co">#get the &quot;Estimates&quot; automatically:</span></span>
<span id="cb55-4"><a href="#cb55-4" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(lm.cubic)</span>
<span id="cb55-5"><a href="#cb55-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Then b will have 4 estimates:</span></span>
<span id="cb55-6"><a href="#cb55-6" aria-hidden="true" tabindex="-1"></a><span class="co"># b[1] is the estimate of beta_0: -1.483</span></span>
<span id="cb55-7"><a href="#cb55-7" aria-hidden="true" tabindex="-1"></a><span class="co"># b[2] is the estimate of beta_1: 0.1814</span></span>
<span id="cb55-8"><a href="#cb55-8" aria-hidden="true" tabindex="-1"></a><span class="co"># b[3] is the estimate of beta_2: -0.0003063</span></span>
<span id="cb55-9"><a href="#cb55-9" aria-hidden="true" tabindex="-1"></a><span class="co"># b[4] is the estimate of beta_3: 1.601e-07</span></span>
<span id="cb55-10"><a href="#cb55-10" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x <span class="sc">+</span> b[<span class="dv">3</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">2</span> <span class="sc">+</span> b[<span class="dv">4</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">3</span>, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>, <span class="at">add=</span><span class="cn">TRUE</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-58-1.png" width="672" /></p>
</td>
<td>
<p><strong>Using ggplot2</strong></p>
<div class="sourceCode" id="cb56"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb56-1"><a href="#cb56-1" aria-hidden="true" tabindex="-1"></a><span class="co">#get the &quot;Estimates&quot; automatically:</span></span>
<span id="cb56-2"><a href="#cb56-2" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(lm.cubic)</span>
<span id="cb56-3"><a href="#cb56-3" aria-hidden="true" tabindex="-1"></a><span class="co"># Then b will have 4 estimates:</span></span>
<span id="cb56-4"><a href="#cb56-4" aria-hidden="true" tabindex="-1"></a><span class="co"># b[1] is the estimate of beta_0: -1.483</span></span>
<span id="cb56-5"><a href="#cb56-5" aria-hidden="true" tabindex="-1"></a><span class="co"># b[2] is the estimate of beta_1: 0.1814</span></span>
<span id="cb56-6"><a href="#cb56-6" aria-hidden="true" tabindex="-1"></a><span class="co"># b[3] is the estimate of beta_2: -0.0003063</span></span>
<span id="cb56-7"><a href="#cb56-7" aria-hidden="true" tabindex="-1"></a><span class="co"># b[4] is the estimate of beta_3: 1.601e-07</span></span>
<span id="cb56-8"><a href="#cb56-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb56-9"><a href="#cb56-9" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(CO2, <span class="fu">aes</span>(<span class="at">y=</span>uptake, <span class="at">x=</span>conc)) <span class="sc">+</span></span>
<span id="cb56-10"><a href="#cb56-10" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_point</span>(<span class="at">pch=</span><span class="dv">21</span>, <span class="at">bg=</span><span class="st">&quot;gray83&quot;</span>, <span class="at">color=</span><span class="st">&quot;skyblue&quot;</span>) <span class="sc">+</span></span>
<span id="cb56-11"><a href="#cb56-11" aria-hidden="true" tabindex="-1"></a>  <span class="co">#geom_smooth(method=&quot;lm&quot;, se=F, formula = y ~ poly(x, 3)) + #easy way, but the more involved manual way using stat_function (see below) is more dynamic.</span></span>
<span id="cb56-12"><a href="#cb56-12" aria-hidden="true" tabindex="-1"></a>  <span class="fu">stat_function</span>(<span class="at">fun =</span> <span class="cf">function</span>(x) b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x <span class="sc">+</span> b[<span class="dv">3</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">2</span> <span class="sc">+</span> b[<span class="dv">4</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">3</span>, <span class="at">color=</span><span class="st">&quot;skyblue&quot;</span>) <span class="sc">+</span></span>
<span id="cb56-13"><a href="#cb56-13" aria-hidden="true" tabindex="-1"></a>  <span class="fu">labs</span>(<span class="at">title=</span><span class="st">&quot;Cubic Model using CO2 data set&quot;</span>) </span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-59-1.png" width="672" /></p>
</td>
</tr>
</table>
<p>It should be stated, that the cubic function is not the best fit for
this data. However, it is a lot better than just a simple line, or a
quadratic model, as shown below.</p>
<div class="sourceCode" id="cb57"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb57-1"><a href="#cb57-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(uptake <span class="sc">~</span> conc, <span class="at">data=</span>CO2, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>, <span class="at">pch=</span><span class="dv">21</span>, <span class="at">bg=</span><span class="st">&quot;gray83&quot;</span>, <span class="at">main=</span><span class="st">&quot;Cubic Model using CO2 data set&quot;</span>, <span class="at">cex.main=</span><span class="dv">1</span>)</span>
<span id="cb57-2"><a href="#cb57-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb57-3"><a href="#cb57-3" aria-hidden="true" tabindex="-1"></a><span class="co">#get the &quot;Estimates&quot; automatically:</span></span>
<span id="cb57-4"><a href="#cb57-4" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(lm.cubic)</span>
<span id="cb57-5"><a href="#cb57-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Then b will have 4 estimates:</span></span>
<span id="cb57-6"><a href="#cb57-6" aria-hidden="true" tabindex="-1"></a><span class="co"># b[1] is the estimate of beta_0: -1.483</span></span>
<span id="cb57-7"><a href="#cb57-7" aria-hidden="true" tabindex="-1"></a><span class="co"># b[2] is the estimate of beta_1: 0.1814</span></span>
<span id="cb57-8"><a href="#cb57-8" aria-hidden="true" tabindex="-1"></a><span class="co"># b[3] is the estimate of beta_2: -0.0003063</span></span>
<span id="cb57-9"><a href="#cb57-9" aria-hidden="true" tabindex="-1"></a><span class="co"># b[4] is the estimate of beta_3: 1.601e-07</span></span>
<span id="cb57-10"><a href="#cb57-10" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x <span class="sc">+</span> b[<span class="dv">3</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">2</span> <span class="sc">+</span> b[<span class="dv">4</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">3</span>, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>, <span class="at">add=</span><span class="cn">TRUE</span>)</span>
<span id="cb57-11"><a href="#cb57-11" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(<span class="fu">lm</span>(uptake <span class="sc">~</span> conc <span class="sc">+</span> <span class="fu">I</span>(conc<span class="sc">^</span><span class="dv">2</span>), <span class="at">data=</span>CO2))</span>
<span id="cb57-12"><a href="#cb57-12" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x <span class="sc">+</span> b[<span class="dv">3</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">2</span>, <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>, <span class="at">add=</span><span class="cn">TRUE</span>)</span>
<span id="cb57-13"><a href="#cb57-13" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(<span class="fu">lm</span>(uptake <span class="sc">~</span> conc, <span class="at">data=</span>CO2))</span>
<span id="cb57-14"><a href="#cb57-14" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x, <span class="at">col=</span><span class="st">&quot;orange&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>, <span class="at">add=</span><span class="cn">TRUE</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-60-1.png" width="672" /></p>
</p>
</div>
<div id="LearnMoreTwoLinesModel" class="tabcontent">
<p>
<table>
<tr>
<td>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-61-1.png" width="144" /></p>
</td>
<td style="text-align: center;padding-left:15px;">
<p><span class="math display">\[
Y_i = \overbrace{\underbrace{\beta_0 + \beta_1 X_{1i} + \beta_2 X_{2i} +
\beta_3 X_{1i} X_{2i}}_{E\{Y_i\}}}^\text{Two-lines Model} + \epsilon_i
\]</span></p>
<p><span class="math display">\[
X_{2i} = \left\{\begin{array}{ll} 1, &amp; \text{Group B} \\ 0, &amp;
\text{Group A} \end{array}\right.
\]</span></p>
</td>
</tr>
</table>
<p>The so called “two-lines” model uses a quantitative <span
class="math inline">\(X_{1i}\)</span> variable and a 0,1 indicator
variable <span class="math inline">\(X_{2i}\)</span>. It is a basic
example of how a “dummy variable” or “indicator variable” can be used to
turn qualitative variables into quantitative terms. In this case, the
indicator variable <span class="math inline">\(X_{2i}\)</span>, which is
either 0 or 1, produces two separate lines: one line for Group A, and
one line for Group B.</p>
<table>
<colgroup>
<col width="13%" />
<col width="86%" />
</colgroup>
<thead>
<tr class="header">
<th>Parameter</th>
<th>Effect</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td><span class="math inline">\(\beta_0\)</span></td>
<td>Y-intercept of the Model.</td>
</tr>
<tr class="even">
<td><span class="math inline">\(\beta_1\)</span></td>
<td>Controls the slope of the “base-line” of the model, the “Group 0”
line.</td>
</tr>
<tr class="odd">
<td><span class="math inline">\(\beta_2\)</span></td>
<td>Controls the <strong>change in y-intercept</strong> for the second
line in the model as compared to the y-intercept of the “base-line”
line.</td>
</tr>
<tr class="even">
<td><span class="math inline">\(\beta_3\)</span></td>
<td>Called the “interaction” term. Controls the <strong>change in the
slope</strong> for the second line in the model as compared to the slope
of the “base-line” line.</td>
</tr>
</tbody>
</table>
<p><strong>An Example</strong></p>
<p>Using the <code>mtcars</code> data set, we run the following
“two-lines” regression. Note that <code>am</code> has only 0 or 1
values: <code>View(mtcars)</code>.</p>
<p><span class="math display">\[
  \underbrace{Y_i}_\text{mpg} \underbrace{=}_{\sim}
\overbrace{\beta_0}^{\stackrel{\text{y-int}}{\text{baseline}}} +
\overbrace{\beta_1}^{\stackrel{\text{slope}}{\text{baseline}}}
\underbrace{X_{1i}}_\text{qsec} +
\overbrace{\beta_2}^{\stackrel{\text{change
in}}{\text{y-int}}}  \underbrace{X_{2i}}_\text{am} +
\overbrace{\beta_3}^{\stackrel{\text{change in}}{\text{slope}}}
\underbrace{X_{1i}X_{2i}}_\text{qsec:am} + \epsilon_i
\]</span></p>
<a href="javascript:showhide('twolinesregressionexamplecode')">
<div class="hoverchunk">
<p><span class="tooltipr"> lm.2lines &lt;- <span class="tooltiprtext">A
name we made up for our “two-lines” regression.</span> </span><span
class="tooltipr"> lm( <span class="tooltiprtext">R function lm used to
perform linear regressions in R. The lm stands for “linear
model”.</span> </span><span class="tooltipr"> mpg <span
class="tooltiprtext">Y-variable, should be quantitative.</span>
</span><span class="tooltipr">  ~  <span class="tooltiprtext">The tilde
<code>~</code> is what lm(…) uses to state the regression equation <span
class="math inline">\(Y_i = ...\)</span>. Notice that the <code>~</code>
is not followed by <span class="math inline">\(\beta_0 +
\beta_1\)</span> like <span class="math inline">\(Y_i = ...\)</span>.
Instead, <span class="math inline">\(X_{1i}\)</span> is the first term
following <code>~</code>. This is because <span
class="math inline">\(\beta\)</span>’s are going to be estimated by the
lm(…). These estimates can be found using summary(lmObject).</span>
</span><span class="tooltipr"> qsec <span class="tooltiprtext"><span
class="math inline">\(X_{1i}\)</span>, should be quantitative.</span>
</span><span class="tooltipr">  +  <span class="tooltiprtext">The plus
<code>+</code> is used between each term in the model. Note that only
the x-variables are included in the lm(…) from the <span
class="math inline">\(Y_i = ...\)</span> model. No beta’s are
included.</span> </span><span class="tooltipr"> am <span
class="tooltiprtext"><span class="math inline">\(X_{2i}\)</span>, an
indicator or 0,1 variable. This term allows the y-intercept of the two
lines to differ.</span> </span><span class="tooltipr">  +  <span
class="tooltiprtext">The plus <code>+</code> is used between each term
in the model. Note that only the x-variables are included in the lm(…)
from the <span class="math inline">\(Y_i = ...\)</span> model. No beta’s
are included.</span> </span><span class="tooltipr"> qsec:am <span
class="tooltiprtext"><span class="math inline">\(X_{1i}X_{2i}\)</span>
the interaction term. This allows the slopes of the two lines to
differ.</span> </span><span class="tooltipr"> , data=mtcars <span
class="tooltiprtext">This is the data set we are using for the
regression.</span> </span><span class="tooltipr"> )<br />
<span class="tooltiprtext">Closing parenthsis for the lm(…)
function.</span> </span><span class="tooltipr">     <br />
<span class="tooltiprtext">Press Enter to run the code.</span>
</span><span class="tooltipr" style="float:right;">  …  <span
class="tooltiprtext">Click to View Output.</span> </span></p>
</div>
</a>
<div id="twolinesregressionexamplecode" style="display:none;">
<p>Pay special attention to how the lm(…) code uses <span
class="math inline">\(Y_i \sim X_{1i} + X_{2i} + X_{1i}X_{2i}\)</span>
and drops all <span class="math inline">\(\beta\)</span>’s and <span
class="math inline">\(\epsilon\)</span> from the model statement. This
is because the estimates for the <span
class="math inline">\(\beta\)</span>’s and <span
class="math inline">\(\epsilon\)</span> are given by the output of the
lm(…) funtion in the “Estimates” column of summary(….) and in
<code>lm.2lines$residuals</code>.</p>
</div>
<div class="sourceCode" id="cb58"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb58-1"><a href="#cb58-1" aria-hidden="true" tabindex="-1"></a>lm<span class="fl">.2</span>lines <span class="ot">&lt;-</span> <span class="fu">lm</span>(mpg <span class="sc">~</span> qsec <span class="sc">+</span> am <span class="sc">+</span> qsec<span class="sc">:</span>am, <span class="at">data=</span>mtcars)</span>
<span id="cb58-2"><a href="#cb58-2" aria-hidden="true" tabindex="-1"></a><span class="fu">pander</span>(<span class="fu">summary</span>(lm<span class="fl">.2</span>lines)<span class="sc">$</span>coefficients)</span></code></pre></div>
<table style="width:88%;">
<colgroup>
<col width="25%" />
<col width="15%" />
<col width="18%" />
<col width="13%" />
<col width="15%" />
</colgroup>
<thead>
<tr class="header">
<th align="center"> </th>
<th align="center">Estimate</th>
<th align="center">Std. Error</th>
<th align="center">t value</th>
<th align="center">Pr(&gt;|t|)</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="center"><strong>(Intercept)</strong></td>
<td align="center">-9.01</td>
<td align="center">8.218</td>
<td align="center">-1.096</td>
<td align="center">0.2823</td>
</tr>
<tr class="even">
<td align="center"><strong>qsec</strong></td>
<td align="center">1.439</td>
<td align="center">0.45</td>
<td align="center">3.197</td>
<td align="center">0.003432</td>
</tr>
<tr class="odd">
<td align="center"><strong>am</strong></td>
<td align="center">-14.51</td>
<td align="center">12.48</td>
<td align="center">-1.163</td>
<td align="center">0.2548</td>
</tr>
<tr class="even">
<td align="center"><strong>qsec:am</strong></td>
<td align="center">1.321</td>
<td align="center">0.7017</td>
<td align="center">1.883</td>
<td align="center">0.07012</td>
</tr>
</tbody>
</table>
<p>The <strong>estimates</strong> shown above approximate the <span
class="math inline">\(\beta\)</span>’s in the regression model: <span
class="math inline">\(\beta_0\)</span> is estimated by the (Intercept),
<span class="math inline">\(\beta_1\)</span> is estimated by the
<code>qsec</code> value of 1.439, <span
class="math inline">\(\beta_2\)</span> is estimated by the
<code>am</code> value of -14.51, and <span
class="math inline">\(\beta_3\)</span> is estimated by the
<code>qsec:am</code> value of 1.321.</p>
<p>This gives two separate equations of lines.</p>
<p><strong>Automatic Transmission (am==0, <span
class="math inline">\(X_{2i} = 0\)</span>) Line</strong></p>
<p><span class="math display">\[
\hat{Y}_i = \overbrace{-9.01}^{\stackrel{\text{y-int}}{\text{baseline}}}
+ \overbrace{1.439}^{\stackrel{\text{slope}}{\text{baseline}}} X_{1i}
\]</span></p>
<p><strong>Manual Transmission (am==1 , <span
class="math inline">\(X_{2i} = 1\)</span>) Line</strong></p>
<p><span class="math display">\[
\hat{Y}_i =
\underbrace{(\overbrace{-9.01}^{\stackrel{\text{y-int}}{\text{baseline}}}
+ \overbrace{-14.51}^{\stackrel{\text{change
in}}{\text{y-int}}})}_{\stackrel{\text{y-intercept}}{-23.52}} +
\underbrace{(\overbrace{1.439}^{\stackrel{\text{slope}}{\text{baseline}}}
+\overbrace{1.321}^{\stackrel{\text{change
in}}{\text{slope}}})}_{\stackrel{\text{slope}}{2.76}} X_{1i}
\]</span></p>
<p>These lines are drawn as follows. Be sure to look at the “Code” to
understand how this graph was created using the ideas in the two
equations above.</p>
<table>
<tr>
<td>
<p><strong>Using Base R</strong></p>
<div class="sourceCode" id="cb59"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb59-1"><a href="#cb59-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(mpg <span class="sc">~</span> qsec, <span class="at">data=</span>mtcars, <span class="at">col=</span><span class="fu">c</span>(<span class="st">&quot;skyblue&quot;</span>,<span class="st">&quot;orange&quot;</span>)[<span class="fu">as.factor</span>(am)], <span class="at">pch=</span><span class="dv">21</span>, <span class="at">bg=</span><span class="st">&quot;gray83&quot;</span>, <span class="at">main=</span><span class="st">&quot;Two-lines Model using mtcars data set&quot;</span>, <span class="at">cex.main=</span><span class="dv">1</span>)</span>
<span id="cb59-2"><a href="#cb59-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb59-3"><a href="#cb59-3" aria-hidden="true" tabindex="-1"></a><span class="fu">legend</span>(<span class="st">&quot;topleft&quot;</span>, <span class="at">legend=</span><span class="fu">c</span>(<span class="st">&quot;Baseline (am==0)&quot;</span>, <span class="st">&quot;Changed-line (am==1)&quot;</span>), <span class="at">bty=</span><span class="st">&quot;n&quot;</span>, <span class="at">lty=</span><span class="dv">1</span>, <span class="at">col=</span><span class="fu">c</span>(<span class="st">&quot;skyblue&quot;</span>,<span class="st">&quot;orange&quot;</span>), <span class="at">cex=</span><span class="fl">0.8</span>)</span>
<span id="cb59-4"><a href="#cb59-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb59-5"><a href="#cb59-5" aria-hidden="true" tabindex="-1"></a><span class="co">#get the &quot;Estimates&quot; automatically:</span></span>
<span id="cb59-6"><a href="#cb59-6" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(lm<span class="fl">.2</span>lines)</span>
<span id="cb59-7"><a href="#cb59-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Then b will have 4 estimates:</span></span>
<span id="cb59-8"><a href="#cb59-8" aria-hidden="true" tabindex="-1"></a><span class="co"># b[1] is the estimate of beta_0: -9.0099</span></span>
<span id="cb59-9"><a href="#cb59-9" aria-hidden="true" tabindex="-1"></a><span class="co"># b[2] is the estimate of beta_1:  1.4385</span></span>
<span id="cb59-10"><a href="#cb59-10" aria-hidden="true" tabindex="-1"></a><span class="co"># b[3] is the estimate of beta_2: -14.5107</span></span>
<span id="cb59-11"><a href="#cb59-11" aria-hidden="true" tabindex="-1"></a><span class="co"># b[4] is the estimate of beta_3: 1.3214</span></span>
<span id="cb59-12"><a href="#cb59-12" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>, <span class="at">add=</span><span class="cn">TRUE</span>)  <span class="co">#baseline (in blue)</span></span>
<span id="cb59-13"><a href="#cb59-13" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>((b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">3</span>]) <span class="sc">+</span> (b[<span class="dv">2</span>] <span class="sc">+</span> b[<span class="dv">4</span>])<span class="sc">*</span>x, <span class="at">col=</span><span class="st">&quot;orange&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>, <span class="at">add=</span><span class="cn">TRUE</span>) <span class="co">#changed line (in orange)</span></span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-63-1.png" width="672" /></p>
</td>
<td>
<p><strong>Using ggplot2</strong></p>
<div class="sourceCode" id="cb60"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb60-1"><a href="#cb60-1" aria-hidden="true" tabindex="-1"></a><span class="co">#get the &quot;Estimates&quot; automatically:</span></span>
<span id="cb60-2"><a href="#cb60-2" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(lm<span class="fl">.2</span>lines)</span>
<span id="cb60-3"><a href="#cb60-3" aria-hidden="true" tabindex="-1"></a><span class="co"># Then b will have 4 estimates:</span></span>
<span id="cb60-4"><a href="#cb60-4" aria-hidden="true" tabindex="-1"></a><span class="co"># b[1] is the estimate of beta_0: -9.0099</span></span>
<span id="cb60-5"><a href="#cb60-5" aria-hidden="true" tabindex="-1"></a><span class="co"># b[2] is the estimate of beta_1:  1.4385</span></span>
<span id="cb60-6"><a href="#cb60-6" aria-hidden="true" tabindex="-1"></a><span class="co"># b[3] is the estimate of beta_2: -14.5107</span></span>
<span id="cb60-7"><a href="#cb60-7" aria-hidden="true" tabindex="-1"></a><span class="co"># b[4] is the estimate of beta_3: 1.3214</span></span>
<span id="cb60-8"><a href="#cb60-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb60-9"><a href="#cb60-9" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(mtcars, <span class="fu">aes</span>(<span class="at">y=</span>mpg, <span class="at">x=</span>qsec, <span class="at">color=</span><span class="fu">factor</span>(am))) <span class="sc">+</span></span>
<span id="cb60-10"><a href="#cb60-10" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_point</span>(<span class="at">pch=</span><span class="dv">21</span>, <span class="at">bg=</span><span class="st">&quot;gray83&quot;</span>) <span class="sc">+</span></span>
<span id="cb60-11"><a href="#cb60-11" aria-hidden="true" tabindex="-1"></a>  <span class="co">#geom_smooth(method=&quot;lm&quot;, se=F) + #easy way, but only draws the full interaction model. The manual way using stat_function (see below) is more involved, but more dynamic.</span></span>
<span id="cb60-12"><a href="#cb60-12" aria-hidden="true" tabindex="-1"></a>  <span class="fu">stat_function</span>(<span class="at">fun =</span> <span class="cf">function</span>(x) b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x, <span class="at">color=</span><span class="st">&quot;skyblue&quot;</span>) <span class="sc">+</span> <span class="co">#am==0 line</span></span>
<span id="cb60-13"><a href="#cb60-13" aria-hidden="true" tabindex="-1"></a>  <span class="fu">stat_function</span>(<span class="at">fun =</span> <span class="cf">function</span>(x) (b[<span class="dv">1</span>]<span class="sc">+</span>b[<span class="dv">3</span>]) <span class="sc">+</span> (b[<span class="dv">2</span>]<span class="sc">+</span>b[<span class="dv">4</span>])<span class="sc">*</span>x,<span class="at">color=</span><span class="st">&quot;orange&quot;</span>) <span class="sc">+</span> <span class="co">#am==1 line </span></span>
<span id="cb60-14"><a href="#cb60-14" aria-hidden="true" tabindex="-1"></a>  <span class="fu">scale_color_manual</span>(<span class="at">name=</span><span class="st">&quot;Transmission (am)&quot;</span>, <span class="at">values=</span><span class="fu">c</span>(<span class="st">&quot;skyblue&quot;</span>,<span class="st">&quot;orange&quot;</span>)) <span class="sc">+</span></span>
<span id="cb60-15"><a href="#cb60-15" aria-hidden="true" tabindex="-1"></a>  <span class="fu">labs</span>(<span class="at">title=</span><span class="st">&quot;Two-lines Model using mtcars data set&quot;</span>) </span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-64-1.png" width="672" /></p>
</td>
</tr>
</table>
</p>
</div>
<div id="LearnMorethreeDModel" class="tabcontent">
<p>
<table>
<tr>
<td>
<p><img src="LinearRegression_files/figure-html/volcano-1.png" width="144" /></p>
</td>
<td style="text-align: center;padding-left:15px;">
<p><span class="math display">\[
Y_i = \overbrace{\underbrace{\beta_0 + \beta_1 X_{1i} + \beta_2 X_{2i} +
\beta_3 X_{1i}X_{2i}}_{E\{Y_i\}}}^\text{3D Model} + \epsilon_i
\]</span></p>
</td>
</tr>
</table>
<p>The so called “3D” regression model uses two different quantitative
x-variables, an <span class="math inline">\(X_{1i}\)</span> and an <span
class="math inline">\(X_{2i}\)</span>. Unlike the two-lines model where
<span class="math inline">\(X_{2i}\)</span> could only be a 0 or a 1,
this <span class="math inline">\(X_{2i}\)</span> variable is
quantitative, and can take on any quantitative value.</p>
<table>
<colgroup>
<col width="12%" />
<col width="87%" />
</colgroup>
<thead>
<tr class="header">
<th>Parameter</th>
<th>Effect</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td><span class="math inline">\(\beta_0\)</span></td>
<td>Y-intercept of the Model</td>
</tr>
<tr class="even">
<td><span class="math inline">\(\beta_1\)</span></td>
<td>Slope of the line in the <span class="math inline">\(X_1\)</span>
direction.</td>
</tr>
<tr class="odd">
<td><span class="math inline">\(\beta_2\)</span></td>
<td>Slope of the line in the <span class="math inline">\(X_2\)</span>
direction.</td>
</tr>
<tr class="even">
<td><span class="math inline">\(\beta_3\)</span></td>
<td>Interaction term that allows the model, which is a plane in
three-dimensional space, to “bend”. If this term is zero, then the
regression surface is just a flat plane.</td>
</tr>
</tbody>
</table>
<p><strong>An Example</strong></p>
<p>Here is what a 3D regression looks like when there is no interaction
term. The two x-variables of <code>Month</code> and <code>Temp</code>
are being used to predict the y-variable of <code>Ozone</code>.</p>
<p><span class="math display">\[
  \underbrace{Y_i}_\text{Ozone} \underbrace{=}_{\sim}
\overbrace{\beta_0}^{\stackrel{\text{y-int}}{\text{baseline}}} +
\overbrace{\beta_1}^{\stackrel{\text{slope}}{\text{baseline}}}
\underbrace{X_{1i}}_\text{Temp} +
\overbrace{\beta_2}^{\stackrel{\text{change
in}}{\text{y-int}}}  \underbrace{X_{2i}}_\text{Month} + \epsilon_i
\]</span></p>
<div class="sourceCode" id="cb61"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb61-1"><a href="#cb61-1" aria-hidden="true" tabindex="-1"></a>air_lm <span class="ot">&lt;-</span> <span class="fu">lm</span>(Ozone <span class="sc">~</span> Temp <span class="sc">+</span> Month, <span class="at">data=</span> airquality)</span>
<span id="cb61-2"><a href="#cb61-2" aria-hidden="true" tabindex="-1"></a><span class="fu">pander</span>(air_lm<span class="sc">$</span>coefficients)</span></code></pre></div>
<table style="width:43%;">
<colgroup>
<col width="19%" />
<col width="11%" />
<col width="12%" />
</colgroup>
<thead>
<tr class="header">
<th align="center">(Intercept)</th>
<th align="center">Temp</th>
<th align="center">Month</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="center">-139.6</td>
<td align="center">2.659</td>
<td align="center">-3.522</td>
</tr>
</tbody>
</table>
<p>Notice how the slope, <span class="math inline">\(\beta_1\)</span>,
in the “Temp” direction is estimated to be 2.659 and the slope in the
“Month” direction, <span class="math inline">\(\beta_2\)</span>, is
estimated to be -3.522. Also, the y-intercept, <span
class="math inline">\(\beta_0\)</span>, is estimated to be -139.6.</p>
<div class="sourceCode" id="cb62"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb62-1"><a href="#cb62-1" aria-hidden="true" tabindex="-1"></a><span class="do">## Hint: library(car) has a scatterplot 3d function which is simple to use</span></span>
<span id="cb62-2"><a href="#cb62-2" aria-hidden="true" tabindex="-1"></a><span class="co">#  but the code should only be run in your console, not knit.</span></span>
<span id="cb62-3"><a href="#cb62-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb62-4"><a href="#cb62-4" aria-hidden="true" tabindex="-1"></a><span class="do">## library(car)</span></span>
<span id="cb62-5"><a href="#cb62-5" aria-hidden="true" tabindex="-1"></a><span class="do">## scatter3d(Y ~ X1 + X2, data=yourdata)</span></span>
<span id="cb62-6"><a href="#cb62-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb62-7"><a href="#cb62-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb62-8"><a href="#cb62-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb62-9"><a href="#cb62-9" aria-hidden="true" tabindex="-1"></a><span class="do">## To embed the 3d-scatterplot inside of your html document is harder.</span></span>
<span id="cb62-10"><a href="#cb62-10" aria-hidden="true" tabindex="-1"></a><span class="co">#library(plotly)</span></span>
<span id="cb62-11"><a href="#cb62-11" aria-hidden="true" tabindex="-1"></a><span class="co">#library(reshape2)</span></span>
<span id="cb62-12"><a href="#cb62-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb62-13"><a href="#cb62-13" aria-hidden="true" tabindex="-1"></a><span class="co">#Perform the multiple regression</span></span>
<span id="cb62-14"><a href="#cb62-14" aria-hidden="true" tabindex="-1"></a>air_lm <span class="ot">&lt;-</span> <span class="fu">lm</span>(Ozone <span class="sc">~</span> Temp <span class="sc">+</span> Month, <span class="at">data=</span> airquality)</span>
<span id="cb62-15"><a href="#cb62-15" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb62-16"><a href="#cb62-16" aria-hidden="true" tabindex="-1"></a><span class="co">#Graph Resolution (more important for more complex shapes)</span></span>
<span id="cb62-17"><a href="#cb62-17" aria-hidden="true" tabindex="-1"></a>graph_reso <span class="ot">&lt;-</span> <span class="fl">0.5</span></span>
<span id="cb62-18"><a href="#cb62-18" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb62-19"><a href="#cb62-19" aria-hidden="true" tabindex="-1"></a><span class="co">#Setup Axis</span></span>
<span id="cb62-20"><a href="#cb62-20" aria-hidden="true" tabindex="-1"></a>axis_x <span class="ot">&lt;-</span> <span class="fu">seq</span>(<span class="fu">min</span>(airquality<span class="sc">$</span>Temp), <span class="fu">max</span>(airquality<span class="sc">$</span>Temp), <span class="at">by =</span> graph_reso)</span>
<span id="cb62-21"><a href="#cb62-21" aria-hidden="true" tabindex="-1"></a>axis_y <span class="ot">&lt;-</span> <span class="fu">seq</span>(<span class="fu">min</span>(airquality<span class="sc">$</span>Month), <span class="fu">max</span>(airquality<span class="sc">$</span>Month), <span class="at">by =</span> graph_reso)</span>
<span id="cb62-22"><a href="#cb62-22" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb62-23"><a href="#cb62-23" aria-hidden="true" tabindex="-1"></a><span class="co">#Sample points</span></span>
<span id="cb62-24"><a href="#cb62-24" aria-hidden="true" tabindex="-1"></a>air_surface <span class="ot">&lt;-</span> <span class="fu">expand.grid</span>(<span class="at">Temp =</span> axis_x, <span class="at">Month =</span> axis_y, <span class="at">KEEP.OUT.ATTRS=</span>F)</span>
<span id="cb62-25"><a href="#cb62-25" aria-hidden="true" tabindex="-1"></a>air_surface<span class="sc">$</span>Z <span class="ot">&lt;-</span> <span class="fu">predict.lm</span>(air_lm, <span class="at">newdata =</span> air_surface)</span>
<span id="cb62-26"><a href="#cb62-26" aria-hidden="true" tabindex="-1"></a>air_surface <span class="ot">&lt;-</span> <span class="fu">acast</span>(air_surface, Month <span class="sc">~</span> Temp, <span class="at">value.var =</span> <span class="st">&quot;Z&quot;</span>) <span class="co">#y ~ x</span></span>
<span id="cb62-27"><a href="#cb62-27" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb62-28"><a href="#cb62-28" aria-hidden="true" tabindex="-1"></a><span class="co">#Create scatterplot</span></span>
<span id="cb62-29"><a href="#cb62-29" aria-hidden="true" tabindex="-1"></a><span class="fu">plot_ly</span>(airquality, </span>
<span id="cb62-30"><a href="#cb62-30" aria-hidden="true" tabindex="-1"></a>        <span class="at">x =</span> <span class="sc">~</span>Temp, </span>
<span id="cb62-31"><a href="#cb62-31" aria-hidden="true" tabindex="-1"></a>        <span class="at">y =</span> <span class="sc">~</span>Month, </span>
<span id="cb62-32"><a href="#cb62-32" aria-hidden="true" tabindex="-1"></a>        <span class="at">z =</span> <span class="sc">~</span>Ozone,</span>
<span id="cb62-33"><a href="#cb62-33" aria-hidden="true" tabindex="-1"></a>        <span class="at">text =</span> <span class="fu">rownames</span>(airquality), </span>
<span id="cb62-34"><a href="#cb62-34" aria-hidden="true" tabindex="-1"></a>        <span class="at">type =</span> <span class="st">&quot;scatter3d&quot;</span>, </span>
<span id="cb62-35"><a href="#cb62-35" aria-hidden="true" tabindex="-1"></a>        <span class="at">mode =</span> <span class="st">&quot;markers&quot;</span>) <span class="sc">%&gt;%</span></span>
<span id="cb62-36"><a href="#cb62-36" aria-hidden="true" tabindex="-1"></a>  <span class="fu">add_trace</span>(<span class="at">z =</span> air_surface,</span>
<span id="cb62-37"><a href="#cb62-37" aria-hidden="true" tabindex="-1"></a>            <span class="at">x =</span> axis_x,</span>
<span id="cb62-38"><a href="#cb62-38" aria-hidden="true" tabindex="-1"></a>            <span class="at">y =</span> axis_y,</span>
<span id="cb62-39"><a href="#cb62-39" aria-hidden="true" tabindex="-1"></a>            <span class="at">type =</span> <span class="st">&quot;surface&quot;</span>)</span></code></pre></div>
<div id="htmlwidget-68b773cb5ce1fdaa09b2" style="width:672px;height:480px;" class="plotly html-widget"></div>
<script type="application/json" data-for="htmlwidget-68b773cb5ce1fdaa09b2">{"x":{"visdat":{"6af02d7abb98":["function () ","plotlyVisDat"]},"cur_data":"6af02d7abb98","attrs":{"6af02d7abb98":{"x":{},"y":{},"z":{},"text":["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31","32","33","34","35","36","37","38","39","40","41","42","43","44","45","46","47","48","49","50","51","52","53","54","55","56","57","58","59","60","61","62","63","64","65","66","67","68","69","70","71","72","73","74","75","76","77","78","79","80","81","82","83","84","85","86","87","88","89","90","91","92","93","94","95","96","97","98","99","100","101","102","103","104","105","106","107","108","109","110","111","112","113","114","115","116","117","118","119","120","121","122","123","124","125","126","127","128","129","130","131","132","133","134","135","136","137","138","139","140","141","142","143","144","145","146","147","148","149","150","151","152","153"],"mode":"markers","alpha_stroke":1,"sizes":[10,100],"spans":[1,20],"type":"scatter3d"},"6af02d7abb98.1":{"x":[56,56.5,57,57.5,58,58.5,59,59.5,60,60.5,61,61.5,62,62.5,63,63.5,64,64.5,65,65.5,66,66.5,67,67.5,68,68.5,69,69.5,70,70.5,71,71.5,72,72.5,73,73.5,74,74.5,75,75.5,76,76.5,77,77.5,78,78.5,79,79.5,80,80.5,81,81.5,82,82.5,83,83.5,84,84.5,85,85.5,86,86.5,87,87.5,88,88.5,89,89.5,90,90.5,91,91.5,92,92.5,93,93.5,94,94.5,95,95.5,96,96.5,97],"y":[5,5.5,6,6.5,7,7.5,8,8.5,9],"z":[[-8.2930156346032753,-6.9632802100885236,-5.6335447855737719,-4.3038093610590202,-2.974073936544297,-1.6443385120295453,-0.3146030875147936,1.0151323369999581,2.3448677615147098,3.6746031860294615,5.0043386105442131,6.3340740350589648,7.6638094595737165,8.9935448840884682,10.32328030860322,11.653015733117972,12.982751157632723,14.312486582147475,15.642222006662227,16.971957431176978,18.30169285569173,19.631428280206482,20.961163704721233,22.290899129235985,23.620634553750737,24.950369978265488,26.28010540278024,27.609840827294992,28.939576251809743,30.269311676324467,31.599047100839218,32.928782525353967,34.258517949868718,35.58825337438347,36.917988798898222,38.247724223412973,39.577459647927725,40.907195072442477,42.236930496957228,43.56666592147198,44.896401345986732,46.226136770501483,47.555872195016235,48.885607619530987,50.215343044045738,51.54507846856049,52.874813893075242,54.204549317589994,55.534284742104745,56.864020166619497,58.193755591134249,59.523491015649,60.853226440163752,62.182961864678504,63.512697289193255,64.842432713707979,66.17216813822273,67.501903562737482,68.831638987252234,70.161374411766985,71.491109836281737,72.820845260796489,74.15058068531124,75.480316109825992,76.810051534340744,78.139786958855495,79.469522383370247,80.799257807884999,82.12899323239975,83.458728656914502,84.788464081429254,86.118199505944006,87.447934930458757,88.777670354973509,90.107405779488261,91.437141204003012,92.766876628517764,94.096612053032516,95.426347477547267,96.756082902062019,98.085818326576742,99.415553751091494,100.74528917560625],[-10.053971968669664,-8.7242365441549126,-7.3945011196401609,-6.0647656951254092,-4.7350302706106859,-3.4052948460959342,-2.0755594215811826,-0.74582399706643088,0.58391142744832081,1.9136468519630725,3.2433822764778242,4.5731177009925759,5.9028531255073275,7.2325885500220792,8.5623239745368309,9.8920593990515826,11.221794823566334,12.551530248081086,13.881265672595838,15.211001097110589,16.540736521625341,17.870471946140093,19.200207370654844,20.529942795169596,21.859678219684348,23.189413644199099,24.519149068713851,25.848884493228603,27.178619917743355,28.508355342258078,29.838090766772829,31.167826191287581,32.497561615802333,33.827297040317085,35.157032464831836,36.486767889346588,37.81650331386134,39.146238738376091,40.475974162890843,41.805709587405595,43.135445011920346,44.465180436435098,45.79491586094985,47.124651285464601,48.454386709979353,49.784122134494105,51.113857559008856,52.443592983523608,53.77332840803836,55.103063832553111,56.432799257067863,57.762534681582615,59.092270106097367,60.422005530612118,61.75174095512687,63.081476379641593,64.411211804156352,65.740947228671104,67.070682653185855,68.400418077700607,69.730153502215359,71.05988892673011,72.389624351244862,73.719359775759614,75.049095200274365,76.378830624789117,77.708566049303869,79.03830147381862,80.368036898333372,81.697772322848124,83.027507747362876,84.357243171877627,85.686978596392379,87.016714020907131,88.346449445421882,89.676184869936634,91.005920294451386,92.335655718966137,93.665391143480889,94.995126567995641,96.324861992510364,97.654597417025116,98.984332841539867],[-11.814928302736053,-10.485192878221302,-9.1554574537065498,-7.8257220291917982,-6.4959866046770749,-5.1662511801623232,-3.8365157556475715,-2.5067803311328198,-1.1770449066180682,0.15269051789668353,1.4824259424114352,2.8121613669261869,4.1418967914409386,5.4716322159556903,6.801367640470442,8.1311030649851936,9.4608384894999453,10.790573914014697,12.120309338529449,13.4500447630442,14.779780187558952,16.109515612073704,17.439251036588455,18.768986461103207,20.098721885617959,21.42845731013271,22.758192734647462,24.087928159162214,25.417663583676966,26.747399008191689,28.07713443270644,29.406869857221192,30.736605281735944,32.066340706250699,33.396076130765451,34.725811555280202,36.055546979794954,37.385282404309706,38.715017828824458,40.044753253339209,41.374488677853961,42.704224102368713,44.033959526883464,45.363694951398216,46.693430375912968,48.023165800427719,49.352901224942471,50.682636649457223,52.012372073971974,53.342107498486726,54.671842923001478,56.001578347516229,57.331313772030981,58.661049196545733,59.990784621060484,61.320520045575208,62.650255470089959,63.979990894604711,65.309726319119463,66.639461743634214,67.969197168148966,69.298932592663718,70.62866801717847,71.958403441693221,73.288138866207973,74.617874290722725,75.947609715237476,77.277345139752228,78.60708056426698,79.936815988781731,81.266551413296483,82.596286837811235,83.926022262325986,85.255757686840738,86.58549311135549,87.915228535870241,89.244963960384993,90.574699384899745,91.904434809414496,93.234170233929248,94.563905658443971,95.893641082958723,97.223376507473475],[-13.575884636802442,-12.24614921228769,-10.916413787772939,-9.5866783632581871,-8.2569429387434639,-6.9272075142287122,-5.5974720897139605,-4.2677366651992088,-2.9380012406844571,-1.6082658161697054,-0.27853039165495375,1.0512050328597979,2.3809404573745496,3.7106758818893013,5.040411306404053,6.3701467309188047,7.6998821554335564,9.029617579948308,10.35935300446306,11.689088428977811,13.018823853492563,14.348559278007315,15.678294702522066,17.008030127036818,18.33776555155157,19.667500976066322,20.997236400581073,22.326971825095825,23.656707249610577,24.9864426741253,26.316178098640052,27.645913523154803,28.975648947669555,30.305384372184307,31.635119796699058,32.96485522121381,34.294590645728562,35.624326070243313,36.954061494758065,38.283796919272817,39.613532343787568,40.94326776830232,42.273003192817072,43.602738617331823,44.932474041846575,46.262209466361327,47.591944890876078,48.92168031539083,50.251415739905582,51.581151164420334,52.910886588935085,54.240622013449837,55.570357437964589,56.90009286247934,58.229828286994092,59.559563711508815,60.889299136023567,62.219034560538319,63.54876998505307,64.878505409567822,66.208240834082574,67.537976258597325,68.867711683112077,70.197447107626829,71.52718253214158,72.856917956656332,74.186653381171084,75.516388805685835,76.846124230200587,78.175859654715339,79.505595079230091,80.835330503744842,82.165065928259594,83.494801352774346,84.824536777289097,86.154272201803849,87.484007626318601,88.813743050833352,90.143478475348104,91.473213899862856,92.802949324377579,94.132684748892331,95.462420173407082],[-15.336840970868831,-14.007105546354079,-12.677370121839328,-11.347634697324576,-10.017899272809853,-8.6881638482951011,-7.3584284237803494,-6.0286929992655978,-4.6989575747508461,-3.3692221502360944,-2.0394867257213427,-0.70975130120659102,0.61998412330816066,1.9497195478229123,3.279454972337664,4.6091903968524157,5.9389258213671674,7.2686612458819191,8.5983966703966708,9.9281320949114225,11.257867519426174,12.587602943940926,13.917338368455678,15.247073792970429,16.576809217485181,17.906544641999933,19.236280066514684,20.566015491029436,21.895750915544188,23.225486340058911,24.555221764573663,25.884957189088414,27.214692613603166,28.544428038117918,29.874163462632669,31.203898887147421,32.533634311662169,33.863369736176921,35.193105160691672,36.522840585206424,37.852576009721176,39.182311434235928,40.512046858750679,41.841782283265431,43.171517707780183,44.501253132294934,45.830988556809686,47.160723981324438,48.490459405839189,49.820194830353941,51.149930254868693,52.479665679383444,53.809401103898196,55.139136528412948,56.468871952927699,57.798607377442423,59.128342801957174,60.458078226471926,61.787813650986678,63.117549075501429,64.447284500016181,65.777019924530933,67.106755349045685,68.436490773560436,69.766226198075188,71.09596162258994,72.425697047104691,73.755432471619443,75.085167896134195,76.414903320648946,77.744638745163698,79.07437416967845,80.404109594193201,81.733845018707953,83.063580443222705,84.393315867737456,85.723051292252208,87.05278671676696,88.382522141281711,89.712257565796463,91.041992990311186,92.371728414825938,93.70146383934069],[-17.09779730493522,-15.768061880420468,-14.438326455905717,-13.108591031390965,-11.778855606876242,-10.44912018236149,-9.1193847578467384,-7.7896493333319867,-6.459913908817235,-5.1301784843024834,-3.8004430597877317,-2.47070763527298,-1.1409722107582283,0.18876321375652338,1.5184986382712751,2.8482340627860268,4.1779694873007784,5.5077049118155301,6.8374403363302818,8.1671757608450335,9.4969111853597852,10.826646609874537,12.156382034389289,13.48611745890404,14.815852883418792,16.145588307933544,17.475323732448295,18.805059156963047,20.134794581477799,21.464530005992522,22.794265430507274,24.124000855022025,25.453736279536777,26.783471704051529,28.11320712856628,29.442942553081032,30.772677977595784,32.102413402110535,33.432148826625287,34.761884251140039,36.09161967565479,37.421355100169542,38.751090524684294,40.080825949199046,41.410561373713797,42.740296798228549,44.070032222743301,45.399767647258052,46.729503071772804,48.059238496287556,49.388973920802307,50.718709345317059,52.048444769831811,53.378180194346562,54.707915618861314,56.037651043376037,57.367386467890789,58.697121892405541,60.026857316920292,61.356592741435044,62.686328165949796,64.01606359046454,65.345799014979292,66.675534439494044,68.005269864008795,69.335005288523547,70.664740713038299,71.99447613755305,73.324211562067802,74.653946986582554,75.983682411097305,77.313417835612057,78.643153260126809,79.972888684641561,81.302624109156312,82.632359533671064,83.962094958185816,85.291830382700567,86.621565807215319,87.951301231730071,89.281036656244794,90.610772080759546,91.940507505274297],[-18.858753639001609,-17.529018214486857,-16.199282789972106,-14.869547365457354,-13.539811940942631,-12.210076516427879,-10.880341091913127,-9.5506056673983757,-8.220870242883624,-6.8911348183688723,-5.5613993938541206,-4.2316639693393689,-2.9019285448246173,-1.5721931203098656,-0.24245769579511389,1.0872777287196378,2.4170131532343895,3.7467485777491412,5.0764840022638928,6.4062194267786445,7.7359548512933962,9.0656902758081479,10.3954257003229,11.725161124837651,13.054896549352403,14.384631973867155,15.714367398381906,17.044102822896658,18.37383824741141,19.703573671926133,21.033309096440885,22.363044520955636,23.692779945470388,25.02251536998514,26.352250794499891,27.681986219014643,29.011721643529395,30.341457068044146,31.671192492558898,33.000927917073653,34.330663341588405,35.660398766103157,36.990134190617908,38.31986961513266,39.649605039647412,40.979340464162163,42.309075888676915,43.638811313191667,44.968546737706419,46.29828216222117,47.628017586735922,48.957753011250674,50.287488435765425,51.617223860280177,52.946959284794929,54.276694709309652,55.606430133824404,56.936165558339155,58.265900982853907,59.595636407368659,60.92537183188341,62.255107256398162,63.584842680912914,64.914578105427665,66.244313529942417,67.574048954457169,68.90378437897192,70.233519803486672,71.563255228001424,72.892990652516175,74.222726077030927,75.552461501545679,76.882196926060431,78.211932350575182,79.541667775089934,80.871403199604686,82.201138624119437,83.530874048634189,84.860609473148941,86.190344897663692,87.520080322178416,88.849815746693167,90.179551171207919],[-20.619709973067998,-19.289974548553246,-17.960239124038495,-16.630503699523743,-15.30076827500902,-13.971032850494268,-12.641297425979516,-11.311562001464765,-9.981826576950013,-8.6520911524352613,-7.3223557279205096,-5.9926203034057579,-4.6628848788910062,-3.3331494543762545,-2.0034140298615029,-0.67367860534675117,0.65605681916800052,1.9857922436827522,3.3155276681975039,4.6452630927122556,5.9749985172270073,7.3047339417417589,8.6344693662565106,9.9642047907712623,11.293940215286014,12.623675639800766,13.953411064315517,15.283146488830269,16.612881913345021,17.942617337859744,19.272352762374496,20.602088186889247,21.931823611403999,23.261559035918751,24.591294460433502,25.921029884948254,27.250765309463006,28.580500733977757,29.910236158492509,31.239971583007261,32.569707007522013,33.899442432036764,35.229177856551516,36.558913281066268,37.888648705581019,39.218384130095771,40.548119554610523,41.877854979125274,43.207590403640026,44.537325828154778,45.867061252669529,47.196796677184281,48.526532101699033,49.856267526213784,51.186002950728536,52.515738375243259,53.845473799758011,55.175209224272763,56.504944648787514,57.834680073302266,59.164415497817018,60.494150922331769,61.823886346846521,63.153621771361273,64.483357195876025,65.813092620390776,67.142828044905528,68.47256346942028,69.802298893935031,71.132034318449783,72.461769742964535,73.791505167479286,75.121240591994038,76.45097601650879,77.780711441023541,79.110446865538293,80.440182290053045,81.769917714567796,83.099653139082548,84.4293885635973,85.759123988112023,87.088859412626775,88.418594837141526],[-22.380666307134387,-21.050930882619635,-19.721195458104884,-18.391460033590132,-17.061724609075409,-15.731989184560657,-14.402253760045905,-13.072518335531154,-11.742782911016402,-10.41304748650165,-9.0833120619868986,-7.7535766374721469,-6.4238412129573952,-5.0941057884426435,-3.7643703639278918,-2.4346349394131401,-1.1048995148983884,0.22483590961636324,1.5545713341311149,2.8843067586458666,4.2140421831606183,5.54377760767537,6.8735130321901217,8.2032484567048733,9.532983881219625,10.862719305734377,12.192454730249128,13.52219015476388,14.851925579278632,16.181661003793355,17.511396428308107,18.841131852822858,20.17086727733761,21.500602701852362,22.830338126367113,24.160073550881865,25.489808975396617,26.819544399911369,28.14927982442612,29.479015248940872,30.808750673455624,32.138486097970372,33.468221522485123,34.797956946999875,36.127692371514627,37.457427796029378,38.78716322054413,40.116898645058882,41.446634069573633,42.776369494088385,44.106104918603137,45.435840343117889,46.76557576763264,48.095311192147392,49.425046616662144,50.754782041176867,52.084517465691619,53.41425289020637,54.743988314721122,56.073723739235874,57.403459163750625,58.733194588265377,60.062930012780129,61.39266543729488,62.722400861809632,64.052136286324384,65.381871710839135,66.711607135353887,68.041342559868639,69.37107798438339,70.700813408898142,72.030548833412894,73.360284257927646,74.690019682442397,76.019755106957149,77.349490531471901,78.679225955986652,80.008961380501404,81.338696805016156,82.668432229530907,83.998167654045631,85.327903078560382,86.657638503075134]],"text":["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31","32","33","34","35","36","37","38","39","40","41","42","43","44","45","46","47","48","49","50","51","52","53","54","55","56","57","58","59","60","61","62","63","64","65","66","67","68","69","70","71","72","73","74","75","76","77","78","79","80","81","82","83","84","85","86","87","88","89","90","91","92","93","94","95","96","97","98","99","100","101","102","103","104","105","106","107","108","109","110","111","112","113","114","115","116","117","118","119","120","121","122","123","124","125","126","127","128","129","130","131","132","133","134","135","136","137","138","139","140","141","142","143","144","145","146","147","148","149","150","151","152","153"],"mode":"markers","alpha_stroke":1,"sizes":[10,100],"spans":[1,20],"type":"surface","inherit":true}},"layout":{"margin":{"b":40,"l":60,"t":25,"r":10},"scene":{"xaxis":{"title":"Temp"},"yaxis":{"title":"Month"},"zaxis":{"title":"Ozone"}},"hovermode":"closest","showlegend":false,"legend":{"yanchor":"top","y":0.5}},"source":"A","config":{"modeBarButtonsToAdd":["hoverclosest","hovercompare"],"showSendToCloud":false},"data":[{"x":[67,72,74,62,66,65,59,61,74,69,66,68,58,64,66,57,68,62,59,73,61,61,67,81,79,76,82,90,87,82,77,72,65,73,76,84,85,81,83,83,88,92,92,89,73,81,80,81,82,84,87,85,74,86,85,82,86,88,86,83,81,81,81,82,86,85,87,89,90,90,86,82,80,77,79,76,78,78,77,72,79,81,86,97,94,96,94,91,92,93,93,87,84,80,78,75,73,81,76,77,71,71,78,67,76,68,82,64,71,81,69,63,70,75,76,68],"y":[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9],"z":[41,36,12,18,28,23,19,8,7,16,11,14,18,14,34,6,30,11,1,11,4,32,23,45,115,37,29,71,39,23,21,37,20,12,13,135,49,32,64,40,77,97,97,85,10,27,7,48,35,61,79,63,16,80,108,20,52,82,50,64,59,39,9,16,78,35,66,122,89,110,44,28,65,22,59,23,31,44,21,9,45,168,73,76,118,84,85,96,78,73,91,47,32,20,23,21,24,44,21,28,9,13,46,18,13,24,16,13,23,36,7,14,30,14,18,20],"text":["1","2","3","4","6","7","8","9","11","12","13","14","15","16","17","18","19","20","21","22","23","24","28","29","30","31","38","40","41","44","47","48","49","50","51","62","63","64","66","67","68","69","70","71","73","74","76","77","78","79","80","81","82","85","86","87","88","89","90","91","92","93","94","95","96","97","98","99","100","101","104","105","106","108","109","110","111","112","113","114","116","117","118","120","121","122","123","124","125","126","127","128","129","130","131","132","133","134","135","136","137","138","139","140","141","142","143","144","145","146","147","148","149","151","152","153"],"mode":"markers","type":"scatter3d","marker":{"color":"rgba(31,119,180,1)","line":{"color":"rgba(31,119,180,1)"}},"error_y":{"color":"rgba(31,119,180,1)"},"error_x":{"color":"rgba(31,119,180,1)"},"line":{"color":"rgba(31,119,180,1)"},"frame":null},{"colorbar":{"title":"Ozone","ticklen":2,"len":0.5,"lenmode":"fraction","y":1,"yanchor":"top"},"colorscale":[["0","rgba(68,1,84,1)"],["0.0416666666666667","rgba(70,19,97,1)"],["0.0833333333333333","rgba(72,32,111,1)"],["0.125","rgba(71,45,122,1)"],["0.166666666666667","rgba(68,58,128,1)"],["0.208333333333333","rgba(64,70,135,1)"],["0.25","rgba(60,82,138,1)"],["0.291666666666667","rgba(56,93,140,1)"],["0.333333333333333","rgba(49,104,142,1)"],["0.375","rgba(46,114,142,1)"],["0.416666666666667","rgba(42,123,142,1)"],["0.458333333333333","rgba(38,133,141,1)"],["0.5","rgba(37,144,140,1)"],["0.541666666666667","rgba(33,154,138,1)"],["0.583333333333333","rgba(39,164,133,1)"],["0.625","rgba(47,174,127,1)"],["0.666666666666667","rgba(53,183,121,1)"],["0.708333333333333","rgba(79,191,110,1)"],["0.75","rgba(98,199,98,1)"],["0.791666666666667","rgba(119,207,85,1)"],["0.833333333333333","rgba(147,214,70,1)"],["0.875","rgba(172,220,52,1)"],["0.916666666666667","rgba(199,225,42,1)"],["0.958333333333333","rgba(226,228,40,1)"],["1","rgba(253,231,37,1)"]],"showscale":true,"x":[56,56.5,57,57.5,58,58.5,59,59.5,60,60.5,61,61.5,62,62.5,63,63.5,64,64.5,65,65.5,66,66.5,67,67.5,68,68.5,69,69.5,70,70.5,71,71.5,72,72.5,73,73.5,74,74.5,75,75.5,76,76.5,77,77.5,78,78.5,79,79.5,80,80.5,81,81.5,82,82.5,83,83.5,84,84.5,85,85.5,86,86.5,87,87.5,88,88.5,89,89.5,90,90.5,91,91.5,92,92.5,93,93.5,94,94.5,95,95.5,96,96.5,97],"y":[5,5.5,6,6.5,7,7.5,8,8.5,9],"z":[[-8.2930156346032753,-6.9632802100885236,-5.6335447855737719,-4.3038093610590202,-2.974073936544297,-1.6443385120295453,-0.3146030875147936,1.0151323369999581,2.3448677615147098,3.6746031860294615,5.0043386105442131,6.3340740350589648,7.6638094595737165,8.9935448840884682,10.32328030860322,11.653015733117972,12.982751157632723,14.312486582147475,15.642222006662227,16.971957431176978,18.30169285569173,19.631428280206482,20.961163704721233,22.290899129235985,23.620634553750737,24.950369978265488,26.28010540278024,27.609840827294992,28.939576251809743,30.269311676324467,31.599047100839218,32.928782525353967,34.258517949868718,35.58825337438347,36.917988798898222,38.247724223412973,39.577459647927725,40.907195072442477,42.236930496957228,43.56666592147198,44.896401345986732,46.226136770501483,47.555872195016235,48.885607619530987,50.215343044045738,51.54507846856049,52.874813893075242,54.204549317589994,55.534284742104745,56.864020166619497,58.193755591134249,59.523491015649,60.853226440163752,62.182961864678504,63.512697289193255,64.842432713707979,66.17216813822273,67.501903562737482,68.831638987252234,70.161374411766985,71.491109836281737,72.820845260796489,74.15058068531124,75.480316109825992,76.810051534340744,78.139786958855495,79.469522383370247,80.799257807884999,82.12899323239975,83.458728656914502,84.788464081429254,86.118199505944006,87.447934930458757,88.777670354973509,90.107405779488261,91.437141204003012,92.766876628517764,94.096612053032516,95.426347477547267,96.756082902062019,98.085818326576742,99.415553751091494,100.74528917560625],[-10.053971968669664,-8.7242365441549126,-7.3945011196401609,-6.0647656951254092,-4.7350302706106859,-3.4052948460959342,-2.0755594215811826,-0.74582399706643088,0.58391142744832081,1.9136468519630725,3.2433822764778242,4.5731177009925759,5.9028531255073275,7.2325885500220792,8.5623239745368309,9.8920593990515826,11.221794823566334,12.551530248081086,13.881265672595838,15.211001097110589,16.540736521625341,17.870471946140093,19.200207370654844,20.529942795169596,21.859678219684348,23.189413644199099,24.519149068713851,25.848884493228603,27.178619917743355,28.508355342258078,29.838090766772829,31.167826191287581,32.497561615802333,33.827297040317085,35.157032464831836,36.486767889346588,37.81650331386134,39.146238738376091,40.475974162890843,41.805709587405595,43.135445011920346,44.465180436435098,45.79491586094985,47.124651285464601,48.454386709979353,49.784122134494105,51.113857559008856,52.443592983523608,53.77332840803836,55.103063832553111,56.432799257067863,57.762534681582615,59.092270106097367,60.422005530612118,61.75174095512687,63.081476379641593,64.411211804156352,65.740947228671104,67.070682653185855,68.400418077700607,69.730153502215359,71.05988892673011,72.389624351244862,73.719359775759614,75.049095200274365,76.378830624789117,77.708566049303869,79.03830147381862,80.368036898333372,81.697772322848124,83.027507747362876,84.357243171877627,85.686978596392379,87.016714020907131,88.346449445421882,89.676184869936634,91.005920294451386,92.335655718966137,93.665391143480889,94.995126567995641,96.324861992510364,97.654597417025116,98.984332841539867],[-11.814928302736053,-10.485192878221302,-9.1554574537065498,-7.8257220291917982,-6.4959866046770749,-5.1662511801623232,-3.8365157556475715,-2.5067803311328198,-1.1770449066180682,0.15269051789668353,1.4824259424114352,2.8121613669261869,4.1418967914409386,5.4716322159556903,6.801367640470442,8.1311030649851936,9.4608384894999453,10.790573914014697,12.120309338529449,13.4500447630442,14.779780187558952,16.109515612073704,17.439251036588455,18.768986461103207,20.098721885617959,21.42845731013271,22.758192734647462,24.087928159162214,25.417663583676966,26.747399008191689,28.07713443270644,29.406869857221192,30.736605281735944,32.066340706250699,33.396076130765451,34.725811555280202,36.055546979794954,37.385282404309706,38.715017828824458,40.044753253339209,41.374488677853961,42.704224102368713,44.033959526883464,45.363694951398216,46.693430375912968,48.023165800427719,49.352901224942471,50.682636649457223,52.012372073971974,53.342107498486726,54.671842923001478,56.001578347516229,57.331313772030981,58.661049196545733,59.990784621060484,61.320520045575208,62.650255470089959,63.979990894604711,65.309726319119463,66.639461743634214,67.969197168148966,69.298932592663718,70.62866801717847,71.958403441693221,73.288138866207973,74.617874290722725,75.947609715237476,77.277345139752228,78.60708056426698,79.936815988781731,81.266551413296483,82.596286837811235,83.926022262325986,85.255757686840738,86.58549311135549,87.915228535870241,89.244963960384993,90.574699384899745,91.904434809414496,93.234170233929248,94.563905658443971,95.893641082958723,97.223376507473475],[-13.575884636802442,-12.24614921228769,-10.916413787772939,-9.5866783632581871,-8.2569429387434639,-6.9272075142287122,-5.5974720897139605,-4.2677366651992088,-2.9380012406844571,-1.6082658161697054,-0.27853039165495375,1.0512050328597979,2.3809404573745496,3.7106758818893013,5.040411306404053,6.3701467309188047,7.6998821554335564,9.029617579948308,10.35935300446306,11.689088428977811,13.018823853492563,14.348559278007315,15.678294702522066,17.008030127036818,18.33776555155157,19.667500976066322,20.997236400581073,22.326971825095825,23.656707249610577,24.9864426741253,26.316178098640052,27.645913523154803,28.975648947669555,30.305384372184307,31.635119796699058,32.96485522121381,34.294590645728562,35.624326070243313,36.954061494758065,38.283796919272817,39.613532343787568,40.94326776830232,42.273003192817072,43.602738617331823,44.932474041846575,46.262209466361327,47.591944890876078,48.92168031539083,50.251415739905582,51.581151164420334,52.910886588935085,54.240622013449837,55.570357437964589,56.90009286247934,58.229828286994092,59.559563711508815,60.889299136023567,62.219034560538319,63.54876998505307,64.878505409567822,66.208240834082574,67.537976258597325,68.867711683112077,70.197447107626829,71.52718253214158,72.856917956656332,74.186653381171084,75.516388805685835,76.846124230200587,78.175859654715339,79.505595079230091,80.835330503744842,82.165065928259594,83.494801352774346,84.824536777289097,86.154272201803849,87.484007626318601,88.813743050833352,90.143478475348104,91.473213899862856,92.802949324377579,94.132684748892331,95.462420173407082],[-15.336840970868831,-14.007105546354079,-12.677370121839328,-11.347634697324576,-10.017899272809853,-8.6881638482951011,-7.3584284237803494,-6.0286929992655978,-4.6989575747508461,-3.3692221502360944,-2.0394867257213427,-0.70975130120659102,0.61998412330816066,1.9497195478229123,3.279454972337664,4.6091903968524157,5.9389258213671674,7.2686612458819191,8.5983966703966708,9.9281320949114225,11.257867519426174,12.587602943940926,13.917338368455678,15.247073792970429,16.576809217485181,17.906544641999933,19.236280066514684,20.566015491029436,21.895750915544188,23.225486340058911,24.555221764573663,25.884957189088414,27.214692613603166,28.544428038117918,29.874163462632669,31.203898887147421,32.533634311662169,33.863369736176921,35.193105160691672,36.522840585206424,37.852576009721176,39.182311434235928,40.512046858750679,41.841782283265431,43.171517707780183,44.501253132294934,45.830988556809686,47.160723981324438,48.490459405839189,49.820194830353941,51.149930254868693,52.479665679383444,53.809401103898196,55.139136528412948,56.468871952927699,57.798607377442423,59.128342801957174,60.458078226471926,61.787813650986678,63.117549075501429,64.447284500016181,65.777019924530933,67.106755349045685,68.436490773560436,69.766226198075188,71.09596162258994,72.425697047104691,73.755432471619443,75.085167896134195,76.414903320648946,77.744638745163698,79.07437416967845,80.404109594193201,81.733845018707953,83.063580443222705,84.393315867737456,85.723051292252208,87.05278671676696,88.382522141281711,89.712257565796463,91.041992990311186,92.371728414825938,93.70146383934069],[-17.09779730493522,-15.768061880420468,-14.438326455905717,-13.108591031390965,-11.778855606876242,-10.44912018236149,-9.1193847578467384,-7.7896493333319867,-6.459913908817235,-5.1301784843024834,-3.8004430597877317,-2.47070763527298,-1.1409722107582283,0.18876321375652338,1.5184986382712751,2.8482340627860268,4.1779694873007784,5.5077049118155301,6.8374403363302818,8.1671757608450335,9.4969111853597852,10.826646609874537,12.156382034389289,13.48611745890404,14.815852883418792,16.145588307933544,17.475323732448295,18.805059156963047,20.134794581477799,21.464530005992522,22.794265430507274,24.124000855022025,25.453736279536777,26.783471704051529,28.11320712856628,29.442942553081032,30.772677977595784,32.102413402110535,33.432148826625287,34.761884251140039,36.09161967565479,37.421355100169542,38.751090524684294,40.080825949199046,41.410561373713797,42.740296798228549,44.070032222743301,45.399767647258052,46.729503071772804,48.059238496287556,49.388973920802307,50.718709345317059,52.048444769831811,53.378180194346562,54.707915618861314,56.037651043376037,57.367386467890789,58.697121892405541,60.026857316920292,61.356592741435044,62.686328165949796,64.01606359046454,65.345799014979292,66.675534439494044,68.005269864008795,69.335005288523547,70.664740713038299,71.99447613755305,73.324211562067802,74.653946986582554,75.983682411097305,77.313417835612057,78.643153260126809,79.972888684641561,81.302624109156312,82.632359533671064,83.962094958185816,85.291830382700567,86.621565807215319,87.951301231730071,89.281036656244794,90.610772080759546,91.940507505274297],[-18.858753639001609,-17.529018214486857,-16.199282789972106,-14.869547365457354,-13.539811940942631,-12.210076516427879,-10.880341091913127,-9.5506056673983757,-8.220870242883624,-6.8911348183688723,-5.5613993938541206,-4.2316639693393689,-2.9019285448246173,-1.5721931203098656,-0.24245769579511389,1.0872777287196378,2.4170131532343895,3.7467485777491412,5.0764840022638928,6.4062194267786445,7.7359548512933962,9.0656902758081479,10.3954257003229,11.725161124837651,13.054896549352403,14.384631973867155,15.714367398381906,17.044102822896658,18.37383824741141,19.703573671926133,21.033309096440885,22.363044520955636,23.692779945470388,25.02251536998514,26.352250794499891,27.681986219014643,29.011721643529395,30.341457068044146,31.671192492558898,33.000927917073653,34.330663341588405,35.660398766103157,36.990134190617908,38.31986961513266,39.649605039647412,40.979340464162163,42.309075888676915,43.638811313191667,44.968546737706419,46.29828216222117,47.628017586735922,48.957753011250674,50.287488435765425,51.617223860280177,52.946959284794929,54.276694709309652,55.606430133824404,56.936165558339155,58.265900982853907,59.595636407368659,60.92537183188341,62.255107256398162,63.584842680912914,64.914578105427665,66.244313529942417,67.574048954457169,68.90378437897192,70.233519803486672,71.563255228001424,72.892990652516175,74.222726077030927,75.552461501545679,76.882196926060431,78.211932350575182,79.541667775089934,80.871403199604686,82.201138624119437,83.530874048634189,84.860609473148941,86.190344897663692,87.520080322178416,88.849815746693167,90.179551171207919],[-20.619709973067998,-19.289974548553246,-17.960239124038495,-16.630503699523743,-15.30076827500902,-13.971032850494268,-12.641297425979516,-11.311562001464765,-9.981826576950013,-8.6520911524352613,-7.3223557279205096,-5.9926203034057579,-4.6628848788910062,-3.3331494543762545,-2.0034140298615029,-0.67367860534675117,0.65605681916800052,1.9857922436827522,3.3155276681975039,4.6452630927122556,5.9749985172270073,7.3047339417417589,8.6344693662565106,9.9642047907712623,11.293940215286014,12.623675639800766,13.953411064315517,15.283146488830269,16.612881913345021,17.942617337859744,19.272352762374496,20.602088186889247,21.931823611403999,23.261559035918751,24.591294460433502,25.921029884948254,27.250765309463006,28.580500733977757,29.910236158492509,31.239971583007261,32.569707007522013,33.899442432036764,35.229177856551516,36.558913281066268,37.888648705581019,39.218384130095771,40.548119554610523,41.877854979125274,43.207590403640026,44.537325828154778,45.867061252669529,47.196796677184281,48.526532101699033,49.856267526213784,51.186002950728536,52.515738375243259,53.845473799758011,55.175209224272763,56.504944648787514,57.834680073302266,59.164415497817018,60.494150922331769,61.823886346846521,63.153621771361273,64.483357195876025,65.813092620390776,67.142828044905528,68.47256346942028,69.802298893935031,71.132034318449783,72.461769742964535,73.791505167479286,75.121240591994038,76.45097601650879,77.780711441023541,79.110446865538293,80.440182290053045,81.769917714567796,83.099653139082548,84.4293885635973,85.759123988112023,87.088859412626775,88.418594837141526],[-22.380666307134387,-21.050930882619635,-19.721195458104884,-18.391460033590132,-17.061724609075409,-15.731989184560657,-14.402253760045905,-13.072518335531154,-11.742782911016402,-10.41304748650165,-9.0833120619868986,-7.7535766374721469,-6.4238412129573952,-5.0941057884426435,-3.7643703639278918,-2.4346349394131401,-1.1048995148983884,0.22483590961636324,1.5545713341311149,2.8843067586458666,4.2140421831606183,5.54377760767537,6.8735130321901217,8.2032484567048733,9.532983881219625,10.862719305734377,12.192454730249128,13.52219015476388,14.851925579278632,16.181661003793355,17.511396428308107,18.841131852822858,20.17086727733761,21.500602701852362,22.830338126367113,24.160073550881865,25.489808975396617,26.819544399911369,28.14927982442612,29.479015248940872,30.808750673455624,32.138486097970372,33.468221522485123,34.797956946999875,36.127692371514627,37.457427796029378,38.78716322054413,40.116898645058882,41.446634069573633,42.776369494088385,44.106104918603137,45.435840343117889,46.76557576763264,48.095311192147392,49.425046616662144,50.754782041176867,52.084517465691619,53.41425289020637,54.743988314721122,56.073723739235874,57.403459163750625,58.733194588265377,60.062930012780129,61.39266543729488,62.722400861809632,64.052136286324384,65.381871710839135,66.711607135353887,68.041342559868639,69.37107798438339,70.700813408898142,72.030548833412894,73.360284257927646,74.690019682442397,76.019755106957149,77.349490531471901,78.679225955986652,80.008961380501404,81.338696805016156,82.668432229530907,83.998167654045631,85.327903078560382,86.657638503075134]],"text":["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31","32","33","34","35","36","37","38","39","40","41","42","43","44","45","46","47","48","49","50","51","52","53","54","55","56","57","58","59","60","61","62","63","64","65","66","67","68","69","70","71","72","73","74","75","76","77","78","79","80","81","82","83","84","85","86","87","88","89","90","91","92","93","94","95","96","97","98","99","100","101","102","103","104","105","106","107","108","109","110","111","112","113","114","115","116","117","118","119","120","121","122","123","124","125","126","127","128","129","130","131","132","133","134","135","136","137","138","139","140","141","142","143","144","145","146","147","148","149","150","151","152","153"],"mode":"markers","type":"surface","frame":null}],"highlight":{"on":"plotly_click","persistent":false,"dynamic":false,"selectize":false,"opacityDim":0.20000000000000001,"selected":{"opacity":1},"debounce":0},"shinyEvents":["plotly_hover","plotly_click","plotly_selected","plotly_relayout","plotly_brushed","plotly_brushing","plotly_clickannotation","plotly_doubleclick","plotly_deselect","plotly_afterplot","plotly_sunburstclick"],"base_url":"https://plot.ly"},"evals":[],"jsHooks":[]}</script>
<p>Here is a second view of this same regression with what is called a
contour plot, contour map, or density plot.</p>
<div class="sourceCode" id="cb63"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb63-1"><a href="#cb63-1" aria-hidden="true" tabindex="-1"></a>mycolorpalette <span class="ot">&lt;-</span> <span class="fu">colorRampPalette</span>(<span class="fu">c</span>(<span class="st">&quot;skyblue2&quot;</span>, <span class="st">&quot;orange&quot;</span>))</span>
<span id="cb63-2"><a href="#cb63-2" aria-hidden="true" tabindex="-1"></a><span class="fu">filled.contour</span>(<span class="at">x=</span>axis_x, <span class="at">y=</span>axis_y, <span class="at">z=</span><span class="fu">matrix</span>(air_surface<span class="sc">$</span>Z, <span class="fu">length</span>(axis_x), <span class="fu">length</span>(axis_y)), <span class="at">col=</span><span class="fu">mycolorpalette</span>(<span class="dv">26</span>))</span></code></pre></div>
<p><strong>Including the Interaction Term</strong></p>
<p>Here is what a 3D regression looks like when the interaction term is
present. The two x-variables of <code>Month</code> and <code>Temp</code>
are being used to predict the y-variable of <code>Ozone</code>.</p>
<p><span class="math display">\[
  \underbrace{Y_i}_\text{Ozone} \underbrace{=}_{\sim}
\overbrace{\beta_0}^{\stackrel{\text{y-int}}{\text{baseline}}} +
\overbrace{\beta_1}^{\stackrel{\text{slope}}{\text{baseline}}}
\underbrace{X_{1i}}_\text{Temp} +
\overbrace{\beta_2}^{\stackrel{\text{change
in}}{\text{y-int}}}  \underbrace{X_{2i}}_\text{Month} +
\overbrace{\beta_3}^{\stackrel{\text{change in}}{\text{slope}}}
\underbrace{X_{1i}X_{2i}}_\text{Temp:Month} + \epsilon_i
\]</span></p>
<div class="sourceCode" id="cb64"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb64-1"><a href="#cb64-1" aria-hidden="true" tabindex="-1"></a>air_lm <span class="ot">&lt;-</span> <span class="fu">lm</span>(Ozone <span class="sc">~</span> Temp <span class="sc">+</span> Month <span class="sc">+</span> Temp<span class="sc">:</span>Month, <span class="at">data=</span> airquality)</span>
<span id="cb64-2"><a href="#cb64-2" aria-hidden="true" tabindex="-1"></a><span class="fu">pander</span>(air_lm<span class="sc">$</span>coefficients)</span></code></pre></div>
<table style="width:60%;">
<colgroup>
<col width="19%" />
<col width="9%" />
<col width="12%" />
<col width="18%" />
</colgroup>
<thead>
<tr class="header">
<th align="center">(Intercept)</th>
<th align="center">Temp</th>
<th align="center">Month</th>
<th align="center">Temp:Month</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="center">-3.915</td>
<td align="center">0.77</td>
<td align="center">-23.01</td>
<td align="center">0.2678</td>
</tr>
</tbody>
</table>
<p>Notice how all coefficient estimates have changed. The y-intercept,
<span class="math inline">\(\beta_0\)</span> is now estimated to be
<span class="math inline">\(-3.915\)</span>. The slope term, <span
class="math inline">\(\beta_1\)</span>, in the Temp-direction is
estimated as <span class="math inline">\(0.77\)</span>, while the slope
term, <span class="math inline">\(\beta_2\)</span>, in the
Month-direction is estimated to be <span
class="math inline">\(-23.01\)</span>. This change in estimated
coefficiets is due to the presence of the interaction term’s
coefficient, <span class="math inline">\(\beta_3\)</span>, which is
estimated to be <span class="math inline">\(0.2678\)</span>. As you
should notice in the graphic, the interaction model allows the “slopes”
in each direction to change, creating a “curved” surface for the
regression surface instead of a flat surface.</p>
<div class="sourceCode" id="cb65"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb65-1"><a href="#cb65-1" aria-hidden="true" tabindex="-1"></a><span class="co">#Perform the multiple regression</span></span>
<span id="cb65-2"><a href="#cb65-2" aria-hidden="true" tabindex="-1"></a>air_lm <span class="ot">&lt;-</span> <span class="fu">lm</span>(Ozone <span class="sc">~</span> Temp <span class="sc">+</span> Month <span class="sc">+</span> Temp<span class="sc">:</span>Month, <span class="at">data=</span> airquality)</span>
<span id="cb65-3"><a href="#cb65-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb65-4"><a href="#cb65-4" aria-hidden="true" tabindex="-1"></a><span class="co">#Graph Resolution (more important for more complex shapes)</span></span>
<span id="cb65-5"><a href="#cb65-5" aria-hidden="true" tabindex="-1"></a>graph_reso <span class="ot">&lt;-</span> <span class="fl">0.5</span></span>
<span id="cb65-6"><a href="#cb65-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb65-7"><a href="#cb65-7" aria-hidden="true" tabindex="-1"></a><span class="co">#Setup Axis</span></span>
<span id="cb65-8"><a href="#cb65-8" aria-hidden="true" tabindex="-1"></a>axis_x <span class="ot">&lt;-</span> <span class="fu">seq</span>(<span class="fu">min</span>(airquality<span class="sc">$</span>Temp), <span class="fu">max</span>(airquality<span class="sc">$</span>Temp), <span class="at">by =</span> graph_reso)</span>
<span id="cb65-9"><a href="#cb65-9" aria-hidden="true" tabindex="-1"></a>axis_y <span class="ot">&lt;-</span> <span class="fu">seq</span>(<span class="fu">min</span>(airquality<span class="sc">$</span>Month), <span class="fu">max</span>(airquality<span class="sc">$</span>Month), <span class="at">by =</span> graph_reso)</span>
<span id="cb65-10"><a href="#cb65-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb65-11"><a href="#cb65-11" aria-hidden="true" tabindex="-1"></a><span class="co">#Sample points</span></span>
<span id="cb65-12"><a href="#cb65-12" aria-hidden="true" tabindex="-1"></a>air_surface <span class="ot">&lt;-</span> <span class="fu">expand.grid</span>(<span class="at">Temp =</span> axis_x, <span class="at">Month =</span> axis_y, <span class="at">KEEP.OUT.ATTRS=</span>F)</span>
<span id="cb65-13"><a href="#cb65-13" aria-hidden="true" tabindex="-1"></a>air_surface <span class="ot">&lt;-</span> air_surface <span class="sc">%&gt;%</span> <span class="fu">mutate</span>(<span class="at">Z=</span><span class="fu">predict.lm</span>(air_lm, <span class="at">newdata =</span> air_surface))</span>
<span id="cb65-14"><a href="#cb65-14" aria-hidden="true" tabindex="-1"></a>air_surface <span class="ot">&lt;-</span> <span class="fu">acast</span>(air_surface, Month <span class="sc">~</span> Temp, <span class="at">value.var =</span> <span class="st">&quot;Z&quot;</span>) <span class="co">#y ~ x</span></span>
<span id="cb65-15"><a href="#cb65-15" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb65-16"><a href="#cb65-16" aria-hidden="true" tabindex="-1"></a><span class="co">#Create scatterplot</span></span>
<span id="cb65-17"><a href="#cb65-17" aria-hidden="true" tabindex="-1"></a><span class="fu">plot_ly</span>(airquality, </span>
<span id="cb65-18"><a href="#cb65-18" aria-hidden="true" tabindex="-1"></a>        <span class="at">x =</span> <span class="sc">~</span>Temp, </span>
<span id="cb65-19"><a href="#cb65-19" aria-hidden="true" tabindex="-1"></a>        <span class="at">y =</span> <span class="sc">~</span>Month, </span>
<span id="cb65-20"><a href="#cb65-20" aria-hidden="true" tabindex="-1"></a>        <span class="at">z =</span> <span class="sc">~</span>Ozone,</span>
<span id="cb65-21"><a href="#cb65-21" aria-hidden="true" tabindex="-1"></a>        <span class="at">text =</span> <span class="fu">rownames</span>(airquality), </span>
<span id="cb65-22"><a href="#cb65-22" aria-hidden="true" tabindex="-1"></a>        <span class="at">type =</span> <span class="st">&quot;scatter3d&quot;</span>, </span>
<span id="cb65-23"><a href="#cb65-23" aria-hidden="true" tabindex="-1"></a>        <span class="at">mode =</span> <span class="st">&quot;markers&quot;</span>) <span class="sc">%&gt;%</span></span>
<span id="cb65-24"><a href="#cb65-24" aria-hidden="true" tabindex="-1"></a>  <span class="fu">add_trace</span>(<span class="at">z =</span> air_surface,</span>
<span id="cb65-25"><a href="#cb65-25" aria-hidden="true" tabindex="-1"></a>            <span class="at">x =</span> axis_x,</span>
<span id="cb65-26"><a href="#cb65-26" aria-hidden="true" tabindex="-1"></a>            <span class="at">y =</span> axis_y,</span>
<span id="cb65-27"><a href="#cb65-27" aria-hidden="true" tabindex="-1"></a>            <span class="at">type =</span> <span class="st">&quot;surface&quot;</span>)</span></code></pre></div>
<div id="htmlwidget-528e7f90ebdfcf34d11d" style="width:672px;height:480px;" class="plotly html-widget"></div>
<script type="application/json" data-for="htmlwidget-528e7f90ebdfcf34d11d">{"x":{"visdat":{"6af05e41d7d0":["function () ","plotlyVisDat"]},"cur_data":"6af05e41d7d0","attrs":{"6af05e41d7d0":{"x":{},"y":{},"z":{},"text":["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31","32","33","34","35","36","37","38","39","40","41","42","43","44","45","46","47","48","49","50","51","52","53","54","55","56","57","58","59","60","61","62","63","64","65","66","67","68","69","70","71","72","73","74","75","76","77","78","79","80","81","82","83","84","85","86","87","88","89","90","91","92","93","94","95","96","97","98","99","100","101","102","103","104","105","106","107","108","109","110","111","112","113","114","115","116","117","118","119","120","121","122","123","124","125","126","127","128","129","130","131","132","133","134","135","136","137","138","139","140","141","142","143","144","145","146","147","148","149","150","151","152","153"],"mode":"markers","alpha_stroke":1,"sizes":[10,100],"spans":[1,20],"type":"scatter3d"},"6af05e41d7d0.1":{"x":[56,56.5,57,57.5,58,58.5,59,59.5,60,60.5,61,61.5,62,62.5,63,63.5,64,64.5,65,65.5,66,66.5,67,67.5,68,68.5,69,69.5,70,70.5,71,71.5,72,72.5,73,73.5,74,74.5,75,75.5,76,76.5,77,77.5,78,78.5,79,79.5,80,80.5,81,81.5,82,82.5,83,83.5,84,84.5,85,85.5,86,86.5,87,87.5,88,88.5,89,89.5,90,90.5,91,91.5,92,92.5,93,93.5,94,94.5,95,95.5,96,96.5,97],"y":[5,5.5,6,6.5,7,7.5,8,8.5,9],"z":[[-0.83442089392806906,0.22012837686895637,1.274677647665996,2.3292269184630356,3.3837761892600753,4.4383254600571291,5.4928747308541546,6.5474240016511942,7.6019732724482481,8.6565225432452735,9.7110718140423273,10.765621084839353,11.820170355636407,12.874719626433446,13.929268897230472,14.983818168027511,16.038367438824565,17.092916709621605,18.147465980418644,19.202015251215684,20.256564522012724,21.311113792809763,22.365663063606789,23.420212334403857,24.474761605200882,25.529310875997922,26.583860146794962,27.638409417592001,28.692958688389055,29.74750795918608,30.80205722998312,31.856606500780167,32.911155771577207,33.965705042374239,35.020254313171279,36.074803583968325,37.129352854765365,38.183902125562398,39.238451396359451,40.293000667156491,41.347549937953524,42.402099208750563,43.45664847954761,44.51119775034465,45.565747021141682,46.620296291938722,47.674845562735769,48.729394833532808,49.783944104329848,50.83849337512688,51.893042645923934,52.947591916720967,54.002141187518006,55.056690458315053,56.111239729112093,57.165788999909125,58.220338270706165,59.274887541503219,60.329436812300244,61.383986083097284,62.438535353894324,63.493084624691377,64.547633895488417,65.602183166285442,66.656732437082482,67.711281707879536,68.765830978676576,69.820380249473601,70.874929520270641,71.929478791067694,72.984028061864734,74.038577332661774,75.093126603458813,76.147675874255853,77.202225145052893,78.256774415849932,79.311323686646972,80.365872957444012,81.420422228241051,82.474971499038091,83.529520769835116,84.584070040632156,85.638619311429224],[-4.8386227932523411,-3.7171209728019505,-2.5956191523515599,-1.4741173319011693,-0.35261551145077874,0.76888630899962607,1.8903881294500025,3.0118899499004073,4.1333917703507979,5.2548935908011742,6.376395411251579,7.4978972317019696,8.6193990521523602,9.7409008726027508,10.862402693053141,11.983904513503532,13.105406333953923,14.226908154404313,15.348409974854718,16.469911795305094,17.591413615755485,18.71291543620589,19.834417256656266,20.955919077106671,22.077420897557047,23.198922718007452,24.320424538457843,25.441926358908219,26.563428179358638,27.684929999809015,28.806431820259405,29.927933640709796,31.049435461160186,32.170937281610577,33.292439102060968,34.413940922511358,35.535442742961763,36.656944563412139,37.778446383862544,38.899948204312935,40.021450024763311,41.142951845213716,42.264453665664107,43.385955486114497,44.507457306564888,45.628959127015278,46.750460947465669,47.87196276791606,48.99346458836645,50.114966408816855,51.236468229267231,52.357970049717622,53.479471870168027,54.600973690618403,55.722475511068808,56.843977331519184,57.965479151969589,59.08698097241998,60.208482792870356,61.329984613320761,62.451486433771151,63.572988254221542,64.694490074671933,65.815991895122323,66.937493715572728,68.058995536023104,69.180497356473509,70.301999176923871,71.423500997374276,72.545002817824681,73.666504638275057,74.788006458725462,75.909508279175853,77.031010099626229,78.152511920076634,79.274013740527039,80.395515560977401,81.517017381427806,82.638519201878211,83.760021022328587,84.881522842778978,86.003024663229354,87.124526483679759],[-8.842824692576599,-7.6543703224728574,-6.4659159523691159,-5.2774615822653743,-4.0890072121616328,-2.900552842057877,-1.7120984719541497,-0.52364410185039389,0.66481026825334766,1.853264638357075,3.041719008460845,4.2301733785645723,5.4186277486683139,6.6070821187720696,7.795536488875797,8.9839908589795385,10.17244522908328,11.360899599187036,12.549353969290777,13.737808339394505,14.926262709498261,16.114717079602002,17.303171449705729,18.491625819809499,19.680080189913227,20.868534560016968,22.056988930120724,23.245443300224451,24.433897670328207,25.622352040431949,26.81080641053569,27.999260780639432,29.187715150743173,30.376169520846915,31.564623890950656,32.753078261054398,33.941532631158154,35.129987001261881,36.318441371365623,37.506895741469378,38.695350111573106,39.883804481676862,41.072258851780603,42.260713221884345,43.449167591988086,44.637621962091828,45.826076332195569,47.014530702299311,48.202985072403052,49.391439442506794,50.57989381261055,51.768348182714291,52.956802552818033,54.14525692292176,55.333711293025516,56.522165663129243,57.710620033233013,58.899074403336755,60.087528773440482,61.275983143544224,62.464437513647965,63.652891883751707,64.841346253855477,66.029800623959204,67.218254994062946,68.406709364166687,69.595163734270429,70.783618104374156,71.972072474477898,73.160526844581668,74.348981214685409,75.537435584789151,76.725889954892878,77.91434432499662,79.102798695100361,80.291253065204131,81.479707435307859,82.6681618054116,83.856616175515342,85.045070545619083,86.233524915722811,87.421979285826581,88.610433655930322],[-12.847026591900885,-11.591619672143793,-10.3362127523867,-9.0808058326296077,-7.8253989128725152,-6.5699919931154085,-5.3145850733583302,-4.0591781536012235,-2.803771233844131,-1.5483643140870527,-0.29295739432993173,0.96244952542714657,2.2178564451842533,3.4732633649413458,4.7286702846984241,5.9840772044555308,7.2394841242126233,8.4948910439697158,9.7502979637268226,11.005704883483901,12.261111803240993,13.5165187229981,14.771925642755178,16.027332562512285,17.282739482269378,18.53814640202647,19.793553321783563,21.048960241540655,22.304367161297762,23.55977408105484,24.815181000811947,26.070587920569039,27.325994840326118,28.581401760083224,29.836808679840317,31.092215599597395,32.347622519354502,33.603029439111594,34.858436358868701,36.113843278625779,37.369250198382872,38.624657118139993,39.880064037897057,41.135470957654164,42.39087787741127,43.646284797168335,44.901691716925441,46.157098636682548,47.412505556439626,48.667912476196733,49.923319395953826,51.178726315710904,52.434133235468011,53.689540155225103,54.944947074982196,56.200353994739288,57.455760914496395,58.711167834253473,59.966574754010566,61.221981673767672,62.477388593524751,63.732795513281857,64.988202433038964,66.243609352796028,67.499016272553135,68.754423192310242,70.009830112067348,71.265237031824412,72.520643951581519,73.776050871338626,75.031457791095704,76.286864710852811,77.542271630609903,78.797678550366982,80.053085470124088,81.308492389881195,82.563899309638259,83.819306229395366,85.074713149152473,86.330120068909551,87.585526988666643,88.84093390842375,90.096340828180828],[-16.851228491225143,-15.5288690218147,-14.206509552404256,-12.884150082993813,-11.561790613583369,-10.239431144172912,-8.9170716747624681,-7.5947122053520246,-6.2723527359415812,-4.9499932665311377,-3.62763379712068,-2.3052743277102508,-0.98291485829979308,0.33944461111065038,1.6618040805210939,2.9841635499315373,4.3065230193419808,5.6288824887524385,6.9512419581628819,8.2736014275733112,9.5959608969837689,10.918320366394212,12.240679835804642,13.563039305215113,14.885398774625543,16.207758244035986,17.53011771344643,18.852477182856887,20.174836652267345,21.497196121677774,22.819555591088218,24.141915060498661,25.464274529909119,26.786633999319562,28.108993468730006,29.431352938140435,30.753712407550893,32.07607187696135,33.398431346371794,34.720790815782237,36.043150285192667,37.365509754603124,38.687869224013554,40.010228693424025,41.332588162834469,42.654947632244898,43.977307101655342,45.299666571065785,46.622026040476257,47.9443855098867,49.26674497929713,50.589104448707573,51.911463918118017,53.233823387528446,54.556182856938932,55.878542326349361,57.200901795759805,58.523261265170248,59.845620734580677,61.167980203991149,62.490339673401593,63.812699142812036,65.13505861222248,66.457418081632909,67.779777551043352,69.102137020453824,70.424496489864268,71.746855959274697,73.069215428685141,74.391574898095584,75.713934367506056,77.036293836916499,78.358653306326929,79.681012775737372,81.003372245147816,82.325731714558287,83.648091183968717,84.97045065337916,86.292810122789604,87.615169592200047,88.937529061610476,90.259888531020948,91.582248000431392],[-20.855430390549429,-19.466118371485635,-18.076806352421855,-16.687494333358046,-15.298182314294252,-13.908870295230443,-12.519558276166649,-11.130246257102868,-9.7409342380390598,-8.3516222189752654,-6.9623101999114567,-5.5729981808476623,-4.1836861617838821,-2.7943741427200735,-1.405062123656279,-0.015750104592484604,1.3735619144713098,2.7628739335351185,4.1521859525989271,5.5414979716626931,6.9308099907265017,8.3201220097903104,9.7094340288540764,11.098746047917899,12.488058066981694,13.877370086045502,15.266682105109282,16.655994124173077,18.0453061432369,19.434618162300666,20.823930181364474,22.213242200428283,23.602554219492049,24.991866238555858,26.381178257619666,27.770490276683461,29.159802295747255,30.54911431481105,31.938426333874858,33.327738352938638,34.717050372002433,36.106362391066256,37.495674410130022,38.88498642919383,40.274298448257639,41.663610467321433,43.052922486385214,44.442234505449022,45.831546524512831,47.220858543576611,48.610170562640405,49.999482581704214,51.388794600767994,52.778106619831789,54.167418638895612,55.556730657959406,56.946042677023186,58.335354696086995,59.724666715150789,61.113978734214569,62.503290753278378,63.892602772342187,65.281914791405967,66.671226810469761,68.06053882953357,69.449850848597379,70.839162867661159,72.228474886724953,73.617786905788762,75.007098924852542,76.396410943916351,77.785722962980159,79.175034982043925,80.564347001107734,81.953659020171543,83.342971039235323,84.732283058299117,86.121595077362926,87.510907096426735,88.900219115490515,90.289531134554309,91.678843153618118,93.068155172681898],[-24.859632289873687,-23.403367721156542,-21.947103152439411,-20.490838583722237,-19.034574015005106,-17.578309446287946,-16.122044877570801,-14.66578030885367,-13.20951574013651,-11.75325117141935,-10.296986602702191,-8.8407220339850596,-7.3844574652679285,-5.9281928965507689,-4.4719283278336093,-3.0156637591164497,-1.5593991903993185,-0.10313462168218734,1.3531299470349722,2.8093945157521318,4.2656590844692914,5.7219236531864226,7.1781882219035538,8.6344527906207134,10.090717359337873,11.546981928055033,13.003246496772164,14.459511065489295,15.915775634206483,17.372040202923614,18.828304771640774,20.284569340357905,21.740833909075036,23.197098477792196,24.653363046509355,26.109627615226486,27.565892183943646,29.022156752660777,30.478421321377937,31.934685890095096,33.390950458812227,34.847215027529387,36.303479596246518,37.759744164963678,39.216008733680837,40.672273302397969,42.128537871115114,43.584802439832288,45.041067008549433,46.497331577266579,47.95359614598371,49.409860714700855,50.866125283418029,52.32238985213516,53.77865442085232,55.234918989569451,56.691183558286596,58.14744812700377,59.603712695720901,61.059977264438047,62.516241833155192,63.972506401872337,65.428770970589511,66.885035539306642,68.341300108023788,69.797564676740933,71.253829245458107,72.710093814175238,74.166358382892383,75.622622951609529,77.078887520326674,78.535152089043848,79.991416657760979,81.447681226478124,82.90394579519527,84.360210363912415,85.816474932629575,87.27273950134672,88.729004070063866,90.185268638781011,91.641533207498142,93.097797776215316,94.554062344932461],[-28.863834189197945,-27.340617070827449,-25.817399952456952,-24.294182834086456,-22.77096571571596,-21.247748597345435,-19.724531478974967,-18.201314360604471,-16.678097242233946,-15.15488012386345,-13.631663005492953,-12.108445887122457,-10.585228768751961,-9.0620116503814359,-7.5387945320109679,-6.0155774136404432,-4.4923602952699468,-2.9691431768994789,-1.4459260585289542,0.077291059841542165,1.6005081782120669,3.1237252965825348,4.6469424149530312,6.1701595333235559,7.6933766516940238,9.2165937700645486,10.739810888435045,12.263028006805541,13.786245125176038,15.309462243546534,16.832679361917059,18.355896480287555,19.879113598658023,21.402330717028519,22.925547835399044,24.448764953769512,25.971982072140037,27.495199190510533,29.018416308881029,30.541633427251526,32.064850545622022,33.588067663992547,35.111284782363043,36.634501900733511,38.157719019104036,39.680936137474532,41.204153255845029,42.727370374215525,44.250587492586021,45.773804610956546,47.297021729327014,48.820238847697539,50.343455966068035,51.866673084438503,53.389890202809028,54.913107321179524,56.436324439550049,57.959541557920517,59.482758676291013,61.005975794661509,62.529192913032034,64.05241003140253,65.575627149773027,67.098844268143523,68.622061386513991,70.145278504884516,71.668495623255041,73.191712741625537,74.714929859996005,76.238146978366501,77.761364096737026,79.284581215107551,80.807798333478019,82.331015451848515,83.854232570219025,85.377449688589508,86.900666806960004,88.423883925330514,89.947101043701025,91.470318162071507,92.993535280442003,94.516752398812514,96.039969517182996],[-32.868036088522217,-31.277866420498384,-29.687696752474523,-28.097527084450689,-26.507357416426828,-24.917187748402966,-23.327018080379133,-21.736848412355272,-20.146678744331439,-18.556509076307577,-16.966339408283716,-15.376169740259883,-13.786000072236021,-12.19583040421216,-10.605660736188327,-9.0154910681644935,-7.425321400140632,-5.8351517321167421,-4.244982064092909,-2.6548123960690759,-1.0646427280452428,0.52552693997864708,2.1156966080024802,3.7058662760263417,5.2960359440501747,6.8862056120740363,8.4763752800978978,10.066544948121731,11.656714616145592,13.246884284169425,14.837053952193287,16.427223620217148,18.017393288240982,19.607562956264843,21.197732624288705,22.787902292312538,24.378071960336399,25.968241628360232,27.558411296384122,29.148580964407955,30.738750632431788,32.32892030045565,33.919089968479511,35.509259636503373,37.099429304527206,38.689598972551039,40.279768640574872,41.869938308598762,43.460107976622623,45.050277644646457,46.64044731267029,48.230616980694151,49.820786648718013,51.410956316741846,53.001125984765707,54.591295652789569,56.181465320813402,57.771634988837263,59.361804656861096,60.951974324884986,62.542143992908819,64.132313660932653,65.722483328956514,67.312652996980376,68.902822665004237,70.49299233302807,72.083162001051903,73.673331669075736,75.263501337099626,76.853671005123488,78.443840673147321,80.034010341171154,81.624180009195015,83.214349677218877,84.804519345242738,86.394689013266571,87.984858681290433,89.575028349314266,91.165198017338128,92.755367685361989,94.345537353385851,95.935707021409684,97.525876689433517]],"text":["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31","32","33","34","35","36","37","38","39","40","41","42","43","44","45","46","47","48","49","50","51","52","53","54","55","56","57","58","59","60","61","62","63","64","65","66","67","68","69","70","71","72","73","74","75","76","77","78","79","80","81","82","83","84","85","86","87","88","89","90","91","92","93","94","95","96","97","98","99","100","101","102","103","104","105","106","107","108","109","110","111","112","113","114","115","116","117","118","119","120","121","122","123","124","125","126","127","128","129","130","131","132","133","134","135","136","137","138","139","140","141","142","143","144","145","146","147","148","149","150","151","152","153"],"mode":"markers","alpha_stroke":1,"sizes":[10,100],"spans":[1,20],"type":"surface","inherit":true}},"layout":{"margin":{"b":40,"l":60,"t":25,"r":10},"scene":{"xaxis":{"title":"Temp"},"yaxis":{"title":"Month"},"zaxis":{"title":"Ozone"}},"hovermode":"closest","showlegend":false,"legend":{"yanchor":"top","y":0.5}},"source":"A","config":{"modeBarButtonsToAdd":["hoverclosest","hovercompare"],"showSendToCloud":false},"data":[{"x":[67,72,74,62,66,65,59,61,74,69,66,68,58,64,66,57,68,62,59,73,61,61,67,81,79,76,82,90,87,82,77,72,65,73,76,84,85,81,83,83,88,92,92,89,73,81,80,81,82,84,87,85,74,86,85,82,86,88,86,83,81,81,81,82,86,85,87,89,90,90,86,82,80,77,79,76,78,78,77,72,79,81,86,97,94,96,94,91,92,93,93,87,84,80,78,75,73,81,76,77,71,71,78,67,76,68,82,64,71,81,69,63,70,75,76,68],"y":[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9],"z":[41,36,12,18,28,23,19,8,7,16,11,14,18,14,34,6,30,11,1,11,4,32,23,45,115,37,29,71,39,23,21,37,20,12,13,135,49,32,64,40,77,97,97,85,10,27,7,48,35,61,79,63,16,80,108,20,52,82,50,64,59,39,9,16,78,35,66,122,89,110,44,28,65,22,59,23,31,44,21,9,45,168,73,76,118,84,85,96,78,73,91,47,32,20,23,21,24,44,21,28,9,13,46,18,13,24,16,13,23,36,7,14,30,14,18,20],"text":["1","2","3","4","6","7","8","9","11","12","13","14","15","16","17","18","19","20","21","22","23","24","28","29","30","31","38","40","41","44","47","48","49","50","51","62","63","64","66","67","68","69","70","71","73","74","76","77","78","79","80","81","82","85","86","87","88","89","90","91","92","93","94","95","96","97","98","99","100","101","104","105","106","108","109","110","111","112","113","114","116","117","118","120","121","122","123","124","125","126","127","128","129","130","131","132","133","134","135","136","137","138","139","140","141","142","143","144","145","146","147","148","149","151","152","153"],"mode":"markers","type":"scatter3d","marker":{"color":"rgba(31,119,180,1)","line":{"color":"rgba(31,119,180,1)"}},"error_y":{"color":"rgba(31,119,180,1)"},"error_x":{"color":"rgba(31,119,180,1)"},"line":{"color":"rgba(31,119,180,1)"},"frame":null},{"colorbar":{"title":"Ozone","ticklen":2,"len":0.5,"lenmode":"fraction","y":1,"yanchor":"top"},"colorscale":[["0","rgba(68,1,84,1)"],["0.0416666666666667","rgba(70,19,97,1)"],["0.0833333333333333","rgba(72,32,111,1)"],["0.125","rgba(71,45,122,1)"],["0.166666666666667","rgba(68,58,128,1)"],["0.208333333333333","rgba(64,70,135,1)"],["0.25","rgba(60,82,138,1)"],["0.291666666666667","rgba(56,93,140,1)"],["0.333333333333333","rgba(49,104,142,1)"],["0.375","rgba(46,114,142,1)"],["0.416666666666667","rgba(42,123,142,1)"],["0.458333333333333","rgba(38,133,141,1)"],["0.5","rgba(37,144,140,1)"],["0.541666666666667","rgba(33,154,138,1)"],["0.583333333333333","rgba(39,164,133,1)"],["0.625","rgba(47,174,127,1)"],["0.666666666666667","rgba(53,183,121,1)"],["0.708333333333333","rgba(79,191,110,1)"],["0.75","rgba(98,199,98,1)"],["0.791666666666667","rgba(119,207,85,1)"],["0.833333333333333","rgba(147,214,70,1)"],["0.875","rgba(172,220,52,1)"],["0.916666666666667","rgba(199,225,42,1)"],["0.958333333333333","rgba(226,228,40,1)"],["1","rgba(253,231,37,1)"]],"showscale":true,"x":[56,56.5,57,57.5,58,58.5,59,59.5,60,60.5,61,61.5,62,62.5,63,63.5,64,64.5,65,65.5,66,66.5,67,67.5,68,68.5,69,69.5,70,70.5,71,71.5,72,72.5,73,73.5,74,74.5,75,75.5,76,76.5,77,77.5,78,78.5,79,79.5,80,80.5,81,81.5,82,82.5,83,83.5,84,84.5,85,85.5,86,86.5,87,87.5,88,88.5,89,89.5,90,90.5,91,91.5,92,92.5,93,93.5,94,94.5,95,95.5,96,96.5,97],"y":[5,5.5,6,6.5,7,7.5,8,8.5,9],"z":[[-0.83442089392806906,0.22012837686895637,1.274677647665996,2.3292269184630356,3.3837761892600753,4.4383254600571291,5.4928747308541546,6.5474240016511942,7.6019732724482481,8.6565225432452735,9.7110718140423273,10.765621084839353,11.820170355636407,12.874719626433446,13.929268897230472,14.983818168027511,16.038367438824565,17.092916709621605,18.147465980418644,19.202015251215684,20.256564522012724,21.311113792809763,22.365663063606789,23.420212334403857,24.474761605200882,25.529310875997922,26.583860146794962,27.638409417592001,28.692958688389055,29.74750795918608,30.80205722998312,31.856606500780167,32.911155771577207,33.965705042374239,35.020254313171279,36.074803583968325,37.129352854765365,38.183902125562398,39.238451396359451,40.293000667156491,41.347549937953524,42.402099208750563,43.45664847954761,44.51119775034465,45.565747021141682,46.620296291938722,47.674845562735769,48.729394833532808,49.783944104329848,50.83849337512688,51.893042645923934,52.947591916720967,54.002141187518006,55.056690458315053,56.111239729112093,57.165788999909125,58.220338270706165,59.274887541503219,60.329436812300244,61.383986083097284,62.438535353894324,63.493084624691377,64.547633895488417,65.602183166285442,66.656732437082482,67.711281707879536,68.765830978676576,69.820380249473601,70.874929520270641,71.929478791067694,72.984028061864734,74.038577332661774,75.093126603458813,76.147675874255853,77.202225145052893,78.256774415849932,79.311323686646972,80.365872957444012,81.420422228241051,82.474971499038091,83.529520769835116,84.584070040632156,85.638619311429224],[-4.8386227932523411,-3.7171209728019505,-2.5956191523515599,-1.4741173319011693,-0.35261551145077874,0.76888630899962607,1.8903881294500025,3.0118899499004073,4.1333917703507979,5.2548935908011742,6.376395411251579,7.4978972317019696,8.6193990521523602,9.7409008726027508,10.862402693053141,11.983904513503532,13.105406333953923,14.226908154404313,15.348409974854718,16.469911795305094,17.591413615755485,18.71291543620589,19.834417256656266,20.955919077106671,22.077420897557047,23.198922718007452,24.320424538457843,25.441926358908219,26.563428179358638,27.684929999809015,28.806431820259405,29.927933640709796,31.049435461160186,32.170937281610577,33.292439102060968,34.413940922511358,35.535442742961763,36.656944563412139,37.778446383862544,38.899948204312935,40.021450024763311,41.142951845213716,42.264453665664107,43.385955486114497,44.507457306564888,45.628959127015278,46.750460947465669,47.87196276791606,48.99346458836645,50.114966408816855,51.236468229267231,52.357970049717622,53.479471870168027,54.600973690618403,55.722475511068808,56.843977331519184,57.965479151969589,59.08698097241998,60.208482792870356,61.329984613320761,62.451486433771151,63.572988254221542,64.694490074671933,65.815991895122323,66.937493715572728,68.058995536023104,69.180497356473509,70.301999176923871,71.423500997374276,72.545002817824681,73.666504638275057,74.788006458725462,75.909508279175853,77.031010099626229,78.152511920076634,79.274013740527039,80.395515560977401,81.517017381427806,82.638519201878211,83.760021022328587,84.881522842778978,86.003024663229354,87.124526483679759],[-8.842824692576599,-7.6543703224728574,-6.4659159523691159,-5.2774615822653743,-4.0890072121616328,-2.900552842057877,-1.7120984719541497,-0.52364410185039389,0.66481026825334766,1.853264638357075,3.041719008460845,4.2301733785645723,5.4186277486683139,6.6070821187720696,7.795536488875797,8.9839908589795385,10.17244522908328,11.360899599187036,12.549353969290777,13.737808339394505,14.926262709498261,16.114717079602002,17.303171449705729,18.491625819809499,19.680080189913227,20.868534560016968,22.056988930120724,23.245443300224451,24.433897670328207,25.622352040431949,26.81080641053569,27.999260780639432,29.187715150743173,30.376169520846915,31.564623890950656,32.753078261054398,33.941532631158154,35.129987001261881,36.318441371365623,37.506895741469378,38.695350111573106,39.883804481676862,41.072258851780603,42.260713221884345,43.449167591988086,44.637621962091828,45.826076332195569,47.014530702299311,48.202985072403052,49.391439442506794,50.57989381261055,51.768348182714291,52.956802552818033,54.14525692292176,55.333711293025516,56.522165663129243,57.710620033233013,58.899074403336755,60.087528773440482,61.275983143544224,62.464437513647965,63.652891883751707,64.841346253855477,66.029800623959204,67.218254994062946,68.406709364166687,69.595163734270429,70.783618104374156,71.972072474477898,73.160526844581668,74.348981214685409,75.537435584789151,76.725889954892878,77.91434432499662,79.102798695100361,80.291253065204131,81.479707435307859,82.6681618054116,83.856616175515342,85.045070545619083,86.233524915722811,87.421979285826581,88.610433655930322],[-12.847026591900885,-11.591619672143793,-10.3362127523867,-9.0808058326296077,-7.8253989128725152,-6.5699919931154085,-5.3145850733583302,-4.0591781536012235,-2.803771233844131,-1.5483643140870527,-0.29295739432993173,0.96244952542714657,2.2178564451842533,3.4732633649413458,4.7286702846984241,5.9840772044555308,7.2394841242126233,8.4948910439697158,9.7502979637268226,11.005704883483901,12.261111803240993,13.5165187229981,14.771925642755178,16.027332562512285,17.282739482269378,18.53814640202647,19.793553321783563,21.048960241540655,22.304367161297762,23.55977408105484,24.815181000811947,26.070587920569039,27.325994840326118,28.581401760083224,29.836808679840317,31.092215599597395,32.347622519354502,33.603029439111594,34.858436358868701,36.113843278625779,37.369250198382872,38.624657118139993,39.880064037897057,41.135470957654164,42.39087787741127,43.646284797168335,44.901691716925441,46.157098636682548,47.412505556439626,48.667912476196733,49.923319395953826,51.178726315710904,52.434133235468011,53.689540155225103,54.944947074982196,56.200353994739288,57.455760914496395,58.711167834253473,59.966574754010566,61.221981673767672,62.477388593524751,63.732795513281857,64.988202433038964,66.243609352796028,67.499016272553135,68.754423192310242,70.009830112067348,71.265237031824412,72.520643951581519,73.776050871338626,75.031457791095704,76.286864710852811,77.542271630609903,78.797678550366982,80.053085470124088,81.308492389881195,82.563899309638259,83.819306229395366,85.074713149152473,86.330120068909551,87.585526988666643,88.84093390842375,90.096340828180828],[-16.851228491225143,-15.5288690218147,-14.206509552404256,-12.884150082993813,-11.561790613583369,-10.239431144172912,-8.9170716747624681,-7.5947122053520246,-6.2723527359415812,-4.9499932665311377,-3.62763379712068,-2.3052743277102508,-0.98291485829979308,0.33944461111065038,1.6618040805210939,2.9841635499315373,4.3065230193419808,5.6288824887524385,6.9512419581628819,8.2736014275733112,9.5959608969837689,10.918320366394212,12.240679835804642,13.563039305215113,14.885398774625543,16.207758244035986,17.53011771344643,18.852477182856887,20.174836652267345,21.497196121677774,22.819555591088218,24.141915060498661,25.464274529909119,26.786633999319562,28.108993468730006,29.431352938140435,30.753712407550893,32.07607187696135,33.398431346371794,34.720790815782237,36.043150285192667,37.365509754603124,38.687869224013554,40.010228693424025,41.332588162834469,42.654947632244898,43.977307101655342,45.299666571065785,46.622026040476257,47.9443855098867,49.26674497929713,50.589104448707573,51.911463918118017,53.233823387528446,54.556182856938932,55.878542326349361,57.200901795759805,58.523261265170248,59.845620734580677,61.167980203991149,62.490339673401593,63.812699142812036,65.13505861222248,66.457418081632909,67.779777551043352,69.102137020453824,70.424496489864268,71.746855959274697,73.069215428685141,74.391574898095584,75.713934367506056,77.036293836916499,78.358653306326929,79.681012775737372,81.003372245147816,82.325731714558287,83.648091183968717,84.97045065337916,86.292810122789604,87.615169592200047,88.937529061610476,90.259888531020948,91.582248000431392],[-20.855430390549429,-19.466118371485635,-18.076806352421855,-16.687494333358046,-15.298182314294252,-13.908870295230443,-12.519558276166649,-11.130246257102868,-9.7409342380390598,-8.3516222189752654,-6.9623101999114567,-5.5729981808476623,-4.1836861617838821,-2.7943741427200735,-1.405062123656279,-0.015750104592484604,1.3735619144713098,2.7628739335351185,4.1521859525989271,5.5414979716626931,6.9308099907265017,8.3201220097903104,9.7094340288540764,11.098746047917899,12.488058066981694,13.877370086045502,15.266682105109282,16.655994124173077,18.0453061432369,19.434618162300666,20.823930181364474,22.213242200428283,23.602554219492049,24.991866238555858,26.381178257619666,27.770490276683461,29.159802295747255,30.54911431481105,31.938426333874858,33.327738352938638,34.717050372002433,36.106362391066256,37.495674410130022,38.88498642919383,40.274298448257639,41.663610467321433,43.052922486385214,44.442234505449022,45.831546524512831,47.220858543576611,48.610170562640405,49.999482581704214,51.388794600767994,52.778106619831789,54.167418638895612,55.556730657959406,56.946042677023186,58.335354696086995,59.724666715150789,61.113978734214569,62.503290753278378,63.892602772342187,65.281914791405967,66.671226810469761,68.06053882953357,69.449850848597379,70.839162867661159,72.228474886724953,73.617786905788762,75.007098924852542,76.396410943916351,77.785722962980159,79.175034982043925,80.564347001107734,81.953659020171543,83.342971039235323,84.732283058299117,86.121595077362926,87.510907096426735,88.900219115490515,90.289531134554309,91.678843153618118,93.068155172681898],[-24.859632289873687,-23.403367721156542,-21.947103152439411,-20.490838583722237,-19.034574015005106,-17.578309446287946,-16.122044877570801,-14.66578030885367,-13.20951574013651,-11.75325117141935,-10.296986602702191,-8.8407220339850596,-7.3844574652679285,-5.9281928965507689,-4.4719283278336093,-3.0156637591164497,-1.5593991903993185,-0.10313462168218734,1.3531299470349722,2.8093945157521318,4.2656590844692914,5.7219236531864226,7.1781882219035538,8.6344527906207134,10.090717359337873,11.546981928055033,13.003246496772164,14.459511065489295,15.915775634206483,17.372040202923614,18.828304771640774,20.284569340357905,21.740833909075036,23.197098477792196,24.653363046509355,26.109627615226486,27.565892183943646,29.022156752660777,30.478421321377937,31.934685890095096,33.390950458812227,34.847215027529387,36.303479596246518,37.759744164963678,39.216008733680837,40.672273302397969,42.128537871115114,43.584802439832288,45.041067008549433,46.497331577266579,47.95359614598371,49.409860714700855,50.866125283418029,52.32238985213516,53.77865442085232,55.234918989569451,56.691183558286596,58.14744812700377,59.603712695720901,61.059977264438047,62.516241833155192,63.972506401872337,65.428770970589511,66.885035539306642,68.341300108023788,69.797564676740933,71.253829245458107,72.710093814175238,74.166358382892383,75.622622951609529,77.078887520326674,78.535152089043848,79.991416657760979,81.447681226478124,82.90394579519527,84.360210363912415,85.816474932629575,87.27273950134672,88.729004070063866,90.185268638781011,91.641533207498142,93.097797776215316,94.554062344932461],[-28.863834189197945,-27.340617070827449,-25.817399952456952,-24.294182834086456,-22.77096571571596,-21.247748597345435,-19.724531478974967,-18.201314360604471,-16.678097242233946,-15.15488012386345,-13.631663005492953,-12.108445887122457,-10.585228768751961,-9.0620116503814359,-7.5387945320109679,-6.0155774136404432,-4.4923602952699468,-2.9691431768994789,-1.4459260585289542,0.077291059841542165,1.6005081782120669,3.1237252965825348,4.6469424149530312,6.1701595333235559,7.6933766516940238,9.2165937700645486,10.739810888435045,12.263028006805541,13.786245125176038,15.309462243546534,16.832679361917059,18.355896480287555,19.879113598658023,21.402330717028519,22.925547835399044,24.448764953769512,25.971982072140037,27.495199190510533,29.018416308881029,30.541633427251526,32.064850545622022,33.588067663992547,35.111284782363043,36.634501900733511,38.157719019104036,39.680936137474532,41.204153255845029,42.727370374215525,44.250587492586021,45.773804610956546,47.297021729327014,48.820238847697539,50.343455966068035,51.866673084438503,53.389890202809028,54.913107321179524,56.436324439550049,57.959541557920517,59.482758676291013,61.005975794661509,62.529192913032034,64.05241003140253,65.575627149773027,67.098844268143523,68.622061386513991,70.145278504884516,71.668495623255041,73.191712741625537,74.714929859996005,76.238146978366501,77.761364096737026,79.284581215107551,80.807798333478019,82.331015451848515,83.854232570219025,85.377449688589508,86.900666806960004,88.423883925330514,89.947101043701025,91.470318162071507,92.993535280442003,94.516752398812514,96.039969517182996],[-32.868036088522217,-31.277866420498384,-29.687696752474523,-28.097527084450689,-26.507357416426828,-24.917187748402966,-23.327018080379133,-21.736848412355272,-20.146678744331439,-18.556509076307577,-16.966339408283716,-15.376169740259883,-13.786000072236021,-12.19583040421216,-10.605660736188327,-9.0154910681644935,-7.425321400140632,-5.8351517321167421,-4.244982064092909,-2.6548123960690759,-1.0646427280452428,0.52552693997864708,2.1156966080024802,3.7058662760263417,5.2960359440501747,6.8862056120740363,8.4763752800978978,10.066544948121731,11.656714616145592,13.246884284169425,14.837053952193287,16.427223620217148,18.017393288240982,19.607562956264843,21.197732624288705,22.787902292312538,24.378071960336399,25.968241628360232,27.558411296384122,29.148580964407955,30.738750632431788,32.32892030045565,33.919089968479511,35.509259636503373,37.099429304527206,38.689598972551039,40.279768640574872,41.869938308598762,43.460107976622623,45.050277644646457,46.64044731267029,48.230616980694151,49.820786648718013,51.410956316741846,53.001125984765707,54.591295652789569,56.181465320813402,57.771634988837263,59.361804656861096,60.951974324884986,62.542143992908819,64.132313660932653,65.722483328956514,67.312652996980376,68.902822665004237,70.49299233302807,72.083162001051903,73.673331669075736,75.263501337099626,76.853671005123488,78.443840673147321,80.034010341171154,81.624180009195015,83.214349677218877,84.804519345242738,86.394689013266571,87.984858681290433,89.575028349314266,91.165198017338128,92.755367685361989,94.345537353385851,95.935707021409684,97.525876689433517]],"text":["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31","32","33","34","35","36","37","38","39","40","41","42","43","44","45","46","47","48","49","50","51","52","53","54","55","56","57","58","59","60","61","62","63","64","65","66","67","68","69","70","71","72","73","74","75","76","77","78","79","80","81","82","83","84","85","86","87","88","89","90","91","92","93","94","95","96","97","98","99","100","101","102","103","104","105","106","107","108","109","110","111","112","113","114","115","116","117","118","119","120","121","122","123","124","125","126","127","128","129","130","131","132","133","134","135","136","137","138","139","140","141","142","143","144","145","146","147","148","149","150","151","152","153"],"mode":"markers","type":"surface","frame":null}],"highlight":{"on":"plotly_click","persistent":false,"dynamic":false,"selectize":false,"opacityDim":0.20000000000000001,"selected":{"opacity":1},"debounce":0},"shinyEvents":["plotly_hover","plotly_click","plotly_selected","plotly_relayout","plotly_brushed","plotly_brushing","plotly_clickannotation","plotly_doubleclick","plotly_deselect","plotly_afterplot","plotly_sunburstclick"],"base_url":"https://plot.ly"},"evals":[],"jsHooks":[]}</script>
<p>And here is that same plot as a contour plot.</p>
<div class="sourceCode" id="cb66"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb66-1"><a href="#cb66-1" aria-hidden="true" tabindex="-1"></a>air_surface <span class="ot">&lt;-</span> <span class="fu">expand.grid</span>(<span class="at">Temp =</span> axis_x, <span class="at">Month =</span> axis_y, <span class="at">KEEP.OUT.ATTRS=</span>F)</span>
<span id="cb66-2"><a href="#cb66-2" aria-hidden="true" tabindex="-1"></a>air_surface<span class="sc">$</span>Z <span class="ot">&lt;-</span> <span class="fu">predict.lm</span>(air_lm, <span class="at">newdata =</span> air_surface)</span>
<span id="cb66-3"><a href="#cb66-3" aria-hidden="true" tabindex="-1"></a>mycolorpalette <span class="ot">&lt;-</span> <span class="fu">colorRampPalette</span>(<span class="fu">c</span>(<span class="st">&quot;skyblue2&quot;</span>, <span class="st">&quot;orange&quot;</span>))</span>
<span id="cb66-4"><a href="#cb66-4" aria-hidden="true" tabindex="-1"></a><span class="fu">filled.contour</span>(<span class="at">x=</span>axis_x, <span class="at">y=</span>axis_y, <span class="at">z=</span><span class="fu">matrix</span>(air_surface<span class="sc">$</span>Z, <span class="fu">length</span>(axis_x), <span class="fu">length</span>(axis_y)), <span class="at">col=</span><span class="fu">mycolorpalette</span>(<span class="dv">27</span>))</span></code></pre></div>
</p>
</div>
<div id="LearnMoreHDModel" class="tabcontent" style="display:none;">
<p>
<table>
<tr>
<td>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-71-1.png" width="144" /></p>
</td>
<td style="text-align: center;padding-left:15px;">
<p><span class="math display">\[
  Y_i = \overbrace{\underbrace{\beta_0 + \beta_1 X_{1i} + \beta_2 X_{2i}
+ \ldots + \beta_{p-1}X_{p-1,i}}_{E\{Y_i\}}}^\text{&quot;High
Dimensional Models&quot;} + \epsilon_i
\]</span></p>
</td>
</tr>
</table>
<p>The so called “HD”, or “High Dimensional”, regression model uses
three or more different quantitative x-variables, an <span
class="math inline">\(X_{1i}\)</span>, an <span
class="math inline">\(X_{2i}\)</span>, and at least an <span
class="math inline">\(X_{3i}\)</span>, but could use many, many other
variables as well. Unlike the 3D model where the final regression could
be shown as either a contour plot or a 3D-graphic, the high dimensional
model exists in 4 or more dimensions. Thus, it is impossible to graph
this model in its full form. Further, it isn’t really even possible to
“mentally connect” with this type of model is it exists beyond what our
3D minds can really comprehend.</p>
<table>
<colgroup>
<col width="12%" />
<col width="87%" />
</colgroup>
<thead>
<tr class="header">
<th>Parameter</th>
<th>Effect</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td><span class="math inline">\(\beta_0\)</span></td>
<td>Y-intercept of the Model</td>
</tr>
<tr class="even">
<td><span class="math inline">\(\beta_1\)</span></td>
<td>Slope of the line in the <span class="math inline">\(X_1\)</span>
direction.</td>
</tr>
<tr class="odd">
<td><span class="math inline">\(\beta_2\)</span></td>
<td>Slope of the line in the <span class="math inline">\(X_2\)</span>
direction.</td>
</tr>
<tr class="even">
<td><span class="math inline">\(...\)</span></td>
<td>Slopes in other directions depending on how many other variables are
included in the model.</td>
</tr>
<tr class="odd">
<td><span class="math inline">\(\beta_{p-1}\)</span></td>
<td>Final term in the model where there are <span
class="math inline">\(p\)</span> total <span
class="math inline">\(\beta\)</span>’s. The reason for the <span
class="math inline">\(p-1\)</span> on the last term is because we
started with <span class="math inline">\(\beta_0\)</span> for the first
term, leaving <span class="math inline">\(\beta_{p-1}\)</span> as the
last term.</td>
</tr>
</tbody>
</table>
<p><strong>An Example</strong></p>
<p>Suppose we used three x-variables of <code>Wind</code>,
<code>Temp</code>, and <code>Solar.R</code> to predict the y-variable of
<code>Ozone</code>.</p>
<p><span class="math display">\[
  \underbrace{Y_i}_\text{Ozone} \underbrace{=}_{\sim}
\overbrace{\beta_0}^{\stackrel{\text{y-int}}{\text{baseline}}} +
\overbrace{\beta_1}^{\stackrel{\text{slope in}}{\text{Wind Direction}}}
\underbrace{X_{1i}}_\text{Wind} +
\overbrace{\beta_2}^{\stackrel{\text{slope in}}{\text{Temp
Direction}}}  \underbrace{X_{2i}}_\text{Temp} +
\overbrace{\beta_3}^{\stackrel{\text{slope in}}{\text{Solar.R
Direction}}}  \underbrace{X_{3i}}_\text{Solar.R} + \epsilon_i
\]</span></p>
<div class="sourceCode" id="cb67"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb67-1"><a href="#cb67-1" aria-hidden="true" tabindex="-1"></a>air_lm <span class="ot">&lt;-</span> <span class="fu">lm</span>(Ozone <span class="sc">~</span> Wind <span class="sc">+</span> Temp <span class="sc">+</span> Solar.R, <span class="at">data=</span> airquality)</span>
<span id="cb67-2"><a href="#cb67-2" aria-hidden="true" tabindex="-1"></a><span class="fu">pander</span>(air_lm<span class="sc">$</span>coefficients)</span></code></pre></div>
<table style="width:57%;">
<colgroup>
<col width="19%" />
<col width="12%" />
<col width="11%" />
<col width="13%" />
</colgroup>
<thead>
<tr class="header">
<th align="center">(Intercept)</th>
<th align="center">Wind</th>
<th align="center">Temp</th>
<th align="center">Solar.R</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="center">-64.34</td>
<td align="center">-3.334</td>
<td align="center">1.652</td>
<td align="center">0.05982</td>
</tr>
</tbody>
</table>
<p>Notice how the slope, <span class="math inline">\(\beta_1\)</span>,
in the “Wind” direction is estimated to be -3.334. The slope in the
“Temp” direction, <span class="math inline">\(\beta_2\)</span>, is
estimated to be 1.652. The slope in the “Solar.R” direction, <span
class="math inline">\(\beta_3\)</span>, is estimated to be 0.05982.
Also, the y-intercept, <span class="math inline">\(\beta_0\)</span>, is
estimated to be -64.34.</p>
<p>Visualizing this model is not really possible in its full form.
However, we can draw the regression from three different angles or
vantage points. This is a limited view of the full regression model, but
at least provides some visual understanding. To do this, we draw <span
class="math inline">\(Y\)</span> against each <span
class="math inline">\(X\)</span>-variable in separate scatterplots, one
for each <span class="math inline">\(X\)</span>-variable used in our
model.</p>
<div class="sourceCode" id="cb68"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb68-1"><a href="#cb68-1" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(air_lm)</span>
<span id="cb68-2"><a href="#cb68-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb68-3"><a href="#cb68-3" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mfrow=</span><span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">3</span>))</span>
<span id="cb68-4"><a href="#cb68-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb68-5"><a href="#cb68-5" aria-hidden="true" tabindex="-1"></a>  <span class="fu">plot</span>(Ozone <span class="sc">~</span> Wind, <span class="at">data=</span>airquality)</span>
<span id="cb68-6"><a href="#cb68-6" aria-hidden="true" tabindex="-1"></a>  <span class="fu">curve</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x <span class="sc">+</span> b[<span class="dv">3</span>]<span class="sc">*</span><span class="dv">79</span> <span class="sc">+</span> b[<span class="dv">4</span>]<span class="sc">*</span><span class="dv">205</span>, <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>)</span>
<span id="cb68-7"><a href="#cb68-7" aria-hidden="true" tabindex="-1"></a>  <span class="co"># The x-variable of this plot is &quot;Wind&quot;</span></span>
<span id="cb68-8"><a href="#cb68-8" aria-hidden="true" tabindex="-1"></a>  <span class="co"># The values of Temp=79 and Solar.R=205 are fixed at some interesting value,</span></span>
<span id="cb68-9"><a href="#cb68-9" aria-hidden="true" tabindex="-1"></a>  <span class="co"># in this case, their respective medians.</span></span>
<span id="cb68-10"><a href="#cb68-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb68-11"><a href="#cb68-11" aria-hidden="true" tabindex="-1"></a>  <span class="fu">plot</span>(Ozone <span class="sc">~</span> Temp, <span class="at">data=</span>airquality)</span>
<span id="cb68-12"><a href="#cb68-12" aria-hidden="true" tabindex="-1"></a>  <span class="fu">curve</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span><span class="fl">9.7</span> <span class="sc">+</span> b[<span class="dv">3</span>]<span class="sc">*</span>x <span class="sc">+</span> b[<span class="dv">4</span>]<span class="sc">*</span><span class="dv">205</span>, <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="st">&quot;orange&quot;</span>)</span>
<span id="cb68-13"><a href="#cb68-13" aria-hidden="true" tabindex="-1"></a>  <span class="co"># The x-variable of this plot is &quot;Temp&quot;</span></span>
<span id="cb68-14"><a href="#cb68-14" aria-hidden="true" tabindex="-1"></a>  <span class="co"># The values of Wind=9.7 and Solar.R=205 are fixed at some interesting value,</span></span>
<span id="cb68-15"><a href="#cb68-15" aria-hidden="true" tabindex="-1"></a>  <span class="co"># in this case, their respective medians.</span></span>
<span id="cb68-16"><a href="#cb68-16" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb68-17"><a href="#cb68-17" aria-hidden="true" tabindex="-1"></a>  <span class="fu">plot</span>(Ozone <span class="sc">~</span> Solar.R, <span class="at">data=</span>airquality)</span>
<span id="cb68-18"><a href="#cb68-18" aria-hidden="true" tabindex="-1"></a>  <span class="fu">curve</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span><span class="fl">9.7</span> <span class="sc">+</span> b[<span class="dv">3</span>]<span class="sc">*</span><span class="dv">79</span> <span class="sc">+</span> b[<span class="dv">4</span>]<span class="sc">*</span>x, <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="st">&quot;firebrick&quot;</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-73-1.png" width="672" /></p>
<div class="sourceCode" id="cb69"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb69-1"><a href="#cb69-1" aria-hidden="true" tabindex="-1"></a>  <span class="co"># The x-variable of this plot is &quot;Solar.R&quot;</span></span>
<span id="cb69-2"><a href="#cb69-2" aria-hidden="true" tabindex="-1"></a>  <span class="co"># The values of Wind = 9.7 and Temp=79 are fixed at some interesting value,</span></span>
<span id="cb69-3"><a href="#cb69-3" aria-hidden="true" tabindex="-1"></a>  <span class="co"># in this case, their respective medians.</span></span></code></pre></div>
</p>
</div>
<hr />
<p>The coefficient <span class="math inline">\(\beta_j\)</span> is
interpreted as the change in the expected value of <span
class="math inline">\(Y\)</span> for a unit increase in <span
class="math inline">\(X_{j}\)</span>, holding all other variables
constant, for <span class="math inline">\(j=1,\ldots,p-1\)</span>.
However, this interpretation breaks down when higher order terms (like
<span class="math inline">\(X^2\)</span>) or interaction terms (like
<span class="math inline">\(X1:X2\)</span>) are included in the
model.</p>
<p>See the <strong>Explanation</strong> tab for details about possible
hypotheses here.</p>
<hr />
</div>
</div>
<div id="r-instructions-1" class="section level3">
<h3>R Instructions</h3>
<div style="padding-left:125px;">
<p><strong>NOTE</strong>: These are general R Commands for <em>all</em>
types of multiple linear regressions. See the “Overview” section for R
Commands details about a specific multiple linear regression model.</p>
<p><strong>Console</strong> Help Command: <code>?lm()</code></p>
<p><strong>Finding Variables</strong></p>
<a href="javascript:showhide('PairsPlot')">
<div class="hoverchunk">
<p><span class="tooltipr"> pairs( <span class="tooltiprtext">A function
in R that creates all possible two-variable scatterplots from a data
set. It requires that all columns of the data set be either numeric or
factor classes. (Character classes will throw an error.)</span>
</span><span class="tooltipr"> cbind( <span class="tooltiprtext">This is
the “column (c) bind” function and it joins together things as
columns.</span> </span><span class="tooltipr"> Res =  <span
class="tooltiprtext">This is just any name you come up with, but Res is
a good abbreviation for Residuals.</span> </span><span class="tooltipr">
mylm$residuals,  <span class="tooltiprtext">This pulls out the residuals
from the current regression and adds them as a new column inside the
cbind data set.</span> </span><span class="tooltipr"> YourDataSet),
<span class="tooltiprtext">This puts the original data set along side
the residuals.</span> </span><span class="tooltipr">
 panel=panel.smooth,  <span class="tooltiprtext">This places a lowess
smoothing line on each scatterplot.</span> </span><span
class="tooltipr"> col =  <span class="tooltiprtext">specifies the colors
of the dots.</span> </span><span class="tooltipr">
as.factor(YourDataSet$Xvar) <span class="tooltiprtext">This causes the
coloring of the points in the plot to be colored according to the groups
found in Xvar. Using palette(c(“color1”,“color2”, and so on)) prior to
the plotting code allows you to specify the colors pairs will pick from
when choosing colors.</span> </span><span class="tooltipr"> ) <span
class="tooltiprtext">Closing parenthesis for the pairs function.</span>
</span></p>
</div>
<p></a></p>
<div id="PairsPlot" style="display:none;">
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-74-1.png" width="672" /></p>
</div>
<p><strong>Perform the Regression</strong></p>
<p>Everything is the same as in simple linear regression except that
more variables are allowed in the call to <code>lm()</code>.</p>
<a href="javascript:showhide('multiplelm')">
<div class="hoverchunk">
<p><span class="tooltipr"> mylm &lt;- lm( <span
class="tooltiprtext"><code>mylm</code> is some name you come up with to
store the results of the <code>lm()</code> test. Note that
<code>lm()</code> stands for “linear model.”</span> </span><span
class="tooltipr"> Y <span class="tooltiprtext"><code>Y</code> must be a
“numeric” vector of the quantitative response variable.</span>
</span><span class="tooltipr">  ~  <span class="tooltiprtext">Formula
operator in R.</span> </span><span class="tooltipr"> X1 + X2 <span
class="tooltiprtext"><code>X1</code> and <code>X2</code> are the
explanatory variables. These can either be quantitative or qualitative.
Note that R treats “numeric” variables as quantitative and “character”
or “factor” variables as qualitative. R will automatcially recode
qualitative variables to become “numeric” variables using a 0,1
encoding. See the Explanation tab for details.</span> </span><span
class="tooltipr">  + X1:X2 <span class="tooltiprtext"><code>X1:X2</code>
is called the interaction term. See the Explanation tab for
details.</span> </span><span class="tooltipr">  + …, <span
class="tooltiprtext">* <code>...</code> emphasizes that as many
explanatory variables as are desired can be included in the
model.</span> </span><span class="tooltipr">  data = YourDataSet) <span
class="tooltiprtext"><code>YourDataSet</code> is the name of your data
set.</span> </span><br/><span class="tooltipr"> summary( <span
class="tooltiprtext">The summary(…) function displays the results of an
lm(…) in R.</span> </span><span class="tooltipr"> mylm <span
class="tooltiprtext">The name of your lm that was performed
earlier.</span> </span><span class="tooltipr"> ) <span
class="tooltiprtext">Closing parenthesis for summary(…) function.</span>
</span></p>
</div>
<p></a></p>
<div id="multiplelm" style="display:none;">
<p>Example output from a regression. Hover each piece to learn more.</p>
<table class="rconsole">
<tr>
<td>
<span class="tooltiprout"> Call:<br/> lm(formula = mpg ~ hp + am +
hp:am, data = mtcars) <span class="tooltiprouttext">This is simply a
statement of your original lm(…) “call” that you made when performing
your regression. It allows you to verify that you ran what you thought
you ran in the lm(…).</span> </span>
</td>
</tr>
</table>
<p><br/></p>
<table class="rconsole">
<tr>
<td colspan="2">
<span class="tooltiprout"> Residuals: <span
class="tooltiprouttext">Residuals are the vertical difference between
each point and the line, <span class="math inline">\(Y_i -
\hat{Y}_i\)</span>. The residuals are supposed to be normally
distributed, so a quick glance at their five-number summary can give us
insight about any skew present in the residuals. </span>
</td>
</tr>
<tr>
<td align="right">
<span class="tooltiprout"> min<br/>   -4.3818 <span
class="tooltiprouttext">“min” gives the value of the residual that is
furthest below the regression line. Ideally, the magnitude of this value
would be about equal to the magnitude of the largest positive residual
(the max) because the hope is that the residuals are normally
distributed around the line.</span> </span>
</td>
<td align="right">
<span class="tooltiprout"> 1Q<br/>   -2.2696 <span
class="tooltiprouttext">“1Q” gives the first quartile of the residuals,
which will always be negative, and ideally would be about equal in
magnitude to the third quartile.</span> </span>
</td>
<td align="right">
<span class="tooltiprout"> Median<br/>   0.1344 <span
class="tooltiprouttext">“Median” gives the median of the residuals,
which would ideally would be about equal to zero. Note that because the
regression line is the least squares line, the mean of the residuals
will ALWAYS be zero, so it is never included in the output summary. This
particular median value of -0.0191 is a little smaller than zero than we
would hope for and suggests a right skew in the data because the mean
(0) is greater than the median (-0.0191) witnessing the residuals are
right skewed. This can also be seen in the maximum being much larger in
magnitude than the minimum.</span> </span>
</td>
<td align="right">
<span class="tooltiprout"> 3Q<br/>   1.7058 <span
class="tooltiprouttext">“3Q” gives the third quartile of the residuals,
which would ideally would be about equal in magnitude to the first
quartile. In this case, it is pretty close, which helps us see that the
first quartile of residuals on either side of the line is behaving
fairly normally.</span> </span>
</td>
<td align="right">
<span class="tooltiprout"> Max</br>   5.8752 <span
class="tooltiprouttext">“Max” gives the maximum positive residuals,
which would ideally would be about equal in magnitude to the minimum
residual. In this case, it is much larger than the minimum, which helps
us see that the residuals are likely right skewed.</span> </span>
</td>
</tr>
</table>
<p><br/></p>
<table class="rconsole">
<tr>
<td colspan="2">
<span class="tooltiprout"> Coefficients: <span
class="tooltiprouttext">Notice that in your lm(…) you used only <span
class="math inline">\(Y\)</span> and <span
class="math inline">\(X\)</span>. You did type out any coefficients,
i.e., the <span class="math inline">\(\beta_0\)</span> or <span
class="math inline">\(\beta_1\)</span> of the regression model. These
coefficients are estimated by the lm(…) function and displayed in this
part of the output along with standard errors, t-values, and
p-values.</span> </span>
</td>
</tr>
<tr>
<td align="left">
</td>
<td align="right">
<span class="tooltiprout">   Estimate <span class="tooltiprouttext">To
learn more about the “Estimates” of the “Coefficients” see the
“Explanation” tab, “Estimating the Model Parameters” section for
details.</span>
</td>
<td align="right">
<span class="tooltiprout">   Std. Error <span class="tooltiprouttext">To
learn more about the “Standard Errors” of the “Coefficients” see the
“Explanation” tab, “Inference for the Model Parameters” section.</span>
</span>
</td>
<td align="right">
<span class="tooltiprout">   t value <span class="tooltiprouttext">To
learn more about the “t value” of the “Coefficients” see the
“Explanation” tab, “Inference for the Model Parameters” section.</span>
</span>
</td>
<td align="right">
<span class="tooltiprout">   Pr(&gt;|t|) <span
class="tooltiprouttext">The “Pr” stands for “Probability” and the “(&gt;
|t|)” stands for “more extreme than the observed t-value”. Thus, this is
the p-value for the hypothesis test of each coefficient being zero.<br/>
To learn more about the “p-value” of the “Coefficients” see the
“Explanation” tab, “Inference for the Model Parameters” section. </span>
</span>
</td>
</tr>
<tr>
<td align="left">
<span class="tooltiprout"> (Intercept) <span
class="tooltiprouttext">This always says “Intercept” for any lm(…) you
run in R. That is because R always assumes there is a y-intercept for
your regression function.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">   26.6248479 <span
class="tooltiprouttext">This is the estimate of the y-intercept, <span
class="math inline">\(\beta_0\)</span>. It is called <span
class="math inline">\(b_0\)</span>. It is the average y-value when all
X-variables are zero.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">   2.1829432 <span
class="tooltiprouttext">This is the standard error of <span
class="math inline">\(b_0\)</span>. It estimates how much <span
class="math inline">\(b_0\)</span> varies from sample to sample. The
closer to zero, the more reliable the estimate of the intercept.</span>
</span>
</td>
<td align="right">
<span class="tooltiprout"> 12.197 <span class="tooltiprouttext">This is
the test statistic t for the test of <span class="math inline">\(\beta_0
= 0\)</span>. It is calculated by dividing the “Estimate” of the
intercept (26.6248479) by its standard error (2.1829432). It gives the
“number of standard errors” away from zero that the “estimate” has
landed. In this case, the estimate of 26.6248479 is t=12.197 standard
errors away from zero, which is a fairly surprising distance as shown by
the p-value.</span> </span>
</td>
<td align="right">
<span class="tooltiprout"> 1.01e-12 <span class="tooltiprouttext">This
is the p-value of the test of the hypothesis that <span
class="math inline">\(\beta_0 = 0\)</span>. It measures the probability
of observing a t-value as extreme as the one observed. To compute it
yourself in R, use
<code>pt(-abs(your t-value), df of your regression)*2</code>.</span>
</span>
</td>
<td align="left">
<span class="tooltiprout"> *** <span class="tooltiprouttext">This is
called a “star”. Three stars means significant at the 0 level of <span
class="math inline">\(\alpha\)</span>.</span> </span>
</td>
</tr>
<tr>
<td align="left">
<span class="tooltiprout"> hp <span class="tooltiprouttext">This is
always the name of your first X-variable in your lm(Y ~ X1 + …).</span>
</span>
</td>
<td align="right">
<span class="tooltiprout">   -0.0591370 <span
class="tooltiprouttext">This is the estimate of <span
class="math inline">\(\beta_1\)</span> in the regression model. It is
called <span class="math inline">\(b_1\)</span>. Interpreting this value
depends on your choice of regression model.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">   0.0129449 <span
class="tooltiprouttext">This is the standard error of <span
class="math inline">\(b_1\)</span>. It estimates how much <span
class="math inline">\(b_1\)</span> varies from sample to sample. The
closer to zero, the more precise the estimate.</span> </span>
</td>
<td align="right">
<span class="tooltiprout"> -4.568 <span class="tooltiprouttext">This is
the test statistic t for the test of <span class="math inline">\(\beta_1
= 0\)</span>. It is calculated by dividing the “Estimate” by its
standard error. It gives the “number of standard errors” away from zero
that the “estimate” has landed.</span> </span>
</td>
<td align="right">
<span class="tooltiprout"> 9.02e-05 <span class="tooltiprouttext">This
is the p-value of the test of the hypothesis that <span
class="math inline">\(\beta_1 = 0\)</span>. To compute it yourself in R,
use <code>pt(-abs(your t-value), df of your regression)*2</code></span>
</span>
</td>
<td align="left">
<span class="tooltiprout"> *** <span class="tooltiprouttext">This is
called a “star”. Three stars means significant at the 0.01 level of
<span class="math inline">\(\alpha\)</span>.</span> </span>
</td>
</tr>
<tr>
<td align="left">
<span class="tooltiprout"> am <span class="tooltiprouttext">This is the
second X-variable of your regression model in lm(Y ~ X1 + X2 +
…).</span> </span>
</td>
<td align="right">
<span class="tooltiprout">   5.2176534 <span
class="tooltiprouttext">This is the estimated value for <span
class="math inline">\(\beta_2\)</span> and is called <span
class="math inline">\(b_2\)</span>.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">   2.6650931 <span
class="tooltiprouttext">This is the standard error of <span
class="math inline">\(b_2\)</span>. It estimates how much <span
class="math inline">\(b_2\)</span> will vary from sample to
sample.</span> </span>
</td>
<td align="right">
<span class="tooltiprout"> 1.958 <span class="tooltiprouttext">Test
statistic (t) for the test of <span class="math inline">\(\beta_2 =
0\)</span>. It represents the number of standard errors that <span
class="math inline">\(b_2\)</span> is from 0.</span> </span>
</td>
<td align="right">
<span class="tooltiprout"> 0.0603 <span class="tooltiprouttext">The
p-value for the test of <span class="math inline">\(\beta_2 =
0\)</span>.</span> </span>
</td>
<td align="left">
<span class="tooltiprout"> . <span class="tooltiprouttext">The dot “.”
implies the result is significant at the 0.1 level.</span> </span>
</td>
</tr>
<tr>
<td align="left">
<span class="tooltiprout"> hp:am <span class="tooltiprouttext">This is
the interaction of <span class="math inline">\(X1\)</span> and <span
class="math inline">\(X2\)</span>. Not all regression models require an
interaction term, and they can include more than one interaction term.
This is just an example of what an interaction term would look
like.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">   0.0004029 <span
class="tooltiprouttext">This is the estimate of the coefficient of the
interaction term.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">   0.0164602 <span
class="tooltiprouttext">Estimated standard error of the interaction
term.</span> </span>
</td>
<td align="right">
<span class="tooltiprout"> 0.024 <span class="tooltiprouttext">Test
statistic for the test that <span class="math inline">\(\beta_3 =
0\)</span>.</span> </span>
</td>
<td align="right">
<span class="tooltiprout"> 0.9806 <span class="tooltiprouttext">P-value
for the test that <span class="math inline">\(\beta_3 =
0\)</span>.</span> </span>
</td>
</tr>
</table>
<table class="rconsole">
<tr>
<td>
<span> --- </span>
</td>
</tr>
</table>
<table class="rconsole">
<tr>
<td>
<span class="tooltiprout"> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ’*’
0.05 ‘.’ 0.1 ‘ ’ 1 <span class="tooltiprouttext">These “codes” explain
what significance level the p-value is smaller than based on how many
“stars” * the p-value is labeled with in the Coefficients table
above.</span> </span>
</td>
</tr>
</table>
<p><br/></p>
<table class="rconsole">
<tr>
<td>
<span class="tooltiprout"> Residual standard error: <span
class="tooltiprouttext">This is the estimate of <span
class="math inline">\(\sigma\)</span> in the regression model <span
class="math inline">\(Y_i = \beta_0 + \beta_1 X_i + \epsilon_i\)</span>
where <span class="math inline">\(\epsilon_i \sim
N(0,\sigma^2)\)</span>. It is the square root of the MSE.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">  2.939 <span class="tooltiprouttext">For this
particular regression, the estimate of <span
class="math inline">\(\sigma\)</span> is 2.939. Squaring this number
gives you the MSE, which is the estimate of <span
class="math inline">\(\sigma^2\)</span>.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">  on 28 degrees of freedom <span
class="tooltiprouttext">This is <span class="math inline">\(n-p\)</span>
where <span class="math inline">\(n\)</span> is the sample size and
<span class="math inline">\(p\)</span> is the number of parameters in
the regression model. In this case, there is a sample size of 32 and two
parameters, <span class="math inline">\(\beta_0\)</span> and <span
class="math inline">\(\beta_1\)</span>, so 32-4 = 28.</span> </span>
</td>
</tr>
</table>
<table class="rconsole">
<tr>
<td>
<span class="tooltiprout"> Multiple R-squared: <span
class="tooltiprouttext">This is <span
class="math inline">\(R^2\)</span>, the percentage of variation in <span
class="math inline">\(Y\)</span> that is explained by the regression
model. It is equal to the SSR/SSTO or, equivalently, 1 -
SSE/SSTO.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">  0.7852, <span class="tooltiprouttext">In
this particular regression, 78.52% of the variation in stopping distance
<code>dist</code> is explained by the regression model using speed of
the car.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">  Adjusted R-squared: <span
class="tooltiprouttext">The adjusted R-squared will always be at least
slightly smaller than <span class="math inline">\(R^2\)</span>. The
closer to R-squared that it is, the better. When it differs dramatically
from <span class="math inline">\(R^2\)</span>, it is a sign that the
regression model is over-fitting the data.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">  0.7621 <span class="tooltiprouttext">In this
case, the value of 0.7621 is quite close to the original <span
class="math inline">\(R^2\)</span> value, so there is no fear of
over-fitting with this particular model. That is good.</span> </span>
</td>
</tr>
</table>
<table class="rconsole">
<tr>
<td>
<span class="tooltiprout"> F-statistic: <span
class="tooltiprouttext">The F-statistic is the test statistic for the
test of <span class="math inline">\(\beta_1 = \beta_2 = \beta_3 = \ldots
= 0\)</span>. In other words, it tests that ALL coefficients are zero
against the alternative that “at least one is not.”</span> </span>
</td>
<td align="right">
<span class="tooltiprout">  34.11 <span class="tooltiprouttext">This is
the value of the F-statistic that should be compared to an
F-distribution with 3 and 28 degrees of freedom.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">  on 3 and 28 DF, <span
class="tooltiprouttext">These two numbers give the two parameters
(degrees of freedom 1 and degrees of freedom 2) of the F-distribution.
Knowing these parameters and the value of the F-statistic allows the
computation of the p-value for the test that all regression coefficients
are zero.</span> </span>
</td>
<td align="right">
<span class="tooltiprout">  p-value: 1.73e-09 <span
class="tooltiprouttext">The p-value of the test that all regression
coefficients are zero. If this p-value is significant, then it can be
determined that “at least one” of the variables included in the
regression gives significant insight about the average y-value.</span>
</span>
</td>
</tr>
</table>
</div>
<p><br/></p>
<p><strong>Plotting the Regression Lines</strong></p>
<p>See each of the “Overview” sections for details on how to plot the
various types of multiple linear regression models.</p>
<p><br/></p>
<p><strong>Making Predictions</strong></p>
<a href="javascript:showhide('predict2')">
<div class="hoverchunk">
<p><span class="tooltipr"> predict( <span class="tooltiprtext">The R
function predict(…) allows you to use an lm(…) object to make
predictions for specified x-values.</span> </span><span
class="tooltipr"> mylm, <span class="tooltiprtext">This is the name of a
previously performed lm(…) that was saved into the name
<code>mylm &lt;- lm(...)</code>.</span> </span><span class="tooltipr">
newdata = data.frame( <span class="tooltiprtext">To specify the values
of <span class="math inline">\(x\)</span> that you want to use in the
prediction, you have to put those x-values into a data set, or more
specifally, a data.frame(…).</span> </span><span class="tooltipr"> <span
class="math inline">\(X_1\)</span>= <span class="tooltiprtext">The value
for <code>X=</code> should be whatever x-variable name was used in the
original regression. For example, if
<code>mylm &lt;- lm(mpg ~ hp + am + hp:am, data=mtcars)</code> was the
original regression, then this code would read <code>hp =</code> instead
of <code>X1 =</code>… Further, the value of <span
class="math inline">\(X_{1h}\)</span> should be some specific number,
like <code>hp=123</code> for example.</span> </span><span
class="tooltipr"> <span class="math inline">\(X_{1h}\)</span>, <span
class="tooltiprtext">The value of <span
class="math inline">\(X_{1h}\)</span> should be some specific number,
like <code>123</code>, as in <code>hp=123</code> for example.</span>
</span><span class="tooltipr"> <span class="math inline">\(X_2\)</span>=
<span class="tooltiprtext">This is the value of the second x-variable,
say <code>am</code>.</span> </span><span class="tooltipr"> <span
class="math inline">\(X_{2h}\)</span>) <span class="tooltiprtext">Since
the <code>am</code> column can only be a 1 or 0, we would try
<code>am=1</code> for example, or <code>am=0</code>.</span> </span><span
class="tooltipr"> ) <span class="tooltiprtext">Closing
parenthesis.</span> </span></p>
</div>
<p></a></p>
<div id="predict2" style="display:none;">
<p><code>mylm &lt;- lm(mpg ~ hp + am + hp:am, data = mtcars)</code></p>
<p><code>predict(mylm, data.frame(hp = 120, am = 1), data = mtcars, type = "response")</code></p>
<pre><code>##        1 
## 24.79441</code></pre>
<p>The value given is the “fitted-value” or “predicted-value” for the
specified x-value. In this case, a car with a speed of 12 is predicted
to have a stopping distance of 29.60981 feet.</p>
</div>
<a href="javascript:showhide('predict2Interval')">
<div class="hoverchunk">
<p><span class="tooltipr"> predict( <span class="tooltiprtext">The R
function predict(…) allows you to use an lm(…) object to make
predictions for specified x-values.</span> </span><span
class="tooltipr"> mylm, <span class="tooltiprtext">This is the name of a
previously performed lm(…) that was saved into the name
<code>mylm &lt;- lm(...)</code>.</span> </span><span class="tooltipr">
 newdata=data.frame( <span class="tooltiprtext">To specify the values of
<span class="math inline">\(x\)</span> that you want to use in the
prediction, you have to put those x-values into a data set, or more
specifally, a data.frame(…).</span> </span><span class="tooltipr"> X1=
<span class="tooltiprtext">The <code>X1=</code> should be replaced with
whatever x-variable name was used in the original regression. For
example, if <code>mylm &lt;- lm(dist ~ speed, data=cars)</code> was the
original regression, then this code would read <code>speed =</code>
instead of <code>X1=</code>… Further, the value of <span
class="math inline">\(X_{1h}\)</span> should be some specific number,
like <code>12</code> so that it reads <code>speed=12</code>, for
example.</span> </span><span class="tooltipr"> <span
class="math inline">\(X_{1h}\)</span>, <span class="tooltiprtext">The
value of <span class="math inline">\(X_{1h}\)</span> should be some
specific number, like <code>12</code>, as in <code>speed=12</code> for
example.</span> </span><span class="tooltipr"> X2= <span
class="tooltiprtext">If a regression of lm(Y ~ X1 + X2 + …) was
performed, then X2 is the name of the second x-variable used in the
regression.</span> </span><span class="tooltipr"> <span
class="math inline">\(X_{2h}\)</span>), <span class="tooltiprtext">A
number should be specified for <span
class="math inline">\(X_{2h}\)</span>, something that would be
meaningful for X2 to be equal to.</span> </span><span class="tooltipr">
interval = “prediction”) <span class="tooltiprtext">This causes the
prediction to include the lower bound and upper bound of the prediction
interval for <span class="math inline">\(Y_i\)</span> for the given X1,
X2, and so on values that have been specified.</span> </span></p>
</div>
<p></a></p>
<div id="predict2Interval" style="display:none;">
<p><code>mylm &lt;- lm(mpg ~ hp + am + hp:am, data = mtcars)</code></p>
<p><code>predict(mylm, data.frame(hp = 120, am = 1), data = mtcars, interval = "prediction")</code></p>
<div class="sourceCode" id="cb71"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb71-1"><a href="#cb71-1" aria-hidden="true" tabindex="-1"></a>mylm <span class="ot">&lt;-</span> <span class="fu">lm</span>(mpg <span class="sc">~</span> hp <span class="sc">+</span> am <span class="sc">+</span> hp<span class="sc">:</span>am, <span class="at">data =</span> mtcars)</span>
<span id="cb71-2"><a href="#cb71-2" aria-hidden="true" tabindex="-1"></a><span class="fu">predict</span>(mylm, <span class="fu">data.frame</span>(<span class="at">hp =</span> <span class="dv">120</span>, <span class="at">am =</span> <span class="dv">1</span>), <span class="at">data =</span> mtcars, <span class="at">interval =</span> <span class="st">&quot;prediction&quot;</span>)</span></code></pre></div>
<pre><code>##        fit      lwr      upr
## 1 24.79441 18.49923 31.08959</code></pre>
<p>The “fit” is the predicted value. The “lwr” is the lower bound. The
“upr” is the upper bound.</p>
<p>In this case, a car with a speed of 12 mph is predicted to have a
stopping distance of 29.60981 feet. However, we are wise enough to
recognize that the stopping distance for individual cars will vary
anywhere from -1.749529 (or 0 because distance can’t go negative) feet
to 60.96915 feet.</p>
</div>
<a href="javascript:showhide('predict2Confidence')">
<div class="hoverchunk">
<p><span class="tooltipr"> predict( <span class="tooltiprtext">The R
function predict(…) allows you to use an lm(…) object to make
predictions for specified x-values.</span> </span><span
class="tooltipr"> mylm, <span class="tooltiprtext">This is the name of a
previously performed lm(…) that was saved into the name
<code>mylm &lt;- lm(...)</code>.</span> </span><span class="tooltipr">
data.frame( <span class="tooltiprtext">To specify the values of <span
class="math inline">\(x\)</span> that you want to use in the prediction,
you have to put those x-values into a data set, or more specifally, a
data.frame(…).</span> </span><span class="tooltipr"> X1= <span
class="tooltiprtext">The <code>X1=</code> should be replaced with
whatever x-variable name was used in the original regression. For
example, if <code>mylm &lt;- lm(dist ~ speed, data=cars)</code> was the
original regression, then this code would read <code>speed =</code>
instead of <code>X1=</code>… Further, the value of <span
class="math inline">\(X_{1h}\)</span> should be some specific number,
like <code>12</code> so that it reads <code>speed=12</code>, for
example.</span> </span><span class="tooltipr"> <span
class="math inline">\(X_{1h}\)</span>, <span class="tooltiprtext">The
value of <span class="math inline">\(X_{1h}\)</span> should be some
specific number, like <code>12</code>, as in <code>speed=12</code> for
example.</span> </span><span class="tooltipr"> X2= <span
class="tooltiprtext">If a regression of lm(Y ~ X1 + X2 + …) was
performed, then X2 is the name of the second x-variable used in the
regression.</span> </span><span class="tooltipr"> <span
class="math inline">\(X_{2h}\)</span>), <span class="tooltiprtext">A
number should be specified for <span
class="math inline">\(X_{2h}\)</span>, something that would be
meaningful for X2 to be equal to.</span> </span><span class="tooltipr">
interval = “confidence”) <span class="tooltiprtext">This causes the
prediction to include the lower and upper bound of a confidence interval
for <span class="math inline">\(E{Y_i}\)</span> for the given <span
class="math inline">\(X\)</span>-values.</span> </span></p>
</div>
<p></a></p>
<div id="predict2Confidence" style="display:none;">
<p><code>mylm &lt;- lm(mpg ~ hp + am + hp:am, data = mtcars)</code></p>
<p><code>predict(mylm, data.frame(hp = 120, am = 1), data = mtcars, interval = "confidence")</code></p>
<div class="sourceCode" id="cb73"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb73-1"><a href="#cb73-1" aria-hidden="true" tabindex="-1"></a>mylm <span class="ot">&lt;-</span> <span class="fu">lm</span>(mpg <span class="sc">~</span> hp <span class="sc">+</span> am <span class="sc">+</span> hp<span class="sc">:</span>am, <span class="at">data =</span> mtcars)</span>
<span id="cb73-2"><a href="#cb73-2" aria-hidden="true" tabindex="-1"></a><span class="fu">predict</span>(mylm, <span class="fu">data.frame</span>(<span class="at">hp =</span> <span class="dv">120</span>, <span class="at">am =</span> <span class="dv">1</span>), <span class="at">interval =</span> <span class="st">&quot;confidence&quot;</span>)</span></code></pre></div>
<pre><code>##        fit      lwr      upr
## 1 24.79441 23.10635 26.48247</code></pre>
<p>The “fit” is the predicted value. The “lwr” is the lower bound. The
“upr” is the upper bound.</p>
<p>In this case, cars with a speed of 12 mph are predicted to have an
average stopping distance of 29.60981 feet, where the average could be
anywhere from 24.39514 feet to 34.82448 feet.</p>
</div>
<hr />
</div>
</div>
<div id="explanation-1" class="section level3">
<h3>Explanation</h3>
<div style="padding-left:125px;">
<div id="assessing-the-model-fit-expand" class="section level4">
<h4>Assessing the Model Fit
<a href="javascript:showhide('assessingFit2')" style="font-size:.6em;color:skyblue;">(Expand)</a></h4>
<p><span class="expand-caption"><span
class="math inline">\(R^2\)</span>, adjusted <span
class="math inline">\(R^2\)</span>, AIC, BIC…</span></p>
<div id="assessingFit2" style="display:none;">
<p>There are many measures of the quality of a regression model. One of
the most popular measurements is the <span
class="math inline">\(R^2\)</span> value (“R-squared”). The <span
class="math inline">\(R^2\)</span> value is a measure of the proportion
of variation of the <span class="math inline">\(Y\)</span>-variable that
is explained by the model. Specifically, <span class="math display">\[
  R^2 = \frac{\text{SSR}}{\text{SSTO}} =
1-\frac{\text{SSE}}{\text{SSTO}}
\]</span> The range of <span class="math inline">\(R^2\)</span> is
between 0 and 1. Values close to 1 imply a very good model. Values close
to 0 imply a very poor model.</p>
<p>One difficulty of <span class="math inline">\(R^2\)</span> in
multiple regression is that it will always get larger when more
variables are included in the regression model. Thus, in multiple linear
regression, it is best to make an adjustment to the <span
class="math inline">\(R^2\)</span> value to protect against this
difficulty. The value of the adjusted <span
class="math inline">\(R^2\)</span> is given by <span
class="math display">\[
  R^2_{adj} = 1 - \frac{(n-1)}{(n-p)}\frac{\text{SSE}}{\text{SSTO}}
\]</span> The interpretation of <span
class="math inline">\(R^2_{adj}\)</span> is essentially the same as the
interpretation of <span class="math inline">\(R^2\)</span>, with the
understanding that a correction has been made for the number of
parameters included in the model, <span
class="math inline">\((n-p)\)</span>.</p>
<p>Consider the models below. The value of <span
class="math inline">\(R^2\)</span> always gets higher as the model adds
more parameters. However, the value of <span
class="math inline">\(R^2_{adj}\)</span> sometimes goes down,
emphasizing the idea that the model is becoming more complex than needed
to capture the pattern in Y.</p>
<div class="sourceCode" id="cb75"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb75-1"><a href="#cb75-1" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mfrow=</span><span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">5</span>), <span class="at">mai=</span><span class="fu">c</span>(<span class="dv">0</span>,.<span class="dv">1</span>,.<span class="dv">4</span>,.<span class="dv">1</span>))</span>
<span id="cb75-2"><a href="#cb75-2" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(dist <span class="sc">~</span> speed, <span class="at">data=</span>cars, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">cex=</span><span class="dv">2</span>, <span class="at">xlim=</span><span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">27</span>), <span class="at">main=</span><span class="st">&quot;Simple Linear&quot;</span>)</span>
<span id="cb75-3"><a href="#cb75-3" aria-hidden="true" tabindex="-1"></a>lm1 <span class="ot">&lt;-</span> <span class="fu">lm</span>(dist <span class="sc">~</span> speed, <span class="at">data=</span>cars)</span>
<span id="cb75-4"><a href="#cb75-4" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(lm1)</span>
<span id="cb75-5"><a href="#cb75-5" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x, <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="st">&quot;orange&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb75-6"><a href="#cb75-6" aria-hidden="true" tabindex="-1"></a><span class="fu">text</span>(<span class="dv">1</span>,<span class="dv">110</span>,<span class="fu">bquote</span>(R<span class="sc">^</span><span class="dv">2</span> <span class="sc">==</span>  .(<span class="fu">round</span>(<span class="fu">summary</span>(lm1)<span class="sc">$</span>r.squared,<span class="dv">3</span>))),<span class="at">pos=</span><span class="dv">4</span>)</span>
<span id="cb75-7"><a href="#cb75-7" aria-hidden="true" tabindex="-1"></a><span class="fu">text</span>(<span class="dv">1</span>,<span class="dv">100</span>,<span class="fu">bquote</span>(R[adj]<span class="sc">^</span><span class="dv">2</span> <span class="sc">==</span> .(<span class="fu">round</span>(<span class="fu">summary</span>(lm1)<span class="sc">$</span>adj.r.squared,<span class="dv">3</span>))),<span class="at">pos=</span><span class="dv">4</span>)</span>
<span id="cb75-8"><a href="#cb75-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb75-9"><a href="#cb75-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb75-10"><a href="#cb75-10" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(dist <span class="sc">~</span> speed, <span class="at">data=</span>cars, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">cex=</span><span class="dv">2</span>, <span class="at">xlim=</span><span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">27</span>), <span class="at">main=</span><span class="st">&quot;Quadratic&quot;</span>)</span>
<span id="cb75-11"><a href="#cb75-11" aria-hidden="true" tabindex="-1"></a>lm1 <span class="ot">&lt;-</span> <span class="fu">lm</span>(dist <span class="sc">~</span> speed <span class="sc">+</span> <span class="fu">I</span>(speed<span class="sc">^</span><span class="dv">2</span>), <span class="at">data=</span>cars)</span>
<span id="cb75-12"><a href="#cb75-12" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(lm1)</span>
<span id="cb75-13"><a href="#cb75-13" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x <span class="sc">+</span> b[<span class="dv">3</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">2</span>, <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="st">&quot;orange&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb75-14"><a href="#cb75-14" aria-hidden="true" tabindex="-1"></a><span class="fu">text</span>(<span class="dv">1</span>,<span class="dv">110</span>,<span class="fu">bquote</span>(R<span class="sc">^</span><span class="dv">2</span> <span class="sc">==</span>  .(<span class="fu">round</span>(<span class="fu">summary</span>(lm1)<span class="sc">$</span>r.squared,<span class="dv">3</span>))),<span class="at">pos=</span><span class="dv">4</span>)</span>
<span id="cb75-15"><a href="#cb75-15" aria-hidden="true" tabindex="-1"></a><span class="fu">text</span>(<span class="dv">1</span>,<span class="dv">100</span>,<span class="fu">bquote</span>(R[adj]<span class="sc">^</span><span class="dv">2</span> <span class="sc">==</span> .(<span class="fu">round</span>(<span class="fu">summary</span>(lm1)<span class="sc">$</span>adj.r.squared,<span class="dv">3</span>))),<span class="at">pos=</span><span class="dv">4</span>)</span>
<span id="cb75-16"><a href="#cb75-16" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb75-17"><a href="#cb75-17" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb75-18"><a href="#cb75-18" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(dist <span class="sc">~</span> speed, <span class="at">data=</span>cars, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">cex=</span><span class="dv">2</span>, <span class="at">xlim=</span><span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">27</span>), <span class="at">main=</span><span class="st">&quot;Cubic&quot;</span>)</span>
<span id="cb75-19"><a href="#cb75-19" aria-hidden="true" tabindex="-1"></a>lm1 <span class="ot">&lt;-</span> <span class="fu">lm</span>(dist <span class="sc">~</span> speed <span class="sc">+</span> <span class="fu">I</span>(speed<span class="sc">^</span><span class="dv">2</span>) <span class="sc">+</span> <span class="fu">I</span>(speed<span class="sc">^</span><span class="dv">3</span>), <span class="at">data=</span>cars)</span>
<span id="cb75-20"><a href="#cb75-20" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(lm1)</span>
<span id="cb75-21"><a href="#cb75-21" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x <span class="sc">+</span> b[<span class="dv">3</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">2</span> <span class="sc">+</span> b[<span class="dv">4</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">3</span>, <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="st">&quot;orange&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb75-22"><a href="#cb75-22" aria-hidden="true" tabindex="-1"></a><span class="fu">text</span>(<span class="dv">1</span>,<span class="dv">110</span>,<span class="fu">bquote</span>(R<span class="sc">^</span><span class="dv">2</span> <span class="sc">==</span>  .(<span class="fu">round</span>(<span class="fu">summary</span>(lm1)<span class="sc">$</span>r.squared,<span class="dv">3</span>))),<span class="at">pos=</span><span class="dv">4</span>)</span>
<span id="cb75-23"><a href="#cb75-23" aria-hidden="true" tabindex="-1"></a><span class="fu">text</span>(<span class="dv">1</span>,<span class="dv">100</span>,<span class="fu">bquote</span>(R[adj]<span class="sc">^</span><span class="dv">2</span> <span class="sc">==</span> .(<span class="fu">round</span>(<span class="fu">summary</span>(lm1)<span class="sc">$</span>adj.r.squared,<span class="dv">3</span>))),<span class="at">pos=</span><span class="dv">4</span>)</span>
<span id="cb75-24"><a href="#cb75-24" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb75-25"><a href="#cb75-25" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb75-26"><a href="#cb75-26" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(dist <span class="sc">~</span> speed, <span class="at">data=</span>cars, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">cex=</span><span class="dv">2</span>, <span class="at">xlim=</span><span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">27</span>), <span class="at">main=</span><span class="st">&quot;Quartic&quot;</span>)</span>
<span id="cb75-27"><a href="#cb75-27" aria-hidden="true" tabindex="-1"></a>lm1 <span class="ot">&lt;-</span> <span class="fu">lm</span>(dist <span class="sc">~</span> speed <span class="sc">+</span> <span class="fu">I</span>(speed<span class="sc">^</span><span class="dv">2</span>) <span class="sc">+</span> <span class="fu">I</span>(speed<span class="sc">^</span><span class="dv">3</span>) <span class="sc">+</span> <span class="fu">I</span>(speed<span class="sc">^</span><span class="dv">4</span>), <span class="at">data=</span>cars)</span>
<span id="cb75-28"><a href="#cb75-28" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(lm1)</span>
<span id="cb75-29"><a href="#cb75-29" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x <span class="sc">+</span> b[<span class="dv">3</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">2</span> <span class="sc">+</span> b[<span class="dv">4</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">3</span> <span class="sc">+</span> b[<span class="dv">5</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">4</span>, <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="st">&quot;orange&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb75-30"><a href="#cb75-30" aria-hidden="true" tabindex="-1"></a><span class="fu">text</span>(<span class="dv">1</span>,<span class="dv">110</span>,<span class="fu">bquote</span>(R<span class="sc">^</span><span class="dv">2</span> <span class="sc">==</span>  .(<span class="fu">round</span>(<span class="fu">summary</span>(lm1)<span class="sc">$</span>r.squared,<span class="dv">3</span>))),<span class="at">pos=</span><span class="dv">4</span>)</span>
<span id="cb75-31"><a href="#cb75-31" aria-hidden="true" tabindex="-1"></a><span class="fu">text</span>(<span class="dv">1</span>,<span class="dv">100</span>,<span class="fu">bquote</span>(R[adj]<span class="sc">^</span><span class="dv">2</span> <span class="sc">==</span> .(<span class="fu">round</span>(<span class="fu">summary</span>(lm1)<span class="sc">$</span>adj.r.squared,<span class="dv">3</span>))),<span class="at">pos=</span><span class="dv">4</span>)</span>
<span id="cb75-32"><a href="#cb75-32" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb75-33"><a href="#cb75-33" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb75-34"><a href="#cb75-34" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(dist <span class="sc">~</span> speed, <span class="at">data=</span>cars, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>, <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">cex=</span><span class="dv">2</span>, <span class="at">xlim=</span><span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">27</span>), <span class="at">main=</span><span class="st">&quot;Quintic&quot;</span>)</span>
<span id="cb75-35"><a href="#cb75-35" aria-hidden="true" tabindex="-1"></a>lm1 <span class="ot">&lt;-</span> <span class="fu">lm</span>(dist <span class="sc">~</span> speed <span class="sc">+</span> <span class="fu">I</span>(speed<span class="sc">^</span><span class="dv">2</span>) <span class="sc">+</span> <span class="fu">I</span>(speed<span class="sc">^</span><span class="dv">3</span>) <span class="sc">+</span> <span class="fu">I</span>(speed<span class="sc">^</span><span class="dv">4</span>) <span class="sc">+</span> <span class="fu">I</span>(speed<span class="sc">^</span><span class="dv">5</span>), <span class="at">data=</span>cars)</span>
<span id="cb75-36"><a href="#cb75-36" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(lm1)</span>
<span id="cb75-37"><a href="#cb75-37" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x <span class="sc">+</span> b[<span class="dv">3</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">2</span> <span class="sc">+</span> b[<span class="dv">4</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">3</span> <span class="sc">+</span> b[<span class="dv">5</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">4</span> <span class="sc">+</span> b[<span class="dv">6</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">5</span>, <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="st">&quot;orange&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb75-38"><a href="#cb75-38" aria-hidden="true" tabindex="-1"></a><span class="fu">text</span>(<span class="dv">1</span>,<span class="dv">110</span>,<span class="fu">bquote</span>(R<span class="sc">^</span><span class="dv">2</span> <span class="sc">==</span>  .(<span class="fu">round</span>(<span class="fu">summary</span>(lm1)<span class="sc">$</span>r.squared,<span class="dv">3</span>))),<span class="at">pos=</span><span class="dv">4</span>)</span>
<span id="cb75-39"><a href="#cb75-39" aria-hidden="true" tabindex="-1"></a><span class="fu">text</span>(<span class="dv">1</span>,<span class="dv">100</span>,<span class="fu">bquote</span>(R[adj]<span class="sc">^</span><span class="dv">2</span> <span class="sc">==</span> .(<span class="fu">round</span>(<span class="fu">summary</span>(lm1)<span class="sc">$</span>adj.r.squared,<span class="dv">3</span>))),<span class="at">pos=</span><span class="dv">4</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-78-1.png" width="672" /></p>
<p><br/></p>
<p>The “simplest” but “best” model of those shown above would be the
Quadratic. This is because it has the best <span
class="math inline">\(R^2_{adj}\)</span> (0.653) other than the far more
complicated Quartic model (0.655). But the <span
class="math inline">\(R^2_{adj}\)</span> for the Quadratic model is a
good improvement over that of the <span
class="math inline">\(R^2_{adj}\)</span> for the Simple Linear model,
with a value of 0.653 compared to 0.644, respectively. So moving to the
complexity of the Quadratic model is justified over the Simple Linear
Model. But there is not enough of an improvement in the <span
class="math inline">\(R^2_{adj}\)</span> to warrant moving to the
complexity of the Quartic Model. Further, the pattern in the Quadratic
seems to generalize better to data outside the range of the current data
than does the Quartic model.</p>
<p><span class="math display">\[
  \text{\emph{Quadratic Model}:}\quad Y_i = \beta_0 + \beta_1 X_i +
\beta_2 X_i^2 + \epsilon_i
\]</span></p>
<p><span class="math display">\[
  \text{\emph{Quartic Model}:}\quad Y_i = \beta_0 + \beta_1 X_i +
\beta_2 X_i^2 + \underbrace{\beta_3 X_i^3 + \beta_4 X_i^4}_\text{Cubic
and Quartic Terms} + \epsilon_i
\]</span></p>
<p><strong>AIC and BIC</strong></p>
<p>Two other measurements, or information criterion, are popular for use
in the model selection process. These are the Akaike Information
Criterion (AIC) and the Bayesian Information Criterion (BIC). These are
easily computed in R using <code>AIC(yourlm)</code> and
<code>BIC(yourlm)</code>.</p>
<p>The formula for each are given in different, but equivalent ways
depending on which source you obtain the equation. Perhaps the easiest
formulation to understand is that given by Kutner, Nachtsheim, and Neter
in their book <em>Applied Linear Regression Models</em> (4th edition,
page 360)</p>
<p><span class="math display">\[
\text{AIC:} \quad n \ln(SSE) - n \ln(n) + 2p
\]</span> where SSE is the usual <span
class="math inline">\(\sum_{i=1}^n (Y_i - \hat{Y}_i)^2\)</span> of the
current regression model under consideration, <span
class="math inline">\(n\)</span> is the sample size, and <span
class="math inline">\(p\)</span> is the number of parameters in the
current regression model.</p>
<p><span class="math display">\[
\text{BIC:} \quad n \ln(SSE) - n \ln(n) + p\ln(n)
\]</span></p>
<p>This shows how the BIC differs only from the AIC in the final term,
where AIC uses <span class="math inline">\(2p\)</span> and BIC uses
<span class="math inline">\(p\ln(n)\)</span>. Since <span
class="math inline">\(\ln(n) \geq 2\)</span> for <span
class="math inline">\(n\geq8\)</span>, then BIC enforces a larger
penalty than the AIC for extra model parameters (<span
class="math inline">\(p\)</span>) when the sample size is 8 or larger,
i.e., most data sets.</p>
<p>The AIC was formulated by <a
href="https://www.ism.ac.jp/editsec/aism/pdf/023_2_0163.pdf">Hirotugu
Akaike in 1971</a>. (Here is a <a
href="http://www.garfield.library.upenn.edu/classics1981/A1981MS54100001.pdf">short
commentary</a> by Akaike about how he developed this information
criterion. Note that he named it “an information criterion (AIC)” when
he published the method and other people later began calling it the
“Akaike Information Criterion.”)</p>
</div>
<p><br/></p>
</div>
<div id="model-selection-expand" class="section level4">
<h4>Model Selection
<a href="javascript:showhide('modelselection')" style="font-size:.6em;color:skyblue;">(Expand)</a></h4>
<p><span class="expand-caption">pairs plots, added variable plots, and
pattern recognition…</span></p>
<div id="modelselection" style="display:none;">
<p>Model selection is an exploratory analysis tool that is useful for
proposing possible regression models for a given response variable <span
class="math inline">\(Y\)</span>. They should always be followed up by
confirmatory analysis that tests the theories proposed by the selected
model. However, when confirmatory studies are not possible, model
validation is a meaningful tool that can be used to attempt to confirm
the utility of a model.</p>
<div id="pairs-plots" class="section level5 tabset tabset-pills">
<h5 class="tabset tabset-pills">Pairs Plots</h5>
<p>A useful visualization tool for model selection is the “pairs plot.”
This plot shows all possible 2D scatterplots that can be created from a
given dataset.</p>
<p>Here is a pairs plot of the <code>mtcars</code> data set in R.</p>
<div id="basic-view" class="section level6">
<h6>Basic View</h6>
<div class="sourceCode" id="cb76"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb76-1"><a href="#cb76-1" aria-hidden="true" tabindex="-1"></a><span class="fu">pairs</span>(mtcars, <span class="at">panel=</span>panel.smooth)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-79-1.png" width="672" /></p>
</div>
<div id="more-detailed-view" class="section level6">
<h6>More Detailed View</h6>
<div class="sourceCode" id="cb77"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb77-1"><a href="#cb77-1" aria-hidden="true" tabindex="-1"></a><span class="fu">pairs</span>(mtcars, <span class="at">panel=</span>panel.smooth)</span></code></pre></div>
<p><img src="Images/pairsPlotGuidance.png" /></p>
</div>
</div>
<div id="section-1" class="section level5">
<h5></h5>
<p>Notice that…</p>
<ul>
<li>the y-axis of each plot is found by locating the variable name (like
“mpg”) that is found to the left or right of the current plot.</li>
<li>the x-axis of each plot is found by locating the variable name (like
“disp”) that is found above or below each plot.</li>
<li>the LOWESS curves have been added to each plot to visualize the type
of regression model that would best fit each plot.</li>
</ul>
</div>
<div id="selecting-a-model" class="section level5 tabset tabset-pills">
<h5 class="tabset tabset-pills">Selecting a Model</h5>
<p>Suppose now that we are trying to come up with a good regression
model for predicting the gas mileage of a car, <span
class="math inline">\(Y=\)</span><code>mpg</code>.</p>
<p><span class="math display">\[
  \underbrace{Y_i}_\text{mpg} = \underbrace{?}_\text{Our model} +
\  \epsilon_i \quad \text{where} \ \epsilon_i \sim N(0, \sigma^2)
\]</span> To find meaningful x-variables that could predict our chosen
y-variable of <code>mpg</code>, we look at all plots that have
<code>mpg</code> as the y-axis of the plot. This happens to be the first
row of the pairs plot.</p>
<p>When looking at the graph, we are looking for variables that show a
strong change in the average y-value (i.e., the LOWESS curve should show
steep slope or a meaningful trend). While all variables in the
<code>mtcars</code> data set seem to have some relationship with
<code>mpg</code>, the strongest relationships appear to e with
<code>cyl</code>, <code>disp</code>, <code>hp</code>, <code>wt</code>,
<code>vs</code>, <code>am</code>, and <code>gear</code>.</p>
<div id="basic-view-1" class="section level6">
<h6>Basic View</h6>
<div class="sourceCode" id="cb78"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb78-1"><a href="#cb78-1" aria-hidden="true" tabindex="-1"></a><span class="fu">pairs</span>(mtcars, <span class="at">panel=</span>panel.smooth)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-81-1.png" width="672" /></p>
</div>
<div id="more-detailed-view-1" class="section level6">
<h6>More Detailed View</h6>
<div class="sourceCode" id="cb79"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb79-1"><a href="#cb79-1" aria-hidden="true" tabindex="-1"></a><span class="fu">pairs</span>(mtcars, <span class="at">panel=</span>panel.smooth)</span></code></pre></div>
<p><img src="Images/pairsPossibleX.png" /></p>
</div>
</div>
<div id="section-2" class="section level5">
<h5></h5>
<p>Also worth noting is that the relationship of <code>mpg</code> with
each of <code>disp</code>, <code>hp</code>, and <code>wt</code> are all
similar, they each look to be an exponential decay type of model. This
tells us that we had better check to see if <code>disp</code>,
<code>hp</code>, and <code>wt</code> are related to each other. If they
are, then we should only use one of them in the regression model as the
other two likely wouldn’t give any new information about
<code>mpg</code>.</p>
<p>Sure enough, the pairs plot shows that there is a fairly strong
relationship between <code>disp</code> and <code>hp</code>,
<code>hp</code> and <code>wt</code>, and <code>disp</code> and
<code>wt</code>.</p>
<p><img src="Images/pairsPossibleRelatedX.png" /></p>
<p>Now, with all of this in mind, we could start looking at a few
possible regression models. Let’s start with perhaps the simplest and
strongest trend we saw with <code>mpg</code> and any of the x-variables,
<code>wt</code>.</p>
<div class="sourceCode" id="cb80"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb80-1"><a href="#cb80-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(mpg <span class="sc">~</span> wt, <span class="at">data=</span>mtcars)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-83-1.png" width="672" /></p>
<div class="sourceCode" id="cb81"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb81-1"><a href="#cb81-1" aria-hidden="true" tabindex="-1"></a>lm.wt <span class="ot">&lt;-</span> <span class="fu">lm</span>(mpg <span class="sc">~</span> wt, <span class="at">data=</span>mtcars)</span>
<span id="cb81-2"><a href="#cb81-2" aria-hidden="true" tabindex="-1"></a><span class="fu">summary</span>(lm.wt) <span class="sc">%&gt;%</span> <span class="fu">pander</span>()</span></code></pre></div>
<table style="width:89%;">
<colgroup>
<col width="25%" />
<col width="15%" />
<col width="18%" />
<col width="13%" />
<col width="16%" />
</colgroup>
<thead>
<tr class="header">
<th align="center"> </th>
<th align="center">Estimate</th>
<th align="center">Std. Error</th>
<th align="center">t value</th>
<th align="center">Pr(&gt;|t|)</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="center"><strong>(Intercept)</strong></td>
<td align="center">37.29</td>
<td align="center">1.878</td>
<td align="center">19.86</td>
<td align="center">8.242e-19</td>
</tr>
<tr class="even">
<td align="center"><strong>wt</strong></td>
<td align="center">-5.344</td>
<td align="center">0.5591</td>
<td align="center">-9.559</td>
<td align="center">1.294e-10</td>
</tr>
</tbody>
</table>
<table style="width:88%;">
<caption>Fitting linear model: mpg ~ wt</caption>
<colgroup>
<col width="20%" />
<col width="30%" />
<col width="12%" />
<col width="23%" />
</colgroup>
<thead>
<tr class="header">
<th align="center">Observations</th>
<th align="center">Residual Std. Error</th>
<th align="center"><span class="math inline">\(R^2\)</span></th>
<th align="center">Adjusted <span
class="math inline">\(R^2\)</span></th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="center">32</td>
<td align="center">3.046</td>
<td align="center">0.7528</td>
<td align="center">0.7446</td>
</tr>
</tbody>
</table>
</div>
</div>
<p><br/></p>
</div>
<div id="model-validation-expand" class="section level4">
<h4>Model Validation
<a href="javascript:showhide('validation')" style="font-size:.6em;color:skyblue;">(Expand)</a></h4>
<p><span class="expand-caption">Verifying a model’s ability to
generalize to new data…</span></p>
<div id="validation" style="display:none;">
<p>The following graph shows three things: (1) a true regression model,
(2) a simple linear regression model that doesn’t quite capture the full
pattern in the data, and (3) a complicated model that seems to overly
fit the data as it fits better than even the true model.</p>
<div class="sourceCode" id="cb82"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb82-1"><a href="#cb82-1" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">123</span>) <span class="co">#gives us the same randomness </span></span>
<span id="cb82-2"><a href="#cb82-2" aria-hidden="true" tabindex="-1"></a>n <span class="ot">&lt;-</span> <span class="dv">20</span> <span class="co">#sample size</span></span>
<span id="cb82-3"><a href="#cb82-3" aria-hidden="true" tabindex="-1"></a>x <span class="ot">&lt;-</span> <span class="fu">runif</span>(n, <span class="sc">-</span><span class="fl">1.5</span>, <span class="fl">3.8</span>) <span class="co">#uniform X from -1.5 to 3.8</span></span>
<span id="cb82-4"><a href="#cb82-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Coefficients for the true model:</span></span>
<span id="cb82-5"><a href="#cb82-5" aria-hidden="true" tabindex="-1"></a>beta0 <span class="ot">&lt;-</span> <span class="dv">2</span></span>
<span id="cb82-6"><a href="#cb82-6" aria-hidden="true" tabindex="-1"></a>beta1 <span class="ot">&lt;-</span> <span class="sc">-</span><span class="fl">2.5</span></span>
<span id="cb82-7"><a href="#cb82-7" aria-hidden="true" tabindex="-1"></a>beta2 <span class="ot">&lt;-</span> <span class="dv">1</span></span>
<span id="cb82-8"><a href="#cb82-8" aria-hidden="true" tabindex="-1"></a>beta3 <span class="ot">&lt;-</span> <span class="dv">3</span></span>
<span id="cb82-9"><a href="#cb82-9" aria-hidden="true" tabindex="-1"></a>beta4 <span class="ot">&lt;-</span> <span class="sc">-</span><span class="fl">0.8</span></span>
<span id="cb82-10"><a href="#cb82-10" aria-hidden="true" tabindex="-1"></a><span class="co"># Get y-value using a true model</span></span>
<span id="cb82-11"><a href="#cb82-11" aria-hidden="true" tabindex="-1"></a>y <span class="ot">&lt;-</span> beta0 <span class="sc">+</span> beta1<span class="sc">*</span>x <span class="sc">+</span> beta2<span class="sc">*</span>x<span class="sc">^</span><span class="dv">2</span> <span class="sc">+</span> beta3<span class="sc">*</span>x<span class="sc">^</span><span class="dv">3</span> <span class="sc">+</span> beta4<span class="sc">*</span>x<span class="sc">^</span><span class="dv">4</span> <span class="sc">+</span> <span class="fu">rnorm</span>(n, <span class="dv">0</span>, <span class="fl">0.5</span>) <span class="co">#normal errors</span></span>
<span id="cb82-12"><a href="#cb82-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb82-13"><a href="#cb82-13" aria-hidden="true" tabindex="-1"></a>thedata <span class="ot">&lt;-</span> <span class="fu">data.frame</span>(y, x)</span>
<span id="cb82-14"><a href="#cb82-14" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb82-15"><a href="#cb82-15" aria-hidden="true" tabindex="-1"></a><span class="co"># Plot it</span></span>
<span id="cb82-16"><a href="#cb82-16" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mai=</span><span class="fu">c</span>(.<span class="dv">1</span>,.<span class="dv">5</span>,.<span class="dv">2</span>,.<span class="dv">1</span>))</span>
<span id="cb82-17"><a href="#cb82-17" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(y <span class="sc">~</span> x, <span class="at">data=</span>thedata, <span class="at">pch=</span><span class="dv">21</span>, <span class="at">col=</span><span class="st">&quot;lightgray&quot;</span>, <span class="at">bg=</span><span class="st">&quot;steelblue&quot;</span>, <span class="at">cex=</span><span class="fl">1.3</span>, <span class="at">ylim=</span><span class="fu">c</span>(<span class="sc">-</span><span class="dv">5</span>,<span class="dv">22</span>), <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">ylab=</span><span class="st">&quot;&quot;</span>, <span class="at">xlab=</span><span class="st">&quot;&quot;</span>)</span>
<span id="cb82-18"><a href="#cb82-18" aria-hidden="true" tabindex="-1"></a><span class="fu">mtext</span>(<span class="at">side=</span><span class="dv">3</span>, <span class="at">text=</span><span class="st">&quot;Original Data (Training Data)&quot;</span>, <span class="at">cex=</span><span class="fl">0.7</span>, <span class="at">at=</span><span class="sc">-</span>.<span class="dv">8</span>, <span class="at">line=</span>.<span class="dv">1</span>)</span>
<span id="cb82-19"><a href="#cb82-19" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb82-20"><a href="#cb82-20" aria-hidden="true" tabindex="-1"></a><span class="co"># Draw true model</span></span>
<span id="cb82-21"><a href="#cb82-21" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(beta0 <span class="sc">+</span> beta1<span class="sc">*</span>x <span class="sc">+</span> beta2<span class="sc">*</span>x<span class="sc">^</span><span class="dv">2</span> <span class="sc">+</span> beta3<span class="sc">*</span>x<span class="sc">^</span><span class="dv">3</span> <span class="sc">+</span> beta4<span class="sc">*</span>x<span class="sc">^</span><span class="dv">4</span>, <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="fu">rgb</span>(<span class="fl">0.2745098</span>, <span class="fl">0.5098039</span>, <span class="fl">0.7058824</span>, .<span class="dv">5</span>), <span class="at">lwd=</span><span class="dv">4</span>)</span>
<span id="cb82-22"><a href="#cb82-22" aria-hidden="true" tabindex="-1"></a>lmt <span class="ot">&lt;-</span> <span class="fu">lm</span>(y <span class="sc">~</span> x <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">2</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">3</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">4</span>), <span class="at">data=</span>thedata) <span class="co">#for later</span></span>
<span id="cb82-23"><a href="#cb82-23" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb82-24"><a href="#cb82-24" aria-hidden="true" tabindex="-1"></a><span class="co"># Draw simple linear model</span></span>
<span id="cb82-25"><a href="#cb82-25" aria-hidden="true" tabindex="-1"></a>lms <span class="ot">&lt;-</span> <span class="fu">lm</span>(y <span class="sc">~</span> x, <span class="at">data=</span>thedata)</span>
<span id="cb82-26"><a href="#cb82-26" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(lms)</span>
<span id="cb82-27"><a href="#cb82-27" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x, <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="fu">rgb</span>(<span class="dv">1</span>,<span class="fl">0.6470588</span>,<span class="dv">0</span>, .<span class="dv">3</span>), <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb82-28"><a href="#cb82-28" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb82-29"><a href="#cb82-29" aria-hidden="true" tabindex="-1"></a><span class="co"># Draw overly complicated model</span></span>
<span id="cb82-30"><a href="#cb82-30" aria-hidden="true" tabindex="-1"></a>lmo <span class="ot">&lt;-</span> <span class="fu">lm</span>(y <span class="sc">~</span> x <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">2</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">3</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">4</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">5</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">6</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">7</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">8</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">9</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">10</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">11</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">12</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">13</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">14</span>), <span class="at">data=</span>thedata)</span>
<span id="cb82-31"><a href="#cb82-31" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(lmo)</span>
<span id="cb82-32"><a href="#cb82-32" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x <span class="sc">+</span> b[<span class="dv">3</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">2</span> <span class="sc">+</span> b[<span class="dv">4</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">3</span> <span class="sc">+</span> b[<span class="dv">5</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">4</span> <span class="sc">+</span> b[<span class="dv">6</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">5</span> <span class="sc">+</span> b[<span class="dv">7</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">6</span> <span class="sc">+</span> b[<span class="dv">8</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">7</span> <span class="sc">+</span> b[<span class="dv">9</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">8</span> <span class="sc">+</span> b[<span class="dv">10</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">9</span> <span class="sc">+</span> b[<span class="dv">11</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">10</span> <span class="sc">+</span> b[<span class="dv">12</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">11</span> <span class="sc">+</span> b[<span class="dv">13</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">12</span> <span class="sc">+</span> b[<span class="dv">14</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">13</span> <span class="sc">+</span> b[<span class="dv">15</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">14</span>, <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="fu">rgb</span>(<span class="fl">0.6980392</span>, <span class="fl">0.133333</span>, <span class="fl">0.133333</span>, .<span class="dv">2</span>), <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb82-33"><a href="#cb82-33" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb82-34"><a href="#cb82-34" aria-hidden="true" tabindex="-1"></a><span class="co"># Add legend</span></span>
<span id="cb82-35"><a href="#cb82-35" aria-hidden="true" tabindex="-1"></a><span class="fu">legend</span>(<span class="st">&quot;topleft&quot;</span>, <span class="at">legend=</span><span class="fu">c</span>(<span class="st">&quot;True Model&quot;</span>, <span class="st">&quot;Simple Model&quot;</span>, <span class="st">&quot;Complicated Model&quot;</span>), <span class="at">lwd=</span><span class="fu">c</span>(<span class="dv">4</span>,<span class="dv">2</span>,<span class="dv">2</span>), <span class="at">col=</span><span class="fu">c</span>(<span class="fu">rgb</span>(<span class="fl">0.2745098</span>, <span class="fl">0.5098039</span>, <span class="fl">0.7058824</span>, .<span class="dv">5</span>), <span class="fu">rgb</span>(<span class="dv">1</span>,<span class="fl">0.6470588</span>,<span class="dv">0</span>, .<span class="dv">3</span>), <span class="fu">rgb</span>(<span class="fl">0.6980392</span>, <span class="fl">0.133333</span>, <span class="fl">0.133333</span>, .<span class="dv">2</span>)), <span class="at">bty=</span><span class="st">&#39;n&#39;</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/problem-1.png" width="480" /></p>
<div class="sourceCode" id="cb83"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb83-1"><a href="#cb83-1" aria-hidden="true" tabindex="-1"></a>my_output_table <span class="ot">&lt;-</span> <span class="fu">data.frame</span>(<span class="at">Model =</span> <span class="fu">c</span>(<span class="st">&quot;True&quot;</span>, <span class="st">&quot;Simple&quot;</span>, <span class="st">&quot;Complicated&quot;</span>), <span class="st">`</span><span class="at">R^2</span><span class="st">`</span> <span class="ot">=</span> <span class="fu">c</span>(<span class="fu">summary</span>(lmt)<span class="sc">$</span>r.squared, <span class="fu">summary</span>(lms)<span class="sc">$</span>r.squared, <span class="fu">summary</span>(lmo)<span class="sc">$</span>r.squared), <span class="st">`</span><span class="at">Adjusted R^2</span><span class="st">`</span> <span class="ot">=</span> <span class="fu">c</span>(<span class="fu">summary</span>(lmt)<span class="sc">$</span>adj.r.squared, <span class="fu">summary</span>(lms)<span class="sc">$</span>adj.r.squared, <span class="fu">summary</span>(lmo)<span class="sc">$</span>adj.r.squared))</span>
<span id="cb83-2"><a href="#cb83-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb83-3"><a href="#cb83-3" aria-hidden="true" tabindex="-1"></a><span class="fu">colnames</span>(my_output_table) <span class="ot">&lt;-</span> <span class="fu">c</span>(<span class="st">&quot;Model&quot;</span>, <span class="st">&quot;$R^2$&quot;</span>, <span class="st">&quot;Adjusted $R^2$&quot;</span>)</span>
<span id="cb83-4"><a href="#cb83-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb83-5"><a href="#cb83-5" aria-hidden="true" tabindex="-1"></a>knitr<span class="sc">::</span><span class="fu">kable</span>(my_output_table)</span></code></pre></div>
<table>
<thead>
<tr class="header">
<th align="left">Model</th>
<th align="right"><span class="math inline">\(R^2\)</span></th>
<th align="right">Adjusted <span class="math inline">\(R^2\)</span></th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="left">True</td>
<td align="right">0.9958725</td>
<td align="right">0.9947718</td>
</tr>
<tr class="even">
<td align="left">Simple</td>
<td align="right">0.8114836</td>
<td align="right">0.8010105</td>
</tr>
<tr class="odd">
<td align="left">Complicated</td>
<td align="right">0.9984527</td>
<td align="right">0.9941204</td>
</tr>
</tbody>
</table>
<p>Now, let’s remind ourselves why we use regression models in the first
place. The main goal is to capture the “essence” of the data. In other
words, the general pattern is what we are after. We want a model that
tells us how “all such” data is created, not just the specific data we
have sampled. So, the great test of a model is to see how well it works
on a new sample of data.</p>
<p>This is precisely <strong>model validation</strong>, the verification
that a model fit on one sample of data, continues to perform well on a
new sample of data.</p>
<div class="sourceCode" id="cb84"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb84-1"><a href="#cb84-1" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">14551</span>) <span class="co">#get same random sample</span></span>
<span id="cb84-2"><a href="#cb84-2" aria-hidden="true" tabindex="-1"></a><span class="co"># Get a new sample of data from the true model</span></span>
<span id="cb84-3"><a href="#cb84-3" aria-hidden="true" tabindex="-1"></a>Xnew <span class="ot">&lt;-</span> <span class="fu">runif</span>(n, <span class="sc">-</span><span class="fl">1.4</span>, <span class="fl">3.7</span>) <span class="co">#uniform X from -1.5 to 3.8</span></span>
<span id="cb84-4"><a href="#cb84-4" aria-hidden="true" tabindex="-1"></a>Ynew <span class="ot">&lt;-</span> beta0 <span class="sc">+</span> beta1<span class="sc">*</span>Xnew <span class="sc">+</span> beta2<span class="sc">*</span>Xnew<span class="sc">^</span><span class="dv">2</span> <span class="sc">+</span> beta3<span class="sc">*</span>Xnew<span class="sc">^</span><span class="dv">3</span> <span class="sc">+</span> beta4<span class="sc">*</span>Xnew<span class="sc">^</span><span class="dv">4</span> <span class="sc">+</span> <span class="fu">rnorm</span>(n, <span class="dv">0</span>, <span class="fl">0.5</span>) <span class="co">#normal errors</span></span>
<span id="cb84-5"><a href="#cb84-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb84-6"><a href="#cb84-6" aria-hidden="true" tabindex="-1"></a>thedata2 <span class="ot">&lt;-</span> <span class="fu">data.frame</span>(<span class="at">y=</span>Ynew, <span class="at">x=</span>Xnew)</span>
<span id="cb84-7"><a href="#cb84-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb84-8"><a href="#cb84-8" aria-hidden="true" tabindex="-1"></a><span class="co"># Plot it</span></span>
<span id="cb84-9"><a href="#cb84-9" aria-hidden="true" tabindex="-1"></a><span class="fu">par</span>(<span class="at">mai=</span><span class="fu">c</span>(.<span class="dv">1</span>,.<span class="dv">5</span>,.<span class="dv">2</span>,.<span class="dv">1</span>))</span>
<span id="cb84-10"><a href="#cb84-10" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(y <span class="sc">~</span> x, <span class="at">data=</span>thedata, <span class="at">pch=</span><span class="dv">21</span>, <span class="at">col=</span><span class="fu">rgb</span>(.<span class="dv">827451</span>,.<span class="dv">827451</span>,.<span class="dv">827451</span>, .<span class="dv">1</span>), <span class="at">bg=</span><span class="fu">rgb</span>(.<span class="dv">2745098</span>,.<span class="dv">5098039</span>,.<span class="dv">7058824</span>, .<span class="dv">2</span>), <span class="at">cex=</span><span class="fl">1.3</span>, <span class="at">ylim=</span><span class="fu">c</span>(<span class="sc">-</span><span class="dv">5</span>,<span class="dv">22</span>), <span class="at">yaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">xaxt=</span><span class="st">&#39;n&#39;</span>, <span class="at">ylab=</span><span class="st">&quot;&quot;</span>, <span class="at">xlab=</span><span class="st">&quot;&quot;</span>)</span>
<span id="cb84-11"><a href="#cb84-11" aria-hidden="true" tabindex="-1"></a><span class="fu">mtext</span>(<span class="at">side=</span><span class="dv">3</span>, <span class="at">text=</span><span class="st">&quot;New Data (Testing Data)&quot;</span>, <span class="at">cex=</span><span class="fl">0.7</span>, <span class="at">at=</span><span class="sc">-</span>.<span class="dv">8</span>, <span class="at">line=</span>.<span class="dv">1</span>)</span>
<span id="cb84-12"><a href="#cb84-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb84-13"><a href="#cb84-13" aria-hidden="true" tabindex="-1"></a><span class="co"># Draw true model</span></span>
<span id="cb84-14"><a href="#cb84-14" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(beta0 <span class="sc">+</span> beta1<span class="sc">*</span>x <span class="sc">+</span> beta2<span class="sc">*</span>x<span class="sc">^</span><span class="dv">2</span> <span class="sc">+</span> beta3<span class="sc">*</span>x<span class="sc">^</span><span class="dv">3</span> <span class="sc">+</span> beta4<span class="sc">*</span>x<span class="sc">^</span><span class="dv">4</span>, <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="fu">rgb</span>(<span class="fl">0.2745098</span>, <span class="fl">0.5098039</span>, <span class="fl">0.7058824</span>, .<span class="dv">5</span>), <span class="at">lwd=</span><span class="dv">4</span>)</span>
<span id="cb84-15"><a href="#cb84-15" aria-hidden="true" tabindex="-1"></a>lmt <span class="ot">&lt;-</span> <span class="fu">lm</span>(y <span class="sc">~</span> x <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">2</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">3</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">4</span>), <span class="at">data=</span>thedata) <span class="co">#for later</span></span>
<span id="cb84-16"><a href="#cb84-16" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb84-17"><a href="#cb84-17" aria-hidden="true" tabindex="-1"></a><span class="co"># Draw simple linear model</span></span>
<span id="cb84-18"><a href="#cb84-18" aria-hidden="true" tabindex="-1"></a>lms <span class="ot">&lt;-</span> <span class="fu">lm</span>(y <span class="sc">~</span> x, <span class="at">data=</span>thedata)</span>
<span id="cb84-19"><a href="#cb84-19" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(lms)</span>
<span id="cb84-20"><a href="#cb84-20" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x, <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="fu">rgb</span>(<span class="dv">1</span>,<span class="fl">0.6470588</span>,<span class="dv">0</span>, .<span class="dv">3</span>), <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb84-21"><a href="#cb84-21" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb84-22"><a href="#cb84-22" aria-hidden="true" tabindex="-1"></a><span class="co"># Draw overly complicated model</span></span>
<span id="cb84-23"><a href="#cb84-23" aria-hidden="true" tabindex="-1"></a>lmc <span class="ot">&lt;-</span> <span class="fu">lm</span>(y <span class="sc">~</span> x <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">2</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">3</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">4</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">5</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">6</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">7</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">8</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">9</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">10</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">11</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">12</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">13</span>) <span class="sc">+</span> <span class="fu">I</span>(x<span class="sc">^</span><span class="dv">14</span>), <span class="at">data=</span>thedata)</span>
<span id="cb84-24"><a href="#cb84-24" aria-hidden="true" tabindex="-1"></a>b <span class="ot">&lt;-</span> <span class="fu">coef</span>(lmc)</span>
<span id="cb84-25"><a href="#cb84-25" aria-hidden="true" tabindex="-1"></a><span class="fu">curve</span>(b[<span class="dv">1</span>] <span class="sc">+</span> b[<span class="dv">2</span>]<span class="sc">*</span>x <span class="sc">+</span> b[<span class="dv">3</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">2</span> <span class="sc">+</span> b[<span class="dv">4</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">3</span> <span class="sc">+</span> b[<span class="dv">5</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">4</span> <span class="sc">+</span> b[<span class="dv">6</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">5</span> <span class="sc">+</span> b[<span class="dv">7</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">6</span> <span class="sc">+</span> b[<span class="dv">8</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">7</span> <span class="sc">+</span> b[<span class="dv">9</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">8</span> <span class="sc">+</span> b[<span class="dv">10</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">9</span> <span class="sc">+</span> b[<span class="dv">11</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">10</span> <span class="sc">+</span> b[<span class="dv">12</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">11</span> <span class="sc">+</span> b[<span class="dv">13</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">12</span> <span class="sc">+</span> b[<span class="dv">14</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">13</span> <span class="sc">+</span> b[<span class="dv">15</span>]<span class="sc">*</span>x<span class="sc">^</span><span class="dv">14</span>, <span class="at">add=</span><span class="cn">TRUE</span>, <span class="at">col=</span><span class="fu">rgb</span>(<span class="fl">0.6980392</span>, <span class="fl">0.133333</span>, <span class="fl">0.133333</span>, .<span class="dv">2</span>), <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb84-26"><a href="#cb84-26" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb84-27"><a href="#cb84-27" aria-hidden="true" tabindex="-1"></a><span class="co"># Add new data to plot</span></span>
<span id="cb84-28"><a href="#cb84-28" aria-hidden="true" tabindex="-1"></a><span class="fu">points</span>(y <span class="sc">~</span> x, <span class="at">data=</span>thedata2, <span class="at">pch=</span><span class="dv">21</span>, <span class="at">col=</span><span class="fu">rgb</span>(.<span class="dv">827451</span>,.<span class="dv">827451</span>,.<span class="dv">827451</span>, .<span class="dv">5</span>), <span class="at">bg=</span><span class="st">&quot;orange&quot;</span>, <span class="at">cex=</span><span class="fl">1.3</span>)</span>
<span id="cb84-29"><a href="#cb84-29" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb84-30"><a href="#cb84-30" aria-hidden="true" tabindex="-1"></a><span class="co"># Add legend</span></span>
<span id="cb84-31"><a href="#cb84-31" aria-hidden="true" tabindex="-1"></a><span class="fu">legend</span>(<span class="st">&quot;topleft&quot;</span>, <span class="at">legend=</span><span class="fu">c</span>(<span class="st">&quot;True Model&quot;</span>, <span class="st">&quot;Simple Model&quot;</span>, <span class="st">&quot;Complicated Model&quot;</span>), <span class="at">lwd=</span><span class="fu">c</span>(<span class="dv">4</span>,<span class="dv">2</span>,<span class="dv">2</span>), <span class="at">col=</span><span class="fu">c</span>(<span class="fu">rgb</span>(<span class="fl">0.2745098</span>, <span class="fl">0.5098039</span>, <span class="fl">0.7058824</span>, .<span class="dv">5</span>), <span class="fu">rgb</span>(<span class="dv">1</span>,<span class="fl">0.6470588</span>,<span class="dv">0</span>, .<span class="dv">3</span>), <span class="fu">rgb</span>(<span class="fl">0.6980392</span>, <span class="fl">0.133333</span>, <span class="fl">0.133333</span>, .<span class="dv">2</span>)), <span class="at">bty=</span><span class="st">&#39;n&#39;</span>)</span>
<span id="cb84-32"><a href="#cb84-32" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb84-33"><a href="#cb84-33" aria-hidden="true" tabindex="-1"></a><span class="co"># Add dot legend</span></span>
<span id="cb84-34"><a href="#cb84-34" aria-hidden="true" tabindex="-1"></a><span class="fu">legend</span>(<span class="st">&quot;bottomright&quot;</span>, <span class="at">legend=</span><span class="fu">c</span>(<span class="st">&quot;Original Sample&quot;</span>, <span class="st">&quot;New Sample&quot;</span>), <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="fu">c</span>(<span class="fu">rgb</span>(.<span class="dv">2745098</span>,.<span class="dv">5098039</span>,.<span class="dv">7058824</span>, .<span class="dv">2</span>),<span class="st">&quot;orange&quot;</span>), <span class="at">bty=</span><span class="st">&#39;n&#39;</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-84-1.png" width="480" /></p>
<div class="sourceCode" id="cb85"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb85-1"><a href="#cb85-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Compute R-squared for each validation</span></span>
<span id="cb85-2"><a href="#cb85-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb85-3"><a href="#cb85-3" aria-hidden="true" tabindex="-1"></a>  <span class="co"># Get y-hat for each model on new data.</span></span>
<span id="cb85-4"><a href="#cb85-4" aria-hidden="true" tabindex="-1"></a>  yht <span class="ot">&lt;-</span> <span class="fu">predict</span>(lmt, <span class="at">newdata=</span>thedata2)</span>
<span id="cb85-5"><a href="#cb85-5" aria-hidden="true" tabindex="-1"></a>  yhs <span class="ot">&lt;-</span> <span class="fu">predict</span>(lms, <span class="at">newdata=</span>thedata2)</span>
<span id="cb85-6"><a href="#cb85-6" aria-hidden="true" tabindex="-1"></a>  yhc <span class="ot">&lt;-</span> <span class="fu">predict</span>(lmc, <span class="at">newdata=</span>thedata2)</span>
<span id="cb85-7"><a href="#cb85-7" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb85-8"><a href="#cb85-8" aria-hidden="true" tabindex="-1"></a>  <span class="co"># Compute y-bar</span></span>
<span id="cb85-9"><a href="#cb85-9" aria-hidden="true" tabindex="-1"></a>  ybar <span class="ot">&lt;-</span> <span class="fu">mean</span>(thedata2<span class="sc">$</span>y) <span class="co">#Yi is given by Ynew from the new sample of data</span></span>
<span id="cb85-10"><a href="#cb85-10" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb85-11"><a href="#cb85-11" aria-hidden="true" tabindex="-1"></a>  <span class="co"># Compute SSTO</span></span>
<span id="cb85-12"><a href="#cb85-12" aria-hidden="true" tabindex="-1"></a>  SSTO <span class="ot">&lt;-</span> <span class="fu">sum</span>( (thedata2<span class="sc">$</span>y <span class="sc">-</span> ybar)<span class="sc">^</span><span class="dv">2</span> )</span>
<span id="cb85-13"><a href="#cb85-13" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb85-14"><a href="#cb85-14" aria-hidden="true" tabindex="-1"></a>  <span class="co"># Compute SSE for each model using y - yhat</span></span>
<span id="cb85-15"><a href="#cb85-15" aria-hidden="true" tabindex="-1"></a>  SSEt <span class="ot">&lt;-</span> <span class="fu">sum</span>( (thedata2<span class="sc">$</span>y <span class="sc">-</span> yht)<span class="sc">^</span><span class="dv">2</span> )</span>
<span id="cb85-16"><a href="#cb85-16" aria-hidden="true" tabindex="-1"></a>  SSEs <span class="ot">&lt;-</span> <span class="fu">sum</span>( (thedata2<span class="sc">$</span>y <span class="sc">-</span> yhs)<span class="sc">^</span><span class="dv">2</span> )</span>
<span id="cb85-17"><a href="#cb85-17" aria-hidden="true" tabindex="-1"></a>  SSEc <span class="ot">&lt;-</span> <span class="fu">sum</span>( (thedata2<span class="sc">$</span>y <span class="sc">-</span> yhc)<span class="sc">^</span><span class="dv">2</span> )</span>
<span id="cb85-18"><a href="#cb85-18" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb85-19"><a href="#cb85-19" aria-hidden="true" tabindex="-1"></a>  <span class="co"># Compute R-squared for each</span></span>
<span id="cb85-20"><a href="#cb85-20" aria-hidden="true" tabindex="-1"></a>  rst <span class="ot">&lt;-</span> <span class="dv">1</span> <span class="sc">-</span> SSEt<span class="sc">/</span>SSTO</span>
<span id="cb85-21"><a href="#cb85-21" aria-hidden="true" tabindex="-1"></a>  rss <span class="ot">&lt;-</span> <span class="dv">1</span> <span class="sc">-</span> SSEs<span class="sc">/</span>SSTO</span>
<span id="cb85-22"><a href="#cb85-22" aria-hidden="true" tabindex="-1"></a>  rsc <span class="ot">&lt;-</span> <span class="dv">1</span> <span class="sc">-</span> SSEc<span class="sc">/</span>SSTO</span>
<span id="cb85-23"><a href="#cb85-23" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb85-24"><a href="#cb85-24" aria-hidden="true" tabindex="-1"></a>  <span class="co"># Compute adjusted R-squared for each</span></span>
<span id="cb85-25"><a href="#cb85-25" aria-hidden="true" tabindex="-1"></a>  n <span class="ot">&lt;-</span> <span class="fu">length</span>(thedata2<span class="sc">$</span>y) <span class="co">#sample size</span></span>
<span id="cb85-26"><a href="#cb85-26" aria-hidden="true" tabindex="-1"></a>  pt <span class="ot">&lt;-</span> <span class="fu">length</span>(<span class="fu">coef</span>(lmt)) <span class="co">#num. parameters in model</span></span>
<span id="cb85-27"><a href="#cb85-27" aria-hidden="true" tabindex="-1"></a>  ps <span class="ot">&lt;-</span> <span class="fu">length</span>(<span class="fu">coef</span>(lms)) <span class="co">#num. parameters in model</span></span>
<span id="cb85-28"><a href="#cb85-28" aria-hidden="true" tabindex="-1"></a>  pc <span class="ot">&lt;-</span> <span class="fu">length</span>(<span class="fu">coef</span>(lmc)) <span class="co">#num. parameters in model</span></span>
<span id="cb85-29"><a href="#cb85-29" aria-hidden="true" tabindex="-1"></a>  rsta <span class="ot">&lt;-</span> <span class="dv">1</span> <span class="sc">-</span> (n<span class="dv">-1</span>)<span class="sc">/</span>(n<span class="sc">-</span>pt)<span class="sc">*</span>SSEt<span class="sc">/</span>SSTO</span>
<span id="cb85-30"><a href="#cb85-30" aria-hidden="true" tabindex="-1"></a>  rssa <span class="ot">&lt;-</span> <span class="dv">1</span> <span class="sc">-</span> (n<span class="dv">-1</span>)<span class="sc">/</span>(n<span class="sc">-</span>ps)<span class="sc">*</span>SSEs<span class="sc">/</span>SSTO</span>
<span id="cb85-31"><a href="#cb85-31" aria-hidden="true" tabindex="-1"></a>  rsca <span class="ot">&lt;-</span> <span class="dv">1</span> <span class="sc">-</span> (n<span class="dv">-1</span>)<span class="sc">/</span>(n<span class="sc">-</span>pc)<span class="sc">*</span>SSEc<span class="sc">/</span>SSTO</span>
<span id="cb85-32"><a href="#cb85-32" aria-hidden="true" tabindex="-1"></a>  </span>
<span id="cb85-33"><a href="#cb85-33" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb85-34"><a href="#cb85-34" aria-hidden="true" tabindex="-1"></a>my_output_table2 <span class="ot">&lt;-</span> <span class="fu">data.frame</span>(<span class="at">Model =</span> <span class="fu">c</span>(<span class="st">&quot;True&quot;</span>, <span class="st">&quot;Simple&quot;</span>, <span class="st">&quot;Complicated&quot;</span>), <span class="st">`</span><span class="at">Original R2</span><span class="st">`</span> <span class="ot">=</span> <span class="fu">c</span>(<span class="fu">summary</span>(lmt)<span class="sc">$</span>r.squared, <span class="fu">summary</span>(lms)<span class="sc">$</span>r.squared, <span class="fu">summary</span>(lmo)<span class="sc">$</span>r.squared), <span class="st">`</span><span class="at">Orig. Adj. R-squared</span><span class="st">`</span> <span class="ot">=</span> <span class="fu">c</span>(<span class="fu">summary</span>(lmt)<span class="sc">$</span>adj.r.squared, <span class="fu">summary</span>(lms)<span class="sc">$</span>adj.r.squared, <span class="fu">summary</span>(lmo)<span class="sc">$</span>adj.r.squared), <span class="st">`</span><span class="at">Validation R-squared</span><span class="st">`</span> <span class="ot">=</span> <span class="fu">c</span>(rst, rss, rsc), <span class="st">`</span><span class="at">Validation Adj. R^2</span><span class="st">`</span> <span class="ot">=</span> <span class="fu">c</span>(rsta, rssa, rsca))</span>
<span id="cb85-35"><a href="#cb85-35" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb85-36"><a href="#cb85-36" aria-hidden="true" tabindex="-1"></a><span class="fu">colnames</span>(my_output_table2) <span class="ot">&lt;-</span> <span class="fu">c</span>(<span class="st">&quot;Model&quot;</span>, <span class="st">&quot;Original $R^2$&quot;</span>, <span class="st">&quot;Original Adj. $R^2$&quot;</span>, <span class="st">&quot;Validation $R^2$&quot;</span>, <span class="st">&quot;Validation Adj. $R^2$&quot;</span>)</span>
<span id="cb85-37"><a href="#cb85-37" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb85-38"><a href="#cb85-38" aria-hidden="true" tabindex="-1"></a>knitr<span class="sc">::</span><span class="fu">kable</span>(my_output_table2, <span class="at">escape=</span><span class="cn">TRUE</span>, <span class="at">digits=</span><span class="dv">4</span>)</span></code></pre></div>
<table>
<colgroup>
<col width="13%" />
<col width="17%" />
<col width="23%" />
<col width="19%" />
<col width="25%" />
</colgroup>
<thead>
<tr class="header">
<th align="left">Model</th>
<th align="right">Original <span class="math inline">\(R^2\)</span></th>
<th align="right">Original Adj. <span
class="math inline">\(R^2\)</span></th>
<th align="right">Validation <span
class="math inline">\(R^2\)</span></th>
<th align="right">Validation Adj. <span
class="math inline">\(R^2\)</span></th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="left">True</td>
<td align="right">0.9959</td>
<td align="right">0.9948</td>
<td align="right">0.9928</td>
<td align="right">0.9908</td>
</tr>
<tr class="even">
<td align="left">Simple</td>
<td align="right">0.8115</td>
<td align="right">0.8010</td>
<td align="right">0.8002</td>
<td align="right">0.7891</td>
</tr>
<tr class="odd">
<td align="left">Complicated</td>
<td align="right">0.9985</td>
<td align="right">0.9941</td>
<td align="right">0.8686</td>
<td align="right">0.5008</td>
</tr>
</tbody>
</table>
<p>Notice how the <span class="math inline">\(R^2\)</span> for the
complicated model dropped fairly dramatically from its original value of
0.9985 to 0.8686, and the adjusted <span
class="math inline">\(R^2\)</span> dropped from 0.994 to 0.501! On the
other hand, the <span class="math inline">\(R^2\)</span> and adjusted
<span class="math inline">\(R^2\)</span> values for the True and Simple
model were relatively unchanged. This is clear evidence that the
“complicated model” is overfitting the original data. It does not
capture the “essence” of the data, so it is not a generalizable model.
It does not fit new data very well, even though it fit the original
sample of data quite well. This is what we mean by <strong>over
fitting</strong> a model to a particular sample of data.</p>
</div>
<p><br/></p>
</div>
<div id="interpretation-expand" class="section level4">
<h4>Interpretation
<a href="javascript:showhide('interpretationMultiple')" style="font-size:.6em;color:skyblue;">(Expand)</a></h4>
<p><span class="expand-caption"><span
class="math inline">\(\beta_j\)</span> is the change in the average
y-value…</span></p>
<div id="interpretationMultiple" style="display:none;">
<p>The only change to interpretation from the simple linear regression
model is that each coefficient, <span
class="math inline">\(\beta_j\)</span> <span
class="math inline">\(j=1,\ldots,p\)</span>, represents the change in
the <span class="math inline">\(E\{Y\}\)</span> for a unit change in
<span class="math inline">\(X_j\)</span>, <em>holding all other
variables constant.</em></p>
</div>
<p><br /></p>
</div>
<div id="added-variable-plots-expand" class="section level4">
<h4>Added Variable Plots
<a href="javascript:showhide('addedVariablePlots')" style="font-size:.6em;color:skyblue;">(Expand)</a></h4>
<p><span class="expand-caption">When to add another <span
class="math inline">\(X\)</span>-variable to the model…</span></p>
<div id="addedVariablePlots" style="display:none;">
<p>The assumptions of multiple linear regression are nearly identical to
simple linear regression, with the addition of one new assumption.</p>
<ol style="list-style-type: decimal">
<li>The regression relation between <span
class="math inline">\(Y\)</span> and <span
class="math inline">\(X\)</span> is linear.</li>
<li>The error terms are normally distributed with <span
class="math inline">\(E\{\epsilon_i\}=0\)</span>.</li>
<li>The variance of the error terms is constant over all <span
class="math inline">\(X\)</span> values.</li>
<li>The <span class="math inline">\(X\)</span> values can be considered
fixed and measured without error.</li>
<li>The error terms are independent.</li>
<li>All important variables are included in the model.</li>
</ol>
<p><br /></p>
<div id="check" class="section level4">
<h4>Checking the Assumptions</h4>
<p>The process of checking assumptions is the same for multiple linear
regression as it is for simple linear regression, with the addition of
one more tool, the added variable plot. Added variable plots can be used
to determine if a new variable should be included in the model.</p>
<table width="90%">
<tr>
<td with="15%">
<img src="LinearRegression_files/figure-html/unnamed-chunk-85-1.png" width="144" />
</td>
<td width="75%">
<p>Let <span class="math inline">\(X_{new}\)</span> be a new explanatory
variable that could be added to the current multiple regression model.
Plotting the residuals from the current linear regression against <span
class="math inline">\(X_{new}\)</span> allows us to determine if <span
class="math inline">\(X_{new}\)</span> has any information to add to the
current model. If there is a trend in the plot, then <span
class="math inline">\(X_{new}\)</span> should be added to the model. If
there is no trend in the plot, then the <span
class="math inline">\(X_{new}\)</span> should be left out.</p>
<p>|
<a href="javascript:showhide('addedvariableplots')" style="font-size:.8em;color:steelblue2;">Show
Examples</a> |</p>
</td>
</tr>
</table>
<div id="addedvariableplots" style="display:none;">
<p><a href="javascript:showhide('addedvariableplotsread')" style="font-size:.8em;color:skyblue;">(Read
more…)</a></p>
<div id="addedvariableplotsread" style="display:none;">
<p>An added variable plot checks to see if a new variable has any
information to add to the current multiple regression model.</p>
<p>The plot is made by taking the residuals from the current multiple
regression model (<span class="math inline">\(y\)</span>-axis) and
plotting them against the new explanatory variable (<span
class="math inline">\(x\)</span>-axis).</p>
<ul>
<li><p>If there is a trend in the added variable plot, then the new
explanatory variable contains extra information that is not already
contained in the current multiple regression. The new variable should be
included in the model.</p></li>
<li><p>If there is no trend in the added variable plot, then the
information provided by the new explanatory variable is already
contained in the current multiple regression model. The new variable
should continue to be left out of the model.</p></li>
</ul>
<p>The left column of plots below show scenarios where the new
explanatory variable should be included in the model. The right column
of plots show scenarios where the new explanatory variable should not be
included in the model.</p>
</div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-86-1.png" width="672" /><img src="LinearRegression_files/figure-html/unnamed-chunk-86-2.png" width="672" /></p>
</div>
</div>
</div>
<p><br /></p>
</div>
<div id="outlier-analysis-expand" class="section level4">
<h4>Outlier Analysis
<a href="javascript:showhide('outlierAnalysis')" style="font-size:.6em;color:skyblue;">(Expand)</a></h4>
<p><span class="expand-caption">Cook’s Distances and Leverage
Values…</span></p>
<div id="outlierAnalysis" style="display:none;">
<p>The presence of outlying points in a regression can bias the
regression estimates substantially. In simple linear regressions, the
outlier are usually quite visible in a residuals vs. fitted-values plot.
However, in higher dimensional regression models, it can become very
difficult to locate points that are negatively effecting the regression.
Here are two measurements that are helpful in identifying points that
are negatively impacting an estimated regression model.</p>
<div id="cooks-distances" class="section level5">
<h5>Cook’s Distances</h5>
<p>The idea behind Cook’s Distance is to measure the impact each
individual point has on the regression estimates <span
class="math inline">\(b_i\)</span> for each <span
class="math inline">\(\beta_i\)</span>. As found in the original article
<a
href="http://www.stat.ucla.edu/~nchristo/statistics100C/1268249.pdf">“Detection
of Influential Observation in Linear Regression” (Dennis Cook, 1977)</a>
the formula Cook developed for measuring this effect is given by (when
adapted to fit the notation of this book)</p>
<p><span class="math display">\[
  D_i = \frac{\sum_{j=1}^n (\widehat{Y}_{j} -
\widehat{Y}_{j(i)})^2}{p\cdot MSE}
\]</span></p>
<p>where <span class="math inline">\(p\)</span> is the number of
parameters in the regression model, <span
class="math inline">\(MSE\)</span> is the estimate of <span
class="math inline">\(\sigma^2\)</span> (the mean squared error), and
<span class="math inline">\(\hat{Y}_{j(i)}\)</span> represents the
residual for point <span class="math inline">\(j\)</span> when the <span
class="math inline">\(i\)</span>th point was removed from the
regression.</p>
<p>To understand this formula, let’s focus first on the numerator: <span
class="math inline">\(\sum_{j=1}^n \widehat{Y}_j -
\widehat{Y}_{j(i)}\)</span>. Here, we are comparing the residual from
the original regression for point <span
class="math inline">\(j\)</span>, <span
class="math inline">\(\widehat{Y}_j\)</span> to the modified value of
that same residual when point <span class="math inline">\(i\)</span> is
removed from the regression. See the image below for a visual
explanation.</p>
<div class="sourceCode" id="cb86"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb86-1"><a href="#cb86-1" aria-hidden="true" tabindex="-1"></a>X <span class="ot">&lt;-</span> <span class="fu">c</span>(<span class="dv">2</span>,<span class="dv">3</span>,<span class="dv">5</span>,<span class="dv">6</span>,<span class="dv">8</span>,<span class="dv">13</span>)</span>
<span id="cb86-2"><a href="#cb86-2" aria-hidden="true" tabindex="-1"></a>Y <span class="ot">&lt;-</span> <span class="fu">c</span>(<span class="dv">3</span>,<span class="dv">5</span>,<span class="dv">7</span>,<span class="dv">9</span>,<span class="dv">8</span>,<span class="dv">12</span>)</span>
<span id="cb86-3"><a href="#cb86-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb86-4"><a href="#cb86-4" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(Y <span class="sc">~</span> X, <span class="at">pch=</span><span class="dv">16</span>, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>, <span class="at">ylim=</span><span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">14</span>))</span>
<span id="cb86-5"><a href="#cb86-5" aria-hidden="true" tabindex="-1"></a><span class="fu">points</span>(X[<span class="dv">4</span>],Y[<span class="dv">4</span>], <span class="at">pch=</span><span class="dv">16</span>, <span class="at">cex=</span><span class="fl">1.1</span>, <span class="at">col=</span><span class="st">&quot;orange&quot;</span>)</span>
<span id="cb86-6"><a href="#cb86-6" aria-hidden="true" tabindex="-1"></a>lm1 <span class="ot">&lt;-</span> <span class="fu">lm</span>(Y <span class="sc">~</span> X)</span>
<span id="cb86-7"><a href="#cb86-7" aria-hidden="true" tabindex="-1"></a>lm2 <span class="ot">&lt;-</span> <span class="fu">lm</span>(Y <span class="sc">~</span> X, <span class="at">w=</span><span class="fu">c</span>(<span class="dv">1</span>,<span class="dv">1</span>,<span class="dv">1</span>,<span class="dv">0</span>,<span class="dv">1</span>,<span class="dv">1</span>))</span>
<span id="cb86-8"><a href="#cb86-8" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(lm1, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb86-9"><a href="#cb86-9" aria-hidden="true" tabindex="-1"></a><span class="fu">abline</span>(lm2, <span class="at">col=</span><span class="st">&quot;orange&quot;</span>, <span class="at">lwd=</span><span class="dv">2</span>)</span>
<span id="cb86-10"><a href="#cb86-10" aria-hidden="true" tabindex="-1"></a><span class="fu">legend</span>(<span class="st">&quot;topleft&quot;</span>, <span class="at">legend=</span><span class="fu">c</span>(<span class="st">&quot;All Points Included&quot;</span>, <span class="st">&quot;Orange Point Removed&quot;</span>), <span class="at">lty=</span><span class="dv">1</span>, <span class="at">col=</span><span class="fu">c</span>(<span class="st">&quot;skyblue&quot;</span>,<span class="st">&quot;orange&quot;</span>), <span class="at">bty=</span><span class="st">&quot;n&quot;</span>)</span>
<span id="cb86-11"><a href="#cb86-11" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb86-12"><a href="#cb86-12" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> (i <span class="cf">in</span> <span class="dv">1</span><span class="sc">:</span><span class="dv">6</span>){</span>
<span id="cb86-13"><a href="#cb86-13" aria-hidden="true" tabindex="-1"></a>  <span class="fu">lines</span>(<span class="fu">c</span>(X[i]<span class="sc">+</span>.<span class="dv">03</span>,X[i]<span class="sc">+</span>.<span class="dv">03</span>), <span class="fu">c</span>(Y[i], lm1<span class="sc">$</span>fit[i]), <span class="at">lty=</span><span class="dv">1</span>, <span class="at">col=</span><span class="st">&quot;skyblue&quot;</span>)</span>
<span id="cb86-14"><a href="#cb86-14" aria-hidden="true" tabindex="-1"></a>  <span class="fu">lines</span>(<span class="fu">c</span>(X[i]<span class="sc">-</span>.<span class="dv">03</span>,X[i]<span class="sc">-</span>.<span class="dv">03</span>), <span class="fu">c</span>(Y[i], lm2<span class="sc">$</span>fit[i]), <span class="at">lty=</span><span class="dv">1</span>, <span class="at">col=</span><span class="st">&quot;orange&quot;</span>)</span>
<span id="cb86-15"><a href="#cb86-15" aria-hidden="true" tabindex="-1"></a>}</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-87-1.png" width="672" /></p>
<div class="sourceCode" id="cb87"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb87-1"><a href="#cb87-1" aria-hidden="true" tabindex="-1"></a><span class="fu">pander</span>(<span class="fu">round</span>(<span class="fu">rbind</span>(<span class="st">`</span><span class="at">Original Residuals</span><span class="st">`</span> <span class="ot">=</span> lm1<span class="sc">$</span>residuals, <span class="st">`</span><span class="at">Orange Point Removed</span><span class="st">`</span> <span class="ot">=</span> lm2<span class="sc">$</span>residuals, <span class="at">Difference =</span> lm1<span class="sc">$</span>residuals <span class="sc">-</span> lm2<span class="sc">$</span>residuals),<span class="dv">2</span>))</span></code></pre></div>
<table>
<colgroup>
<col width="36%" />
<col width="10%" />
<col width="10%" />
<col width="10%" />
<col width="10%" />
<col width="10%" />
<col width="10%" />
</colgroup>
<thead>
<tr class="header">
<th align="center"> </th>
<th align="center">1</th>
<th align="center">2</th>
<th align="center">3</th>
<th align="center">4</th>
<th align="center">5</th>
<th align="center">6</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="center"><strong>Original Residuals</strong></td>
<td align="center">-1.23</td>
<td align="center">0.02</td>
<td align="center">0.53</td>
<td align="center">1.79</td>
<td align="center">-0.7</td>
<td align="center">-0.42</td>
</tr>
<tr class="even">
<td align="center"><strong>Orange Point Removed</strong></td>
<td align="center">-0.86</td>
<td align="center">0.4</td>
<td align="center">0.9</td>
<td align="center">2.15</td>
<td align="center">-0.35</td>
<td align="center">-0.09</td>
</tr>
<tr class="odd">
<td align="center"><strong>Difference</strong></td>
<td align="center">-0.38</td>
<td align="center">-0.37</td>
<td align="center">-0.36</td>
<td align="center">-0.36</td>
<td align="center">-0.35</td>
<td align="center">-0.33</td>
</tr>
</tbody>
</table>
<p>Squaring the sum of the “differences” in the residuals from the
original regression and the one where point <span
class="math inline">\(i\)</span> (the orange dot) has been removed gives
<span class="math inline">\(0.77186\)</span>. Then, noting that the MSE
for the original regression was <span
class="math inline">\(1.418605\)</span>, and that <span
class="math inline">\(p=2\)</span> because there were two parameters, we
find the Cook’s Distance for Point #4 comes out to be</p>
<p><span class="math display">\[
  D_4 = \frac{\sum_{j=1}^n (\widehat{Y}_{j} -
\widehat{Y}_{j(4)})^2}{p\cdot MSE} \approx \frac{0.77186}{2\cdot
1.418605} \approx 0.272
\]</span></p>
<p>Similar calculations show the Cook’s Distances for each point to
be</p>
<div class="sourceCode" id="cb88"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb88-1"><a href="#cb88-1" aria-hidden="true" tabindex="-1"></a><span class="fu">pander</span>(<span class="fu">round</span>(<span class="fu">cooks.distance</span>(lm1),<span class="dv">3</span>), <span class="at">caption=</span><span class="st">&quot;Cook&#39;s Distances for each Point 1, ..., 6&quot;</span>)</span></code></pre></div>
<table style="width:61%;">
<colgroup>
<col width="11%" />
<col width="5%" />
<col width="11%" />
<col width="11%" />
<col width="11%" />
<col width="11%" />
</colgroup>
<thead>
<tr class="header">
<th align="center">1</th>
<th align="center">2</th>
<th align="center">3</th>
<th align="center">4</th>
<th align="center">5</th>
<th align="center">6</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="center">0.551</td>
<td align="center">0</td>
<td align="center">0.028</td>
<td align="center">0.272</td>
<td align="center">0.057</td>
<td align="center">0.807</td>
</tr>
</tbody>
</table>
<p>In R, it is simple to calculate Cook’s Distances using the code
<code>cooks.distance(lmObject)</code>. Also, a graph of Cook’s Distances
can be obtained using <code>plot(lmObject, which=4)</code> as shown
here:</p>
<div class="sourceCode" id="cb89"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb89-1"><a href="#cb89-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(lm1, <span class="at">which=</span><span class="dv">4</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-89-1.png" width="672" /></p>
</div>
<div id="leverage-values" class="section level5">
<h5>Leverage Values</h5>
<p>The leverage value of a point is a measurement that lives between 0
and 1 where values close to 1 imply the point has a lot of “leverage”
and is “pulling” the regression toward itself. A value near 0 implies
the point is just “one of many” and that it is not unduly influencing
the regression line.</p>
<p>It is difficult to understand leverage values mathematically unless
we look at regression from a linear algebra (matrix) perspective.</p>
<p>To do this, first recall the simple linear regression model</p>
<p><span class="math display">\[
  Y_i = \beta_0 + \beta_1 X_i + \epsilon_i
\]</span></p>
<p>This could be expanded to explicity list out each value of <span
class="math inline">\(i\)</span> in the model using vector notation:</p>
<p><span class="math display">\[
  \left[ \begin{array}{c} Y_1 \\ Y_2 \\ \vdots \\ Y_n\end{array}\right]
= \beta_0 \left[ \begin{array}{c} 1  \\
  1  \\
  \vdots \\ 1  \end{array}\right] + \beta_1 \left[\begin{array}{c} X_1
\\ X_2 \\ \vdots \\ X_n  \end{array}\right] + \left[\begin{array}{c}
\epsilon_1 \\ \epsilon_2 \\ \vdots \\ \epsilon_n \end{array}\right]
\]</span></p>
<p>We could then rewrite this in matrix notation using</p>
<p><span class="math display">\[
  \left[ \begin{array}{c} Y_1 \\ Y_2 \\ \vdots \\ Y_n\end{array}\right]
=  \left[ \begin{array}{cc} 1 &amp; X_1\\
  1  &amp; X_2 \\
  \vdots &amp; \vdots \\
  1 &amp; X_n \end{array}\right] \left[\begin{array}{c} \beta_0 \\
\beta_1 \end{array}\right] + \left[\begin{array}{c} \epsilon_1 \\
\epsilon_2 \\ \vdots \\ \epsilon_n \end{array}\right]
\]</span></p>
<p>Or, more concisely as</p>
<p><span class="math display">\[
  \vec{Y} = \mathbf{X}\vec{\beta} + \vec{\epsilon}
\]</span></p>
<p>The goal of regression is to choose values for <span
class="math inline">\(\beta_0\)</span> and <span
class="math inline">\(\beta_1\)</span> that “minimize” the sum of the
squared errors. Mathematically this would be written as</p>
<p><span class="math display">\[
  \sum_{i=1}^n \epsilon_i ^2
\]</span></p>
<p>If you are familiar with vectors then you would see that this could
be written with the notation</p>
<p><span class="math display">\[
  \vec{\epsilon}^t \vec{\epsilon} = \sum_{i=1}^n \epsilon_i ^2
\]</span></p>
<p>And since we can also write</p>
<p><span class="math display">\[
  \vec{\epsilon} = \vec{Y} - \mathbf{X}\vec{\beta}
\]</span></p>
<p>then we have</p>
<p><span class="math display">\[
\sum_{i=1}^n \epsilon_i^2 = \vec{\epsilon}^t \vec{\epsilon} = (\vec{Y} -
\mathbf{X}\vec{\beta})^t (\vec{Y} - \mathbf{X}\vec{\beta})
\]</span></p>
<p>To choose the values of <span
class="math inline">\(\vec{\beta}\)</span> that minimize the above
equation, we will take the derivative with respect to <span
class="math inline">\(\vec{\beta}\)</span> which turns out to give</p>
<p><span class="math display">\[
\frac{d}{d\vec{\beta}}(\vec{Y} - \mathbf{X}\vec{\beta})^t (\vec{Y} -
\mathbf{X}\vec{\beta}) = -2\mathbf{X}^t(\vec{Y} - \mathbf{X}\vec{\beta})
\]</span></p>
<p>Setting the derivative equal to the zero vector <span
class="math inline">\(\vec{0}\)</span> and solving, we obtain</p>
<p><span class="math display">\[
-2\mathbf{X}^t(\vec{Y} - \mathbf{X}\vec{\beta}) = \vec{0} \\
-2\mathbf{X}^t\vec{Y} = -2\mathbf{X}^t\mathbf{X}\vec{\beta}) \\
\mathbf{X}^t\vec{Y} = \mathbf{X}^t\mathbf{X}\vec{\beta}
\]</span></p>
<p>Since <span class="math inline">\(\mathbf{X}^t\mathbf{X}\)</span> is
a square matrix, it is invertible. This allows us to solve for <span
class="math inline">\(\vec{\beta}\)</span> by</p>
<p><span class="math display">\[
(\mathbf{X}^t\mathbf{X})^{-1}\mathbf{X}^t\vec{Y} = \vec{\beta}
\]</span></p>
<p>However, at this point istead of pretending we have found the true
<span class="math inline">\(\beta\)</span>’s, we change the equation
to</p>
<p><span class="math display">\[
\vec{b} = (\mathbf{X}^t\mathbf{X})^{-1}\mathbf{X}^t\vec{Y}
\]</span></p>
<p>Then, if we use the equation for <span
class="math inline">\(\hat{Y}_i\)</span> in vector notation, we get</p>
<p><span class="math display">\[
  \hat{\vec{Y}} = \mathbf{X}\vec{b}
\]</span></p>
<p>and substituting into <span class="math inline">\(\vec{b}\)</span>
gives</p>
<p><span class="math display">\[
  \hat{\vec{Y}} =
\mathbf{X}(\mathbf{X}^t\mathbf{X})^{-1}\mathbf{X}^t\vec{Y}
\]</span></p>
<p>This shows the <span class="math inline">\(\hat{Y}\)</span> values
are a matrix transformation of the <span
class="math inline">\(Y\)</span> values, often called a projection of
<span class="math inline">\(Y\)</span> onto the <span
class="math inline">\(\hat{Y}\)</span> surface. But now we have arrived
at the thing we wanted to look at in order to talk about leverage, the
“hat matrix” <span class="math inline">\(\mathbf{H}\)</span>:</p>
<p><span class="math display">\[
  \mathbf{H} = \mathbf{X}(\mathbf{X}^t\mathbf{X})^{-1}\mathbf{X}^t
\]</span></p>
<p>This allows us to write</p>
<p><span class="math display">\[
\hat{\vec{Y}} = \mathbf{H}\vec{Y}
\]</span></p>
<p>The diagonal elements of <span
class="math inline">\(\mathbf{H}\)</span> are the “leverage values” and
are notated as the <span class="math inline">\(h_{ii}\)</span> values.
Essentially each of these values explain how much <span
class="math inline">\(\hat{Y}_i\)</span> is being pulled towards <span
class="math inline">\(Y_i\)</span> by each <span
class="math inline">\(Y_i\)</span>, where values of <span
class="math inline">\(h_{ii}\)</span> close to 1 represent a “lot of
pull,” and values close to 0 represent “little pull.”</p>
<p>In R these values are obtained by the <code>hatvalues(...)</code>
function:</p>
<div class="sourceCode" id="cb90"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb90-1"><a href="#cb90-1" aria-hidden="true" tabindex="-1"></a><span class="fu">hatvalues</span>(lm1) <span class="sc">%&gt;%</span> <span class="fu">pander</span>()</span></code></pre></div>
<table style="width:74%;">
<colgroup>
<col width="12%" />
<col width="12%" />
<col width="12%" />
<col width="11%" />
<col width="12%" />
<col width="12%" />
</colgroup>
<thead>
<tr class="header">
<th align="center">1</th>
<th align="center">2</th>
<th align="center">3</th>
<th align="center">4</th>
<th align="center">5</th>
<th align="center">6</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="center">0.3869</td>
<td align="center">0.2939</td>
<td align="center">0.1839</td>
<td align="center">0.167</td>
<td align="center">0.2093</td>
<td align="center">0.759</td>
</tr>
</tbody>
</table>
<p>Or, graphically depicted by <code>plot(lmObject, which=5)</code></p>
<div class="sourceCode" id="cb91"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb91-1"><a href="#cb91-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(lm1, <span class="at">which=</span><span class="dv">5</span>)</span></code></pre></div>
<p><img src="LinearRegression_files/figure-html/unnamed-chunk-91-1.png" width="672" /></p>
<p>Points with “lots of leverage” and a large “Cook’s Distance” are
points that should be investigated for accuracy and possibly removed (or
downweighted) in the regression.</p>
</div>
</div>
<p><br /></p>
</div>
<div id="inference-for-the-model-parameters-expand-1"
class="section level4">
<h4>Inference for the Model Parameters
<a href="javascript:showhide('inferenceMultiple')" style="font-size:.6em;color:skyblue;">(Expand)</a></h4>
<p><span class="expand-caption">t Tests and F tests in multiple
regression…</span></p>
<div id="inferenceMultiple" style="display:none;">
<p>Inference in the multiple regression model can be for any of the
model coefficients, <span class="math inline">\(\beta_0\)</span>, <span
class="math inline">\(\beta_1\)</span>, <span
class="math inline">\(\ldots\)</span>, <span
class="math inline">\(\beta_p\)</span> or for several coefficients
simultaneously.</p>
<p><br /></p>
<div id="t-tests" class="section level5">
<h5>t Tests</h5>
<p>The most typical tests for multiple regression are t Tests for a
single coefficient. The hypotheses for these t Tests are written as
<span class="math display">\[
  H_0: \beta_j = 0
\]</span> <span class="math display">\[
  H_a: \beta_j \neq 0
\]</span> Note that these hypotheses assume that all other variables
(and coefficients) are already in the model. The significance of the
single variable is thus assessed after accounting for the effect of all
other variables. If a t Test of a single coefficient is significant,
then that variable should remain in the model. If the t Test for a
single coefficient is not significant, then the other variables in the
model provide the same information that the variable being tested
provides. Removing it from the model may be appropriate. However,
whenever a single variable is removed from the model the other variables
can change in their significance.</p>
<p><br /></p>
</div>
<div id="f-tests" class="section level5">
<h5>F Tests</h5>
<p>Another approach to testing hypotheses about coefficients is to use
an F Test. The F Test allows a single test for any group of hypotheses
simultaneously.</p>
<p>The most commonly used F Test is the one given by the hypotheses
<span class="math display">\[
  H_0: \beta_1 = \cdots = \beta_p = 0
\]</span> <span class="math display">\[
  H_a: \beta_j \neq 0 \ \text{for at least one}\ j \in \{1,\ldots,p\}
\]</span> However, any subset of coefficients could be tested in a
similar way using a customized F Test. The details of how to do this are
somewhat involved and are beyond the scope of this class.</p>
</div>
</div>
<p><br /> <br /></p>
<hr />
</div>
</div>
</div>
</div>
<div id="section-3" class="section level2">
<h2></h2>
<div style="padding-left:125px;">
<p><strong>Examples:</strong> <a
href="./Analyses/Linear%20Regression/Examples/CivicVsCorollaMLR.html">Civic
Vs Corolla</a> <a
href="./Analyses/Linear%20Regression/Examples/cadillacsMLR.html">cadillacs</a></p>
</div>
<hr />
<footer>
</footer>
</div>


</div>

<script>

// add bootstrap table styles to pandoc tables
function bootstrapStylePandocTables() {
  $('tr.odd').parent('tbody').parent('table').addClass('table table-condensed');
}
$(document).ready(function () {
  bootstrapStylePandocTables();
});


</script>

<!-- tabsets -->

<script>
$(document).ready(function () {
  window.buildTabsets("TOC");
});

$(document).ready(function () {
  $('.tabset-dropdown > .nav-tabs > li').click(function () {
    $(this).parent().toggleClass('nav-tabs-open');
  });
});
</script>

<!-- code folding -->
<script>
$(document).ready(function () {
  window.initializeCodeFolding("hide" === "show");
});
</script>


<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>

</body>
</html>