diff --git a/.nojekyll b/.nojekyll
index a502733..fc8afbc 100644
--- a/.nojekyll
+++ b/.nojekyll
@@ -1 +1 @@
-eb45d49f
\ No newline at end of file
+da494773
\ No newline at end of file
diff --git a/content/labs/Lab_3/IM939_Lab_3_Exercise_Linear_Regression.html b/content/labs/Lab_3/IM939_Lab_3_Exercise_Linear_Regression.html
index 586500d..86367af 100644
--- a/content/labs/Lab_3/IM939_Lab_3_Exercise_Linear_Regression.html
+++ b/content/labs/Lab_3/IM939_Lab_3_Exercise_Linear_Regression.html
@@ -580,7 +580,7 @@ <h2 id="toc-title">Table of contents</h2>
 
 <header id="title-block-header" class="quarto-title-block default">
 <div class="quarto-title">
-<h1 class="title"><span id="sec-exercise-regression" class="quarto-section-identifier"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Exercise: Regression</span></span></h1>
+<h1 class="title"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">Exercise: Regression</span></h1>
 </div>
 
 
@@ -595,6 +595,10 @@ <h1 class="title"><span id="sec-exercise-regression" class="quarto-section-ident
 
 </header>
 
+<div class="cell" data-execution_count="1">
+<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> warnings</span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a>warnings.filterwarnings(<span class="st">'ignore'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
 <p>Now it’s your turn to prepare a linear regression model.</p>
 <section id="scikit-learn" class="level2" data-number="13.1">
 <h2 data-number="13.1" class="anchored" data-anchor-id="scikit-learn"><span class="header-section-number">13.1</span> Scikit Learn</h2>
@@ -608,21 +612,24 @@ <h2 data-number="13.2" class="anchored" data-anchor-id="wine-dataset"><span clas
 <section id="reading-data" class="level2" data-number="13.3">
 <h2 data-number="13.3" class="anchored" data-anchor-id="reading-data"><span class="header-section-number">13.3</span> Reading Data</h2>
 <div class="cell" data-execution_count="2">
-<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
-<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
-<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
-<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
+<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <div class="cell" data-execution_count="3">
-<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>wine <span class="op">=</span> pd.read_excel(<span class="st">'data/raw/winequality-red_v2.xlsx'</span>, engine <span class="op">=</span> <span class="st">'openpyxl'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>wine <span class="op">=</span> pd.read_excel(<span class="st">'data/raw/winequality-red_v2.xlsx'</span>, engine <span class="op">=</span> <span class="st">'openpyxl'</span>)</span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="co">#You might need to use encoding, then the code will look like:</span></span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a><span class="co"># wine = pd.read_excel('data/raw/winequality-red_v2.xlsx', engine = 'openpyxl', encoding='UTF-8')</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 </section>
 <section id="data-exploration" class="level2" data-number="13.4">
 <h2 data-number="13.4" class="anchored" data-anchor-id="data-exploration"><span class="header-section-number">13.4</span> Data exploration</h2>
 <p>Let’s check the data, their distribution and central tendencies</p>
 <div class="cell" data-execution_count="4">
-<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">'shape:'</span>, wine.shape)</span>
-<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>wine.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">'shape:'</span>, wine.shape)</span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>wine.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>shape: (1599, 12)</code></pre>
 </div>
@@ -735,16 +742,16 @@ <h3 data-number="13.4.1" class="anchored" data-anchor-id="check-your-variables">
 <p>Use lmplot() function from Seaborn to explore linear relationship Input data must be in a Pandas DataFrame. To plot them, we provide the predictor and response variable names along with the dataset</p>
 <p>Did you find outliers or missing data? You can use function np.unique and find the unique elements of an array.</p>
 <div class="cell" data-execution_count="6">
-<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>?np.unique</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a>?np.unique</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Do you need to remove any cases?</p>
 <div class="cell" data-execution_count="8">
-<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a> </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a> </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Did you need to standarize data?</p>
 <p>If you standarized data, try to plot them again</p>
 <div class="cell" data-execution_count="11">
-<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a> </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a> </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 </section>
 </section>
@@ -757,47 +764,47 @@ <h2 data-number="13.6" class="anchored" data-anchor-id="move-on-to-building-some
 <p>You can calculates a Pearson correlation coefficient and the p-value for testing non-correlation.</p>
 <p>We will be using the scikit-learn package here. This is a package we will be making use of very frequently.</p>
 <div class="cell" data-execution_count="12">
-<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> scipy.stats</span>
-<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a>scipy.stats.pearsonr(wine.???.values, wine.???.values)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> scipy.stats</span>
+<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a>scipy.stats.pearsonr(wine.???.values, wine.???.values)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-error">
 <pre><code>SyntaxError: invalid syntax (987973612.py, line 2)</code></pre>
 </div>
 </div>
 <p>using <strong>Scikit-learn</strong>, build a simple linear regression (OLS)</p>
 <div class="cell" data-execution_count="13">
-<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> LinearRegression</span>
-<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a>est <span class="op">=</span> LinearRegression(fit_intercept <span class="op">=</span> <span class="va">True</span>)</span>
-<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb10-5"><a href="#cb10-5" aria-hidden="true" tabindex="-1"></a>x <span class="op">=</span> wine[[<span class="st">'???'</span>]]</span>
-<span id="cb10-6"><a href="#cb10-6" aria-hidden="true" tabindex="-1"></a>y <span class="op">=</span> wine[[<span class="st">'???'</span>]]</span>
-<span id="cb10-7"><a href="#cb10-7" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb10-8"><a href="#cb10-8" aria-hidden="true" tabindex="-1"></a>est.fit(x, y)</span>
-<span id="cb10-9"><a href="#cb10-9" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb10-10"><a href="#cb10-10" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">"Coefficients:"</span>, est.coef_)</span>
-<span id="cb10-11"><a href="#cb10-11" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span> (<span class="st">"Intercept:"</span>, est.intercept_)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> LinearRegression</span>
+<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a>est <span class="op">=</span> LinearRegression(fit_intercept <span class="op">=</span> <span class="va">True</span>)</span>
+<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a>x <span class="op">=</span> wine[[<span class="st">'???'</span>]]</span>
+<span id="cb11-6"><a href="#cb11-6" aria-hidden="true" tabindex="-1"></a>y <span class="op">=</span> wine[[<span class="st">'???'</span>]]</span>
+<span id="cb11-7"><a href="#cb11-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb11-8"><a href="#cb11-8" aria-hidden="true" tabindex="-1"></a>est.fit(x, y)</span>
+<span id="cb11-9"><a href="#cb11-9" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb11-10"><a href="#cb11-10" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">"Coefficients:"</span>, est.coef_)</span>
+<span id="cb11-11"><a href="#cb11-11" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span> (<span class="st">"Intercept:"</span>, est.intercept_)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-error">
 <pre><code>KeyError: "None of [Index(['???'], dtype='object')] are in the [columns]"</code></pre>
 </div>
 </div>
 <p>What is the model’s mean squared error (<span class="math inline">\(MSE\)</span>) and the coefficient of determination (<span class="math inline">\(R^2\)</span>) ?</p>
 <div class="cell" data-execution_count="14">
-<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn <span class="im">import</span> metrics</span>
-<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a><span class="co"># Analysis for all months together.</span></span>
-<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a>x <span class="op">=</span> wdi[[<span class="st">'???'</span>]]</span>
-<span id="cb12-5"><a href="#cb12-5" aria-hidden="true" tabindex="-1"></a>y <span class="op">=</span> wdi[[<span class="st">'???'</span>]]</span>
-<span id="cb12-6"><a href="#cb12-6" aria-hidden="true" tabindex="-1"></a>model <span class="op">=</span> LinearRegression()</span>
-<span id="cb12-7"><a href="#cb12-7" aria-hidden="true" tabindex="-1"></a>model.fit(x, y)</span>
-<span id="cb12-8"><a href="#cb12-8" aria-hidden="true" tabindex="-1"></a>y_hat <span class="op">=</span> model.predict(x)</span>
-<span id="cb12-9"><a href="#cb12-9" aria-hidden="true" tabindex="-1"></a>plt.plot(x, y,<span class="st">'o'</span>, alpha <span class="op">=</span> <span class="fl">0.5</span>)</span>
-<span id="cb12-10"><a href="#cb12-10" aria-hidden="true" tabindex="-1"></a>plt.plot(x, y_hat, <span class="st">'r'</span>, alpha <span class="op">=</span> <span class="fl">0.5</span>)</span>
-<span id="cb12-11"><a href="#cb12-11" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">'?'</span>)</span>
-<span id="cb12-12"><a href="#cb12-12" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">'?'</span>)</span>
-<span id="cb12-13"><a href="#cb12-13" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span> (<span class="st">"MSE:"</span>, metrics.mean_squared_error(y_hat, y))</span>
-<span id="cb12-14"><a href="#cb12-14" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span> (<span class="st">"R^2:"</span>, metrics.r2_score(y_hat, y))</span>
-<span id="cb12-15"><a href="#cb12-15" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span> (<span class="st">"var:"</span>, y.var())</span>
-<span id="cb12-16"><a href="#cb12-16" aria-hidden="true" tabindex="-1"></a>plt.savefig(<span class="st">"?.png"</span>, dpi <span class="op">=</span> <span class="dv">300</span>, bbox_inches <span class="op">=</span> <span class="st">'tight'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn <span class="im">import</span> metrics</span>
+<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a><span class="co"># Analysis for all months together.</span></span>
+<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a>x <span class="op">=</span> wdi[[<span class="st">'???'</span>]]</span>
+<span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a>y <span class="op">=</span> wdi[[<span class="st">'???'</span>]]</span>
+<span id="cb13-6"><a href="#cb13-6" aria-hidden="true" tabindex="-1"></a>model <span class="op">=</span> LinearRegression()</span>
+<span id="cb13-7"><a href="#cb13-7" aria-hidden="true" tabindex="-1"></a>model.fit(x, y)</span>
+<span id="cb13-8"><a href="#cb13-8" aria-hidden="true" tabindex="-1"></a>y_hat <span class="op">=</span> model.predict(x)</span>
+<span id="cb13-9"><a href="#cb13-9" aria-hidden="true" tabindex="-1"></a>plt.plot(x, y,<span class="st">'o'</span>, alpha <span class="op">=</span> <span class="fl">0.5</span>)</span>
+<span id="cb13-10"><a href="#cb13-10" aria-hidden="true" tabindex="-1"></a>plt.plot(x, y_hat, <span class="st">'r'</span>, alpha <span class="op">=</span> <span class="fl">0.5</span>)</span>
+<span id="cb13-11"><a href="#cb13-11" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">'?'</span>)</span>
+<span id="cb13-12"><a href="#cb13-12" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">'?'</span>)</span>
+<span id="cb13-13"><a href="#cb13-13" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span> (<span class="st">"MSE:"</span>, metrics.mean_squared_error(y_hat, y))</span>
+<span id="cb13-14"><a href="#cb13-14" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span> (<span class="st">"R^2:"</span>, metrics.r2_score(y_hat, y))</span>
+<span id="cb13-15"><a href="#cb13-15" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span> (<span class="st">"var:"</span>, y.var())</span>
+<span id="cb13-16"><a href="#cb13-16" aria-hidden="true" tabindex="-1"></a>plt.savefig(<span class="st">"?.png"</span>, dpi <span class="op">=</span> <span class="dv">300</span>, bbox_inches <span class="op">=</span> <span class="st">'tight'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-error">
 <pre><code>NameError: name 'wdi' is not defined</code></pre>
 </div>
diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris.html b/content/labs/Lab_4/IM939_Lab_4_1_Iris.html
index b8cde03..380675c 100644
--- a/content/labs/Lab_4/IM939_Lab_4_1_Iris.html
+++ b/content/labs/Lab_4/IM939_Lab_4_1_Iris.html
@@ -1651,13 +1651,13 @@ <h2 data-number="16.4" class="anchored" data-anchor-id="do-this-yourself-check-i
 <div class="cell" data-execution_count="16">
 <div class="sourceCode cell-code" id="cb22"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a>k_means.labels_</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-display" data-execution_count="16">
-<pre><code>array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-       1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0,
-       0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0,
-       0, 2, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2], dtype=int32)</code></pre>
+<pre><code>array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+       2, 2, 2, 2, 2, 2, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
+       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
+       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0], dtype=int32)</code></pre>
 </div>
 </div>
 <p>Each row has been assigned a label.</p>
@@ -1689,7 +1689,7 @@ <h2 data-number="16.4" class="anchored" data-anchor-id="do-this-yourself-check-i
 <td>0.625000</td>
 <td>0.067797</td>
 <td>0.041667</td>
-<td>1</td>
+<td>2</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">1</td>
@@ -1697,7 +1697,7 @@ <h2 data-number="16.4" class="anchored" data-anchor-id="do-this-yourself-check-i
 <td>0.416667</td>
 <td>0.067797</td>
 <td>0.041667</td>
-<td>1</td>
+<td>2</td>
 </tr>
 <tr class="odd">
 <td data-quarto-table-cell-role="th">2</td>
@@ -1705,7 +1705,7 @@ <h2 data-number="16.4" class="anchored" data-anchor-id="do-this-yourself-check-i
 <td>0.500000</td>
 <td>0.050847</td>
 <td>0.041667</td>
-<td>1</td>
+<td>2</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">3</td>
@@ -1713,7 +1713,7 @@ <h2 data-number="16.4" class="anchored" data-anchor-id="do-this-yourself-check-i
 <td>0.458333</td>
 <td>0.084746</td>
 <td>0.041667</td>
-<td>1</td>
+<td>2</td>
 </tr>
 <tr class="odd">
 <td data-quarto-table-cell-role="th">4</td>
@@ -1721,7 +1721,7 @@ <h2 data-number="16.4" class="anchored" data-anchor-id="do-this-yourself-check-i
 <td>0.666667</td>
 <td>0.067797</td>
 <td>0.041667</td>
-<td>1</td>
+<td>2</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">...</td>
@@ -1737,7 +1737,7 @@ <h2 data-number="16.4" class="anchored" data-anchor-id="do-this-yourself-check-i
 <td>0.416667</td>
 <td>0.711864</td>
 <td>0.916667</td>
-<td>0</td>
+<td>1</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">146</td>
@@ -1745,7 +1745,7 @@ <h2 data-number="16.4" class="anchored" data-anchor-id="do-this-yourself-check-i
 <td>0.208333</td>
 <td>0.677966</td>
 <td>0.750000</td>
-<td>2</td>
+<td>0</td>
 </tr>
 <tr class="odd">
 <td data-quarto-table-cell-role="th">147</td>
@@ -1753,7 +1753,7 @@ <h2 data-number="16.4" class="anchored" data-anchor-id="do-this-yourself-check-i
 <td>0.416667</td>
 <td>0.711864</td>
 <td>0.791667</td>
-<td>0</td>
+<td>1</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">148</td>
@@ -1761,7 +1761,7 @@ <h2 data-number="16.4" class="anchored" data-anchor-id="do-this-yourself-check-i
 <td>0.583333</td>
 <td>0.745763</td>
 <td>0.916667</td>
-<td>0</td>
+<td>1</td>
 </tr>
 <tr class="odd">
 <td data-quarto-table-cell-role="th">149</td>
@@ -1769,7 +1769,7 @@ <h2 data-number="16.4" class="anchored" data-anchor-id="do-this-yourself-check-i
 <td>0.416667</td>
 <td>0.694915</td>
 <td>0.708333</td>
-<td>2</td>
+<td>0</td>
 </tr>
 </tbody>
 </table>
@@ -1798,9 +1798,9 @@ <h2 data-number="16.4" class="anchored" data-anchor-id="do-this-yourself-check-i
 <div class="cell" data-execution_count="21">
 <div class="sourceCode cell-code" id="cb29"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb29-1"><a href="#cb29-1" aria-hidden="true" tabindex="-1"></a>k_means.cluster_centers_</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-display" data-execution_count="21">
-<pre><code>array([[0.70726496, 0.4508547 , 0.79704476, 0.82478632],
-       [0.19611111, 0.595     , 0.07830508, 0.06083333],
-       [0.44125683, 0.30737705, 0.57571548, 0.54918033]])</code></pre>
+<pre><code>array([[0.44125683, 0.30737705, 0.57571548, 0.54918033],
+       [0.70726496, 0.4508547 , 0.79704476, 0.82478632],
+       [0.19611111, 0.595     , 0.07830508, 0.06083333]])</code></pre>
 </div>
 </div>
 <p>It is tricky to plot these using seaborn but we can use a normal maplotlib scatter plot.</p>
@@ -1851,7 +1851,7 @@ <h2 data-number="16.4" class="anchored" data-anchor-id="do-this-yourself-check-i
 <span id="cb33-26"><a href="#cb33-26" aria-hidden="true" tabindex="-1"></a>    alpha <span class="op">=</span> <span class="dv">1</span>, color <span class="op">=</span> <span class="st">'black'</span></span>
 <span id="cb33-27"><a href="#cb33-27" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-display" data-execution_count="24">
-<pre><code>&lt;matplotlib.collections.PathCollection at 0x14fe387d0&gt;</code></pre>
+<pre><code>&lt;matplotlib.collections.PathCollection at 0x162e90390&gt;</code></pre>
 </div>
 <div class="cell-output cell-output-display">
 <p><img src="IM939_Lab_4_1_Iris_files/figure-html/cell-25-output-2.png" width="571" height="411"></p>
@@ -1914,9 +1914,9 @@ <h3 data-number="16.4.1" class="anchored" data-anchor-id="number-of-clusters"><s
 <td>0.625000</td>
 <td>0.067797</td>
 <td>0.041667</td>
+<td>2</td>
 <td>1</td>
-<td>1</td>
-<td>4</td>
+<td>3</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">1</td>
@@ -1924,9 +1924,9 @@ <h3 data-number="16.4.1" class="anchored" data-anchor-id="number-of-clusters"><s
 <td>0.416667</td>
 <td>0.067797</td>
 <td>0.041667</td>
+<td>2</td>
 <td>1</td>
 <td>1</td>
-<td>0</td>
 </tr>
 <tr class="odd">
 <td data-quarto-table-cell-role="th">2</td>
@@ -1934,9 +1934,9 @@ <h3 data-number="16.4.1" class="anchored" data-anchor-id="number-of-clusters"><s
 <td>0.500000</td>
 <td>0.050847</td>
 <td>0.041667</td>
+<td>2</td>
 <td>1</td>
 <td>1</td>
-<td>0</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">3</td>
@@ -1944,9 +1944,9 @@ <h3 data-number="16.4.1" class="anchored" data-anchor-id="number-of-clusters"><s
 <td>0.458333</td>
 <td>0.084746</td>
 <td>0.041667</td>
+<td>2</td>
 <td>1</td>
 <td>1</td>
-<td>0</td>
 </tr>
 <tr class="odd">
 <td data-quarto-table-cell-role="th">4</td>
@@ -1954,9 +1954,9 @@ <h3 data-number="16.4.1" class="anchored" data-anchor-id="number-of-clusters"><s
 <td>0.666667</td>
 <td>0.067797</td>
 <td>0.041667</td>
+<td>2</td>
 <td>1</td>
-<td>1</td>
-<td>4</td>
+<td>3</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">...</td>
@@ -1974,9 +1974,9 @@ <h3 data-number="16.4.1" class="anchored" data-anchor-id="number-of-clusters"><s
 <td>0.416667</td>
 <td>0.711864</td>
 <td>0.916667</td>
+<td>1</td>
 <td>0</td>
-<td>0</td>
-<td>2</td>
+<td>4</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">146</td>
@@ -1984,9 +1984,9 @@ <h3 data-number="16.4.1" class="anchored" data-anchor-id="number-of-clusters"><s
 <td>0.208333</td>
 <td>0.677966</td>
 <td>0.750000</td>
-<td>2</td>
 <td>0</td>
-<td>3</td>
+<td>0</td>
+<td>2</td>
 </tr>
 <tr class="odd">
 <td data-quarto-table-cell-role="th">147</td>
@@ -1994,9 +1994,9 @@ <h3 data-number="16.4.1" class="anchored" data-anchor-id="number-of-clusters"><s
 <td>0.416667</td>
 <td>0.711864</td>
 <td>0.791667</td>
+<td>1</td>
 <td>0</td>
-<td>0</td>
-<td>3</td>
+<td>2</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">148</td>
@@ -2004,9 +2004,9 @@ <h3 data-number="16.4.1" class="anchored" data-anchor-id="number-of-clusters"><s
 <td>0.583333</td>
 <td>0.745763</td>
 <td>0.916667</td>
+<td>1</td>
 <td>0</td>
-<td>0</td>
-<td>2</td>
+<td>4</td>
 </tr>
 <tr class="odd">
 <td data-quarto-table-cell-role="th">149</td>
@@ -2014,9 +2014,9 @@ <h3 data-number="16.4.1" class="anchored" data-anchor-id="number-of-clusters"><s
 <td>0.416667</td>
 <td>0.694915</td>
 <td>0.708333</td>
-<td>2</td>
 <td>0</td>
-<td>3</td>
+<td>0</td>
+<td>2</td>
 </tr>
 </tbody>
 </table>
@@ -2041,7 +2041,7 @@ <h3 data-number="16.4.1" class="anchored" data-anchor-id="number-of-clusters"><s
 <div class="cell" data-execution_count="32">
 <div class="sourceCode cell-code" id="cb44"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb44-1"><a href="#cb44-1" aria-hidden="true" tabindex="-1"></a>k_means_5.inertia_</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-display" data-execution_count="32">
-<pre><code>4.58977540011789</code></pre>
+<pre><code>4.580948640117293</code></pre>
 </div>
 </div>
 <p>It looks like our k = 5 model captures the data well. Intertia, <a href="https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html">looking at the sklearn documentation</a> as the <em>Sum of squared distances of samples to their closest cluster center.</em>.</p>
@@ -2315,9 +2315,9 @@ <h3 data-number="16.5.1" class="anchored" data-anchor-id="dimension-reduction"><
 <td>0.625000</td>
 <td>0.067797</td>
 <td>0.041667</td>
+<td>2</td>
 <td>1</td>
-<td>1</td>
-<td>4</td>
+<td>3</td>
 <td>-0.630703</td>
 <td>0.107578</td>
 </tr>
@@ -2327,9 +2327,9 @@ <h3 data-number="16.5.1" class="anchored" data-anchor-id="dimension-reduction"><
 <td>0.416667</td>
 <td>0.067797</td>
 <td>0.041667</td>
+<td>2</td>
 <td>1</td>
 <td>1</td>
-<td>0</td>
 <td>-0.622905</td>
 <td>-0.104260</td>
 </tr>
@@ -2339,9 +2339,9 @@ <h3 data-number="16.5.1" class="anchored" data-anchor-id="dimension-reduction"><
 <td>0.500000</td>
 <td>0.050847</td>
 <td>0.041667</td>
+<td>2</td>
 <td>1</td>
 <td>1</td>
-<td>0</td>
 <td>-0.669520</td>
 <td>-0.051417</td>
 </tr>
@@ -2351,9 +2351,9 @@ <h3 data-number="16.5.1" class="anchored" data-anchor-id="dimension-reduction"><
 <td>0.458333</td>
 <td>0.084746</td>
 <td>0.041667</td>
+<td>2</td>
 <td>1</td>
 <td>1</td>
-<td>0</td>
 <td>-0.654153</td>
 <td>-0.102885</td>
 </tr>
@@ -2363,9 +2363,9 @@ <h3 data-number="16.5.1" class="anchored" data-anchor-id="dimension-reduction"><
 <td>0.666667</td>
 <td>0.067797</td>
 <td>0.041667</td>
+<td>2</td>
 <td>1</td>
-<td>1</td>
-<td>4</td>
+<td>3</td>
 <td>-0.648788</td>
 <td>0.133488</td>
 </tr>
@@ -2387,9 +2387,9 @@ <h3 data-number="16.5.1" class="anchored" data-anchor-id="dimension-reduction"><
 <td>0.416667</td>
 <td>0.711864</td>
 <td>0.916667</td>
+<td>1</td>
 <td>0</td>
-<td>0</td>
-<td>2</td>
+<td>4</td>
 <td>0.551462</td>
 <td>0.059841</td>
 </tr>
@@ -2399,9 +2399,9 @@ <h3 data-number="16.5.1" class="anchored" data-anchor-id="dimension-reduction"><
 <td>0.208333</td>
 <td>0.677966</td>
 <td>0.750000</td>
-<td>2</td>
 <td>0</td>
-<td>3</td>
+<td>0</td>
+<td>2</td>
 <td>0.407146</td>
 <td>-0.171821</td>
 </tr>
@@ -2411,9 +2411,9 @@ <h3 data-number="16.5.1" class="anchored" data-anchor-id="dimension-reduction"><
 <td>0.416667</td>
 <td>0.711864</td>
 <td>0.791667</td>
+<td>1</td>
 <td>0</td>
-<td>0</td>
-<td>3</td>
+<td>2</td>
 <td>0.447143</td>
 <td>0.037560</td>
 </tr>
@@ -2423,9 +2423,9 @@ <h3 data-number="16.5.1" class="anchored" data-anchor-id="dimension-reduction"><
 <td>0.583333</td>
 <td>0.745763</td>
 <td>0.916667</td>
+<td>1</td>
 <td>0</td>
-<td>0</td>
-<td>2</td>
+<td>4</td>
 <td>0.488208</td>
 <td>0.149678</td>
 </tr>
@@ -2435,9 +2435,9 @@ <h3 data-number="16.5.1" class="anchored" data-anchor-id="dimension-reduction"><
 <td>0.416667</td>
 <td>0.694915</td>
 <td>0.708333</td>
-<td>2</td>
 <td>0</td>
-<td>3</td>
+<td>0</td>
+<td>2</td>
 <td>0.312066</td>
 <td>-0.031130</td>
 </tr>
@@ -2497,9 +2497,9 @@ <h3 data-number="16.5.1" class="anchored" data-anchor-id="dimension-reduction"><
 <td>0.625000</td>
 <td>0.067797</td>
 <td>0.041667</td>
+<td>2</td>
 <td>1</td>
-<td>1</td>
-<td>4</td>
+<td>3</td>
 <td>-0.630703</td>
 <td>0.107578</td>
 </tr>
@@ -2509,9 +2509,9 @@ <h3 data-number="16.5.1" class="anchored" data-anchor-id="dimension-reduction"><
 <td>0.416667</td>
 <td>0.067797</td>
 <td>0.041667</td>
+<td>2</td>
 <td>1</td>
 <td>1</td>
-<td>0</td>
 <td>-0.622905</td>
 <td>-0.104260</td>
 </tr>
@@ -2521,9 +2521,9 @@ <h3 data-number="16.5.1" class="anchored" data-anchor-id="dimension-reduction"><
 <td>0.500000</td>
 <td>0.050847</td>
 <td>0.041667</td>
+<td>2</td>
 <td>1</td>
 <td>1</td>
-<td>0</td>
 <td>-0.669520</td>
 <td>-0.051417</td>
 </tr>
@@ -2533,9 +2533,9 @@ <h3 data-number="16.5.1" class="anchored" data-anchor-id="dimension-reduction"><
 <td>0.458333</td>
 <td>0.084746</td>
 <td>0.041667</td>
+<td>2</td>
 <td>1</td>
 <td>1</td>
-<td>0</td>
 <td>-0.654153</td>
 <td>-0.102885</td>
 </tr>
@@ -2545,9 +2545,9 @@ <h3 data-number="16.5.1" class="anchored" data-anchor-id="dimension-reduction"><
 <td>0.666667</td>
 <td>0.067797</td>
 <td>0.041667</td>
+<td>2</td>
 <td>1</td>
-<td>1</td>
-<td>4</td>
+<td>3</td>
 <td>-0.648788</td>
 <td>0.133488</td>
 </tr>
@@ -2569,9 +2569,9 @@ <h3 data-number="16.5.1" class="anchored" data-anchor-id="dimension-reduction"><
 <td>0.416667</td>
 <td>0.711864</td>
 <td>0.916667</td>
+<td>1</td>
 <td>0</td>
-<td>0</td>
-<td>2</td>
+<td>4</td>
 <td>0.551462</td>
 <td>0.059841</td>
 </tr>
@@ -2581,9 +2581,9 @@ <h3 data-number="16.5.1" class="anchored" data-anchor-id="dimension-reduction"><
 <td>0.208333</td>
 <td>0.677966</td>
 <td>0.750000</td>
-<td>2</td>
 <td>0</td>
-<td>3</td>
+<td>0</td>
+<td>2</td>
 <td>0.407146</td>
 <td>-0.171821</td>
 </tr>
@@ -2593,9 +2593,9 @@ <h3 data-number="16.5.1" class="anchored" data-anchor-id="dimension-reduction"><
 <td>0.416667</td>
 <td>0.711864</td>
 <td>0.791667</td>
+<td>1</td>
 <td>0</td>
-<td>0</td>
-<td>3</td>
+<td>2</td>
 <td>0.447143</td>
 <td>0.037560</td>
 </tr>
@@ -2605,9 +2605,9 @@ <h3 data-number="16.5.1" class="anchored" data-anchor-id="dimension-reduction"><
 <td>0.583333</td>
 <td>0.745763</td>
 <td>0.916667</td>
+<td>1</td>
 <td>0</td>
-<td>0</td>
-<td>2</td>
+<td>4</td>
 <td>0.488208</td>
 <td>0.149678</td>
 </tr>
@@ -2617,9 +2617,9 @@ <h3 data-number="16.5.1" class="anchored" data-anchor-id="dimension-reduction"><
 <td>0.416667</td>
 <td>0.694915</td>
 <td>0.708333</td>
-<td>2</td>
 <td>0</td>
-<td>3</td>
+<td>0</td>
+<td>2</td>
 <td>0.312066</td>
 <td>-0.031130</td>
 </tr>
@@ -2675,12 +2675,12 @@ <h3 data-number="16.5.2" class="anchored" data-anchor-id="pca-to-clusters"><span
 <td>0.625000</td>
 <td>0.067797</td>
 <td>0.041667</td>
+<td>2</td>
 <td>1</td>
-<td>1</td>
-<td>4</td>
+<td>3</td>
 <td>-0.630703</td>
 <td>0.107578</td>
-<td>1</td>
+<td>0</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">1</td>
@@ -2688,12 +2688,12 @@ <h3 data-number="16.5.2" class="anchored" data-anchor-id="pca-to-clusters"><span
 <td>0.416667</td>
 <td>0.067797</td>
 <td>0.041667</td>
+<td>2</td>
 <td>1</td>
 <td>1</td>
-<td>0</td>
 <td>-0.622905</td>
 <td>-0.104260</td>
-<td>1</td>
+<td>0</td>
 </tr>
 <tr class="odd">
 <td data-quarto-table-cell-role="th">2</td>
@@ -2701,12 +2701,12 @@ <h3 data-number="16.5.2" class="anchored" data-anchor-id="pca-to-clusters"><span
 <td>0.500000</td>
 <td>0.050847</td>
 <td>0.041667</td>
+<td>2</td>
 <td>1</td>
 <td>1</td>
-<td>0</td>
 <td>-0.669520</td>
 <td>-0.051417</td>
-<td>1</td>
+<td>0</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">3</td>
@@ -2714,12 +2714,12 @@ <h3 data-number="16.5.2" class="anchored" data-anchor-id="pca-to-clusters"><span
 <td>0.458333</td>
 <td>0.084746</td>
 <td>0.041667</td>
+<td>2</td>
 <td>1</td>
 <td>1</td>
-<td>0</td>
 <td>-0.654153</td>
 <td>-0.102885</td>
-<td>1</td>
+<td>0</td>
 </tr>
 <tr class="odd">
 <td data-quarto-table-cell-role="th">4</td>
@@ -2727,12 +2727,12 @@ <h3 data-number="16.5.2" class="anchored" data-anchor-id="pca-to-clusters"><span
 <td>0.666667</td>
 <td>0.067797</td>
 <td>0.041667</td>
+<td>2</td>
 <td>1</td>
-<td>1</td>
-<td>4</td>
+<td>3</td>
 <td>-0.648788</td>
 <td>0.133488</td>
-<td>1</td>
+<td>0</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">...</td>
@@ -2753,12 +2753,12 @@ <h3 data-number="16.5.2" class="anchored" data-anchor-id="pca-to-clusters"><span
 <td>0.416667</td>
 <td>0.711864</td>
 <td>0.916667</td>
+<td>1</td>
 <td>0</td>
-<td>0</td>
-<td>2</td>
+<td>4</td>
 <td>0.551462</td>
 <td>0.059841</td>
-<td>0</td>
+<td>1</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">146</td>
@@ -2766,9 +2766,9 @@ <h3 data-number="16.5.2" class="anchored" data-anchor-id="pca-to-clusters"><span
 <td>0.208333</td>
 <td>0.677966</td>
 <td>0.750000</td>
-<td>2</td>
 <td>0</td>
-<td>3</td>
+<td>0</td>
+<td>2</td>
 <td>0.407146</td>
 <td>-0.171821</td>
 <td>2</td>
@@ -2779,12 +2779,12 @@ <h3 data-number="16.5.2" class="anchored" data-anchor-id="pca-to-clusters"><span
 <td>0.416667</td>
 <td>0.711864</td>
 <td>0.791667</td>
+<td>1</td>
 <td>0</td>
-<td>0</td>
-<td>3</td>
+<td>2</td>
 <td>0.447143</td>
 <td>0.037560</td>
-<td>0</td>
+<td>1</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">148</td>
@@ -2792,12 +2792,12 @@ <h3 data-number="16.5.2" class="anchored" data-anchor-id="pca-to-clusters"><span
 <td>0.583333</td>
 <td>0.745763</td>
 <td>0.916667</td>
+<td>1</td>
 <td>0</td>
-<td>0</td>
-<td>2</td>
+<td>4</td>
 <td>0.488208</td>
 <td>0.149678</td>
-<td>0</td>
+<td>1</td>
 </tr>
 <tr class="odd">
 <td data-quarto-table-cell-role="th">149</td>
@@ -2805,9 +2805,9 @@ <h3 data-number="16.5.2" class="anchored" data-anchor-id="pca-to-clusters"><span
 <td>0.416667</td>
 <td>0.694915</td>
 <td>0.708333</td>
-<td>2</td>
 <td>0</td>
-<td>3</td>
+<td>0</td>
+<td>2</td>
 <td>0.312066</td>
 <td>-0.031130</td>
 <td>2</td>
@@ -2853,7 +2853,7 @@ <h3 data-number="16.5.2" class="anchored" data-anchor-id="pca-to-clusters"><span
 <span id="cb76-16"><a href="#cb76-16" aria-hidden="true" tabindex="-1"></a>plt.xticks(ks)</span>
 <span id="cb76-17"><a href="#cb76-17" aria-hidden="true" tabindex="-1"></a>plt.show()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-display">
-<p><img src="IM939_Lab_4_1_Iris_files/figure-html/cell-53-output-1.png" width="589" height="427"></p>
+<p><img src="IM939_Lab_4_1_Iris_files/figure-html/cell-53-output-1.png" width="589" height="430"></p>
 </div>
 </div>
 <p>Three seems ok. We clearly want no more than three.</p>
@@ -2878,10 +2878,10 @@ <h3 data-number="16.5.3" class="anchored" data-anchor-id="missing-values"><span
 <span id="cb78-7"><a href="#cb78-7" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb78-8"><a href="#cb78-8" aria-hidden="true" tabindex="-1"></a>df.isna().<span class="bu">sum</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-display" data-execution_count="54">
-<pre><code>sepal length (cm)    32
-sepal width (cm)     34
-petal length (cm)    37
-petal width (cm)     29
+<pre><code>sepal length (cm)    29
+sepal width (cm)     21
+petal length (cm)    32
+petal width (cm)     21
 dtype: int64</code></pre>
 </div>
 </div>
@@ -2905,14 +2905,14 @@ <h3 data-number="16.5.3" class="anchored" data-anchor-id="missing-values"><span
 <tr class="odd">
 <td data-quarto-table-cell-role="th">0</td>
 <td>5.1</td>
-<td>NaN</td>
-<td>NaN</td>
+<td>3.5</td>
+<td>1.4</td>
 <td>0.2</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">1</td>
 <td>4.9</td>
-<td>NaN</td>
+<td>3.0</td>
 <td>NaN</td>
 <td>0.2</td>
 </tr>
@@ -2927,15 +2927,15 @@ <h3 data-number="16.5.3" class="anchored" data-anchor-id="missing-values"><span
 <td data-quarto-table-cell-role="th">3</td>
 <td>4.6</td>
 <td>3.1</td>
-<td>1.5</td>
 <td>NaN</td>
+<td>0.2</td>
 </tr>
 <tr class="odd">
 <td data-quarto-table-cell-role="th">4</td>
-<td>5.0</td>
 <td>NaN</td>
+<td>3.6</td>
 <td>1.4</td>
-<td>NaN</td>
+<td>0.2</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">...</td>
@@ -2947,28 +2947,28 @@ <h3 data-number="16.5.3" class="anchored" data-anchor-id="missing-values"><span
 <tr class="odd">
 <td data-quarto-table-cell-role="th">145</td>
 <td>6.7</td>
-<td>3.0</td>
+<td>NaN</td>
 <td>5.2</td>
 <td>2.3</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">146</td>
-<td>NaN</td>
+<td>6.3</td>
 <td>2.5</td>
-<td>5.0</td>
+<td>NaN</td>
 <td>1.9</td>
 </tr>
 <tr class="odd">
 <td data-quarto-table-cell-role="th">147</td>
 <td>6.5</td>
-<td>NaN</td>
-<td>NaN</td>
+<td>3.0</td>
+<td>5.2</td>
 <td>2.0</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">148</td>
-<td>6.2</td>
 <td>NaN</td>
+<td>3.4</td>
 <td>5.4</td>
 <td>2.3</td>
 </tr>
@@ -2976,8 +2976,8 @@ <h3 data-number="16.5.3" class="anchored" data-anchor-id="missing-values"><span
 <td data-quarto-table-cell-role="th">149</td>
 <td>5.9</td>
 <td>3.0</td>
-<td>5.1</td>
-<td>1.8</td>
+<td>NaN</td>
+<td>NaN</td>
 </tr>
 </tbody>
 </table>
@@ -3014,14 +3014,14 @@ <h4 data-number="16.5.3.1" class="anchored" data-anchor-id="zeroing"><span class
 <tr class="odd">
 <td data-quarto-table-cell-role="th">0</td>
 <td>5.1</td>
-<td>0.0</td>
-<td>0.0</td>
+<td>3.5</td>
+<td>1.4</td>
 <td>0.2</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">1</td>
 <td>4.9</td>
-<td>0.0</td>
+<td>3.0</td>
 <td>0.0</td>
 <td>0.2</td>
 </tr>
@@ -3036,15 +3036,15 @@ <h4 data-number="16.5.3.1" class="anchored" data-anchor-id="zeroing"><span class
 <td data-quarto-table-cell-role="th">3</td>
 <td>4.6</td>
 <td>3.1</td>
-<td>1.5</td>
 <td>0.0</td>
+<td>0.2</td>
 </tr>
 <tr class="odd">
 <td data-quarto-table-cell-role="th">4</td>
-<td>5.0</td>
 <td>0.0</td>
+<td>3.6</td>
 <td>1.4</td>
-<td>0.0</td>
+<td>0.2</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">...</td>
@@ -3056,28 +3056,28 @@ <h4 data-number="16.5.3.1" class="anchored" data-anchor-id="zeroing"><span class
 <tr class="odd">
 <td data-quarto-table-cell-role="th">145</td>
 <td>6.7</td>
-<td>3.0</td>
+<td>0.0</td>
 <td>5.2</td>
 <td>2.3</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">146</td>
-<td>0.0</td>
+<td>6.3</td>
 <td>2.5</td>
-<td>5.0</td>
+<td>0.0</td>
 <td>1.9</td>
 </tr>
 <tr class="odd">
 <td data-quarto-table-cell-role="th">147</td>
 <td>6.5</td>
-<td>0.0</td>
-<td>0.0</td>
+<td>3.0</td>
+<td>5.2</td>
 <td>2.0</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">148</td>
-<td>6.2</td>
 <td>0.0</td>
+<td>3.4</td>
 <td>5.4</td>
 <td>2.3</td>
 </tr>
@@ -3085,8 +3085,8 @@ <h4 data-number="16.5.3.1" class="anchored" data-anchor-id="zeroing"><span class
 <td data-quarto-table-cell-role="th">149</td>
 <td>5.9</td>
 <td>3.0</td>
-<td>5.1</td>
-<td>1.8</td>
+<td>0.0</td>
+<td>0.0</td>
 </tr>
 </tbody>
 </table>
@@ -3123,20 +3123,20 @@ <h4 data-number="16.5.3.1" class="anchored" data-anchor-id="zeroing"><span class
 <pre><code>&lt;Axes: xlabel='c1', ylabel='c2'&gt;</code></pre>
 </div>
 <div class="cell-output cell-output-display">
-<p><img src="IM939_Lab_4_1_Iris_files/figure-html/cell-60-output-2.png" width="585" height="427"></p>
+<p><img src="IM939_Lab_4_1_Iris_files/figure-html/cell-60-output-2.png" width="582" height="427"></p>
 </div>
 </div>
 <div class="cell" data-execution_count="60">
 <div class="sourceCode cell-code" id="cb86"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb86-1"><a href="#cb86-1" aria-hidden="true" tabindex="-1"></a>df_1_pca.explained_variance_</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-display" data-execution_count="60">
-<pre><code>array([6.71803744, 4.89376791])</code></pre>
+<pre><code>array([6.24279356, 4.84811544])</code></pre>
 </div>
 </div>
 <div class="cell" data-execution_count="61">
 <div class="sourceCode cell-code" id="cb88"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb88-1"><a href="#cb88-1" aria-hidden="true" tabindex="-1"></a>df_1_pca.components_</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-display" data-execution_count="61">
-<pre><code>array([[-0.91235845,  0.02968512, -0.38161438, -0.14522853],
-       [-0.39939351,  0.05086373,  0.90393389,  0.14422629]])</code></pre>
+<pre><code>array([[-0.86129917,  0.04084996, -0.48641492, -0.14105157],
+       [-0.50682662, -0.04550418,  0.84286268,  0.175039  ]])</code></pre>
 </div>
 </div>
 </section>
@@ -3167,37 +3167,37 @@ <h4 data-number="16.5.3.2" class="anchored" data-anchor-id="replacing-with-the-a
 <tr class="odd">
 <td data-quarto-table-cell-role="th">0</td>
 <td>5.100000</td>
-<td>3.00431</td>
-<td>3.90885</td>
+<td>3.500000</td>
+<td>1.400000</td>
 <td>0.200000</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">1</td>
 <td>4.900000</td>
-<td>3.00431</td>
-<td>3.90885</td>
+<td>3.000000</td>
+<td>3.877119</td>
 <td>0.200000</td>
 </tr>
 <tr class="odd">
 <td data-quarto-table-cell-role="th">2</td>
-<td>5.866102</td>
-<td>3.20000</td>
-<td>1.30000</td>
+<td>5.839669</td>
+<td>3.200000</td>
+<td>1.300000</td>
 <td>0.200000</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">3</td>
 <td>4.600000</td>
-<td>3.10000</td>
-<td>1.50000</td>
-<td>1.210744</td>
+<td>3.100000</td>
+<td>3.877119</td>
+<td>0.200000</td>
 </tr>
 <tr class="odd">
 <td data-quarto-table-cell-role="th">4</td>
-<td>5.000000</td>
-<td>3.00431</td>
-<td>1.40000</td>
-<td>1.210744</td>
+<td>5.839669</td>
+<td>3.600000</td>
+<td>1.400000</td>
+<td>0.200000</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">...</td>
@@ -3209,37 +3209,37 @@ <h4 data-number="16.5.3.2" class="anchored" data-anchor-id="replacing-with-the-a
 <tr class="odd">
 <td data-quarto-table-cell-role="th">145</td>
 <td>6.700000</td>
-<td>3.00000</td>
-<td>5.20000</td>
+<td>3.054264</td>
+<td>5.200000</td>
 <td>2.300000</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">146</td>
-<td>5.866102</td>
-<td>2.50000</td>
-<td>5.00000</td>
+<td>6.300000</td>
+<td>2.500000</td>
+<td>3.877119</td>
 <td>1.900000</td>
 </tr>
 <tr class="odd">
 <td data-quarto-table-cell-role="th">147</td>
 <td>6.500000</td>
-<td>3.00431</td>
-<td>3.90885</td>
+<td>3.000000</td>
+<td>5.200000</td>
 <td>2.000000</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">148</td>
-<td>6.200000</td>
-<td>3.00431</td>
-<td>5.40000</td>
+<td>5.839669</td>
+<td>3.400000</td>
+<td>5.400000</td>
 <td>2.300000</td>
 </tr>
 <tr class="odd">
 <td data-quarto-table-cell-role="th">149</td>
 <td>5.900000</td>
-<td>3.00000</td>
-<td>5.10000</td>
-<td>1.800000</td>
+<td>3.000000</td>
+<td>3.877119</td>
+<td>1.205426</td>
 </tr>
 </tbody>
 </table>
@@ -3280,14 +3280,14 @@ <h4 data-number="16.5.3.2" class="anchored" data-anchor-id="replacing-with-the-a
 <div class="cell" data-execution_count="66">
 <div class="sourceCode cell-code" id="cb95"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb95-1"><a href="#cb95-1" aria-hidden="true" tabindex="-1"></a>df_2_pca.explained_variance_</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-display" data-execution_count="66">
-<pre><code>array([2.68417915, 0.33506061])</code></pre>
+<pre><code>array([3.01818399, 0.26633671])</code></pre>
 </div>
 </div>
 <div class="cell" data-execution_count="67">
 <div class="sourceCode cell-code" id="cb97"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb97-1"><a href="#cb97-1" aria-hidden="true" tabindex="-1"></a>df_2_pca.components_</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-display" data-execution_count="67">
-<pre><code>array([[ 0.33775908, -0.04345744,  0.87824143,  0.33574133],
-       [ 0.82803166,  0.20108365, -0.42517727,  0.30521014]])</code></pre>
+<pre><code>array([[ 0.31417904, -0.06487468,  0.88369345,  0.34083528],
+       [ 0.89110506,  0.17000084, -0.37665661,  0.18751344]])</code></pre>
 </div>
 </div>
 </section>
diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-20-output-1.png b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-20-output-1.png
index 134a3b4..77f52f6 100644
Binary files a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-20-output-1.png and b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-20-output-1.png differ
diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-21-output-2.png b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-21-output-2.png
index 709c0ba..6b0d389 100644
Binary files a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-21-output-2.png and b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-21-output-2.png differ
diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-25-output-2.png b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-25-output-2.png
index 3bc769f..313727e 100644
Binary files a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-25-output-2.png and b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-25-output-2.png differ
diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-29-output-1.png b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-29-output-1.png
index a5fe247..9c9b7ae 100644
Binary files a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-29-output-1.png and b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-29-output-1.png differ
diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-47-output-2.png b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-47-output-2.png
index 3932a6d..3b5f16f 100644
Binary files a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-47-output-2.png and b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-47-output-2.png differ
diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-52-output-2.png b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-52-output-2.png
index 50599b1..a00bf08 100644
Binary files a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-52-output-2.png and b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-52-output-2.png differ
diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-53-output-1.png b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-53-output-1.png
index 02e1ffe..e1c18d2 100644
Binary files a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-53-output-1.png and b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-53-output-1.png differ
diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-59-output-1.png b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-59-output-1.png
index 4ed7452..348399d 100644
Binary files a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-59-output-1.png and b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-59-output-1.png differ
diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-60-output-2.png b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-60-output-2.png
index a4f6bc3..a397ab7 100644
Binary files a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-60-output-2.png and b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-60-output-2.png differ
diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-65-output-1.png b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-65-output-1.png
index b1c2cf2..8a89879 100644
Binary files a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-65-output-1.png and b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-65-output-1.png differ
diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-66-output-2.png b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-66-output-2.png
index d496c2b..ed10bb5 100644
Binary files a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-66-output-2.png and b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-66-output-2.png differ
diff --git a/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-15-output-2.png b/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-15-output-2.png
index 4a6e529..2505d8a 100644
Binary files a/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-15-output-2.png and b/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-15-output-2.png differ
diff --git a/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-16-output-1.png b/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-16-output-1.png
index 878f660..af515ab 100644
Binary files a/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-16-output-1.png and b/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-16-output-1.png differ
diff --git a/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-18-output-2.png b/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-18-output-2.png
index c3d7f56..322439b 100644
Binary files a/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-18-output-2.png and b/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-18-output-2.png differ
diff --git a/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-19-output-2.png b/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-19-output-2.png
index f18e6d6..ed64a82 100644
Binary files a/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-19-output-2.png and b/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-19-output-2.png differ
diff --git a/content/labs/Lab_5/IM939_Lab_5_1.html b/content/labs/Lab_5/IM939_Lab_5_1.html
index 52b0099..ca70f26 100644
--- a/content/labs/Lab_5/IM939_Lab_5_1.html
+++ b/content/labs/Lab_5/IM939_Lab_5_1.html
@@ -115,11 +115,17 @@
     "search-label": "Search"
   }
 }</script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script>
+<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
 
 
 <meta name="twitter:title" content="Data Science Across Disciplines - 21&nbsp; Lab: Clustering and Ground Truth">
 <meta name="twitter:description" content="">
-<meta name="twitter:card" content="summary">
+<meta name="twitter:image" content="https://warwickcim.github.io/IM939_handbook/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-7-output-2.png">
+<meta name="twitter:image-height" content="858">
+<meta name="twitter:image-width" content="1290">
+<meta name="twitter:card" content="summary_large_image">
 </head>
 
 <body class="nav-sidebar floating">
@@ -561,9 +567,12 @@ <h2 id="toc-title">Table of contents</h2>
   <li><a href="#data-wrangling" id="toc-data-wrangling" class="nav-link active" data-scroll-target="#data-wrangling"><span class="header-section-number">21.1</span> Data Wrangling</a></li>
   <li><a href="#cluster-analysis" id="toc-cluster-analysis" class="nav-link" data-scroll-target="#cluster-analysis"><span class="header-section-number">21.2</span> Cluster analysis</a>
   <ul class="collapse">
-  <li><a href="#number-of-clusters" id="toc-number-of-clusters" class="nav-link" data-scroll-target="#number-of-clusters"><span class="header-section-number">21.2.1</span> Number of clusters</a></li>
-  <li><a href="#calculate-3-clusters" id="toc-calculate-3-clusters" class="nav-link" data-scroll-target="#calculate-3-clusters"><span class="header-section-number">21.2.2</span> Calculate 3 clusters</a></li>
-  <li><a href="#ground-truth-validation" id="toc-ground-truth-validation" class="nav-link" data-scroll-target="#ground-truth-validation"><span class="header-section-number">21.2.3</span> Ground Truth Validation</a></li>
+  <li><a href="#determining-the-number-of-clusters" id="toc-determining-the-number-of-clusters" class="nav-link" data-scroll-target="#determining-the-number-of-clusters"><span class="header-section-number">21.2.1</span> Determining the number of clusters</a></li>
+  <li><a href="#computing-the-clusters" id="toc-computing-the-clusters" class="nav-link" data-scroll-target="#computing-the-clusters"><span class="header-section-number">21.2.2</span> Computing the clusters</a></li>
+  </ul></li>
+  <li><a href="#clusters-and-ground-truth" id="toc-clusters-and-ground-truth" class="nav-link" data-scroll-target="#clusters-and-ground-truth"><span class="header-section-number">21.3</span> Clusters and Ground Truth</a>
+  <ul class="collapse">
+  <li><a href="#principal-components-analysis" id="toc-principal-components-analysis" class="nav-link" data-scroll-target="#principal-components-analysis"><span class="header-section-number">21.3.1</span> Principal Components Analysis</a></li>
   </ul></li>
   </ul>
 <div class="toc-actions"><div><i class="bi bi-github"></i></div><div class="action-links"><p><a href="https://github.dev/WarwickCIM/IM939_handbook/blob/main/content/labs/Lab_5/IM939_Lab_5_1.ipynb" class="toc-action">Edit this page</a></p><p><a href="https://github.com/WarwickCIM/IM939_handbook/issues/new" class="toc-action">Report an issue</a></p></div></div></nav>
@@ -588,122 +597,512 @@ <h1 class="title"><span class="chapter-number">21</span>&nbsp; <span class="chap
 
 </header>
 
-<p>In this notebook we are going to reflect on how well a given model (a kmeans cluster, a linear regression, dimension reduction, etc.) performs. To do so, we are going to use the Wine Quality Dataset from <span class="citation" data-cites="cortezWineQuality2009">Cortez et al. (<a href="../../references.html#ref-cortezWineQuality2009" role="doc-biblioref">2009</a>)</span> that we used in the past (<a href="../Lab_3/IM939_Lab_3_Exercise_Linear_Regression.html"><span>Chapter&nbsp;13</span></a>) and you may be familiar with by now (but if you don’t, tou can find more information about it here: <a href="https://doi.org/10.24432/C56S3T" class="uri">https://doi.org/10.24432/C56S3T</a>).</p>
+<p>We are going to use the Wine Quality Dataset from <span class="citation" data-cites="cortezWineQuality2009">Cortez et al. (<a href="../../references.html#ref-cortezWineQuality2009" role="doc-biblioref">2009</a>)</span> that you may be familiar with by now (but if you don’t, tou can find more information about it here: <a href="https://doi.org/10.24432/C56S3T" class="uri">https://doi.org/10.24432/C56S3T</a>).</p>
 <section id="data-wrangling" class="level2 page-columns page-full" data-number="21.1">
 <h2 data-number="21.1" class="anchored" data-anchor-id="data-wrangling"><span class="header-section-number">21.1</span> Data Wrangling</h2>
-<p>As usual, we will start by looking at our data, and making transformations, if needed.</p>
-<div class="cell">
+<div class="cell" data-execution_count="2">
 <div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
 <span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a>df <span class="op">=</span> pd.read_csv(<span class="st">'data/wine.csv'</span>)</span>
-<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a>df.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a>df <span class="op">=</span> pd.read_csv(<span class="st">'data/wine.csv'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-<div class="callout callout-style-default callout-tip callout-titled">
-<div class="callout-header d-flex align-content-center">
-<div class="callout-icon-container">
-<i class="callout-icon"></i>
+<p>Look at our data.</p>
+<div class="cell" data-execution_count="3">
+<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>df.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="3">
+<div>
+
+
+<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Class label</th>
+<th data-quarto-table-cell-role="th">Alcohol</th>
+<th data-quarto-table-cell-role="th">Malic acid</th>
+<th data-quarto-table-cell-role="th">Ash</th>
+<th data-quarto-table-cell-role="th">Alcalinity of ash</th>
+<th data-quarto-table-cell-role="th">Magnesium</th>
+<th data-quarto-table-cell-role="th">Total phenols</th>
+<th data-quarto-table-cell-role="th">Flavanoids</th>
+<th data-quarto-table-cell-role="th">Nonflavanoid phenols</th>
+<th data-quarto-table-cell-role="th">Proanthocyanins</th>
+<th data-quarto-table-cell-role="th">Color intensity</th>
+<th data-quarto-table-cell-role="th">Hue</th>
+<th data-quarto-table-cell-role="th">OD280/OD315 of diluted wines</th>
+<th data-quarto-table-cell-role="th">Proline</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1</td>
+<td>14.23</td>
+<td>1.71</td>
+<td>2.43</td>
+<td>15.6</td>
+<td>127</td>
+<td>2.80</td>
+<td>3.06</td>
+<td>0.28</td>
+<td>2.29</td>
+<td>5.64</td>
+<td>1.04</td>
+<td>3.92</td>
+<td>1065</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1</td>
+<td>13.20</td>
+<td>1.78</td>
+<td>2.14</td>
+<td>11.2</td>
+<td>100</td>
+<td>2.65</td>
+<td>2.76</td>
+<td>0.26</td>
+<td>1.28</td>
+<td>4.38</td>
+<td>1.05</td>
+<td>3.40</td>
+<td>1050</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1</td>
+<td>13.16</td>
+<td>2.36</td>
+<td>2.67</td>
+<td>18.6</td>
+<td>101</td>
+<td>2.80</td>
+<td>3.24</td>
+<td>0.30</td>
+<td>2.81</td>
+<td>5.68</td>
+<td>1.03</td>
+<td>3.17</td>
+<td>1185</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1</td>
+<td>14.37</td>
+<td>1.95</td>
+<td>2.50</td>
+<td>16.8</td>
+<td>113</td>
+<td>3.85</td>
+<td>3.49</td>
+<td>0.24</td>
+<td>2.18</td>
+<td>7.80</td>
+<td>0.86</td>
+<td>3.45</td>
+<td>1480</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>1</td>
+<td>13.24</td>
+<td>2.59</td>
+<td>2.87</td>
+<td>21.0</td>
+<td>118</td>
+<td>2.80</td>
+<td>2.69</td>
+<td>0.39</td>
+<td>1.82</td>
+<td>4.32</td>
+<td>1.04</td>
+<td>2.93</td>
+<td>735</td>
+</tr>
+</tbody>
+</table>
+
 </div>
-<div class="callout-title-container flex-fill">
-Tip
 </div>
 </div>
-<div class="callout-body-container callout-body">
-<p>There is a column called <code>Class label</code> that gives us the ground truth. The wines come from three different cultivars. Knowing the actual grouping helps us to identify how well our methods can capture this ground truth.</p>
-</div>
-</div>
-<p>Following the data wrangling process that was summarised in <a href="week5_recap.html"><span>Chapter&nbsp;20</span></a>, we should first get a sense of our data.</p>
-<div class="cell">
-<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>df.describe()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>As you can see no variable has any missing data, but the scales of our features vary (e.g., <code>Magnesium</code> is in the 100s whereas <code>Hue</code> is in the low single digits).</p>
-<p>Let’s visually inspect how features are distributed using a violin plot:</p>
-<div class="cell">
-<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
-<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>df_long <span class="op">=</span> df.melt(id_vars<span class="op">=</span><span class="st">'Class label'</span>)</span>
-<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a>sns.violinplot(data <span class="op">=</span> df_long, x <span class="op">=</span> <span class="st">'variable'</span>, y <span class="op">=</span> <span class="st">'value'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<p>Regretfully, this is not very useful right now, due to the different scales that we detected previously. In this case, it makes sense to normalise our data.</p>
-<div class="cell">
-<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.preprocessing <span class="im">import</span> MinMaxScaler</span>
-<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="co"># create a scaler object</span></span>
-<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a>scaler <span class="op">=</span> MinMaxScaler()</span>
-<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a><span class="co"># fit and transform the data</span></span>
-<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a>df_norm <span class="op">=</span> pd.DataFrame(scaler.fit_transform(df), columns <span class="op">=</span> df.columns)</span>
-<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a>df_long <span class="op">=</span> df_norm.melt(id_vars<span class="op">=</span><span class="st">'Class label'</span>)</span>
-<span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a>df_long</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell column-page">
-<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="co">#create seaborn violin plot</span></span>
-<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>my_plot <span class="op">=</span> sns.violinplot(data <span class="op">=</span> df_long, x <span class="op">=</span> <span class="st">'variable'</span>, y <span class="op">=</span> <span class="st">'value'</span>)</span>
-<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a><span class="co">#rotate x-axis labels</span></span>
-<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a>my_plot.set_xticklabels(my_plot.get_xticklabels(), rotation<span class="op">=</span><span class="dv">90</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>There is a column called Class label that gives us the ground truth. The wines come from three different cultivars. Knowing the actual grouping helps us to identify how well our methods can capture this ground truth.</p>
+<p>Following our process above, we should first get a sense of our data.</p>
+<div class="cell" data-execution_count="4">
+<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>df.describe()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="4">
+<div>
+
+
+<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Class label</th>
+<th data-quarto-table-cell-role="th">Alcohol</th>
+<th data-quarto-table-cell-role="th">Malic acid</th>
+<th data-quarto-table-cell-role="th">Ash</th>
+<th data-quarto-table-cell-role="th">Alcalinity of ash</th>
+<th data-quarto-table-cell-role="th">Magnesium</th>
+<th data-quarto-table-cell-role="th">Total phenols</th>
+<th data-quarto-table-cell-role="th">Flavanoids</th>
+<th data-quarto-table-cell-role="th">Nonflavanoid phenols</th>
+<th data-quarto-table-cell-role="th">Proanthocyanins</th>
+<th data-quarto-table-cell-role="th">Color intensity</th>
+<th data-quarto-table-cell-role="th">Hue</th>
+<th data-quarto-table-cell-role="th">OD280/OD315 of diluted wines</th>
+<th data-quarto-table-cell-role="th">Proline</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">count</td>
+<td>178.000000</td>
+<td>178.000000</td>
+<td>178.000000</td>
+<td>178.000000</td>
+<td>178.000000</td>
+<td>178.000000</td>
+<td>178.000000</td>
+<td>178.000000</td>
+<td>178.000000</td>
+<td>178.000000</td>
+<td>178.000000</td>
+<td>178.000000</td>
+<td>178.000000</td>
+<td>178.000000</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">mean</td>
+<td>1.938202</td>
+<td>13.000618</td>
+<td>2.336348</td>
+<td>2.366517</td>
+<td>19.494944</td>
+<td>99.741573</td>
+<td>2.295112</td>
+<td>2.029270</td>
+<td>0.361854</td>
+<td>1.590899</td>
+<td>5.058090</td>
+<td>0.957449</td>
+<td>2.611685</td>
+<td>746.893258</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">std</td>
+<td>0.775035</td>
+<td>0.811827</td>
+<td>1.117146</td>
+<td>0.274344</td>
+<td>3.339564</td>
+<td>14.282484</td>
+<td>0.625851</td>
+<td>0.998859</td>
+<td>0.124453</td>
+<td>0.572359</td>
+<td>2.318286</td>
+<td>0.228572</td>
+<td>0.709990</td>
+<td>314.907474</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">min</td>
+<td>1.000000</td>
+<td>11.030000</td>
+<td>0.740000</td>
+<td>1.360000</td>
+<td>10.600000</td>
+<td>70.000000</td>
+<td>0.980000</td>
+<td>0.340000</td>
+<td>0.130000</td>
+<td>0.410000</td>
+<td>1.280000</td>
+<td>0.480000</td>
+<td>1.270000</td>
+<td>278.000000</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">25%</td>
+<td>1.000000</td>
+<td>12.362500</td>
+<td>1.602500</td>
+<td>2.210000</td>
+<td>17.200000</td>
+<td>88.000000</td>
+<td>1.742500</td>
+<td>1.205000</td>
+<td>0.270000</td>
+<td>1.250000</td>
+<td>3.220000</td>
+<td>0.782500</td>
+<td>1.937500</td>
+<td>500.500000</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">50%</td>
+<td>2.000000</td>
+<td>13.050000</td>
+<td>1.865000</td>
+<td>2.360000</td>
+<td>19.500000</td>
+<td>98.000000</td>
+<td>2.355000</td>
+<td>2.135000</td>
+<td>0.340000</td>
+<td>1.555000</td>
+<td>4.690000</td>
+<td>0.965000</td>
+<td>2.780000</td>
+<td>673.500000</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">75%</td>
+<td>3.000000</td>
+<td>13.677500</td>
+<td>3.082500</td>
+<td>2.557500</td>
+<td>21.500000</td>
+<td>107.000000</td>
+<td>2.800000</td>
+<td>2.875000</td>
+<td>0.437500</td>
+<td>1.950000</td>
+<td>6.200000</td>
+<td>1.120000</td>
+<td>3.170000</td>
+<td>985.000000</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">max</td>
+<td>3.000000</td>
+<td>14.830000</td>
+<td>5.800000</td>
+<td>3.230000</td>
+<td>30.000000</td>
+<td>162.000000</td>
+<td>3.880000</td>
+<td>5.080000</td>
+<td>0.660000</td>
+<td>3.580000</td>
+<td>13.000000</td>
+<td>1.710000</td>
+<td>4.000000</td>
+<td>1680.000000</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
+</div>
+<p>No missing data. The scales of our features vary (e.g., Magnesium is in the 100s whereas Hue is in the low single digits).</p>
+<p>How about our feature distributions?</p>
+<div class="cell" data-execution_count="5">
+<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>df_long <span class="op">=</span> df.melt(id_vars<span class="op">=</span><span class="st">'Class label'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
+<div class="cell" data-execution_count="6">
+<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
+<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a>sns.violinplot(data <span class="op">=</span> df_long, x <span class="op">=</span> <span class="st">'variable'</span>, y <span class="op">=</span> <span class="st">'value'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="6">
+<pre><code>&lt;Axes: xlabel='variable', ylabel='value'&gt;</code></pre>
+</div>
+<div class="cell-output cell-output-display">
+<p><img src="IM939_Lab_5_1_files/figure-html/cell-7-output-2.png" width="645" height="429"></p>
+</div>
+</div>
+<p>Makes sense to normalise our data.</p>
+<div class="cell" data-execution_count="7">
+<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.preprocessing <span class="im">import</span> MinMaxScaler</span>
+<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a><span class="co"># create a scaler object</span></span>
+<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a>scaler <span class="op">=</span> MinMaxScaler()</span>
+<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a><span class="co"># fit and transform the data</span></span>
+<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a>df_norm <span class="op">=</span> pd.DataFrame(scaler.fit_transform(df), columns <span class="op">=</span> df.columns)</span>
+<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a>df_long <span class="op">=</span> df_norm.melt(id_vars<span class="op">=</span><span class="st">'Class label'</span>)</span>
+<span id="cb7-10"><a href="#cb7-10" aria-hidden="true" tabindex="-1"></a>df_long</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="7">
+<div>
+
+
+<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Class label</th>
+<th data-quarto-table-cell-role="th">variable</th>
+<th data-quarto-table-cell-role="th">value</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>0.0</td>
+<td>Alcohol</td>
+<td>0.842105</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>0.0</td>
+<td>Alcohol</td>
+<td>0.571053</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>0.0</td>
+<td>Alcohol</td>
+<td>0.560526</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>0.0</td>
+<td>Alcohol</td>
+<td>0.878947</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>0.0</td>
+<td>Alcohol</td>
+<td>0.581579</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2309</td>
+<td>1.0</td>
+<td>Proline</td>
+<td>0.329529</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">2310</td>
+<td>1.0</td>
+<td>Proline</td>
+<td>0.336662</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2311</td>
+<td>1.0</td>
+<td>Proline</td>
+<td>0.397290</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">2312</td>
+<td>1.0</td>
+<td>Proline</td>
+<td>0.400856</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2313</td>
+<td>1.0</td>
+<td>Proline</td>
+<td>0.201141</td>
+</tr>
+</tbody>
+</table>
+
+<p>2314 rows × 3 columns</p>
+</div>
+</div>
+</div>
+<div class="cell" data-execution_count="8">
+<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="co">#create seaborn violin plot</span></span>
+<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a>my_plot <span class="op">=</span> sns.violinplot(data <span class="op">=</span> df_long, x <span class="op">=</span> <span class="st">'variable'</span>, y <span class="op">=</span> <span class="st">'value'</span>)</span>
+<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a><span class="co">#rotate x-axis labels</span></span>
+<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a>my_plot.set_xticklabels(my_plot.get_xticklabels(), rotation<span class="op">=</span><span class="dv">90</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="8">
+<pre><code>[Text(0, 0, 'Alcohol'),
+ Text(1, 0, 'Malic acid'),
+ Text(2, 0, 'Ash'),
+ Text(3, 0, 'Alcalinity of ash'),
+ Text(4, 0, 'Magnesium'),
+ Text(5, 0, 'Total phenols'),
+ Text(6, 0, 'Flavanoids'),
+ Text(7, 0, 'Nonflavanoid phenols'),
+ Text(8, 0, 'Proanthocyanins'),
+ Text(9, 0, 'Color intensity'),
+ Text(10, 0, 'Hue'),
+ Text(11, 0, 'OD280/OD315 of diluted wines'),
+ Text(12, 0, 'Proline ')]</code></pre>
+</div>
+<div class="cell-output cell-output-display">
+<p><img src="IM939_Lab_5_1_files/figure-html/cell-9-output-2.png" width="600" height="624"></p>
+</div>
 </div>
 <p>Are there any patterns?</p>
 <p>How about a pairplot?</p>
-<div class="cell column-page">
-<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a>sns.pairplot(data <span class="op">=</span> df_norm.iloc[:,<span class="dv">1</span>:])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell page-columns page-full" data-execution_count="9">
+<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>sns.pairplot(data <span class="op">=</span> df_norm.iloc[:,<span class="dv">1</span>:])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display column-page">
+<p><img src="IM939_Lab_5_1_files/figure-html/cell-10-output-1.png" width="3062" height="3061"></p>
+</div>
 </div>
 <p>Hmm, a few interesting correlations. Some of our variables are skewed. We could apply some PCA here to look at fewer dimension or even log transform some of the skewed variables.</p>
 </section>
 <section id="cluster-analysis" class="level2" data-number="21.2">
 <h2 data-number="21.2" class="anchored" data-anchor-id="cluster-analysis"><span class="header-section-number">21.2</span> Cluster analysis</h2>
 <p>For now we will just run a kmeans cluster and then check our results against the ground truth.</p>
-<section id="number-of-clusters" class="level3" data-number="21.2.1">
-<h3 data-number="21.2.1" class="anchored" data-anchor-id="number-of-clusters"><span class="header-section-number">21.2.1</span> Number of clusters</h3>
+<section id="determining-the-number-of-clusters" class="level3" data-number="21.2.1">
+<h3 data-number="21.2.1" class="anchored" data-anchor-id="determining-the-number-of-clusters"><span class="header-section-number">21.2.1</span> Determining the number of clusters</h3>
 <p>Lets decide how many clusters we need.</p>
-<div class="cell">
-<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.cluster <span class="im">import</span> KMeans</span>
-<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a>ks <span class="op">=</span> <span class="bu">range</span>(<span class="dv">1</span>, <span class="dv">10</span>)</span>
-<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a>inertias <span class="op">=</span> []</span>
-<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> k <span class="kw">in</span> ks:</span>
-<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Create a KMeans instance with k clusters: model</span></span>
-<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a>    model <span class="op">=</span> KMeans(n_clusters<span class="op">=</span>k, n_init <span class="op">=</span> <span class="dv">10</span>)</span>
-<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Fit model to samples</span></span>
-<span id="cb7-10"><a href="#cb7-10" aria-hidden="true" tabindex="-1"></a>    model.fit(df.iloc[:,<span class="dv">1</span>:])</span>
-<span id="cb7-11"><a href="#cb7-11" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb7-12"><a href="#cb7-12" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Append the inertia to the list of inertias</span></span>
-<span id="cb7-13"><a href="#cb7-13" aria-hidden="true" tabindex="-1"></a>    inertias.append(model.inertia_)</span>
-<span id="cb7-14"><a href="#cb7-14" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb7-15"><a href="#cb7-15" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
-<span id="cb7-16"><a href="#cb7-16" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb7-17"><a href="#cb7-17" aria-hidden="true" tabindex="-1"></a>plt.plot(ks, inertias, <span class="st">'-o'</span>, color<span class="op">=</span><span class="st">'black'</span>)</span>
-<span id="cb7-18"><a href="#cb7-18" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">'number of clusters, k'</span>)</span>
-<span id="cb7-19"><a href="#cb7-19" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">'inertia'</span>)</span>
-<span id="cb7-20"><a href="#cb7-20" aria-hidden="true" tabindex="-1"></a>plt.xticks(ks)</span>
-<span id="cb7-21"><a href="#cb7-21" aria-hidden="true" tabindex="-1"></a>plt.show()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-execution_count="10">
+<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.cluster <span class="im">import</span> KMeans</span>
+<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a>ks <span class="op">=</span> <span class="bu">range</span>(<span class="dv">1</span>, <span class="dv">10</span>)</span>
+<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a>inertias <span class="op">=</span> []</span>
+<span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> k <span class="kw">in</span> ks:</span>
+<span id="cb11-6"><a href="#cb11-6" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Create a KMeans instance with k clusters: model</span></span>
+<span id="cb11-7"><a href="#cb11-7" aria-hidden="true" tabindex="-1"></a>    model <span class="op">=</span> KMeans(n_clusters<span class="op">=</span>k, n_init <span class="op">=</span> <span class="dv">10</span>)</span>
+<span id="cb11-8"><a href="#cb11-8" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb11-9"><a href="#cb11-9" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Fit model to samples</span></span>
+<span id="cb11-10"><a href="#cb11-10" aria-hidden="true" tabindex="-1"></a>    model.fit(df.iloc[:,<span class="dv">1</span>:])</span>
+<span id="cb11-11"><a href="#cb11-11" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb11-12"><a href="#cb11-12" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Append the inertia to the list of inertias</span></span>
+<span id="cb11-13"><a href="#cb11-13" aria-hidden="true" tabindex="-1"></a>    inertias.append(model.inertia_)</span>
+<span id="cb11-14"><a href="#cb11-14" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb11-15"><a href="#cb11-15" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb11-16"><a href="#cb11-16" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb11-17"><a href="#cb11-17" aria-hidden="true" tabindex="-1"></a>plt.plot(ks, inertias, <span class="st">'-o'</span>, color<span class="op">=</span><span class="st">'black'</span>)</span>
+<span id="cb11-18"><a href="#cb11-18" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">'number of clusters, k'</span>)</span>
+<span id="cb11-19"><a href="#cb11-19" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">'inertia'</span>)</span>
+<span id="cb11-20"><a href="#cb11-20" aria-hidden="true" tabindex="-1"></a>plt.xticks(ks)</span>
+<span id="cb11-21"><a href="#cb11-21" aria-hidden="true" tabindex="-1"></a>plt.show()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<p><img src="IM939_Lab_5_1_files/figure-html/cell-11-output-1.png" width="597" height="443"></p>
+</div>
 </div>
 <p>What happens if we use the normalised data instead?</p>
-<div class="cell">
-<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.cluster <span class="im">import</span> KMeans</span>
-<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a>ks <span class="op">=</span> <span class="bu">range</span>(<span class="dv">1</span>, <span class="dv">10</span>)</span>
-<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a>inertias <span class="op">=</span> []</span>
-<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> k <span class="kw">in</span> ks:</span>
-<span id="cb8-6"><a href="#cb8-6" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Create a KMeans instance with k clusters: model</span></span>
-<span id="cb8-7"><a href="#cb8-7" aria-hidden="true" tabindex="-1"></a>    model <span class="op">=</span> KMeans(n_clusters<span class="op">=</span>k, n_init <span class="op">=</span> <span class="dv">10</span>)</span>
-<span id="cb8-8"><a href="#cb8-8" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb8-9"><a href="#cb8-9" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Fit model to samples</span></span>
-<span id="cb8-10"><a href="#cb8-10" aria-hidden="true" tabindex="-1"></a>    model.fit(df_norm.iloc[:,<span class="dv">1</span>:])</span>
-<span id="cb8-11"><a href="#cb8-11" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb8-12"><a href="#cb8-12" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Append the inertia to the list of inertias</span></span>
-<span id="cb8-13"><a href="#cb8-13" aria-hidden="true" tabindex="-1"></a>    inertias.append(model.inertia_)</span>
-<span id="cb8-14"><a href="#cb8-14" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-15"><a href="#cb8-15" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
-<span id="cb8-16"><a href="#cb8-16" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-17"><a href="#cb8-17" aria-hidden="true" tabindex="-1"></a>plt.plot(ks, inertias, <span class="st">'-o'</span>, color<span class="op">=</span><span class="st">'black'</span>)</span>
-<span id="cb8-18"><a href="#cb8-18" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">'number of clusters, k'</span>)</span>
-<span id="cb8-19"><a href="#cb8-19" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">'inertia'</span>)</span>
-<span id="cb8-20"><a href="#cb8-20" aria-hidden="true" tabindex="-1"></a>plt.xticks(ks)</span>
-<span id="cb8-21"><a href="#cb8-21" aria-hidden="true" tabindex="-1"></a>plt.show()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-execution_count="11">
+<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.cluster <span class="im">import</span> KMeans</span>
+<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a>ks <span class="op">=</span> <span class="bu">range</span>(<span class="dv">1</span>, <span class="dv">10</span>)</span>
+<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a>inertias <span class="op">=</span> []</span>
+<span id="cb12-5"><a href="#cb12-5" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> k <span class="kw">in</span> ks:</span>
+<span id="cb12-6"><a href="#cb12-6" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Create a KMeans instance with k clusters: model</span></span>
+<span id="cb12-7"><a href="#cb12-7" aria-hidden="true" tabindex="-1"></a>    model <span class="op">=</span> KMeans(n_clusters<span class="op">=</span>k, n_init <span class="op">=</span> <span class="dv">10</span>)</span>
+<span id="cb12-8"><a href="#cb12-8" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb12-9"><a href="#cb12-9" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Fit model to samples</span></span>
+<span id="cb12-10"><a href="#cb12-10" aria-hidden="true" tabindex="-1"></a>    model.fit(df_norm.iloc[:,<span class="dv">1</span>:])</span>
+<span id="cb12-11"><a href="#cb12-11" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb12-12"><a href="#cb12-12" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Append the inertia to the list of inertias</span></span>
+<span id="cb12-13"><a href="#cb12-13" aria-hidden="true" tabindex="-1"></a>    inertias.append(model.inertia_)</span>
+<span id="cb12-14"><a href="#cb12-14" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb12-15"><a href="#cb12-15" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb12-16"><a href="#cb12-16" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb12-17"><a href="#cb12-17" aria-hidden="true" tabindex="-1"></a>plt.plot(ks, inertias, <span class="st">'-o'</span>, color<span class="op">=</span><span class="st">'black'</span>)</span>
+<span id="cb12-18"><a href="#cb12-18" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">'number of clusters, k'</span>)</span>
+<span id="cb12-19"><a href="#cb12-19" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">'inertia'</span>)</span>
+<span id="cb12-20"><a href="#cb12-20" aria-hidden="true" tabindex="-1"></a>plt.xticks(ks)</span>
+<span id="cb12-21"><a href="#cb12-21" aria-hidden="true" tabindex="-1"></a>plt.show()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display">
+<p><img src="IM939_Lab_5_1_files/figure-html/cell-12-output-1.png" width="585" height="429"></p>
+</div>
 </div>
 <div class="callout callout-style-default callout-warning callout-titled">
 <div class="callout-header d-flex align-content-center">
@@ -719,39 +1118,324 @@ <h3 data-number="21.2.1" class="anchored" data-anchor-id="number-of-clusters"><s
 </div>
 </div>
 <p>Three clusters seems about right (and matches our number of origonal labels).</p>
-<div class="cell">
-<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>df[<span class="st">'Class label'</span>].value_counts()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-execution_count="12">
+<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a>df[<span class="st">'Class label'</span>].value_counts()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="12">
+<pre><code>Class label
+2    71
+1    59
+3    48
+Name: count, dtype: int64</code></pre>
+</div>
 </div>
 </section>
-<section id="calculate-3-clusters" class="level3" data-number="21.2.2">
-<h3 data-number="21.2.2" class="anchored" data-anchor-id="calculate-3-clusters"><span class="header-section-number">21.2.2</span> Calculate 3 clusters</h3>
-<p>Now, we are going to calculate three clusters and store each observation’s cluster labels into a variable within the original dataframe:</p>
-<div class="cell">
-<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a KMeans instance with k clusters: model</span></span>
-<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a>k_means <span class="op">=</span> KMeans(n_clusters<span class="op">=</span><span class="dv">3</span>)</span>
-<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Fit model to samples</span></span>
-<span id="cb10-5"><a href="#cb10-5" aria-hidden="true" tabindex="-1"></a>df_k_means <span class="op">=</span> k_means.fit(df.iloc[:,<span class="dv">1</span>:])</span>
-<span id="cb10-6"><a href="#cb10-6" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb10-7"><a href="#cb10-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a new variable with the fited cluster label.</span></span>
-<span id="cb10-8"><a href="#cb10-8" aria-hidden="true" tabindex="-1"></a>df[<span class="st">'Three clusters'</span>] <span class="op">=</span> pd.Series(df_k_means.predict(df.iloc[:,<span class="dv">1</span>:].values), index <span class="op">=</span> df.index)</span>
-<span id="cb10-9"><a href="#cb10-9" aria-hidden="true" tabindex="-1"></a>df</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<section id="computing-the-clusters" class="level3" data-number="21.2.2">
+<h3 data-number="21.2.2" class="anchored" data-anchor-id="computing-the-clusters"><span class="header-section-number">21.2.2</span> Computing the clusters</h3>
+<div class="cell" data-execution_count="13">
+<div class="sourceCode cell-code" id="cb15"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a KMeans instance with k clusters: model</span></span>
+<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a>k_means <span class="op">=</span> KMeans(n_clusters<span class="op">=</span><span class="dv">3</span>)</span>
+<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Fit model to samples</span></span>
+<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a>df_k_means <span class="op">=</span> k_means.fit(df.iloc[:,<span class="dv">1</span>:])</span>
+<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a>df[<span class="st">'Three clusters'</span>] <span class="op">=</span> pd.Series(df_k_means.predict(df.iloc[:,<span class="dv">1</span>:].values), index <span class="op">=</span> df.index)</span>
+<span id="cb15-8"><a href="#cb15-8" aria-hidden="true" tabindex="-1"></a>df</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="13">
+<div>
+
+
+<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Class label</th>
+<th data-quarto-table-cell-role="th">Alcohol</th>
+<th data-quarto-table-cell-role="th">Malic acid</th>
+<th data-quarto-table-cell-role="th">Ash</th>
+<th data-quarto-table-cell-role="th">Alcalinity of ash</th>
+<th data-quarto-table-cell-role="th">Magnesium</th>
+<th data-quarto-table-cell-role="th">Total phenols</th>
+<th data-quarto-table-cell-role="th">Flavanoids</th>
+<th data-quarto-table-cell-role="th">Nonflavanoid phenols</th>
+<th data-quarto-table-cell-role="th">Proanthocyanins</th>
+<th data-quarto-table-cell-role="th">Color intensity</th>
+<th data-quarto-table-cell-role="th">Hue</th>
+<th data-quarto-table-cell-role="th">OD280/OD315 of diluted wines</th>
+<th data-quarto-table-cell-role="th">Proline</th>
+<th data-quarto-table-cell-role="th">Three clusters</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>1</td>
+<td>14.23</td>
+<td>1.71</td>
+<td>2.43</td>
+<td>15.6</td>
+<td>127</td>
+<td>2.80</td>
+<td>3.06</td>
+<td>0.28</td>
+<td>2.29</td>
+<td>5.64</td>
+<td>1.04</td>
+<td>3.92</td>
+<td>1065</td>
+<td>1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>1</td>
+<td>13.20</td>
+<td>1.78</td>
+<td>2.14</td>
+<td>11.2</td>
+<td>100</td>
+<td>2.65</td>
+<td>2.76</td>
+<td>0.26</td>
+<td>1.28</td>
+<td>4.38</td>
+<td>1.05</td>
+<td>3.40</td>
+<td>1050</td>
+<td>1</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>1</td>
+<td>13.16</td>
+<td>2.36</td>
+<td>2.67</td>
+<td>18.6</td>
+<td>101</td>
+<td>2.80</td>
+<td>3.24</td>
+<td>0.30</td>
+<td>2.81</td>
+<td>5.68</td>
+<td>1.03</td>
+<td>3.17</td>
+<td>1185</td>
+<td>1</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>1</td>
+<td>14.37</td>
+<td>1.95</td>
+<td>2.50</td>
+<td>16.8</td>
+<td>113</td>
+<td>3.85</td>
+<td>3.49</td>
+<td>0.24</td>
+<td>2.18</td>
+<td>7.80</td>
+<td>0.86</td>
+<td>3.45</td>
+<td>1480</td>
+<td>1</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>1</td>
+<td>13.24</td>
+<td>2.59</td>
+<td>2.87</td>
+<td>21.0</td>
+<td>118</td>
+<td>2.80</td>
+<td>2.69</td>
+<td>0.39</td>
+<td>1.82</td>
+<td>4.32</td>
+<td>1.04</td>
+<td>2.93</td>
+<td>735</td>
+<td>2</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">173</td>
+<td>3</td>
+<td>13.71</td>
+<td>5.65</td>
+<td>2.45</td>
+<td>20.5</td>
+<td>95</td>
+<td>1.68</td>
+<td>0.61</td>
+<td>0.52</td>
+<td>1.06</td>
+<td>7.70</td>
+<td>0.64</td>
+<td>1.74</td>
+<td>740</td>
+<td>2</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">174</td>
+<td>3</td>
+<td>13.40</td>
+<td>3.91</td>
+<td>2.48</td>
+<td>23.0</td>
+<td>102</td>
+<td>1.80</td>
+<td>0.75</td>
+<td>0.43</td>
+<td>1.41</td>
+<td>7.30</td>
+<td>0.70</td>
+<td>1.56</td>
+<td>750</td>
+<td>2</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">175</td>
+<td>3</td>
+<td>13.27</td>
+<td>4.28</td>
+<td>2.26</td>
+<td>20.0</td>
+<td>120</td>
+<td>1.59</td>
+<td>0.69</td>
+<td>0.43</td>
+<td>1.35</td>
+<td>10.20</td>
+<td>0.59</td>
+<td>1.56</td>
+<td>835</td>
+<td>2</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">176</td>
+<td>3</td>
+<td>13.17</td>
+<td>2.59</td>
+<td>2.37</td>
+<td>20.0</td>
+<td>120</td>
+<td>1.65</td>
+<td>0.68</td>
+<td>0.53</td>
+<td>1.46</td>
+<td>9.30</td>
+<td>0.60</td>
+<td>1.62</td>
+<td>840</td>
+<td>2</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">177</td>
+<td>3</td>
+<td>14.13</td>
+<td>4.10</td>
+<td>2.74</td>
+<td>24.5</td>
+<td>96</td>
+<td>2.05</td>
+<td>0.76</td>
+<td>0.56</td>
+<td>1.35</td>
+<td>9.20</td>
+<td>0.61</td>
+<td>1.60</td>
+<td>560</td>
+<td>0</td>
+</tr>
+</tbody>
+</table>
+
+<p>178 rows × 15 columns</p>
+</div>
+</div>
 </div>
 </section>
-<section id="ground-truth-validation" class="level3" data-number="21.2.3">
-<h3 data-number="21.2.3" class="anchored" data-anchor-id="ground-truth-validation"><span class="header-section-number">21.2.3</span> Ground Truth Validation</h3>
-<p>Do our cluster labels match our ground truth? Did our cluster model capture reality?</p>
-<div class="cell">
-<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a>ct <span class="op">=</span> pd.crosstab(df[<span class="st">'Three clusters'</span>], df[<span class="st">'Class label'</span>])</span>
-<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a>ct</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="clusters-and-ground-truth" class="level2" data-number="21.3">
+<h2 data-number="21.3" class="anchored" data-anchor-id="clusters-and-ground-truth"><span class="header-section-number">21.3</span> Clusters and Ground Truth</h2>
+<p>Now that we have created three clusters, we may ask ourselves: Do our cluster labels match our ground truth? Did our cluster model capture reality?</p>
+<div class="cell" data-execution_count="14">
+<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a>ct <span class="op">=</span> pd.crosstab(df[<span class="st">'Three clusters'</span>], df[<span class="st">'Class label'</span>])</span>
+<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a>ct</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="14">
+<div>
+
+
+<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th">Class label</th>
+<th data-quarto-table-cell-role="th">1</th>
+<th data-quarto-table-cell-role="th">2</th>
+<th data-quarto-table-cell-role="th">3</th>
+</tr>
+<tr class="odd">
+<th data-quarto-table-cell-role="th">Three clusters</th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th"></th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>0</td>
+<td>50</td>
+<td>19</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>46</td>
+<td>1</td>
+<td>0</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>13</td>
+<td>20</td>
+<td>29</td>
+</tr>
+</tbody>
+</table>
+
+</div>
+</div>
 </div>
 <p>It might be easier to see as a stacked plot (see <a href="https://stackoverflow.com/questions/43544694/using-pandas-crosstab-with-seaborn-stacked-barplots">this post</a>).</p>
-<div class="cell">
-<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
-<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
-<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a>ct.plot.bar(stacked<span class="op">=</span><span class="va">True</span>)</span>
-<span id="cb12-5"><a href="#cb12-5" aria-hidden="true" tabindex="-1"></a>plt.legend(title<span class="op">=</span><span class="st">'Class label'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-execution_count="15">
+<div class="sourceCode cell-code" id="cb17"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb17-4"><a href="#cb17-4" aria-hidden="true" tabindex="-1"></a>ct.plot.bar(stacked<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb17-5"><a href="#cb17-5" aria-hidden="true" tabindex="-1"></a>plt.legend(title<span class="op">=</span><span class="st">'Class label'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="15">
+<pre><code>&lt;matplotlib.legend.Legend at 0x1798f3e50&gt;</code></pre>
+</div>
+<div class="cell-output cell-output-display">
+<p><img src="IM939_Lab_5_1_files/figure-html/cell-16-output-2.png" width="566" height="425"></p>
+</div>
 </div>
 <p>How has the kmeans model done compared to our ground truth?</p>
 <div class="callout callout-style-default callout-important callout-titled">
@@ -767,32 +1451,251 @@ <h3 data-number="21.2.3" class="anchored" data-anchor-id="ground-truth-validatio
 <p>We need to be really careful here. We notice that it is not easily possible to compare the known class labels to clustering labels. The reason is that the clustering algorithm labels are just arbitrary and not assigned to any deterministic criteria. Each time you run the algorithm, you might get a different id for the labels. <strong>The reason is that the label itself doesn’t actually mean anything, what is important is the list of items that are in the same cluster and their relations.</strong></p>
 </div>
 </div>
-<p>A way to come over this ambiguity and evaluate the results is to look at a visualisations of the results and compare. But this brings in the question of what type of visualisation to use for looking at the clusters.</p>
-<p>An immediate alternative is to use scatterplots. However, it is not clear which axis to use for clustering. A common method to apply at this stage is to make use of PCA to get a 2D plane where we can project the data points and visualise them over this projection.</p>
-<div class="cell">
-<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a>df.iloc[:,<span class="dv">1</span>:<span class="dv">14</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<section id="principal-components-analysis" class="level3" data-number="21.3.1">
+<h3 data-number="21.3.1" class="anchored" data-anchor-id="principal-components-analysis"><span class="header-section-number">21.3.1</span> Principal Components Analysis</h3>
+<p>A way to come over this ambiguity and evaluate the results is to look at a visualisations of the results and compare. But this brings in the question of what type of visualisation to use for looking at the clusters. An immediate alternative is to use scatterplots. However, it is not clear which axis to use for clustering. A common method to apply at this stage is to make use of PCA to get a 2D plane where we can project the data points and visualise them over this projection.</p>
+<div class="cell" data-execution_count="16">
+<div class="sourceCode cell-code" id="cb19"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a>df.iloc[:,<span class="dv">1</span>:<span class="dv">14</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="16">
+<div>
+
+
+<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
+<thead>
+<tr class="header">
+<th data-quarto-table-cell-role="th"></th>
+<th data-quarto-table-cell-role="th">Alcohol</th>
+<th data-quarto-table-cell-role="th">Malic acid</th>
+<th data-quarto-table-cell-role="th">Ash</th>
+<th data-quarto-table-cell-role="th">Alcalinity of ash</th>
+<th data-quarto-table-cell-role="th">Magnesium</th>
+<th data-quarto-table-cell-role="th">Total phenols</th>
+<th data-quarto-table-cell-role="th">Flavanoids</th>
+<th data-quarto-table-cell-role="th">Nonflavanoid phenols</th>
+<th data-quarto-table-cell-role="th">Proanthocyanins</th>
+<th data-quarto-table-cell-role="th">Color intensity</th>
+<th data-quarto-table-cell-role="th">Hue</th>
+<th data-quarto-table-cell-role="th">OD280/OD315 of diluted wines</th>
+<th data-quarto-table-cell-role="th">Proline</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">0</td>
+<td>14.23</td>
+<td>1.71</td>
+<td>2.43</td>
+<td>15.6</td>
+<td>127</td>
+<td>2.80</td>
+<td>3.06</td>
+<td>0.28</td>
+<td>2.29</td>
+<td>5.64</td>
+<td>1.04</td>
+<td>3.92</td>
+<td>1065</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">1</td>
+<td>13.20</td>
+<td>1.78</td>
+<td>2.14</td>
+<td>11.2</td>
+<td>100</td>
+<td>2.65</td>
+<td>2.76</td>
+<td>0.26</td>
+<td>1.28</td>
+<td>4.38</td>
+<td>1.05</td>
+<td>3.40</td>
+<td>1050</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">2</td>
+<td>13.16</td>
+<td>2.36</td>
+<td>2.67</td>
+<td>18.6</td>
+<td>101</td>
+<td>2.80</td>
+<td>3.24</td>
+<td>0.30</td>
+<td>2.81</td>
+<td>5.68</td>
+<td>1.03</td>
+<td>3.17</td>
+<td>1185</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">3</td>
+<td>14.37</td>
+<td>1.95</td>
+<td>2.50</td>
+<td>16.8</td>
+<td>113</td>
+<td>3.85</td>
+<td>3.49</td>
+<td>0.24</td>
+<td>2.18</td>
+<td>7.80</td>
+<td>0.86</td>
+<td>3.45</td>
+<td>1480</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">4</td>
+<td>13.24</td>
+<td>2.59</td>
+<td>2.87</td>
+<td>21.0</td>
+<td>118</td>
+<td>2.80</td>
+<td>2.69</td>
+<td>0.39</td>
+<td>1.82</td>
+<td>4.32</td>
+<td>1.04</td>
+<td>2.93</td>
+<td>735</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+<td>...</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">173</td>
+<td>13.71</td>
+<td>5.65</td>
+<td>2.45</td>
+<td>20.5</td>
+<td>95</td>
+<td>1.68</td>
+<td>0.61</td>
+<td>0.52</td>
+<td>1.06</td>
+<td>7.70</td>
+<td>0.64</td>
+<td>1.74</td>
+<td>740</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">174</td>
+<td>13.40</td>
+<td>3.91</td>
+<td>2.48</td>
+<td>23.0</td>
+<td>102</td>
+<td>1.80</td>
+<td>0.75</td>
+<td>0.43</td>
+<td>1.41</td>
+<td>7.30</td>
+<td>0.70</td>
+<td>1.56</td>
+<td>750</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">175</td>
+<td>13.27</td>
+<td>4.28</td>
+<td>2.26</td>
+<td>20.0</td>
+<td>120</td>
+<td>1.59</td>
+<td>0.69</td>
+<td>0.43</td>
+<td>1.35</td>
+<td>10.20</td>
+<td>0.59</td>
+<td>1.56</td>
+<td>835</td>
+</tr>
+<tr class="even">
+<td data-quarto-table-cell-role="th">176</td>
+<td>13.17</td>
+<td>2.59</td>
+<td>2.37</td>
+<td>20.0</td>
+<td>120</td>
+<td>1.65</td>
+<td>0.68</td>
+<td>0.53</td>
+<td>1.46</td>
+<td>9.30</td>
+<td>0.60</td>
+<td>1.62</td>
+<td>840</td>
+</tr>
+<tr class="odd">
+<td data-quarto-table-cell-role="th">177</td>
+<td>14.13</td>
+<td>4.10</td>
+<td>2.74</td>
+<td>24.5</td>
+<td>96</td>
+<td>2.05</td>
+<td>0.76</td>
+<td>0.56</td>
+<td>1.35</td>
+<td>9.20</td>
+<td>0.61</td>
+<td>1.60</td>
+<td>560</td>
+</tr>
+</tbody>
+</table>
+
+<p>178 rows × 13 columns</p>
+</div>
+</div>
 </div>
-<div class="cell">
-<div class="sourceCode cell-code" id="cb14"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.decomposition <span class="im">import</span> PCA</span>
-<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a>n_components <span class="op">=</span> <span class="dv">2</span></span>
-<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a>pca <span class="op">=</span> PCA(n_components<span class="op">=</span>n_components)</span>
-<span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a>df_pca <span class="op">=</span> pca.fit(df.iloc[:,<span class="dv">1</span>:<span class="dv">14</span>])</span>
-<span id="cb14-7"><a href="#cb14-7" aria-hidden="true" tabindex="-1"></a>df_pca_vals <span class="op">=</span> df_pca.transform(df.iloc[:,<span class="dv">1</span>:<span class="dv">14</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-execution_count="17">
+<div class="sourceCode cell-code" id="cb20"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.decomposition <span class="im">import</span> PCA</span>
+<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a>n_components <span class="op">=</span> <span class="dv">2</span></span>
+<span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb20-5"><a href="#cb20-5" aria-hidden="true" tabindex="-1"></a>pca <span class="op">=</span> PCA(n_components<span class="op">=</span>n_components)</span>
+<span id="cb20-6"><a href="#cb20-6" aria-hidden="true" tabindex="-1"></a>df_pca <span class="op">=</span> pca.fit(df.iloc[:,<span class="dv">1</span>:<span class="dv">14</span>])</span>
+<span id="cb20-7"><a href="#cb20-7" aria-hidden="true" tabindex="-1"></a>df_pca_vals <span class="op">=</span> df_pca.transform(df.iloc[:,<span class="dv">1</span>:<span class="dv">14</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Grab our projections and plot along with our cluster names.</p>
-<div class="cell">
-<div class="sourceCode cell-code" id="cb15"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a>df[<span class="st">'c1'</span>] <span class="op">=</span> [item[<span class="dv">0</span>] <span class="cf">for</span> item <span class="kw">in</span> df_pca_vals]</span>
-<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a>df[<span class="st">'c2'</span>] <span class="op">=</span> [item[<span class="dv">1</span>] <span class="cf">for</span> item <span class="kw">in</span> df_pca_vals]</span>
-<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a>ax <span class="op">=</span> sns.scatterplot(data <span class="op">=</span> df, x <span class="op">=</span> <span class="st">'c1'</span>, y <span class="op">=</span> <span class="st">'c2'</span>, hue <span class="op">=</span> <span class="st">'Class label'</span>)</span>
-<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a>ax.set_title(<span class="st">'Known labels visualised over PCs'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-execution_count="18">
+<div class="sourceCode cell-code" id="cb21"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a>df[<span class="st">'c1'</span>] <span class="op">=</span> [item[<span class="dv">0</span>] <span class="cf">for</span> item <span class="kw">in</span> df_pca_vals]</span>
+<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a>df[<span class="st">'c2'</span>] <span class="op">=</span> [item[<span class="dv">1</span>] <span class="cf">for</span> item <span class="kw">in</span> df_pca_vals]</span>
+<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb21-4"><a href="#cb21-4" aria-hidden="true" tabindex="-1"></a>ax <span class="op">=</span> sns.scatterplot(data <span class="op">=</span> df, x <span class="op">=</span> <span class="st">'c1'</span>, y <span class="op">=</span> <span class="st">'c2'</span>, hue <span class="op">=</span> <span class="st">'Class label'</span>)</span>
+<span id="cb21-5"><a href="#cb21-5" aria-hidden="true" tabindex="-1"></a>ax.set_title(<span class="st">'Known labels visualised over PCs'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="18">
+<pre><code>Text(0.5, 1.0, 'Known labels visualised over PCs')</code></pre>
+</div>
+<div class="cell-output cell-output-display">
+<p><img src="IM939_Lab_5_1_files/figure-html/cell-19-output-2.png" width="612" height="449"></p>
+</div>
 </div>
 <p>In the figure above, we colored the points based on the actual labels, we observe that there has been several misclassifications in the figure above (i.e., in the algorithm’s results). So one may choose to use an alternative algorithm or devise a better distance metric.</p>
-<div class="cell">
-<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a>ax <span class="op">=</span> sns.scatterplot(data <span class="op">=</span> df, x <span class="op">=</span> <span class="st">'c1'</span>, y <span class="op">=</span> <span class="st">'c2'</span>, hue <span class="op">=</span> <span class="st">'Three clusters'</span>)</span>
-<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a>ax.set_title(<span class="st">'Results of the algorithm visualised over PCs'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-execution_count="19">
+<div class="sourceCode cell-code" id="cb23"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a>ax <span class="op">=</span> sns.scatterplot(data <span class="op">=</span> df, x <span class="op">=</span> <span class="st">'c1'</span>, y <span class="op">=</span> <span class="st">'c2'</span>, hue <span class="op">=</span> <span class="st">'Three clusters'</span>)</span>
+<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a>ax.set_title(<span class="st">'Results of the algorithm visualised over PCs'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="19">
+<pre><code>Text(0.5, 1.0, 'Results of the algorithm visualised over PCs')</code></pre>
+</div>
+<div class="cell-output cell-output-display">
+<p><img src="IM939_Lab_5_1_files/figure-html/cell-20-output-2.png" width="612" height="449"></p>
+</div>
 </div>
 <p>This shows the parallelism between the clustering algorithm and PCA. By looking at the PCA loadings, we can find out what the x-axis mean and try to interpret the clusters (We leave this as an additional exercise for those interested).</p>
 <p>How might your interpret the above plots? Did the kmeans model identify the ground truth?</p>
@@ -803,32 +1706,44 @@ <h3 data-number="21.2.3" class="anchored" data-anchor-id="ground-truth-validatio
 <li>Removing some data and running the modelling multiple times.</li>
 </ul>
 <p>Run the below cell a few times. What do you see?</p>
-<div class="cell">
-<div class="sourceCode cell-code" id="cb17"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a KMeans instance with k clusters: model</span></span>
-<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a>k_means <span class="op">=</span> KMeans(n_clusters<span class="op">=</span><span class="dv">3</span>, init<span class="op">=</span><span class="st">'random'</span>, n_init <span class="op">=</span> <span class="dv">10</span>)</span>
-<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb17-4"><a href="#cb17-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Fit model to samples</span></span>
-<span id="cb17-5"><a href="#cb17-5" aria-hidden="true" tabindex="-1"></a>df_k_means <span class="op">=</span> k_means.fit(df.iloc[:,<span class="dv">1</span>:<span class="dv">14</span>])</span>
-<span id="cb17-6"><a href="#cb17-6" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb17-7"><a href="#cb17-7" aria-hidden="true" tabindex="-1"></a>df[<span class="st">'Three clusters'</span>] <span class="op">=</span> pd.Series(df_k_means.predict(df.iloc[:,<span class="dv">1</span>:<span class="dv">14</span>].values), index <span class="op">=</span> df.index)</span>
-<span id="cb17-8"><a href="#cb17-8" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb17-9"><a href="#cb17-9" aria-hidden="true" tabindex="-1"></a>ax <span class="op">=</span> sns.scatterplot(data <span class="op">=</span> df, x <span class="op">=</span> <span class="st">'c1'</span>, y <span class="op">=</span> <span class="st">'c2'</span>, hue <span class="op">=</span> <span class="st">'Three clusters'</span>)</span>
-<span id="cb17-10"><a href="#cb17-10" aria-hidden="true" tabindex="-1"></a>ax.set_title(<span class="st">'Results of the algorithm visualised over PCs'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-execution_count="20">
+<div class="sourceCode cell-code" id="cb25"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a KMeans instance with k clusters: model</span></span>
+<span id="cb25-2"><a href="#cb25-2" aria-hidden="true" tabindex="-1"></a>k_means <span class="op">=</span> KMeans(n_clusters<span class="op">=</span><span class="dv">3</span>, init<span class="op">=</span><span class="st">'random'</span>, n_init <span class="op">=</span> <span class="dv">10</span>)</span>
+<span id="cb25-3"><a href="#cb25-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb25-4"><a href="#cb25-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Fit model to samples</span></span>
+<span id="cb25-5"><a href="#cb25-5" aria-hidden="true" tabindex="-1"></a>df_k_means <span class="op">=</span> k_means.fit(df.iloc[:,<span class="dv">1</span>:<span class="dv">14</span>])</span>
+<span id="cb25-6"><a href="#cb25-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb25-7"><a href="#cb25-7" aria-hidden="true" tabindex="-1"></a>df[<span class="st">'Three clusters'</span>] <span class="op">=</span> pd.Series(df_k_means.predict(df.iloc[:,<span class="dv">1</span>:<span class="dv">14</span>].values), index <span class="op">=</span> df.index)</span>
+<span id="cb25-8"><a href="#cb25-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb25-9"><a href="#cb25-9" aria-hidden="true" tabindex="-1"></a>ax <span class="op">=</span> sns.scatterplot(data <span class="op">=</span> df, x <span class="op">=</span> <span class="st">'c1'</span>, y <span class="op">=</span> <span class="st">'c2'</span>, hue <span class="op">=</span> <span class="st">'Three clusters'</span>)</span>
+<span id="cb25-10"><a href="#cb25-10" aria-hidden="true" tabindex="-1"></a>ax.set_title(<span class="st">'Results of the algorithm visualised over PCs'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="20">
+<pre><code>Text(0.5, 1.0, 'Results of the algorithm visualised over PCs')</code></pre>
+</div>
+<div class="cell-output cell-output-display">
+<p><img src="IM939_Lab_5_1_files/figure-html/cell-21-output-2.png" width="612" height="449"></p>
+</div>
 </div>
 <p>How about with only 80% of the data?</p>
-<div class="cell">
-<div class="sourceCode cell-code" id="cb18"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a>df_sample <span class="op">=</span> df.sample(frac<span class="op">=</span><span class="fl">0.8</span>, replace<span class="op">=</span><span class="va">False</span>)</span>
-<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a KMeans instance with k clusters: model</span></span>
-<span id="cb18-4"><a href="#cb18-4" aria-hidden="true" tabindex="-1"></a>k_means <span class="op">=</span> KMeans(n_clusters<span class="op">=</span><span class="dv">3</span>, init<span class="op">=</span><span class="st">'random'</span>, n_init <span class="op">=</span> <span class="dv">10</span>)</span>
-<span id="cb18-5"><a href="#cb18-5" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb18-6"><a href="#cb18-6" aria-hidden="true" tabindex="-1"></a><span class="co"># Fit model to samples</span></span>
-<span id="cb18-7"><a href="#cb18-7" aria-hidden="true" tabindex="-1"></a>df_k_means <span class="op">=</span> k_means.fit(df_sample.iloc[:,<span class="dv">1</span>:<span class="dv">14</span>])</span>
-<span id="cb18-8"><a href="#cb18-8" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb18-9"><a href="#cb18-9" aria-hidden="true" tabindex="-1"></a>df_sample[<span class="st">'Three clusters'</span>] <span class="op">=</span> pd.Series(df_k_means.predict(df_sample.iloc[:,<span class="dv">1</span>:<span class="dv">14</span>].values), index <span class="op">=</span> df_sample.index)</span>
-<span id="cb18-10"><a href="#cb18-10" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb18-11"><a href="#cb18-11" aria-hidden="true" tabindex="-1"></a>ax <span class="op">=</span> sns.scatterplot(data <span class="op">=</span> df_sample, x <span class="op">=</span> <span class="st">'c1'</span>, y <span class="op">=</span> <span class="st">'c2'</span>, hue <span class="op">=</span> <span class="st">'Three clusters'</span>)</span>
-<span id="cb18-12"><a href="#cb18-12" aria-hidden="true" tabindex="-1"></a>ax.set_title(<span class="st">'Results of the algorithm visualised over PCs'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-execution_count="21">
+<div class="sourceCode cell-code" id="cb27"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb27-1"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a>df_sample <span class="op">=</span> df.sample(frac<span class="op">=</span><span class="fl">0.8</span>, replace<span class="op">=</span><span class="va">False</span>)</span>
+<span id="cb27-2"><a href="#cb27-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb27-3"><a href="#cb27-3" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a KMeans instance with k clusters: model</span></span>
+<span id="cb27-4"><a href="#cb27-4" aria-hidden="true" tabindex="-1"></a>k_means <span class="op">=</span> KMeans(n_clusters<span class="op">=</span><span class="dv">3</span>, init<span class="op">=</span><span class="st">'random'</span>, n_init <span class="op">=</span> <span class="dv">10</span>)</span>
+<span id="cb27-5"><a href="#cb27-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb27-6"><a href="#cb27-6" aria-hidden="true" tabindex="-1"></a><span class="co"># Fit model to samples</span></span>
+<span id="cb27-7"><a href="#cb27-7" aria-hidden="true" tabindex="-1"></a>df_k_means <span class="op">=</span> k_means.fit(df_sample.iloc[:,<span class="dv">1</span>:<span class="dv">14</span>])</span>
+<span id="cb27-8"><a href="#cb27-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb27-9"><a href="#cb27-9" aria-hidden="true" tabindex="-1"></a>df_sample[<span class="st">'Three clusters'</span>] <span class="op">=</span> pd.Series(df_k_means.predict(df_sample.iloc[:,<span class="dv">1</span>:<span class="dv">14</span>].values), index <span class="op">=</span> df_sample.index)</span>
+<span id="cb27-10"><a href="#cb27-10" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb27-11"><a href="#cb27-11" aria-hidden="true" tabindex="-1"></a>ax <span class="op">=</span> sns.scatterplot(data <span class="op">=</span> df_sample, x <span class="op">=</span> <span class="st">'c1'</span>, y <span class="op">=</span> <span class="st">'c2'</span>, hue <span class="op">=</span> <span class="st">'Three clusters'</span>)</span>
+<span id="cb27-12"><a href="#cb27-12" aria-hidden="true" tabindex="-1"></a>ax.set_title(<span class="st">'Results of the algorithm visualised over PCs'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="21">
+<pre><code>Text(0.5, 1.0, 'Results of the algorithm visualised over PCs')</code></pre>
+</div>
+<div class="cell-output cell-output-display">
+<p><img src="IM939_Lab_5_1_files/figure-html/cell-22-output-2.png" width="612" height="449"></p>
+</div>
 </div>
 <p>We may want to automate the process of resampling the data or rerunning the model then perhaps plotting the different inertia values or creating different plots.</p>
 <p>Do you think our clustering algorithm is stable and provide similiar results even when some data is removed or the initial values are random?</p>
diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-10-output-1.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-10-output-1.png
index c686cad..0e88bbd 100644
Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-10-output-1.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-10-output-1.png differ
diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-11-output-1.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-11-output-1.png
index 33a1e9a..0b09c4f 100644
Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-11-output-1.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-11-output-1.png differ
diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-12-output-1.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-12-output-1.png
index dc44232..7d26839 100644
Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-12-output-1.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-12-output-1.png differ
diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-19-output-2.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-19-output-2.png
index f6f3c4d..f9c04c5 100644
Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-19-output-2.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-19-output-2.png differ
diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-20-output-2.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-20-output-2.png
index 4769a92..f6f3c4d 100644
Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-20-output-2.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-20-output-2.png differ
diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-21-output-2.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-21-output-2.png
index b326702..d880af7 100644
Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-21-output-2.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-21-output-2.png differ
diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-22-output-2.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-22-output-2.png
index f33922d..5fecad0 100644
Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-22-output-2.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-22-output-2.png differ
diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-7-output-2.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-7-output-2.png
index dd4eb86..c08f8b5 100644
Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-7-output-2.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-7-output-2.png differ
diff --git a/content/labs/Lab_5/IM939_Lab_5_2.html b/content/labs/Lab_5/IM939_Lab_5_2.html
index ce7016e..25803e1 100644
--- a/content/labs/Lab_5/IM939_Lab_5_2.html
+++ b/content/labs/Lab_5/IM939_Lab_5_2.html
@@ -103,7 +103,7 @@
 
 <meta name="twitter:title" content="Data Science Across Disciplines - 22&nbsp; Lab: Cross validation">
 <meta name="twitter:description" content="">
-<meta name="twitter:image" content="https://warwickcim.github.io/IM939_handbook/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-5-output-1.png">
+<meta name="twitter:image" content="https://warwickcim.github.io/IM939_handbook/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-4-output-1.png">
 <meta name="twitter:image-height" content="1128">
 <meta name="twitter:image-width" content="1130">
 <meta name="twitter:card" content="summary_large_image">
@@ -565,11 +565,11 @@ <h1 class="title"><span class="chapter-number">22</span>&nbsp; <span class="chap
 
 <p>Details of the crime dataset are <a href="https://archive.ics.uci.edu/ml/datasets/Communities+and+Crime">here</a>.</p>
 <p>We are going to examine the data, fit and then cross-validate a regression model.</p>
-<div class="cell" data-execution_count="2">
+<div class="cell" data-execution_count="1">
 <div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
 <span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a>df <span class="op">=</span> pd.read_csv(<span class="st">'data/censusCrimeClean.csv'</span>)</span>
 <span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a>df.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="2">
+<div class="cell-output cell-output-display" data-execution_count="1">
 <div>
 
 
@@ -732,10 +732,10 @@ <h1 class="title"><span class="chapter-number">22</span>&nbsp; <span class="chap
 <p>Instead, we can pick out particular variables and carry out a linear regression. To make our work simple we will look at ViolentCrimesPerPop as our dependent variable and medIncome as our indpendent variable.</p>
 <p>We may wonder if there is more violent crime in low income areas.</p>
 <p>Let us create a new dataframe containing our regression variables. We do not have to do this I find it makes our work clearer.</p>
-<div class="cell" data-execution_count="3">
+<div class="cell" data-execution_count="2">
 <div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>df_reg <span class="op">=</span> df[[<span class="st">'communityname'</span>, <span class="st">'medIncome'</span>, <span class="st">'ViolentCrimesPerPop'</span>]]</span>
 <span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>df_reg</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="3">
+<div class="cell-output cell-output-display" data-execution_count="2">
 <div>
 
 
@@ -823,28 +823,42 @@ <h1 class="title"><span class="chapter-number">22</span>&nbsp; <span class="chap
 </div>
 </div>
 <p>Plot our data (a nice page on plotting regressions with seaborn is <a href="http://seaborn.pydata.org/tutorial/regression.html#:~:text=Two%20main%20functions%20in%20seaborn%20are%20used%20to,quickly%20choose%20the%20correct%20tool%20for%20particular%20job.">here</a>).</p>
-<div class="cell" data-execution_count="4">
+<div class="cell" data-execution_count="3">
 <div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
 <span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>sns.jointplot(data <span class="op">=</span> df[[<span class="st">'medIncome'</span>, <span class="st">'ViolentCrimesPerPop'</span>]], </span>
 <span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>              x <span class="op">=</span> <span class="st">'ViolentCrimesPerPop'</span>, </span>
 <span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>              y <span class="op">=</span> <span class="st">'medIncome'</span>, kind<span class="op">=</span><span class="st">'reg'</span>,</span>
 <span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a>              marker <span class="op">=</span> <span class="st">'.'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-display">
-<p><img src="IM939_Lab_5_2_files/figure-html/cell-5-output-1.png" width="565" height="564"></p>
+<p><img src="IM939_Lab_5_2_files/figure-html/cell-4-output-1.png" width="565" height="564"></p>
 </div>
 </div>
 <p>We may want to z-transform or log these scores as they are heavily skewed.</p>
-<div class="cell" data-execution_count="5">
+<div class="cell" data-execution_count="4">
 <div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
 <span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="co"># some values are 0 so 0.1 is added to prevent log giving us infinity</span></span>
 <span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="co"># there may be a better way to do this!</span></span>
 <span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a>df_reg.loc[:, <span class="st">'ViolentCrimesPerPop_log'</span>] <span class="op">=</span> np.log(df_reg[<span class="st">'ViolentCrimesPerPop'</span>] <span class="op">+</span> <span class="fl">0.1</span>)</span>
 <span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a>df_reg.loc[:,<span class="st">'medIncome_log'</span>] <span class="op">=</span> np.log(df_reg[<span class="st">'medIncome'</span>] <span class="op">+</span> <span class="fl">0.1</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-stderr">
+<pre><code>/var/folders/7v/zl9mv52s3ls94kntlt_l9ryh0000gq/T/ipykernel_13528/3488182522.py:5: SettingWithCopyWarning: 
+A value is trying to be set on a copy of a slice from a DataFrame.
+Try using .loc[row_indexer,col_indexer] = value instead
+
+See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
+  df_reg.loc[:, 'ViolentCrimesPerPop_log'] = np.log(df_reg['ViolentCrimesPerPop'] + 0.1)
+/var/folders/7v/zl9mv52s3ls94kntlt_l9ryh0000gq/T/ipykernel_13528/3488182522.py:6: SettingWithCopyWarning: 
+A value is trying to be set on a copy of a slice from a DataFrame.
+Try using .loc[row_indexer,col_indexer] = value instead
+
+See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
+  df_reg.loc[:,'medIncome_log'] = np.log(df_reg['medIncome'] + 0.1)</code></pre>
 </div>
-<div class="cell" data-execution_count="6">
-<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>df_reg</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="6">
+</div>
+<div class="cell" data-execution_count="5">
+<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a>df_reg</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="5">
 <div>
 
 
@@ -955,39 +969,39 @@ <h1 class="title"><span class="chapter-number">22</span>&nbsp; <span class="chap
 </div>
 </div>
 </div>
-<div class="cell" data-execution_count="7">
-<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
-<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a>sns.jointplot(data <span class="op">=</span> df_reg[[<span class="st">'medIncome_log'</span>, <span class="st">'ViolentCrimesPerPop_log'</span>]], </span>
-<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a>              x <span class="op">=</span> <span class="st">'ViolentCrimesPerPop_log'</span>, </span>
-<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a>              y <span class="op">=</span> <span class="st">'medIncome_log'</span>, kind<span class="op">=</span><span class="st">'reg'</span>,</span>
-<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a>              marker <span class="op">=</span> <span class="st">'.'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-execution_count="6">
+<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
+<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a>sns.jointplot(data <span class="op">=</span> df_reg[[<span class="st">'medIncome_log'</span>, <span class="st">'ViolentCrimesPerPop_log'</span>]], </span>
+<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a>              x <span class="op">=</span> <span class="st">'ViolentCrimesPerPop_log'</span>, </span>
+<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a>              y <span class="op">=</span> <span class="st">'medIncome_log'</span>, kind<span class="op">=</span><span class="st">'reg'</span>,</span>
+<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a>              marker <span class="op">=</span> <span class="st">'.'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-display">
-<p><img src="IM939_Lab_5_2_files/figure-html/cell-8-output-1.png" width="576" height="564"></p>
+<p><img src="IM939_Lab_5_2_files/figure-html/cell-7-output-1.png" width="576" height="564"></p>
 </div>
 </div>
 <p>Is log transforming our variables the right thing to do here?</p>
 <p>Fit our regression to the log transformed data.</p>
-<div class="cell" data-execution_count="8">
-<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
-<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> LinearRegression</span>
-<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn <span class="im">import</span> metrics</span>
-<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a>x <span class="op">=</span> df_reg[[<span class="st">'ViolentCrimesPerPop_log'</span>]]</span>
-<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a>y <span class="op">=</span> df_reg[[<span class="st">'medIncome_log'</span>]]</span>
-<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a>model <span class="op">=</span> LinearRegression()</span>
-<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a>model.fit(x, y)</span>
-<span id="cb7-10"><a href="#cb7-10" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb7-11"><a href="#cb7-11" aria-hidden="true" tabindex="-1"></a>y_hat <span class="op">=</span> model.predict(x)</span>
-<span id="cb7-12"><a href="#cb7-12" aria-hidden="true" tabindex="-1"></a>plt.plot(x, y,<span class="st">'o'</span>, alpha <span class="op">=</span> <span class="fl">0.5</span>)</span>
-<span id="cb7-13"><a href="#cb7-13" aria-hidden="true" tabindex="-1"></a>plt.plot(x, y_hat, <span class="st">'r'</span>, alpha <span class="op">=</span> <span class="fl">0.5</span>)</span>
-<span id="cb7-14"><a href="#cb7-14" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb7-15"><a href="#cb7-15" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">'Violent Crimes Per Population'</span>)</span>
-<span id="cb7-16"><a href="#cb7-16" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">'Median Income'</span>)</span>
-<span id="cb7-17"><a href="#cb7-17" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb7-18"><a href="#cb7-18" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span> (<span class="st">"MSE:"</span>, metrics.mean_squared_error(y_hat, y))</span>
-<span id="cb7-19"><a href="#cb7-19" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span> (<span class="st">"R^2:"</span>, metrics.r2_score(y, y_hat))</span>
-<span id="cb7-20"><a href="#cb7-20" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span> (<span class="st">"var:"</span>, y.var())</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-execution_count="7">
+<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> LinearRegression</span>
+<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn <span class="im">import</span> metrics</span>
+<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a>x <span class="op">=</span> df_reg[[<span class="st">'ViolentCrimesPerPop_log'</span>]]</span>
+<span id="cb8-6"><a href="#cb8-6" aria-hidden="true" tabindex="-1"></a>y <span class="op">=</span> df_reg[[<span class="st">'medIncome_log'</span>]]</span>
+<span id="cb8-7"><a href="#cb8-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-8"><a href="#cb8-8" aria-hidden="true" tabindex="-1"></a>model <span class="op">=</span> LinearRegression()</span>
+<span id="cb8-9"><a href="#cb8-9" aria-hidden="true" tabindex="-1"></a>model.fit(x, y)</span>
+<span id="cb8-10"><a href="#cb8-10" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-11"><a href="#cb8-11" aria-hidden="true" tabindex="-1"></a>y_hat <span class="op">=</span> model.predict(x)</span>
+<span id="cb8-12"><a href="#cb8-12" aria-hidden="true" tabindex="-1"></a>plt.plot(x, y,<span class="st">'o'</span>, alpha <span class="op">=</span> <span class="fl">0.5</span>)</span>
+<span id="cb8-13"><a href="#cb8-13" aria-hidden="true" tabindex="-1"></a>plt.plot(x, y_hat, <span class="st">'r'</span>, alpha <span class="op">=</span> <span class="fl">0.5</span>)</span>
+<span id="cb8-14"><a href="#cb8-14" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-15"><a href="#cb8-15" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">'Violent Crimes Per Population'</span>)</span>
+<span id="cb8-16"><a href="#cb8-16" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">'Median Income'</span>)</span>
+<span id="cb8-17"><a href="#cb8-17" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-18"><a href="#cb8-18" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span> (<span class="st">"MSE:"</span>, metrics.mean_squared_error(y_hat, y))</span>
+<span id="cb8-19"><a href="#cb8-19" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span> (<span class="st">"R^2:"</span>, metrics.r2_score(y, y_hat))</span>
+<span id="cb8-20"><a href="#cb8-20" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span> (<span class="st">"var:"</span>, y.var())</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>MSE: 0.1531885348757034
 R^2: 0.22763497704356928
@@ -995,27 +1009,27 @@ <h1 class="title"><span class="chapter-number">22</span>&nbsp; <span class="chap
 dtype: float64</code></pre>
 </div>
 <div class="cell-output cell-output-display">
-<p><img src="IM939_Lab_5_2_files/figure-html/cell-9-output-2.png" width="600" height="429"></p>
+<p><img src="IM939_Lab_5_2_files/figure-html/cell-8-output-2.png" width="600" height="429"></p>
 </div>
 </div>
 <p>Has our log transformation distorted the pattern in the data?</p>
-<div class="cell" data-execution_count="9">
-<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>x <span class="op">=</span> df_reg[[<span class="st">'ViolentCrimesPerPop'</span>]]</span>
-<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a>y <span class="op">=</span> df_reg[[<span class="st">'medIncome'</span>]]</span>
-<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a>model <span class="op">=</span> LinearRegression()</span>
-<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a>model.fit(x, y)</span>
-<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a>y_hat <span class="op">=</span> model.predict(x)</span>
-<span id="cb9-8"><a href="#cb9-8" aria-hidden="true" tabindex="-1"></a>plt.plot(x, y,<span class="st">'o'</span>, alpha <span class="op">=</span> <span class="fl">0.5</span>)</span>
-<span id="cb9-9"><a href="#cb9-9" aria-hidden="true" tabindex="-1"></a>plt.plot(x, y_hat, <span class="st">'r'</span>, alpha <span class="op">=</span> <span class="fl">0.5</span>)</span>
-<span id="cb9-10"><a href="#cb9-10" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb9-11"><a href="#cb9-11" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">'Violent Crimes Per Population'</span>)</span>
-<span id="cb9-12"><a href="#cb9-12" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">'Median Income'</span>)</span>
-<span id="cb9-13"><a href="#cb9-13" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb9-14"><a href="#cb9-14" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span> (<span class="st">"MSE:"</span>, metrics.mean_squared_error(y_hat, y))</span>
-<span id="cb9-15"><a href="#cb9-15" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span> (<span class="st">"R^2:"</span>, metrics.r2_score(y, y_hat))</span>
-<span id="cb9-16"><a href="#cb9-16" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span> (<span class="st">"var:"</span>, y.var())</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-execution_count="8">
+<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>x <span class="op">=</span> df_reg[[<span class="st">'ViolentCrimesPerPop'</span>]]</span>
+<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a>y <span class="op">=</span> df_reg[[<span class="st">'medIncome'</span>]]</span>
+<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a>model <span class="op">=</span> LinearRegression()</span>
+<span id="cb10-5"><a href="#cb10-5" aria-hidden="true" tabindex="-1"></a>model.fit(x, y)</span>
+<span id="cb10-6"><a href="#cb10-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb10-7"><a href="#cb10-7" aria-hidden="true" tabindex="-1"></a>y_hat <span class="op">=</span> model.predict(x)</span>
+<span id="cb10-8"><a href="#cb10-8" aria-hidden="true" tabindex="-1"></a>plt.plot(x, y,<span class="st">'o'</span>, alpha <span class="op">=</span> <span class="fl">0.5</span>)</span>
+<span id="cb10-9"><a href="#cb10-9" aria-hidden="true" tabindex="-1"></a>plt.plot(x, y_hat, <span class="st">'r'</span>, alpha <span class="op">=</span> <span class="fl">0.5</span>)</span>
+<span id="cb10-10"><a href="#cb10-10" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb10-11"><a href="#cb10-11" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">'Violent Crimes Per Population'</span>)</span>
+<span id="cb10-12"><a href="#cb10-12" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">'Median Income'</span>)</span>
+<span id="cb10-13"><a href="#cb10-13" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb10-14"><a href="#cb10-14" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span> (<span class="st">"MSE:"</span>, metrics.mean_squared_error(y_hat, y))</span>
+<span id="cb10-15"><a href="#cb10-15" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span> (<span class="st">"R^2:"</span>, metrics.r2_score(y, y_hat))</span>
+<span id="cb10-16"><a href="#cb10-16" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span> (<span class="st">"var:"</span>, y.var())</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>MSE: 0.03592636778157073
 R^2: 0.17996313165549482
@@ -1023,42 +1037,42 @@ <h1 class="title"><span class="chapter-number">22</span>&nbsp; <span class="chap
 dtype: float64</code></pre>
 </div>
 <div class="cell-output cell-output-display">
-<p><img src="IM939_Lab_5_2_files/figure-html/cell-10-output-2.png" width="589" height="429"></p>
+<p><img src="IM939_Lab_5_2_files/figure-html/cell-9-output-2.png" width="589" height="429"></p>
 </div>
 </div>
 <p>What is the relationship between violent crime and median income? Why might this be?</p>
 <p>Assuming the log data is fine, have we overfit the model? Remember that a good model (which accurately models the relationship between violent crimes per population) need to be robust when faced with new data.</p>
 <p>Kfold cross validation splits data into train and test subsets. We can then fit the regression to the training set and see how well it does for the test set.</p>
+<div class="cell" data-execution_count="9">
+<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.model_selection <span class="im">import</span> KFold</span>
+<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a>X <span class="op">=</span> df_reg[[<span class="st">'ViolentCrimesPerPop'</span>]]</span>
+<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a>y <span class="op">=</span> df_reg[[<span class="st">'medIncome'</span>]]</span>
+<span id="cb12-5"><a href="#cb12-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb12-6"><a href="#cb12-6" aria-hidden="true" tabindex="-1"></a><span class="co"># get four splits, Each split contains a </span></span>
+<span id="cb12-7"><a href="#cb12-7" aria-hidden="true" tabindex="-1"></a><span class="co"># test series and a train series.</span></span>
+<span id="cb12-8"><a href="#cb12-8" aria-hidden="true" tabindex="-1"></a>kf <span class="op">=</span> KFold(n_splits<span class="op">=</span><span class="dv">4</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div>
 <div class="cell" data-execution_count="10">
-<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.model_selection <span class="im">import</span> KFold</span>
-<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a>X <span class="op">=</span> df_reg[[<span class="st">'ViolentCrimesPerPop'</span>]]</span>
-<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a>y <span class="op">=</span> df_reg[[<span class="st">'medIncome'</span>]]</span>
-<span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb11-6"><a href="#cb11-6" aria-hidden="true" tabindex="-1"></a><span class="co"># get four splits, Each split contains a </span></span>
-<span id="cb11-7"><a href="#cb11-7" aria-hidden="true" tabindex="-1"></a><span class="co"># test series and a train series.</span></span>
-<span id="cb11-8"><a href="#cb11-8" aria-hidden="true" tabindex="-1"></a>kf <span class="op">=</span> KFold(n_splits<span class="op">=</span><span class="dv">4</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="co"># lists to store our statistics</span></span>
+<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a>r_vals <span class="op">=</span> []</span>
+<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a>MSEs <span class="op">=</span> []</span>
+<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a>medIncome_coef <span class="op">=</span> []</span>
+<span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb13-6"><a href="#cb13-6" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> train_index, test_index <span class="kw">in</span> kf.split(X):</span>
+<span id="cb13-7"><a href="#cb13-7" aria-hidden="true" tabindex="-1"></a>    <span class="co"># fit our model and extract statistics</span></span>
+<span id="cb13-8"><a href="#cb13-8" aria-hidden="true" tabindex="-1"></a>    model <span class="op">=</span> LinearRegression()</span>
+<span id="cb13-9"><a href="#cb13-9" aria-hidden="true" tabindex="-1"></a>    model.fit(X.iloc[train_index], y.iloc[train_index])</span>
+<span id="cb13-10"><a href="#cb13-10" aria-hidden="true" tabindex="-1"></a>    y_hat <span class="op">=</span> model.predict(X.iloc[test_index])</span>
+<span id="cb13-11"><a href="#cb13-11" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb13-12"><a href="#cb13-12" aria-hidden="true" tabindex="-1"></a>    MSEs.append(metrics.mean_squared_error(y.iloc[test_index], y_hat))</span>
+<span id="cb13-13"><a href="#cb13-13" aria-hidden="true" tabindex="-1"></a>    medIncome_coef.append(model.coef_[<span class="dv">0</span>][<span class="dv">0</span>])</span>
+<span id="cb13-14"><a href="#cb13-14" aria-hidden="true" tabindex="-1"></a>    r_vals.append(metrics.r2_score(y.iloc[test_index], y_hat))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <div class="cell" data-execution_count="11">
-<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="co"># lists to store our statistics</span></span>
-<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a>r_vals <span class="op">=</span> []</span>
-<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a>MSEs <span class="op">=</span> []</span>
-<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a>medIncome_coef <span class="op">=</span> []</span>
-<span id="cb12-5"><a href="#cb12-5" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb12-6"><a href="#cb12-6" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> train_index, test_index <span class="kw">in</span> kf.split(X):</span>
-<span id="cb12-7"><a href="#cb12-7" aria-hidden="true" tabindex="-1"></a>    <span class="co"># fit our model and extract statistics</span></span>
-<span id="cb12-8"><a href="#cb12-8" aria-hidden="true" tabindex="-1"></a>    model <span class="op">=</span> LinearRegression()</span>
-<span id="cb12-9"><a href="#cb12-9" aria-hidden="true" tabindex="-1"></a>    model.fit(X.iloc[train_index], y.iloc[train_index])</span>
-<span id="cb12-10"><a href="#cb12-10" aria-hidden="true" tabindex="-1"></a>    y_hat <span class="op">=</span> model.predict(X.iloc[test_index])</span>
-<span id="cb12-11"><a href="#cb12-11" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb12-12"><a href="#cb12-12" aria-hidden="true" tabindex="-1"></a>    MSEs.append(metrics.mean_squared_error(y.iloc[test_index], y_hat))</span>
-<span id="cb12-13"><a href="#cb12-13" aria-hidden="true" tabindex="-1"></a>    medIncome_coef.append(model.coef_[<span class="dv">0</span>][<span class="dv">0</span>])</span>
-<span id="cb12-14"><a href="#cb12-14" aria-hidden="true" tabindex="-1"></a>    r_vals.append(metrics.r2_score(y.iloc[test_index], y_hat))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div>
-<div class="cell" data-execution_count="12">
-<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a>data <span class="op">=</span> {<span class="st">'MSE'</span> : MSEs, <span class="st">'medIncome coefficient'</span> : medIncome_coef, <span class="st">'r squared'</span> : r_vals}</span>
-<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a>pd.DataFrame(data)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="12">
+<div class="sourceCode cell-code" id="cb14"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a>data <span class="op">=</span> {<span class="st">'MSE'</span> : MSEs, <span class="st">'medIncome coefficient'</span> : medIncome_coef, <span class="st">'r squared'</span> : r_vals}</span>
+<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a>pd.DataFrame(data)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell-output cell-output-display" data-execution_count="11">
 <div>
 
 
@@ -1104,15 +1118,15 @@ <h1 class="title"><span class="chapter-number">22</span>&nbsp; <span class="chap
 </div>
 <p>Does our model produce similiar coefficients with subsets of the data?</p>
 <p>We can do this using an inbuild sklearn function (see <a href="https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html">here</a>).</p>
-<div class="cell" data-execution_count="13">
-<div class="sourceCode cell-code" id="cb14"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.model_selection <span class="im">import</span> cross_val_score</span>
-<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a>x <span class="op">=</span> df_reg[[<span class="st">'ViolentCrimesPerPop'</span>]]</span>
-<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a>y <span class="op">=</span> df_reg[[<span class="st">'medIncome'</span>]]</span>
-<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a>model <span class="op">=</span> LinearRegression()</span>
-<span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a>model.fit(x, y)</span>
-<span id="cb14-7"><a href="#cb14-7" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb14-8"><a href="#cb14-8" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(cross_val_score(model, x, y, cv<span class="op">=</span><span class="dv">4</span>))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="cell" data-execution_count="12">
+<div class="sourceCode cell-code" id="cb15"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.model_selection <span class="im">import</span> cross_val_score</span>
+<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a>x <span class="op">=</span> df_reg[[<span class="st">'ViolentCrimesPerPop'</span>]]</span>
+<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a>y <span class="op">=</span> df_reg[[<span class="st">'medIncome'</span>]]</span>
+<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a>model <span class="op">=</span> LinearRegression()</span>
+<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a>model.fit(x, y)</span>
+<span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-8"><a href="#cb15-8" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(cross_val_score(model, x, y, cv<span class="op">=</span><span class="dv">4</span>))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>[0.13047946 0.16281953 0.20013867 0.18240261]</code></pre>
 </div>
diff --git a/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-4-output-1.png b/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-4-output-1.png
new file mode 100644
index 0000000..34793f3
Binary files /dev/null and b/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-4-output-1.png differ
diff --git a/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-5-output-1.png b/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-5-output-1.png
index a770677..01322ba 100644
Binary files a/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-5-output-1.png and b/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-5-output-1.png differ
diff --git a/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-7-output-1.png b/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-7-output-1.png
new file mode 100644
index 0000000..a326fa3
Binary files /dev/null and b/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-7-output-1.png differ
diff --git a/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-8-output-1.png b/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-8-output-1.png
index 86d02e7..f099855 100644
Binary files a/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-8-output-1.png and b/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-8-output-1.png differ
diff --git a/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-9-output-2.png b/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-9-output-2.png
index d4c20a4..96d96c5 100644
Binary files a/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-9-output-2.png and b/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-9-output-2.png differ
diff --git a/search.json b/search.json
index 27493df..1291918 100644
--- a/search.json
+++ b/search.json
@@ -501,7 +501,7 @@
     "href": "content/labs/Lab_3/IM939_Lab_3_Exercise_Linear_Regression.html#reading-data",
     "title": "13  Exercise: Regression",
     "section": "13.3 Reading Data",
-    "text": "13.3 Reading Data\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\n\n\nwine = pd.read_excel('data/raw/winequality-red_v2.xlsx', engine = 'openpyxl')"
+    "text": "13.3 Reading Data\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\n\n\nwine = pd.read_excel('data/raw/winequality-red_v2.xlsx', engine = 'openpyxl')\n\n#You might need to use encoding, then the code will look like:\n# wine = pd.read_excel('data/raw/winequality-red_v2.xlsx', engine = 'openpyxl', encoding='UTF-8')"
   },
   {
     "objectID": "content/labs/Lab_3/IM939_Lab_3_Exercise_Linear_Regression.html#data-exploration",
@@ -606,14 +606,14 @@
     "href": "content/labs/Lab_4/IM939_Lab_4_1_Iris.html#do-this-yourself-check-if-we-need-to-do-any-normalisation-for-this-case",
     "title": "16  Lab: Dimension Reduction",
     "section": "16.4 Do-this-yourself: Check if we need to do any normalisation for this case?",
-    "text": "16.4 Do-this-yourself: Check if we need to do any normalisation for this case?\nWe have already looked at how the data looks, what are the descriptive statistics look like, see if we need to do anything more?\n\nk_means = KMeans(n_clusters = 3, init = 'random',  n_init = 10)\n\nFit our kmeans model to the data\n\nk_means.fit(iris)\n\nKMeans(init='random', n_clusters=3, n_init=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.KMeansKMeans(init='random', n_clusters=3, n_init=10)\n\n\nThe algorithm has assigned the a label to each row.\n\nk_means.labels_\n\narray([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n       1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0,\n       0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0,\n       0, 2, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2], dtype=int32)\n\n\nEach row has been assigned a label.\nTo tidy things up we should put everything into a dataframe.\n\niris_df['Three clusters'] = pd.Series(k_means.predict(iris_df.values), index = iris_df.index)\n\n\niris_df\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\nThree clusters\n\n\n\n\n0\n0.222222\n0.625000\n0.067797\n0.041667\n1\n\n\n1\n0.166667\n0.416667\n0.067797\n0.041667\n1\n\n\n2\n0.111111\n0.500000\n0.050847\n0.041667\n1\n\n\n3\n0.083333\n0.458333\n0.084746\n0.041667\n1\n\n\n4\n0.194444\n0.666667\n0.067797\n0.041667\n1\n\n\n...\n...\n...\n...\n...\n...\n\n\n145\n0.666667\n0.416667\n0.711864\n0.916667\n0\n\n\n146\n0.555556\n0.208333\n0.677966\n0.750000\n2\n\n\n147\n0.611111\n0.416667\n0.711864\n0.791667\n0\n\n\n148\n0.527778\n0.583333\n0.745763\n0.916667\n0\n\n\n149\n0.444444\n0.416667\n0.694915\n0.708333\n2\n\n\n\n\n150 rows × 5 columns\n\n\n\n\nsns.pairplot(iris_df, hue = 'Three clusters')\n\n\n\n\nThat seems quite nice. We can also do individual plots if preferred.\n\nsns.scatterplot(data = iris_df, x = 'sepal length (cm)', y = 'petal width (cm)', hue = 'Three clusters')\n\n&lt;Axes: xlabel='sepal length (cm)', ylabel='petal width (cm)'&gt;\n\n\n\n\n\nK-means works by clustering the data around central points (often called centroids, means or cluster centers). We can extract the cluster centres from the kmeans object.\n\nk_means.cluster_centers_\n\narray([[0.70726496, 0.4508547 , 0.79704476, 0.82478632],\n       [0.19611111, 0.595     , 0.07830508, 0.06083333],\n       [0.44125683, 0.30737705, 0.57571548, 0.54918033]])\n\n\nIt is tricky to plot these using seaborn but we can use a normal maplotlib scatter plot.\nLet us grab the groups.\n\ngroup1 = iris_df[iris_df['Three clusters'] == 0]\ngroup2 = iris_df[iris_df['Three clusters'] == 1]\ngroup3 = iris_df[iris_df['Three clusters'] == 2]\n\nGrab the centroids\n\nimport pandas as pd\n\ncentres = k_means.cluster_centers_\n\ndata = {'x': [centres[0][0], centres[1][0], centres[2][0]],\n        'y': [centres[0][3], centres[1][3], centres[2][3]]}\n\ndf = pd.DataFrame (data, columns = ['x', 'y'])\n\nCreate the plot\n\nimport matplotlib.pyplot as plt\n\n# Plot each group individually\nplt.scatter(\n    x = group1['sepal length (cm)'], \n    y = group1['petal width (cm)'], \n    alpha = 0.1, color = 'blue'\n)\n\nplt.scatter(\n    x = group2['sepal length (cm)'], \n    y = group2['petal width (cm)'], \n    alpha = 0.1, color = 'orange'\n)\n\nplt.scatter(\n    x = group3['sepal length (cm)'], \n    y = group3['petal width (cm)'], \n    alpha = 0.1, color = 'red'\n)\n\n# Plot cluster centres\nplt.scatter(\n    x = df['x'], \n    y = df['y'], \n    alpha = 1, color = 'black'\n)\n\n&lt;matplotlib.collections.PathCollection at 0x14fe387d0&gt;\n\n\n\n\n\n\n16.4.1 Number of clusters\nWhat happens if we change the number of clusters?\nTwo groups\n\nk_means_2 = KMeans(n_clusters = 2, init = 'random', n_init = 10)\nk_means_2.fit(iris)\niris_df['Two clusters'] = pd.Series(k_means_2.predict(iris_df.iloc[:,0:4].values), index = iris_df.index)\n\nNote that I have added a new column to the iris dataframe called ‘cluster 2 means’ and pass only our origonal 4 columns to the predict function (hence me using .iloc[:,0:4]).\nHow do our groupings look now (without plotting the cluster column)?\n\nsns.pairplot(iris_df.loc[:, iris_df.columns != 'Three clusters'], hue = 'Two clusters')\n\n\n\n\nHmm, does the data have more than two groups in it?\nPerhaps we should try 5 clusters instead.\n\nk_means_5 = KMeans(n_clusters = 5, init = 'random', n_init = 10)\nk_means_5.fit(iris)\niris_df['Five clusters'] = pd.Series(k_means_5.predict(iris_df.iloc[:,0:4].values), index = iris_df.index)\n\nPlot without the columns called ‘cluster’ and ‘Two cluster’\n\nsns.pairplot(iris_df.loc[:, (iris_df.columns != 'Three clusters') & (iris_df.columns != 'Two clusters')], hue = 'Five clusters')\n\n\n\n\n\niris_df\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\nThree clusters\nTwo clusters\nFive clusters\n\n\n\n\n0\n0.222222\n0.625000\n0.067797\n0.041667\n1\n1\n4\n\n\n1\n0.166667\n0.416667\n0.067797\n0.041667\n1\n1\n0\n\n\n2\n0.111111\n0.500000\n0.050847\n0.041667\n1\n1\n0\n\n\n3\n0.083333\n0.458333\n0.084746\n0.041667\n1\n1\n0\n\n\n4\n0.194444\n0.666667\n0.067797\n0.041667\n1\n1\n4\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n145\n0.666667\n0.416667\n0.711864\n0.916667\n0\n0\n2\n\n\n146\n0.555556\n0.208333\n0.677966\n0.750000\n2\n0\n3\n\n\n147\n0.611111\n0.416667\n0.711864\n0.791667\n0\n0\n3\n\n\n148\n0.527778\n0.583333\n0.745763\n0.916667\n0\n0\n2\n\n\n149\n0.444444\n0.416667\n0.694915\n0.708333\n2\n0\n3\n\n\n\n\n150 rows × 7 columns\n\n\n\nWhich did best?\n\nk_means.inertia_\n\n6.982216473785234\n\n\n\nk_means_2.inertia_\n\n12.127790750538193\n\n\n\nk_means_5.inertia_\n\n4.58977540011789\n\n\nIt looks like our k = 5 model captures the data well. Intertia, looking at the sklearn documentation as the Sum of squared distances of samples to their closest cluster center..\nIf you want to dive further into this then Real Python’s practical guide to K-Means Clustering is quite good."
+    "text": "16.4 Do-this-yourself: Check if we need to do any normalisation for this case?\nWe have already looked at how the data looks, what are the descriptive statistics look like, see if we need to do anything more?\n\nk_means = KMeans(n_clusters = 3, init = 'random',  n_init = 10)\n\nFit our kmeans model to the data\n\nk_means.fit(iris)\n\nKMeans(init='random', n_clusters=3, n_init=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.KMeansKMeans(init='random', n_clusters=3, n_init=10)\n\n\nThe algorithm has assigned the a label to each row.\n\nk_means.labels_\n\narray([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n       2, 2, 2, 2, 2, 2, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,\n       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,\n       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0], dtype=int32)\n\n\nEach row has been assigned a label.\nTo tidy things up we should put everything into a dataframe.\n\niris_df['Three clusters'] = pd.Series(k_means.predict(iris_df.values), index = iris_df.index)\n\n\niris_df\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\nThree clusters\n\n\n\n\n0\n0.222222\n0.625000\n0.067797\n0.041667\n2\n\n\n1\n0.166667\n0.416667\n0.067797\n0.041667\n2\n\n\n2\n0.111111\n0.500000\n0.050847\n0.041667\n2\n\n\n3\n0.083333\n0.458333\n0.084746\n0.041667\n2\n\n\n4\n0.194444\n0.666667\n0.067797\n0.041667\n2\n\n\n...\n...\n...\n...\n...\n...\n\n\n145\n0.666667\n0.416667\n0.711864\n0.916667\n1\n\n\n146\n0.555556\n0.208333\n0.677966\n0.750000\n0\n\n\n147\n0.611111\n0.416667\n0.711864\n0.791667\n1\n\n\n148\n0.527778\n0.583333\n0.745763\n0.916667\n1\n\n\n149\n0.444444\n0.416667\n0.694915\n0.708333\n0\n\n\n\n\n150 rows × 5 columns\n\n\n\n\nsns.pairplot(iris_df, hue = 'Three clusters')\n\n\n\n\nThat seems quite nice. We can also do individual plots if preferred.\n\nsns.scatterplot(data = iris_df, x = 'sepal length (cm)', y = 'petal width (cm)', hue = 'Three clusters')\n\n&lt;Axes: xlabel='sepal length (cm)', ylabel='petal width (cm)'&gt;\n\n\n\n\n\nK-means works by clustering the data around central points (often called centroids, means or cluster centers). We can extract the cluster centres from the kmeans object.\n\nk_means.cluster_centers_\n\narray([[0.44125683, 0.30737705, 0.57571548, 0.54918033],\n       [0.70726496, 0.4508547 , 0.79704476, 0.82478632],\n       [0.19611111, 0.595     , 0.07830508, 0.06083333]])\n\n\nIt is tricky to plot these using seaborn but we can use a normal maplotlib scatter plot.\nLet us grab the groups.\n\ngroup1 = iris_df[iris_df['Three clusters'] == 0]\ngroup2 = iris_df[iris_df['Three clusters'] == 1]\ngroup3 = iris_df[iris_df['Three clusters'] == 2]\n\nGrab the centroids\n\nimport pandas as pd\n\ncentres = k_means.cluster_centers_\n\ndata = {'x': [centres[0][0], centres[1][0], centres[2][0]],\n        'y': [centres[0][3], centres[1][3], centres[2][3]]}\n\ndf = pd.DataFrame (data, columns = ['x', 'y'])\n\nCreate the plot\n\nimport matplotlib.pyplot as plt\n\n# Plot each group individually\nplt.scatter(\n    x = group1['sepal length (cm)'], \n    y = group1['petal width (cm)'], \n    alpha = 0.1, color = 'blue'\n)\n\nplt.scatter(\n    x = group2['sepal length (cm)'], \n    y = group2['petal width (cm)'], \n    alpha = 0.1, color = 'orange'\n)\n\nplt.scatter(\n    x = group3['sepal length (cm)'], \n    y = group3['petal width (cm)'], \n    alpha = 0.1, color = 'red'\n)\n\n# Plot cluster centres\nplt.scatter(\n    x = df['x'], \n    y = df['y'], \n    alpha = 1, color = 'black'\n)\n\n&lt;matplotlib.collections.PathCollection at 0x162e90390&gt;\n\n\n\n\n\n\n16.4.1 Number of clusters\nWhat happens if we change the number of clusters?\nTwo groups\n\nk_means_2 = KMeans(n_clusters = 2, init = 'random', n_init = 10)\nk_means_2.fit(iris)\niris_df['Two clusters'] = pd.Series(k_means_2.predict(iris_df.iloc[:,0:4].values), index = iris_df.index)\n\nNote that I have added a new column to the iris dataframe called ‘cluster 2 means’ and pass only our origonal 4 columns to the predict function (hence me using .iloc[:,0:4]).\nHow do our groupings look now (without plotting the cluster column)?\n\nsns.pairplot(iris_df.loc[:, iris_df.columns != 'Three clusters'], hue = 'Two clusters')\n\n\n\n\nHmm, does the data have more than two groups in it?\nPerhaps we should try 5 clusters instead.\n\nk_means_5 = KMeans(n_clusters = 5, init = 'random', n_init = 10)\nk_means_5.fit(iris)\niris_df['Five clusters'] = pd.Series(k_means_5.predict(iris_df.iloc[:,0:4].values), index = iris_df.index)\n\nPlot without the columns called ‘cluster’ and ‘Two cluster’\n\nsns.pairplot(iris_df.loc[:, (iris_df.columns != 'Three clusters') & (iris_df.columns != 'Two clusters')], hue = 'Five clusters')\n\n\n\n\n\niris_df\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\nThree clusters\nTwo clusters\nFive clusters\n\n\n\n\n0\n0.222222\n0.625000\n0.067797\n0.041667\n2\n1\n3\n\n\n1\n0.166667\n0.416667\n0.067797\n0.041667\n2\n1\n1\n\n\n2\n0.111111\n0.500000\n0.050847\n0.041667\n2\n1\n1\n\n\n3\n0.083333\n0.458333\n0.084746\n0.041667\n2\n1\n1\n\n\n4\n0.194444\n0.666667\n0.067797\n0.041667\n2\n1\n3\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n145\n0.666667\n0.416667\n0.711864\n0.916667\n1\n0\n4\n\n\n146\n0.555556\n0.208333\n0.677966\n0.750000\n0\n0\n2\n\n\n147\n0.611111\n0.416667\n0.711864\n0.791667\n1\n0\n2\n\n\n148\n0.527778\n0.583333\n0.745763\n0.916667\n1\n0\n4\n\n\n149\n0.444444\n0.416667\n0.694915\n0.708333\n0\n0\n2\n\n\n\n\n150 rows × 7 columns\n\n\n\nWhich did best?\n\nk_means.inertia_\n\n6.982216473785234\n\n\n\nk_means_2.inertia_\n\n12.127790750538193\n\n\n\nk_means_5.inertia_\n\n4.580948640117293\n\n\nIt looks like our k = 5 model captures the data well. Intertia, looking at the sklearn documentation as the Sum of squared distances of samples to their closest cluster center..\nIf you want to dive further into this then Real Python’s practical guide to K-Means Clustering is quite good."
   },
   {
     "objectID": "content/labs/Lab_4/IM939_Lab_4_1_Iris.html#principal-component-analysis-pca",
     "href": "content/labs/Lab_4/IM939_Lab_4_1_Iris.html#principal-component-analysis-pca",
     "title": "16  Lab: Dimension Reduction",
     "section": "16.5 Principal Component Analysis (PCA)",
-    "text": "16.5 Principal Component Analysis (PCA)\nPCA reduces the dimension of our data. The method derives point in an n dimentional space from our data which are uncorrelated.\nTo carry out a PCA on our Iris dataset where there are only two dimensions.\n\nfrom sklearn.decomposition import PCA\n\nn_components = 2\n\npca = PCA(n_components=n_components)\niris_pca = pca.fit(iris_df.iloc[:,0:4])\n\nWe can look at the components.\n\niris_pca.components_\n\narray([[ 0.42494212, -0.15074824,  0.61626702,  0.64568888],\n       [ 0.42320271,  0.90396711, -0.06038308, -0.00983925]])\n\n\nThese components are intersting. You may want to look at a PennState article on interpreting PCA components.\nOur second column, ‘sepal width (cm)’ is positively correlated with our second principle component whereas the first column ‘sepal length (cm)’ is postively correlated with both.\nYou may want to consider:\n\nDo we need more than two components?\nIs it useful to keep sepal length (cm) in the dataset?\n\nWe can also examine the explained variance of the each principle component.\n\niris_pca.explained_variance_\n\narray([0.23245325, 0.0324682 ])\n\n\nA nice worked example showing the link between the explained variance and the component is here.\nOur first principle component explains a lot more of the variance of data then the second.\nAnother way to explore these indicators is to look at the explained_variance_ratio_ values. These present a similar information but provide them as percentage values so they are easier to interpret. You can also create a plot and see how these percentages add up. In this case, the first two components add up to 0.96. Which means the first two features are able to represent around 96% of the variation in the data, not bad. These values are not always this high.\nA high value that is close to 100% means that the PCA is able to represent much of the variance and they will be good representations of the data without losing a lot of that variance in the underlying features. This of course is based on an assumption that variance is a good proxy about how informative a feature is.\n\niris_pca.explained_variance_ratio_\n\narray([0.84136038, 0.11751808])\n\n\n\nplt.plot(np.cumsum(pca.explained_variance_ratio_))\nplt.xlabel('number of components')\nplt.ylabel('cumulative explained variance');\n\n\n\n\n\n16.5.1 Dimension reduction\nFor our purposes, we are interested in using PCA for reducing the number of dimension in our data whilst preseving the maximal data variance.\nWe can extract the projected components from the model.\n\niris_pca_vals = pca.fit_transform(iris_df.iloc[:,0:4])\n\nThe numpy arrays contains the projected values.\n\ntype(iris_pca_vals)\n\nnumpy.ndarray\n\n\n\niris_pca_vals\n\narray([[-6.30702931e-01,  1.07577910e-01],\n       [-6.22904943e-01, -1.04259833e-01],\n       [-6.69520395e-01, -5.14170597e-02],\n       [-6.54152759e-01, -1.02884871e-01],\n       [-6.48788056e-01,  1.33487576e-01],\n       [-5.35272778e-01,  2.89615724e-01],\n       [-6.56537790e-01,  1.07244911e-02],\n       [-6.25780499e-01,  5.71335411e-02],\n       [-6.75643504e-01, -2.00703283e-01],\n       [-6.45644619e-01, -6.72080097e-02],\n       [-5.97408238e-01,  2.17151953e-01],\n       [-6.38943190e-01,  3.25988375e-02],\n       [-6.61612593e-01, -1.15605495e-01],\n       [-7.51967943e-01, -1.71313322e-01],\n       [-6.00371589e-01,  3.80240692e-01],\n       [-5.52157227e-01,  5.15255982e-01],\n       [-5.77053593e-01,  2.93709492e-01],\n       [-6.03799228e-01,  1.07167941e-01],\n       [-5.20483461e-01,  2.87627289e-01],\n       [-6.12197555e-01,  2.19140388e-01],\n       [-5.57674300e-01,  1.02109180e-01],\n       [-5.79012675e-01,  1.81065123e-01],\n       [-7.37784662e-01,  9.05588211e-02],\n       [-5.06093857e-01,  2.79470846e-02],\n       [-6.07607579e-01,  2.95285112e-02],\n       [-5.90210587e-01, -9.45510863e-02],\n       [-5.61527888e-01,  5.52901611e-02],\n       [-6.08453780e-01,  1.18310099e-01],\n       [-6.12617807e-01,  8.16682448e-02],\n       [-6.38184784e-01, -5.44873860e-02],\n       [-6.20099660e-01, -8.03970516e-02],\n       [-5.24757301e-01,  1.03336126e-01],\n       [-6.73044544e-01,  3.44711846e-01],\n       [-6.27455379e-01,  4.18257508e-01],\n       [-6.18740916e-01, -6.76179787e-02],\n       [-6.44553756e-01, -1.51267253e-02],\n       [-5.93932344e-01,  1.55623876e-01],\n       [-6.87495707e-01,  1.22141914e-01],\n       [-6.92369885e-01, -1.62014545e-01],\n       [-6.13976551e-01,  6.88891719e-02],\n       [-6.26048380e-01,  9.64357527e-02],\n       [-6.09693996e-01, -4.14325957e-01],\n       [-7.04932239e-01, -8.66839521e-02],\n       [-5.14001659e-01,  9.21355196e-02],\n       [-5.43513037e-01,  2.14636651e-01],\n       [-6.07805187e-01, -1.16425433e-01],\n       [-6.28656055e-01,  2.18526915e-01],\n       [-6.70879139e-01, -6.41961326e-02],\n       [-6.09212186e-01,  2.05396323e-01],\n       [-6.29944525e-01,  2.04916869e-02],\n       [ 2.79951766e-01,  1.79245790e-01],\n       [ 2.15141376e-01,  1.10348921e-01],\n       [ 3.22223106e-01,  1.27368010e-01],\n       [ 5.94030131e-02, -3.28502275e-01],\n       [ 2.62515235e-01, -2.95800761e-02],\n       [ 1.03831043e-01, -1.21781742e-01],\n       [ 2.44850362e-01,  1.33801733e-01],\n       [-1.71529386e-01, -3.52976762e-01],\n       [ 2.14230599e-01,  2.06607890e-02],\n       [ 1.53249619e-02, -2.12494509e-01],\n       [-1.13710323e-01, -4.93929201e-01],\n       [ 1.37348380e-01, -2.06894998e-02],\n       [ 4.39928190e-02, -3.06159511e-01],\n       [ 1.92559767e-01, -3.95507760e-02],\n       [-8.26091518e-03, -8.66610981e-02],\n       [ 2.19485489e-01,  1.09383928e-01],\n       [ 1.33272148e-01, -5.90267184e-02],\n       [-5.75757060e-04, -1.42367733e-01],\n       [ 2.54345249e-01, -2.89815304e-01],\n       [-5.60800300e-03, -2.39572672e-01],\n       [ 2.68168358e-01,  4.72705335e-02],\n       [ 9.88208151e-02, -6.96420088e-02],\n       [ 2.89086481e-01, -1.69157553e-01],\n       [ 1.45033538e-01, -7.63961345e-02],\n       [ 1.59287093e-01,  2.19853643e-04],\n       [ 2.13962718e-01,  5.99630005e-02],\n       [ 2.91913782e-01,  4.04990109e-03],\n       [ 3.69148997e-01,  6.43480720e-02],\n       [ 1.86769115e-01, -4.96694916e-02],\n       [-6.87697501e-02, -1.85648007e-01],\n       [-2.15759776e-02, -2.87970157e-01],\n       [-5.89248844e-02, -2.86536746e-01],\n       [ 3.23412419e-02, -1.41140786e-01],\n       [ 2.88906394e-01, -1.31550706e-01],\n       [ 1.09664252e-01, -8.25379800e-02],\n       [ 1.82266934e-01,  1.38247021e-01],\n       [ 2.77724803e-01,  1.05903632e-01],\n       [ 1.95615410e-01, -2.38550997e-01],\n       [ 3.76839264e-02, -5.41130122e-02],\n       [ 4.68406593e-02, -2.53171683e-01],\n       [ 5.54365941e-02, -2.19190186e-01],\n       [ 1.75833387e-01, -8.62037590e-04],\n       [ 4.90676225e-02, -1.79829525e-01],\n       [-1.53444261e-01, -3.78886428e-01],\n       [ 6.69726607e-02, -1.68132343e-01],\n       [ 3.30293747e-02, -4.29708545e-02],\n       [ 6.62142547e-02, -8.10461198e-02],\n       [ 1.35679197e-01, -2.32914079e-02],\n       [-1.58634575e-01, -2.89139847e-01],\n       [ 6.20502279e-02, -1.17687974e-01],\n       [ 6.22771338e-01,  1.16807265e-01],\n       [ 3.46009609e-01, -1.56291874e-01],\n       [ 6.17986434e-01,  1.00519741e-01],\n       [ 4.17789309e-01, -2.68903690e-02],\n       [ 5.63621248e-01,  3.05994289e-02],\n       [ 7.50122599e-01,  1.52133800e-01],\n       [ 1.35857804e-01, -3.30462554e-01],\n       [ 6.08945212e-01,  8.35018443e-02],\n       [ 5.11020215e-01, -1.32575915e-01],\n       [ 7.20608541e-01,  3.34580389e-01],\n       [ 4.24135062e-01,  1.13914054e-01],\n       [ 4.37723702e-01, -8.78049736e-02],\n       [ 5.40793776e-01,  6.93466165e-02],\n       [ 3.63226514e-01, -2.42764625e-01],\n       [ 4.74246948e-01, -1.20676423e-01],\n       [ 5.13932631e-01,  9.88816323e-02],\n       [ 4.24670824e-01,  3.53096310e-02],\n       [ 7.49026039e-01,  4.63778390e-01],\n       [ 8.72194272e-01,  9.33798117e-03],\n       [ 2.82963372e-01, -3.18443776e-01],\n       [ 6.14733184e-01,  1.53566018e-01],\n       [ 3.22133832e-01, -1.40500924e-01],\n       [ 7.58030401e-01,  8.79453649e-02],\n       [ 3.57235237e-01, -9.50568671e-02],\n       [ 5.31036706e-01,  1.68539991e-01],\n       [ 5.46962123e-01,  1.87812429e-01],\n       [ 3.28704908e-01, -6.81237595e-02],\n       [ 3.14783811e-01, -5.57223965e-03],\n       [ 5.16585543e-01, -5.40299414e-02],\n       [ 4.84826663e-01,  1.15348658e-01],\n       [ 6.33043632e-01,  5.92290940e-02],\n       [ 6.87490917e-01,  4.91179916e-01],\n       [ 5.43489246e-01, -5.44399104e-02],\n       [ 2.91133358e-01, -5.82085481e-02],\n       [ 3.05410131e-01, -1.61757644e-01],\n       [ 7.63507935e-01,  1.68186703e-01],\n       [ 5.47805644e-01,  1.58976299e-01],\n       [ 4.06585699e-01,  6.12192966e-02],\n       [ 2.92534659e-01, -1.63044284e-02],\n       [ 5.35871344e-01,  1.19790986e-01],\n       [ 6.13864965e-01,  9.30029331e-02],\n       [ 5.58343139e-01,  1.22041374e-01],\n       [ 3.46009609e-01, -1.56291874e-01],\n       [ 6.23819644e-01,  1.39763503e-01],\n       [ 6.38651518e-01,  1.66900115e-01],\n       [ 5.51461624e-01,  5.98413741e-02],\n       [ 4.07146497e-01, -1.71820871e-01],\n       [ 4.47142619e-01,  3.75600193e-02],\n       [ 4.88207585e-01,  1.49677521e-01],\n       [ 3.12066323e-01, -3.11303854e-02]])\n\n\nEach row corresponds to a row in our data.\n\niris_pca_vals.shape\n\n(150, 2)\n\n\n\niris_df.shape\n\n(150, 7)\n\n\nWe can add the component to our dataset. I prefer to keep everything in one table and it is not at all required. You can just assign the values whichever variables you prefer.\n\niris_df['c1'] = [item[0] for item in iris_pca_vals]\niris_df['c2'] = [item[1] for item in iris_pca_vals]\n\n\niris_df\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\nThree clusters\nTwo clusters\nFive clusters\nc1\nc2\n\n\n\n\n0\n0.222222\n0.625000\n0.067797\n0.041667\n1\n1\n4\n-0.630703\n0.107578\n\n\n1\n0.166667\n0.416667\n0.067797\n0.041667\n1\n1\n0\n-0.622905\n-0.104260\n\n\n2\n0.111111\n0.500000\n0.050847\n0.041667\n1\n1\n0\n-0.669520\n-0.051417\n\n\n3\n0.083333\n0.458333\n0.084746\n0.041667\n1\n1\n0\n-0.654153\n-0.102885\n\n\n4\n0.194444\n0.666667\n0.067797\n0.041667\n1\n1\n4\n-0.648788\n0.133488\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n145\n0.666667\n0.416667\n0.711864\n0.916667\n0\n0\n2\n0.551462\n0.059841\n\n\n146\n0.555556\n0.208333\n0.677966\n0.750000\n2\n0\n3\n0.407146\n-0.171821\n\n\n147\n0.611111\n0.416667\n0.711864\n0.791667\n0\n0\n3\n0.447143\n0.037560\n\n\n148\n0.527778\n0.583333\n0.745763\n0.916667\n0\n0\n2\n0.488208\n0.149678\n\n\n149\n0.444444\n0.416667\n0.694915\n0.708333\n2\n0\n3\n0.312066\n-0.031130\n\n\n\n\n150 rows × 9 columns\n\n\n\nPlotting out our data on our new two component space.\n\nsns.scatterplot(data = iris_df, x = 'c1', y = 'c2')\n\n&lt;Axes: xlabel='c1', ylabel='c2'&gt;\n\n\n\n\n\nWe have reduced our three dimensions to two.\nWe can also colour by our clusters. What does this show us and is it useful?\n\nsns.scatterplot(data = iris_df, x = 'c1', y = 'c2', hue = 'Three clusters')\n\n&lt;Axes: xlabel='c1', ylabel='c2'&gt;\n\n\n\n\n\n\niris_df\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\nThree clusters\nTwo clusters\nFive clusters\nc1\nc2\n\n\n\n\n0\n0.222222\n0.625000\n0.067797\n0.041667\n1\n1\n4\n-0.630703\n0.107578\n\n\n1\n0.166667\n0.416667\n0.067797\n0.041667\n1\n1\n0\n-0.622905\n-0.104260\n\n\n2\n0.111111\n0.500000\n0.050847\n0.041667\n1\n1\n0\n-0.669520\n-0.051417\n\n\n3\n0.083333\n0.458333\n0.084746\n0.041667\n1\n1\n0\n-0.654153\n-0.102885\n\n\n4\n0.194444\n0.666667\n0.067797\n0.041667\n1\n1\n4\n-0.648788\n0.133488\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n145\n0.666667\n0.416667\n0.711864\n0.916667\n0\n0\n2\n0.551462\n0.059841\n\n\n146\n0.555556\n0.208333\n0.677966\n0.750000\n2\n0\n3\n0.407146\n-0.171821\n\n\n147\n0.611111\n0.416667\n0.711864\n0.791667\n0\n0\n3\n0.447143\n0.037560\n\n\n148\n0.527778\n0.583333\n0.745763\n0.916667\n0\n0\n2\n0.488208\n0.149678\n\n\n149\n0.444444\n0.416667\n0.694915\n0.708333\n2\n0\n3\n0.312066\n-0.031130\n\n\n\n\n150 rows × 9 columns\n\n\n\n\n\n16.5.2 PCA to Clusters\nWe have reduced our 4D dataset to 2D whilst keeping the data variance. Reducing the data to fewer dimensions can help with the ‘curse of dimensionality’, reduce the change of overfitting a machine learning model (see here) and reduce the computational complexity of a model fit.\nPutting our new dimensions into a kMeans model\n\nk_means_pca = KMeans(n_clusters = 3, init = 'random', n_init = 10)\niris_pca_kmeans = k_means_pca.fit(iris_df.iloc[:,-2:])\n\n\ntype(iris_df.iloc[:,-2:].values)\n\nnumpy.ndarray\n\n\n\niris_df['PCA 3 clusters'] = pd.Series(k_means_pca.predict(iris_df.iloc[:,-2:].values), index = iris_df.index)\niris_df\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\nThree clusters\nTwo clusters\nFive clusters\nc1\nc2\nPCA 3 clusters\n\n\n\n\n0\n0.222222\n0.625000\n0.067797\n0.041667\n1\n1\n4\n-0.630703\n0.107578\n1\n\n\n1\n0.166667\n0.416667\n0.067797\n0.041667\n1\n1\n0\n-0.622905\n-0.104260\n1\n\n\n2\n0.111111\n0.500000\n0.050847\n0.041667\n1\n1\n0\n-0.669520\n-0.051417\n1\n\n\n3\n0.083333\n0.458333\n0.084746\n0.041667\n1\n1\n0\n-0.654153\n-0.102885\n1\n\n\n4\n0.194444\n0.666667\n0.067797\n0.041667\n1\n1\n4\n-0.648788\n0.133488\n1\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n145\n0.666667\n0.416667\n0.711864\n0.916667\n0\n0\n2\n0.551462\n0.059841\n0\n\n\n146\n0.555556\n0.208333\n0.677966\n0.750000\n2\n0\n3\n0.407146\n-0.171821\n2\n\n\n147\n0.611111\n0.416667\n0.711864\n0.791667\n0\n0\n3\n0.447143\n0.037560\n0\n\n\n148\n0.527778\n0.583333\n0.745763\n0.916667\n0\n0\n2\n0.488208\n0.149678\n0\n\n\n149\n0.444444\n0.416667\n0.694915\n0.708333\n2\n0\n3\n0.312066\n-0.031130\n2\n\n\n\n\n150 rows × 10 columns\n\n\n\nAs we only have two dimensions we can easily plot this on a single scatterplot.\n\n# a different seaborn theme\n# see https://python-graph-gallery.com/104-seaborn-themes/\nsns.set_style(\"darkgrid\")\nsns.scatterplot(data = iris_df, x = 'c1', y = 'c2', hue = 'PCA 3 clusters')\n\n&lt;Axes: xlabel='c1', ylabel='c2'&gt;\n\n\n\n\n\nI suspect having two clusters would work better. We should try a few different models.\nCopying the code from here we can fit multiple numbers of clusters.\n\nks = range(1, 10)\ninertias = [] # Create an empty list (will be populated later)\nfor k in ks:\n    # Create a KMeans instance with k clusters: model\n    model = KMeans(n_clusters=k, n_init = 10)\n    \n    # Fit model to samples\n    model.fit(iris_df.iloc[:,-2:])\n    \n    # Append the inertia to the list of inertias\n    inertias.append(model.inertia_)\n    \nplt.plot(ks, inertias, '-o', color='black')\nplt.xlabel('number of clusters, k')\nplt.ylabel('inertia')\nplt.xticks(ks)\nplt.show()\n\n\n\n\nThree seems ok. We clearly want no more than three.\nThese types of plots show an point about model complexity. More free parameters in the model (here the number of clusters) will improve how well the model captures the data, often with reducing returns. However, a model which overfits the data will not be able to fit new data well - referred to overfitting. Randomish internet blogs introduce the topic pretty well, see here, and also wikipedia, see here.\n\n\n16.5.3 Missing values\nFinally, how we deal with missing values can impact the results of PCA and kMeans clustering.\nLets us load in the iris dataset again and randomly remove 10% of the data (see code from here).\n\nimport numpy as np\n\nx = load_iris()\n\n\niris_df = pd.DataFrame(x.data, columns = x.feature_names)\n\nmask = np.random.choice([True, False], size = iris_df.shape, p = [0.2, 0.8])\nmask[mask.all(1),-1] = 0\n\ndf = iris_df.mask(mask)\n\ndf.isna().sum()\n\nsepal length (cm)    32\nsepal width (cm)     34\npetal length (cm)    37\npetal width (cm)     29\ndtype: int64\n\n\n\ndf\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\n\n\n\n\n0\n5.1\nNaN\nNaN\n0.2\n\n\n1\n4.9\nNaN\nNaN\n0.2\n\n\n2\nNaN\n3.2\n1.3\n0.2\n\n\n3\n4.6\n3.1\n1.5\nNaN\n\n\n4\n5.0\nNaN\n1.4\nNaN\n\n\n...\n...\n...\n...\n...\n\n\n145\n6.7\n3.0\n5.2\n2.3\n\n\n146\nNaN\n2.5\n5.0\n1.9\n\n\n147\n6.5\nNaN\nNaN\n2.0\n\n\n148\n6.2\nNaN\n5.4\n2.3\n\n\n149\n5.9\n3.0\n5.1\n1.8\n\n\n\n\n150 rows × 4 columns\n\n\n\nAbout 20% of the data is randomly an NaN.\n\n16.5.3.1 Zeroing\nWe can 0 them and fit our models.\n\ndf_1 = df.copy()\ndf_1 = df_1.fillna(0)\n\n\ndf_1\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\n\n\n\n\n0\n5.1\n0.0\n0.0\n0.2\n\n\n1\n4.9\n0.0\n0.0\n0.2\n\n\n2\n0.0\n3.2\n1.3\n0.2\n\n\n3\n4.6\n3.1\n1.5\n0.0\n\n\n4\n5.0\n0.0\n1.4\n0.0\n\n\n...\n...\n...\n...\n...\n\n\n145\n6.7\n3.0\n5.2\n2.3\n\n\n146\n0.0\n2.5\n5.0\n1.9\n\n\n147\n6.5\n0.0\n0.0\n2.0\n\n\n148\n6.2\n0.0\n5.4\n2.3\n\n\n149\n5.9\n3.0\n5.1\n1.8\n\n\n\n\n150 rows × 4 columns\n\n\n\n\nk_means_zero = KMeans(n_clusters = 4, init = 'random', n_init = 10)\nk_means_zero.fit(df_1)\ndf_1['Four clusters'] = pd.Series(k_means_zero.predict(df_1.iloc[:,0:4].values), index = df_1.index)\nsns.pairplot(df_1, hue = 'Four clusters')\n\n\n\n\nWhat impact has zeroing the values had on our results?\nNow, onto PCA.\n\n# PCA analysis\nn_components = 2\n\npca = PCA(n_components=n_components)\ndf_1_pca = pca.fit(df_1.iloc[:,0:4])\n\n# Extract projected values\ndf_1_pca_vals = df_1_pca.transform(df_1.iloc[:,0:4])\ndf_1['c1'] = [item[0] for item in df_1_pca_vals]\ndf_1['c2'] = [item[1] for item in df_1_pca_vals]\n\nsns.scatterplot(data = df_1, x = 'c1', y = 'c2')\n\n&lt;Axes: xlabel='c1', ylabel='c2'&gt;\n\n\n\n\n\n\ndf_1_pca.explained_variance_\n\narray([6.71803744, 4.89376791])\n\n\n\ndf_1_pca.components_\n\narray([[-0.91235845,  0.02968512, -0.38161438, -0.14522853],\n       [-0.39939351,  0.05086373,  0.90393389,  0.14422629]])\n\n\n\n\n16.5.3.2 Replacing with the average\n\ndf_2 = df.copy()\nfor i in range(4):\n    df_2.iloc[:,i] = df_2.iloc[:,i].fillna(df_2.iloc[:,i].mean())\n\n\ndf_2\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\n\n\n\n\n0\n5.100000\n3.00431\n3.90885\n0.200000\n\n\n1\n4.900000\n3.00431\n3.90885\n0.200000\n\n\n2\n5.866102\n3.20000\n1.30000\n0.200000\n\n\n3\n4.600000\n3.10000\n1.50000\n1.210744\n\n\n4\n5.000000\n3.00431\n1.40000\n1.210744\n\n\n...\n...\n...\n...\n...\n\n\n145\n6.700000\n3.00000\n5.20000\n2.300000\n\n\n146\n5.866102\n2.50000\n5.00000\n1.900000\n\n\n147\n6.500000\n3.00431\n3.90885\n2.000000\n\n\n148\n6.200000\n3.00431\n5.40000\n2.300000\n\n\n149\n5.900000\n3.00000\n5.10000\n1.800000\n\n\n\n\n150 rows × 4 columns\n\n\n\n\nk_means_zero = KMeans(n_clusters = 4, init = 'random', n_init = 10)\nk_means_zero.fit(df_2)\ndf_2['Four clusters'] = pd.Series(k_means_zero.predict(df_2.iloc[:,0:4].values), index = df_2.index)\nsns.pairplot(df_2, hue = 'Four clusters')\n\n\n\n\n\n# PCA analysis\nn_components = 2\n\npca = PCA(n_components=n_components)\ndf_2_pca = pca.fit(df_2.iloc[:,0:4])\n\n# Extract projected values\ndf_2_pca_vals = df_2_pca.transform(df_2.iloc[:,0:4])\ndf_2['c1'] = [item[0] for item in df_2_pca_vals]\ndf_2['c2'] = [item[1] for item in df_2_pca_vals]\n\nsns.scatterplot(data = df_2, x = 'c1', y = 'c2')\n\n&lt;Axes: xlabel='c1', ylabel='c2'&gt;\n\n\n\n\n\n\ndf_2_pca.explained_variance_\n\narray([2.68417915, 0.33506061])\n\n\n\ndf_2_pca.components_\n\narray([[ 0.33775908, -0.04345744,  0.87824143,  0.33574133],\n       [ 0.82803166,  0.20108365, -0.42517727,  0.30521014]])"
+    "text": "16.5 Principal Component Analysis (PCA)\nPCA reduces the dimension of our data. The method derives point in an n dimentional space from our data which are uncorrelated.\nTo carry out a PCA on our Iris dataset where there are only two dimensions.\n\nfrom sklearn.decomposition import PCA\n\nn_components = 2\n\npca = PCA(n_components=n_components)\niris_pca = pca.fit(iris_df.iloc[:,0:4])\n\nWe can look at the components.\n\niris_pca.components_\n\narray([[ 0.42494212, -0.15074824,  0.61626702,  0.64568888],\n       [ 0.42320271,  0.90396711, -0.06038308, -0.00983925]])\n\n\nThese components are intersting. You may want to look at a PennState article on interpreting PCA components.\nOur second column, ‘sepal width (cm)’ is positively correlated with our second principle component whereas the first column ‘sepal length (cm)’ is postively correlated with both.\nYou may want to consider:\n\nDo we need more than two components?\nIs it useful to keep sepal length (cm) in the dataset?\n\nWe can also examine the explained variance of the each principle component.\n\niris_pca.explained_variance_\n\narray([0.23245325, 0.0324682 ])\n\n\nA nice worked example showing the link between the explained variance and the component is here.\nOur first principle component explains a lot more of the variance of data then the second.\nAnother way to explore these indicators is to look at the explained_variance_ratio_ values. These present a similar information but provide them as percentage values so they are easier to interpret. You can also create a plot and see how these percentages add up. In this case, the first two components add up to 0.96. Which means the first two features are able to represent around 96% of the variation in the data, not bad. These values are not always this high.\nA high value that is close to 100% means that the PCA is able to represent much of the variance and they will be good representations of the data without losing a lot of that variance in the underlying features. This of course is based on an assumption that variance is a good proxy about how informative a feature is.\n\niris_pca.explained_variance_ratio_\n\narray([0.84136038, 0.11751808])\n\n\n\nplt.plot(np.cumsum(pca.explained_variance_ratio_))\nplt.xlabel('number of components')\nplt.ylabel('cumulative explained variance');\n\n\n\n\n\n16.5.1 Dimension reduction\nFor our purposes, we are interested in using PCA for reducing the number of dimension in our data whilst preseving the maximal data variance.\nWe can extract the projected components from the model.\n\niris_pca_vals = pca.fit_transform(iris_df.iloc[:,0:4])\n\nThe numpy arrays contains the projected values.\n\ntype(iris_pca_vals)\n\nnumpy.ndarray\n\n\n\niris_pca_vals\n\narray([[-6.30702931e-01,  1.07577910e-01],\n       [-6.22904943e-01, -1.04259833e-01],\n       [-6.69520395e-01, -5.14170597e-02],\n       [-6.54152759e-01, -1.02884871e-01],\n       [-6.48788056e-01,  1.33487576e-01],\n       [-5.35272778e-01,  2.89615724e-01],\n       [-6.56537790e-01,  1.07244911e-02],\n       [-6.25780499e-01,  5.71335411e-02],\n       [-6.75643504e-01, -2.00703283e-01],\n       [-6.45644619e-01, -6.72080097e-02],\n       [-5.97408238e-01,  2.17151953e-01],\n       [-6.38943190e-01,  3.25988375e-02],\n       [-6.61612593e-01, -1.15605495e-01],\n       [-7.51967943e-01, -1.71313322e-01],\n       [-6.00371589e-01,  3.80240692e-01],\n       [-5.52157227e-01,  5.15255982e-01],\n       [-5.77053593e-01,  2.93709492e-01],\n       [-6.03799228e-01,  1.07167941e-01],\n       [-5.20483461e-01,  2.87627289e-01],\n       [-6.12197555e-01,  2.19140388e-01],\n       [-5.57674300e-01,  1.02109180e-01],\n       [-5.79012675e-01,  1.81065123e-01],\n       [-7.37784662e-01,  9.05588211e-02],\n       [-5.06093857e-01,  2.79470846e-02],\n       [-6.07607579e-01,  2.95285112e-02],\n       [-5.90210587e-01, -9.45510863e-02],\n       [-5.61527888e-01,  5.52901611e-02],\n       [-6.08453780e-01,  1.18310099e-01],\n       [-6.12617807e-01,  8.16682448e-02],\n       [-6.38184784e-01, -5.44873860e-02],\n       [-6.20099660e-01, -8.03970516e-02],\n       [-5.24757301e-01,  1.03336126e-01],\n       [-6.73044544e-01,  3.44711846e-01],\n       [-6.27455379e-01,  4.18257508e-01],\n       [-6.18740916e-01, -6.76179787e-02],\n       [-6.44553756e-01, -1.51267253e-02],\n       [-5.93932344e-01,  1.55623876e-01],\n       [-6.87495707e-01,  1.22141914e-01],\n       [-6.92369885e-01, -1.62014545e-01],\n       [-6.13976551e-01,  6.88891719e-02],\n       [-6.26048380e-01,  9.64357527e-02],\n       [-6.09693996e-01, -4.14325957e-01],\n       [-7.04932239e-01, -8.66839521e-02],\n       [-5.14001659e-01,  9.21355196e-02],\n       [-5.43513037e-01,  2.14636651e-01],\n       [-6.07805187e-01, -1.16425433e-01],\n       [-6.28656055e-01,  2.18526915e-01],\n       [-6.70879139e-01, -6.41961326e-02],\n       [-6.09212186e-01,  2.05396323e-01],\n       [-6.29944525e-01,  2.04916869e-02],\n       [ 2.79951766e-01,  1.79245790e-01],\n       [ 2.15141376e-01,  1.10348921e-01],\n       [ 3.22223106e-01,  1.27368010e-01],\n       [ 5.94030131e-02, -3.28502275e-01],\n       [ 2.62515235e-01, -2.95800761e-02],\n       [ 1.03831043e-01, -1.21781742e-01],\n       [ 2.44850362e-01,  1.33801733e-01],\n       [-1.71529386e-01, -3.52976762e-01],\n       [ 2.14230599e-01,  2.06607890e-02],\n       [ 1.53249619e-02, -2.12494509e-01],\n       [-1.13710323e-01, -4.93929201e-01],\n       [ 1.37348380e-01, -2.06894998e-02],\n       [ 4.39928190e-02, -3.06159511e-01],\n       [ 1.92559767e-01, -3.95507760e-02],\n       [-8.26091518e-03, -8.66610981e-02],\n       [ 2.19485489e-01,  1.09383928e-01],\n       [ 1.33272148e-01, -5.90267184e-02],\n       [-5.75757060e-04, -1.42367733e-01],\n       [ 2.54345249e-01, -2.89815304e-01],\n       [-5.60800300e-03, -2.39572672e-01],\n       [ 2.68168358e-01,  4.72705335e-02],\n       [ 9.88208151e-02, -6.96420088e-02],\n       [ 2.89086481e-01, -1.69157553e-01],\n       [ 1.45033538e-01, -7.63961345e-02],\n       [ 1.59287093e-01,  2.19853643e-04],\n       [ 2.13962718e-01,  5.99630005e-02],\n       [ 2.91913782e-01,  4.04990109e-03],\n       [ 3.69148997e-01,  6.43480720e-02],\n       [ 1.86769115e-01, -4.96694916e-02],\n       [-6.87697501e-02, -1.85648007e-01],\n       [-2.15759776e-02, -2.87970157e-01],\n       [-5.89248844e-02, -2.86536746e-01],\n       [ 3.23412419e-02, -1.41140786e-01],\n       [ 2.88906394e-01, -1.31550706e-01],\n       [ 1.09664252e-01, -8.25379800e-02],\n       [ 1.82266934e-01,  1.38247021e-01],\n       [ 2.77724803e-01,  1.05903632e-01],\n       [ 1.95615410e-01, -2.38550997e-01],\n       [ 3.76839264e-02, -5.41130122e-02],\n       [ 4.68406593e-02, -2.53171683e-01],\n       [ 5.54365941e-02, -2.19190186e-01],\n       [ 1.75833387e-01, -8.62037590e-04],\n       [ 4.90676225e-02, -1.79829525e-01],\n       [-1.53444261e-01, -3.78886428e-01],\n       [ 6.69726607e-02, -1.68132343e-01],\n       [ 3.30293747e-02, -4.29708545e-02],\n       [ 6.62142547e-02, -8.10461198e-02],\n       [ 1.35679197e-01, -2.32914079e-02],\n       [-1.58634575e-01, -2.89139847e-01],\n       [ 6.20502279e-02, -1.17687974e-01],\n       [ 6.22771338e-01,  1.16807265e-01],\n       [ 3.46009609e-01, -1.56291874e-01],\n       [ 6.17986434e-01,  1.00519741e-01],\n       [ 4.17789309e-01, -2.68903690e-02],\n       [ 5.63621248e-01,  3.05994289e-02],\n       [ 7.50122599e-01,  1.52133800e-01],\n       [ 1.35857804e-01, -3.30462554e-01],\n       [ 6.08945212e-01,  8.35018443e-02],\n       [ 5.11020215e-01, -1.32575915e-01],\n       [ 7.20608541e-01,  3.34580389e-01],\n       [ 4.24135062e-01,  1.13914054e-01],\n       [ 4.37723702e-01, -8.78049736e-02],\n       [ 5.40793776e-01,  6.93466165e-02],\n       [ 3.63226514e-01, -2.42764625e-01],\n       [ 4.74246948e-01, -1.20676423e-01],\n       [ 5.13932631e-01,  9.88816323e-02],\n       [ 4.24670824e-01,  3.53096310e-02],\n       [ 7.49026039e-01,  4.63778390e-01],\n       [ 8.72194272e-01,  9.33798117e-03],\n       [ 2.82963372e-01, -3.18443776e-01],\n       [ 6.14733184e-01,  1.53566018e-01],\n       [ 3.22133832e-01, -1.40500924e-01],\n       [ 7.58030401e-01,  8.79453649e-02],\n       [ 3.57235237e-01, -9.50568671e-02],\n       [ 5.31036706e-01,  1.68539991e-01],\n       [ 5.46962123e-01,  1.87812429e-01],\n       [ 3.28704908e-01, -6.81237595e-02],\n       [ 3.14783811e-01, -5.57223965e-03],\n       [ 5.16585543e-01, -5.40299414e-02],\n       [ 4.84826663e-01,  1.15348658e-01],\n       [ 6.33043632e-01,  5.92290940e-02],\n       [ 6.87490917e-01,  4.91179916e-01],\n       [ 5.43489246e-01, -5.44399104e-02],\n       [ 2.91133358e-01, -5.82085481e-02],\n       [ 3.05410131e-01, -1.61757644e-01],\n       [ 7.63507935e-01,  1.68186703e-01],\n       [ 5.47805644e-01,  1.58976299e-01],\n       [ 4.06585699e-01,  6.12192966e-02],\n       [ 2.92534659e-01, -1.63044284e-02],\n       [ 5.35871344e-01,  1.19790986e-01],\n       [ 6.13864965e-01,  9.30029331e-02],\n       [ 5.58343139e-01,  1.22041374e-01],\n       [ 3.46009609e-01, -1.56291874e-01],\n       [ 6.23819644e-01,  1.39763503e-01],\n       [ 6.38651518e-01,  1.66900115e-01],\n       [ 5.51461624e-01,  5.98413741e-02],\n       [ 4.07146497e-01, -1.71820871e-01],\n       [ 4.47142619e-01,  3.75600193e-02],\n       [ 4.88207585e-01,  1.49677521e-01],\n       [ 3.12066323e-01, -3.11303854e-02]])\n\n\nEach row corresponds to a row in our data.\n\niris_pca_vals.shape\n\n(150, 2)\n\n\n\niris_df.shape\n\n(150, 7)\n\n\nWe can add the component to our dataset. I prefer to keep everything in one table and it is not at all required. You can just assign the values whichever variables you prefer.\n\niris_df['c1'] = [item[0] for item in iris_pca_vals]\niris_df['c2'] = [item[1] for item in iris_pca_vals]\n\n\niris_df\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\nThree clusters\nTwo clusters\nFive clusters\nc1\nc2\n\n\n\n\n0\n0.222222\n0.625000\n0.067797\n0.041667\n2\n1\n3\n-0.630703\n0.107578\n\n\n1\n0.166667\n0.416667\n0.067797\n0.041667\n2\n1\n1\n-0.622905\n-0.104260\n\n\n2\n0.111111\n0.500000\n0.050847\n0.041667\n2\n1\n1\n-0.669520\n-0.051417\n\n\n3\n0.083333\n0.458333\n0.084746\n0.041667\n2\n1\n1\n-0.654153\n-0.102885\n\n\n4\n0.194444\n0.666667\n0.067797\n0.041667\n2\n1\n3\n-0.648788\n0.133488\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n145\n0.666667\n0.416667\n0.711864\n0.916667\n1\n0\n4\n0.551462\n0.059841\n\n\n146\n0.555556\n0.208333\n0.677966\n0.750000\n0\n0\n2\n0.407146\n-0.171821\n\n\n147\n0.611111\n0.416667\n0.711864\n0.791667\n1\n0\n2\n0.447143\n0.037560\n\n\n148\n0.527778\n0.583333\n0.745763\n0.916667\n1\n0\n4\n0.488208\n0.149678\n\n\n149\n0.444444\n0.416667\n0.694915\n0.708333\n0\n0\n2\n0.312066\n-0.031130\n\n\n\n\n150 rows × 9 columns\n\n\n\nPlotting out our data on our new two component space.\n\nsns.scatterplot(data = iris_df, x = 'c1', y = 'c2')\n\n&lt;Axes: xlabel='c1', ylabel='c2'&gt;\n\n\n\n\n\nWe have reduced our three dimensions to two.\nWe can also colour by our clusters. What does this show us and is it useful?\n\nsns.scatterplot(data = iris_df, x = 'c1', y = 'c2', hue = 'Three clusters')\n\n&lt;Axes: xlabel='c1', ylabel='c2'&gt;\n\n\n\n\n\n\niris_df\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\nThree clusters\nTwo clusters\nFive clusters\nc1\nc2\n\n\n\n\n0\n0.222222\n0.625000\n0.067797\n0.041667\n2\n1\n3\n-0.630703\n0.107578\n\n\n1\n0.166667\n0.416667\n0.067797\n0.041667\n2\n1\n1\n-0.622905\n-0.104260\n\n\n2\n0.111111\n0.500000\n0.050847\n0.041667\n2\n1\n1\n-0.669520\n-0.051417\n\n\n3\n0.083333\n0.458333\n0.084746\n0.041667\n2\n1\n1\n-0.654153\n-0.102885\n\n\n4\n0.194444\n0.666667\n0.067797\n0.041667\n2\n1\n3\n-0.648788\n0.133488\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n145\n0.666667\n0.416667\n0.711864\n0.916667\n1\n0\n4\n0.551462\n0.059841\n\n\n146\n0.555556\n0.208333\n0.677966\n0.750000\n0\n0\n2\n0.407146\n-0.171821\n\n\n147\n0.611111\n0.416667\n0.711864\n0.791667\n1\n0\n2\n0.447143\n0.037560\n\n\n148\n0.527778\n0.583333\n0.745763\n0.916667\n1\n0\n4\n0.488208\n0.149678\n\n\n149\n0.444444\n0.416667\n0.694915\n0.708333\n0\n0\n2\n0.312066\n-0.031130\n\n\n\n\n150 rows × 9 columns\n\n\n\n\n\n16.5.2 PCA to Clusters\nWe have reduced our 4D dataset to 2D whilst keeping the data variance. Reducing the data to fewer dimensions can help with the ‘curse of dimensionality’, reduce the change of overfitting a machine learning model (see here) and reduce the computational complexity of a model fit.\nPutting our new dimensions into a kMeans model\n\nk_means_pca = KMeans(n_clusters = 3, init = 'random', n_init = 10)\niris_pca_kmeans = k_means_pca.fit(iris_df.iloc[:,-2:])\n\n\ntype(iris_df.iloc[:,-2:].values)\n\nnumpy.ndarray\n\n\n\niris_df['PCA 3 clusters'] = pd.Series(k_means_pca.predict(iris_df.iloc[:,-2:].values), index = iris_df.index)\niris_df\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\nThree clusters\nTwo clusters\nFive clusters\nc1\nc2\nPCA 3 clusters\n\n\n\n\n0\n0.222222\n0.625000\n0.067797\n0.041667\n2\n1\n3\n-0.630703\n0.107578\n0\n\n\n1\n0.166667\n0.416667\n0.067797\n0.041667\n2\n1\n1\n-0.622905\n-0.104260\n0\n\n\n2\n0.111111\n0.500000\n0.050847\n0.041667\n2\n1\n1\n-0.669520\n-0.051417\n0\n\n\n3\n0.083333\n0.458333\n0.084746\n0.041667\n2\n1\n1\n-0.654153\n-0.102885\n0\n\n\n4\n0.194444\n0.666667\n0.067797\n0.041667\n2\n1\n3\n-0.648788\n0.133488\n0\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n145\n0.666667\n0.416667\n0.711864\n0.916667\n1\n0\n4\n0.551462\n0.059841\n1\n\n\n146\n0.555556\n0.208333\n0.677966\n0.750000\n0\n0\n2\n0.407146\n-0.171821\n2\n\n\n147\n0.611111\n0.416667\n0.711864\n0.791667\n1\n0\n2\n0.447143\n0.037560\n1\n\n\n148\n0.527778\n0.583333\n0.745763\n0.916667\n1\n0\n4\n0.488208\n0.149678\n1\n\n\n149\n0.444444\n0.416667\n0.694915\n0.708333\n0\n0\n2\n0.312066\n-0.031130\n2\n\n\n\n\n150 rows × 10 columns\n\n\n\nAs we only have two dimensions we can easily plot this on a single scatterplot.\n\n# a different seaborn theme\n# see https://python-graph-gallery.com/104-seaborn-themes/\nsns.set_style(\"darkgrid\")\nsns.scatterplot(data = iris_df, x = 'c1', y = 'c2', hue = 'PCA 3 clusters')\n\n&lt;Axes: xlabel='c1', ylabel='c2'&gt;\n\n\n\n\n\nI suspect having two clusters would work better. We should try a few different models.\nCopying the code from here we can fit multiple numbers of clusters.\n\nks = range(1, 10)\ninertias = [] # Create an empty list (will be populated later)\nfor k in ks:\n    # Create a KMeans instance with k clusters: model\n    model = KMeans(n_clusters=k, n_init = 10)\n    \n    # Fit model to samples\n    model.fit(iris_df.iloc[:,-2:])\n    \n    # Append the inertia to the list of inertias\n    inertias.append(model.inertia_)\n    \nplt.plot(ks, inertias, '-o', color='black')\nplt.xlabel('number of clusters, k')\nplt.ylabel('inertia')\nplt.xticks(ks)\nplt.show()\n\n\n\n\nThree seems ok. We clearly want no more than three.\nThese types of plots show an point about model complexity. More free parameters in the model (here the number of clusters) will improve how well the model captures the data, often with reducing returns. However, a model which overfits the data will not be able to fit new data well - referred to overfitting. Randomish internet blogs introduce the topic pretty well, see here, and also wikipedia, see here.\n\n\n16.5.3 Missing values\nFinally, how we deal with missing values can impact the results of PCA and kMeans clustering.\nLets us load in the iris dataset again and randomly remove 10% of the data (see code from here).\n\nimport numpy as np\n\nx = load_iris()\n\n\niris_df = pd.DataFrame(x.data, columns = x.feature_names)\n\nmask = np.random.choice([True, False], size = iris_df.shape, p = [0.2, 0.8])\nmask[mask.all(1),-1] = 0\n\ndf = iris_df.mask(mask)\n\ndf.isna().sum()\n\nsepal length (cm)    29\nsepal width (cm)     21\npetal length (cm)    32\npetal width (cm)     21\ndtype: int64\n\n\n\ndf\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\n\n\n\n\n0\n5.1\n3.5\n1.4\n0.2\n\n\n1\n4.9\n3.0\nNaN\n0.2\n\n\n2\nNaN\n3.2\n1.3\n0.2\n\n\n3\n4.6\n3.1\nNaN\n0.2\n\n\n4\nNaN\n3.6\n1.4\n0.2\n\n\n...\n...\n...\n...\n...\n\n\n145\n6.7\nNaN\n5.2\n2.3\n\n\n146\n6.3\n2.5\nNaN\n1.9\n\n\n147\n6.5\n3.0\n5.2\n2.0\n\n\n148\nNaN\n3.4\n5.4\n2.3\n\n\n149\n5.9\n3.0\nNaN\nNaN\n\n\n\n\n150 rows × 4 columns\n\n\n\nAbout 20% of the data is randomly an NaN.\n\n16.5.3.1 Zeroing\nWe can 0 them and fit our models.\n\ndf_1 = df.copy()\ndf_1 = df_1.fillna(0)\n\n\ndf_1\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\n\n\n\n\n0\n5.1\n3.5\n1.4\n0.2\n\n\n1\n4.9\n3.0\n0.0\n0.2\n\n\n2\n0.0\n3.2\n1.3\n0.2\n\n\n3\n4.6\n3.1\n0.0\n0.2\n\n\n4\n0.0\n3.6\n1.4\n0.2\n\n\n...\n...\n...\n...\n...\n\n\n145\n6.7\n0.0\n5.2\n2.3\n\n\n146\n6.3\n2.5\n0.0\n1.9\n\n\n147\n6.5\n3.0\n5.2\n2.0\n\n\n148\n0.0\n3.4\n5.4\n2.3\n\n\n149\n5.9\n3.0\n0.0\n0.0\n\n\n\n\n150 rows × 4 columns\n\n\n\n\nk_means_zero = KMeans(n_clusters = 4, init = 'random', n_init = 10)\nk_means_zero.fit(df_1)\ndf_1['Four clusters'] = pd.Series(k_means_zero.predict(df_1.iloc[:,0:4].values), index = df_1.index)\nsns.pairplot(df_1, hue = 'Four clusters')\n\n\n\n\nWhat impact has zeroing the values had on our results?\nNow, onto PCA.\n\n# PCA analysis\nn_components = 2\n\npca = PCA(n_components=n_components)\ndf_1_pca = pca.fit(df_1.iloc[:,0:4])\n\n# Extract projected values\ndf_1_pca_vals = df_1_pca.transform(df_1.iloc[:,0:4])\ndf_1['c1'] = [item[0] for item in df_1_pca_vals]\ndf_1['c2'] = [item[1] for item in df_1_pca_vals]\n\nsns.scatterplot(data = df_1, x = 'c1', y = 'c2')\n\n&lt;Axes: xlabel='c1', ylabel='c2'&gt;\n\n\n\n\n\n\ndf_1_pca.explained_variance_\n\narray([6.24279356, 4.84811544])\n\n\n\ndf_1_pca.components_\n\narray([[-0.86129917,  0.04084996, -0.48641492, -0.14105157],\n       [-0.50682662, -0.04550418,  0.84286268,  0.175039  ]])\n\n\n\n\n16.5.3.2 Replacing with the average\n\ndf_2 = df.copy()\nfor i in range(4):\n    df_2.iloc[:,i] = df_2.iloc[:,i].fillna(df_2.iloc[:,i].mean())\n\n\ndf_2\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\n\n\n\n\n0\n5.100000\n3.500000\n1.400000\n0.200000\n\n\n1\n4.900000\n3.000000\n3.877119\n0.200000\n\n\n2\n5.839669\n3.200000\n1.300000\n0.200000\n\n\n3\n4.600000\n3.100000\n3.877119\n0.200000\n\n\n4\n5.839669\n3.600000\n1.400000\n0.200000\n\n\n...\n...\n...\n...\n...\n\n\n145\n6.700000\n3.054264\n5.200000\n2.300000\n\n\n146\n6.300000\n2.500000\n3.877119\n1.900000\n\n\n147\n6.500000\n3.000000\n5.200000\n2.000000\n\n\n148\n5.839669\n3.400000\n5.400000\n2.300000\n\n\n149\n5.900000\n3.000000\n3.877119\n1.205426\n\n\n\n\n150 rows × 4 columns\n\n\n\n\nk_means_zero = KMeans(n_clusters = 4, init = 'random', n_init = 10)\nk_means_zero.fit(df_2)\ndf_2['Four clusters'] = pd.Series(k_means_zero.predict(df_2.iloc[:,0:4].values), index = df_2.index)\nsns.pairplot(df_2, hue = 'Four clusters')\n\n\n\n\n\n# PCA analysis\nn_components = 2\n\npca = PCA(n_components=n_components)\ndf_2_pca = pca.fit(df_2.iloc[:,0:4])\n\n# Extract projected values\ndf_2_pca_vals = df_2_pca.transform(df_2.iloc[:,0:4])\ndf_2['c1'] = [item[0] for item in df_2_pca_vals]\ndf_2['c2'] = [item[1] for item in df_2_pca_vals]\n\nsns.scatterplot(data = df_2, x = 'c1', y = 'c2')\n\n&lt;Axes: xlabel='c1', ylabel='c2'&gt;\n\n\n\n\n\n\ndf_2_pca.explained_variance_\n\narray([3.01818399, 0.26633671])\n\n\n\ndf_2_pca.components_\n\narray([[ 0.31417904, -0.06487468,  0.88369345,  0.34083528],\n       [ 0.89110506,  0.17000084, -0.37665661,  0.18751344]])"
   },
   {
     "objectID": "content/labs/Lab_4/IM939_Lab_4_1_Iris.html#useful-resources",
@@ -718,21 +718,28 @@
     "href": "content/labs/Lab_5/IM939_Lab_5_1.html#data-wrangling",
     "title": "21  Lab: Clustering and Ground Truth",
     "section": "21.1 Data Wrangling",
-    "text": "21.1 Data Wrangling\nAs usual, we will start by looking at our data, and making transformations, if needed.\n\nimport pandas as pd\n\ndf = pd.read_csv('data/wine.csv')\n\ndf.head()\n\n\n\n\n\n\n\nTip\n\n\n\nThere is a column called Class label that gives us the ground truth. The wines come from three different cultivars. Knowing the actual grouping helps us to identify how well our methods can capture this ground truth.\n\n\nFollowing the data wrangling process that was summarised in Chapter 20, we should first get a sense of our data.\n\ndf.describe()\n\nAs you can see no variable has any missing data, but the scales of our features vary (e.g., Magnesium is in the 100s whereas Hue is in the low single digits).\nLet’s visually inspect how features are distributed using a violin plot:\n\nimport seaborn as sns\n\ndf_long = df.melt(id_vars='Class label')\n\nsns.violinplot(data = df_long, x = 'variable', y = 'value')\n\nRegretfully, this is not very useful right now, due to the different scales that we detected previously. In this case, it makes sense to normalise our data.\n\nfrom sklearn.preprocessing import MinMaxScaler\n\n# create a scaler object\nscaler = MinMaxScaler()\n\n# fit and transform the data\ndf_norm = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)\n\ndf_long = df_norm.melt(id_vars='Class label')\ndf_long\n\n\n#create seaborn violin plot\nmy_plot = sns.violinplot(data = df_long, x = 'variable', y = 'value')\n\n#rotate x-axis labels\nmy_plot.set_xticklabels(my_plot.get_xticklabels(), rotation=90)\n\nAre there any patterns?\nHow about a pairplot?\n\nsns.pairplot(data = df_norm.iloc[:,1:])\n\nHmm, a few interesting correlations. Some of our variables are skewed. We could apply some PCA here to look at fewer dimension or even log transform some of the skewed variables."
+    "text": "21.1 Data Wrangling\n\nimport pandas as pd\n\ndf = pd.read_csv('data/wine.csv')\n\nLook at our data.\n\ndf.head()\n\n\n\n\n\n\n\n\nClass label\nAlcohol\nMalic acid\nAsh\nAlcalinity of ash\nMagnesium\nTotal phenols\nFlavanoids\nNonflavanoid phenols\nProanthocyanins\nColor intensity\nHue\nOD280/OD315 of diluted wines\nProline\n\n\n\n\n0\n1\n14.23\n1.71\n2.43\n15.6\n127\n2.80\n3.06\n0.28\n2.29\n5.64\n1.04\n3.92\n1065\n\n\n1\n1\n13.20\n1.78\n2.14\n11.2\n100\n2.65\n2.76\n0.26\n1.28\n4.38\n1.05\n3.40\n1050\n\n\n2\n1\n13.16\n2.36\n2.67\n18.6\n101\n2.80\n3.24\n0.30\n2.81\n5.68\n1.03\n3.17\n1185\n\n\n3\n1\n14.37\n1.95\n2.50\n16.8\n113\n3.85\n3.49\n0.24\n2.18\n7.80\n0.86\n3.45\n1480\n\n\n4\n1\n13.24\n2.59\n2.87\n21.0\n118\n2.80\n2.69\n0.39\n1.82\n4.32\n1.04\n2.93\n735\n\n\n\n\n\n\n\nThere is a column called Class label that gives us the ground truth. The wines come from three different cultivars. Knowing the actual grouping helps us to identify how well our methods can capture this ground truth.\nFollowing our process above, we should first get a sense of our data.\n\ndf.describe()\n\n\n\n\n\n\n\n\nClass label\nAlcohol\nMalic acid\nAsh\nAlcalinity of ash\nMagnesium\nTotal phenols\nFlavanoids\nNonflavanoid phenols\nProanthocyanins\nColor intensity\nHue\nOD280/OD315 of diluted wines\nProline\n\n\n\n\ncount\n178.000000\n178.000000\n178.000000\n178.000000\n178.000000\n178.000000\n178.000000\n178.000000\n178.000000\n178.000000\n178.000000\n178.000000\n178.000000\n178.000000\n\n\nmean\n1.938202\n13.000618\n2.336348\n2.366517\n19.494944\n99.741573\n2.295112\n2.029270\n0.361854\n1.590899\n5.058090\n0.957449\n2.611685\n746.893258\n\n\nstd\n0.775035\n0.811827\n1.117146\n0.274344\n3.339564\n14.282484\n0.625851\n0.998859\n0.124453\n0.572359\n2.318286\n0.228572\n0.709990\n314.907474\n\n\nmin\n1.000000\n11.030000\n0.740000\n1.360000\n10.600000\n70.000000\n0.980000\n0.340000\n0.130000\n0.410000\n1.280000\n0.480000\n1.270000\n278.000000\n\n\n25%\n1.000000\n12.362500\n1.602500\n2.210000\n17.200000\n88.000000\n1.742500\n1.205000\n0.270000\n1.250000\n3.220000\n0.782500\n1.937500\n500.500000\n\n\n50%\n2.000000\n13.050000\n1.865000\n2.360000\n19.500000\n98.000000\n2.355000\n2.135000\n0.340000\n1.555000\n4.690000\n0.965000\n2.780000\n673.500000\n\n\n75%\n3.000000\n13.677500\n3.082500\n2.557500\n21.500000\n107.000000\n2.800000\n2.875000\n0.437500\n1.950000\n6.200000\n1.120000\n3.170000\n985.000000\n\n\nmax\n3.000000\n14.830000\n5.800000\n3.230000\n30.000000\n162.000000\n3.880000\n5.080000\n0.660000\n3.580000\n13.000000\n1.710000\n4.000000\n1680.000000\n\n\n\n\n\n\n\nNo missing data. The scales of our features vary (e.g., Magnesium is in the 100s whereas Hue is in the low single digits).\nHow about our feature distributions?\n\ndf_long = df.melt(id_vars='Class label')\n\n\nimport seaborn as sns\n\nsns.violinplot(data = df_long, x = 'variable', y = 'value')\n\n&lt;Axes: xlabel='variable', ylabel='value'&gt;\n\n\n\n\n\nMakes sense to normalise our data.\n\nfrom sklearn.preprocessing import MinMaxScaler\n\n# create a scaler object\nscaler = MinMaxScaler()\n\n# fit and transform the data\ndf_norm = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)\n\ndf_long = df_norm.melt(id_vars='Class label')\ndf_long\n\n\n\n\n\n\n\n\nClass label\nvariable\nvalue\n\n\n\n\n0\n0.0\nAlcohol\n0.842105\n\n\n1\n0.0\nAlcohol\n0.571053\n\n\n2\n0.0\nAlcohol\n0.560526\n\n\n3\n0.0\nAlcohol\n0.878947\n\n\n4\n0.0\nAlcohol\n0.581579\n\n\n...\n...\n...\n...\n\n\n2309\n1.0\nProline\n0.329529\n\n\n2310\n1.0\nProline\n0.336662\n\n\n2311\n1.0\nProline\n0.397290\n\n\n2312\n1.0\nProline\n0.400856\n\n\n2313\n1.0\nProline\n0.201141\n\n\n\n\n2314 rows × 3 columns\n\n\n\n\n#create seaborn violin plot\nmy_plot = sns.violinplot(data = df_long, x = 'variable', y = 'value')\n\n#rotate x-axis labels\nmy_plot.set_xticklabels(my_plot.get_xticklabels(), rotation=90)\n\n[Text(0, 0, 'Alcohol'),\n Text(1, 0, 'Malic acid'),\n Text(2, 0, 'Ash'),\n Text(3, 0, 'Alcalinity of ash'),\n Text(4, 0, 'Magnesium'),\n Text(5, 0, 'Total phenols'),\n Text(6, 0, 'Flavanoids'),\n Text(7, 0, 'Nonflavanoid phenols'),\n Text(8, 0, 'Proanthocyanins'),\n Text(9, 0, 'Color intensity'),\n Text(10, 0, 'Hue'),\n Text(11, 0, 'OD280/OD315 of diluted wines'),\n Text(12, 0, 'Proline ')]\n\n\n\n\n\nAre there any patterns?\nHow about a pairplot?\n\nsns.pairplot(data = df_norm.iloc[:,1:])\n\n\n\n\nHmm, a few interesting correlations. Some of our variables are skewed. We could apply some PCA here to look at fewer dimension or even log transform some of the skewed variables."
   },
   {
     "objectID": "content/labs/Lab_5/IM939_Lab_5_1.html#cluster-analysis",
     "href": "content/labs/Lab_5/IM939_Lab_5_1.html#cluster-analysis",
     "title": "21  Lab: Clustering and Ground Truth",
     "section": "21.2 Cluster analysis",
-    "text": "21.2 Cluster analysis\nFor now we will just run a kmeans cluster and then check our results against the ground truth.\n\n21.2.1 Number of clusters\nLets decide how many clusters we need.\n\nfrom sklearn.cluster import KMeans\n\nks = range(1, 10)\ninertias = []\nfor k in ks:\n    # Create a KMeans instance with k clusters: model\n    model = KMeans(n_clusters=k, n_init = 10)\n    \n    # Fit model to samples\n    model.fit(df.iloc[:,1:])\n    \n    # Append the inertia to the list of inertias\n    inertias.append(model.inertia_)\n\nimport matplotlib.pyplot as plt\n\nplt.plot(ks, inertias, '-o', color='black')\nplt.xlabel('number of clusters, k')\nplt.ylabel('inertia')\nplt.xticks(ks)\nplt.show()\n\nWhat happens if we use the normalised data instead?\n\nfrom sklearn.cluster import KMeans\n\nks = range(1, 10)\ninertias = []\nfor k in ks:\n    # Create a KMeans instance with k clusters: model\n    model = KMeans(n_clusters=k, n_init = 10)\n    \n    # Fit model to samples\n    model.fit(df_norm.iloc[:,1:])\n    \n    # Append the inertia to the list of inertias\n    inertias.append(model.inertia_)\n\nimport matplotlib.pyplot as plt\n\nplt.plot(ks, inertias, '-o', color='black')\nplt.xlabel('number of clusters, k')\nplt.ylabel('inertia')\nplt.xticks(ks)\nplt.show()\n\n\n\n\n\n\n\nPause for thought\n\n\n\nBoth of the graphs are the same. Is that what you would expect?\n\n\nThree clusters seems about right (and matches our number of origonal labels).\n\ndf['Class label'].value_counts()\n\n\n\n21.2.2 Calculate 3 clusters\nNow, we are going to calculate three clusters and store each observation’s cluster labels into a variable within the original dataframe:\n\n# Create a KMeans instance with k clusters: model\nk_means = KMeans(n_clusters=3)\n\n# Fit model to samples\ndf_k_means = k_means.fit(df.iloc[:,1:])\n\n# Create a new variable with the fited cluster label.\ndf['Three clusters'] = pd.Series(df_k_means.predict(df.iloc[:,1:].values), index = df.index)\ndf\n\n\n\n21.2.3 Ground Truth Validation\nDo our cluster labels match our ground truth? Did our cluster model capture reality?\n\nct = pd.crosstab(df['Three clusters'], df['Class label'])\nct\n\nIt might be easier to see as a stacked plot (see this post).\n\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nct.plot.bar(stacked=True)\nplt.legend(title='Class label')\n\nHow has the kmeans model done compared to our ground truth?\n\n\n\n\n\n\nImportant\n\n\n\nWe need to be really careful here. We notice that it is not easily possible to compare the known class labels to clustering labels. The reason is that the clustering algorithm labels are just arbitrary and not assigned to any deterministic criteria. Each time you run the algorithm, you might get a different id for the labels. The reason is that the label itself doesn’t actually mean anything, what is important is the list of items that are in the same cluster and their relations.\n\n\nA way to come over this ambiguity and evaluate the results is to look at a visualisations of the results and compare. But this brings in the question of what type of visualisation to use for looking at the clusters.\nAn immediate alternative is to use scatterplots. However, it is not clear which axis to use for clustering. A common method to apply at this stage is to make use of PCA to get a 2D plane where we can project the data points and visualise them over this projection.\n\ndf.iloc[:,1:14]\n\n\nfrom sklearn.decomposition import PCA\n\nn_components = 2\n\npca = PCA(n_components=n_components)\ndf_pca = pca.fit(df.iloc[:,1:14])\ndf_pca_vals = df_pca.transform(df.iloc[:,1:14])\n\nGrab our projections and plot along with our cluster names.\n\ndf['c1'] = [item[0] for item in df_pca_vals]\ndf['c2'] = [item[1] for item in df_pca_vals]\n\nax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Class label')\nax.set_title('Known labels visualised over PCs')\n\nIn the figure above, we colored the points based on the actual labels, we observe that there has been several misclassifications in the figure above (i.e., in the algorithm’s results). So one may choose to use an alternative algorithm or devise a better distance metric.\n\nax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Three clusters')\nax.set_title('Results of the algorithm visualised over PCs')\n\nThis shows the parallelism between the clustering algorithm and PCA. By looking at the PCA loadings, we can find out what the x-axis mean and try to interpret the clusters (We leave this as an additional exercise for those interested).\nHow might your interpret the above plots? Did the kmeans model identify the ground truth?\nHow robust is our clustering? It may be that the kmeans algorithm becamse stuck or that a few outliers have biased the clustering.\nTwo ways to check are:\n\nRunning the model multiple times with different initial values.\nRemoving some data and running the modelling multiple times.\n\nRun the below cell a few times. What do you see?\n\n# Create a KMeans instance with k clusters: model\nk_means = KMeans(n_clusters=3, init='random', n_init = 10)\n\n# Fit model to samples\ndf_k_means = k_means.fit(df.iloc[:,1:14])\n\ndf['Three clusters'] = pd.Series(df_k_means.predict(df.iloc[:,1:14].values), index = df.index)\n\nax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Three clusters')\nax.set_title('Results of the algorithm visualised over PCs')\n\nHow about with only 80% of the data?\n\ndf_sample = df.sample(frac=0.8, replace=False)\n\n# Create a KMeans instance with k clusters: model\nk_means = KMeans(n_clusters=3, init='random', n_init = 10)\n\n# Fit model to samples\ndf_k_means = k_means.fit(df_sample.iloc[:,1:14])\n\ndf_sample['Three clusters'] = pd.Series(df_k_means.predict(df_sample.iloc[:,1:14].values), index = df_sample.index)\n\nax = sns.scatterplot(data = df_sample, x = 'c1', y = 'c2', hue = 'Three clusters')\nax.set_title('Results of the algorithm visualised over PCs')\n\nWe may want to automate the process of resampling the data or rerunning the model then perhaps plotting the different inertia values or creating different plots.\nDo you think our clustering algorithm is stable and provide similiar results even when some data is removed or the initial values are random?\nIf so, then is our algorithm capturing the ground truth?\n\n\n\n\nCortez, Paulo, A Cerdeira, F Almeida, T Matos, and J. Reis. 2009. “Wine Quality.” UCI Machine Learning Repository. https://doi.org/10.24432/C56S3T."
+    "text": "21.2 Cluster analysis\nFor now we will just run a kmeans cluster and then check our results against the ground truth.\n\n21.2.1 Determining the number of clusters\nLets decide how many clusters we need.\n\nfrom sklearn.cluster import KMeans\n\nks = range(1, 10)\ninertias = []\nfor k in ks:\n    # Create a KMeans instance with k clusters: model\n    model = KMeans(n_clusters=k, n_init = 10)\n    \n    # Fit model to samples\n    model.fit(df.iloc[:,1:])\n    \n    # Append the inertia to the list of inertias\n    inertias.append(model.inertia_)\n\nimport matplotlib.pyplot as plt\n\nplt.plot(ks, inertias, '-o', color='black')\nplt.xlabel('number of clusters, k')\nplt.ylabel('inertia')\nplt.xticks(ks)\nplt.show()\n\n\n\n\nWhat happens if we use the normalised data instead?\n\nfrom sklearn.cluster import KMeans\n\nks = range(1, 10)\ninertias = []\nfor k in ks:\n    # Create a KMeans instance with k clusters: model\n    model = KMeans(n_clusters=k, n_init = 10)\n    \n    # Fit model to samples\n    model.fit(df_norm.iloc[:,1:])\n    \n    # Append the inertia to the list of inertias\n    inertias.append(model.inertia_)\n\nimport matplotlib.pyplot as plt\n\nplt.plot(ks, inertias, '-o', color='black')\nplt.xlabel('number of clusters, k')\nplt.ylabel('inertia')\nplt.xticks(ks)\nplt.show()\n\n\n\n\n\n\n\n\n\n\nPause for thought\n\n\n\nBoth of the graphs are the same. Is that what you would expect?\n\n\nThree clusters seems about right (and matches our number of origonal labels).\n\ndf['Class label'].value_counts()\n\nClass label\n2    71\n1    59\n3    48\nName: count, dtype: int64\n\n\n\n\n21.2.2 Computing the clusters\n\n# Create a KMeans instance with k clusters: model\nk_means = KMeans(n_clusters=3)\n\n# Fit model to samples\ndf_k_means = k_means.fit(df.iloc[:,1:])\n\ndf['Three clusters'] = pd.Series(df_k_means.predict(df.iloc[:,1:].values), index = df.index)\ndf\n\n\n\n\n\n\n\n\nClass label\nAlcohol\nMalic acid\nAsh\nAlcalinity of ash\nMagnesium\nTotal phenols\nFlavanoids\nNonflavanoid phenols\nProanthocyanins\nColor intensity\nHue\nOD280/OD315 of diluted wines\nProline\nThree clusters\n\n\n\n\n0\n1\n14.23\n1.71\n2.43\n15.6\n127\n2.80\n3.06\n0.28\n2.29\n5.64\n1.04\n3.92\n1065\n1\n\n\n1\n1\n13.20\n1.78\n2.14\n11.2\n100\n2.65\n2.76\n0.26\n1.28\n4.38\n1.05\n3.40\n1050\n1\n\n\n2\n1\n13.16\n2.36\n2.67\n18.6\n101\n2.80\n3.24\n0.30\n2.81\n5.68\n1.03\n3.17\n1185\n1\n\n\n3\n1\n14.37\n1.95\n2.50\n16.8\n113\n3.85\n3.49\n0.24\n2.18\n7.80\n0.86\n3.45\n1480\n1\n\n\n4\n1\n13.24\n2.59\n2.87\n21.0\n118\n2.80\n2.69\n0.39\n1.82\n4.32\n1.04\n2.93\n735\n2\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n173\n3\n13.71\n5.65\n2.45\n20.5\n95\n1.68\n0.61\n0.52\n1.06\n7.70\n0.64\n1.74\n740\n2\n\n\n174\n3\n13.40\n3.91\n2.48\n23.0\n102\n1.80\n0.75\n0.43\n1.41\n7.30\n0.70\n1.56\n750\n2\n\n\n175\n3\n13.27\n4.28\n2.26\n20.0\n120\n1.59\n0.69\n0.43\n1.35\n10.20\n0.59\n1.56\n835\n2\n\n\n176\n3\n13.17\n2.59\n2.37\n20.0\n120\n1.65\n0.68\n0.53\n1.46\n9.30\n0.60\n1.62\n840\n2\n\n\n177\n3\n14.13\n4.10\n2.74\n24.5\n96\n2.05\n0.76\n0.56\n1.35\n9.20\n0.61\n1.60\n560\n0\n\n\n\n\n178 rows × 15 columns"
+  },
+  {
+    "objectID": "content/labs/Lab_5/IM939_Lab_5_1.html#clusters-and-ground-truth",
+    "href": "content/labs/Lab_5/IM939_Lab_5_1.html#clusters-and-ground-truth",
+    "title": "21  Lab: Clustering and Ground Truth",
+    "section": "21.3 Clusters and Ground Truth",
+    "text": "21.3 Clusters and Ground Truth\nNow that we have created three clusters, we may ask ourselves: Do our cluster labels match our ground truth? Did our cluster model capture reality?\n\nct = pd.crosstab(df['Three clusters'], df['Class label'])\nct\n\n\n\n\n\n\n\nClass label\n1\n2\n3\n\n\nThree clusters\n\n\n\n\n\n\n\n0\n0\n50\n19\n\n\n1\n46\n1\n0\n\n\n2\n13\n20\n29\n\n\n\n\n\n\n\nIt might be easier to see as a stacked plot (see this post).\n\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nct.plot.bar(stacked=True)\nplt.legend(title='Class label')\n\n&lt;matplotlib.legend.Legend at 0x1798f3e50&gt;\n\n\n\n\n\nHow has the kmeans model done compared to our ground truth?\n\n\n\n\n\n\nImportant\n\n\n\nWe need to be really careful here. We notice that it is not easily possible to compare the known class labels to clustering labels. The reason is that the clustering algorithm labels are just arbitrary and not assigned to any deterministic criteria. Each time you run the algorithm, you might get a different id for the labels. The reason is that the label itself doesn’t actually mean anything, what is important is the list of items that are in the same cluster and their relations.\n\n\n\n21.3.1 Principal Components Analysis\nA way to come over this ambiguity and evaluate the results is to look at a visualisations of the results and compare. But this brings in the question of what type of visualisation to use for looking at the clusters. An immediate alternative is to use scatterplots. However, it is not clear which axis to use for clustering. A common method to apply at this stage is to make use of PCA to get a 2D plane where we can project the data points and visualise them over this projection.\n\ndf.iloc[:,1:14]\n\n\n\n\n\n\n\n\nAlcohol\nMalic acid\nAsh\nAlcalinity of ash\nMagnesium\nTotal phenols\nFlavanoids\nNonflavanoid phenols\nProanthocyanins\nColor intensity\nHue\nOD280/OD315 of diluted wines\nProline\n\n\n\n\n0\n14.23\n1.71\n2.43\n15.6\n127\n2.80\n3.06\n0.28\n2.29\n5.64\n1.04\n3.92\n1065\n\n\n1\n13.20\n1.78\n2.14\n11.2\n100\n2.65\n2.76\n0.26\n1.28\n4.38\n1.05\n3.40\n1050\n\n\n2\n13.16\n2.36\n2.67\n18.6\n101\n2.80\n3.24\n0.30\n2.81\n5.68\n1.03\n3.17\n1185\n\n\n3\n14.37\n1.95\n2.50\n16.8\n113\n3.85\n3.49\n0.24\n2.18\n7.80\n0.86\n3.45\n1480\n\n\n4\n13.24\n2.59\n2.87\n21.0\n118\n2.80\n2.69\n0.39\n1.82\n4.32\n1.04\n2.93\n735\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n173\n13.71\n5.65\n2.45\n20.5\n95\n1.68\n0.61\n0.52\n1.06\n7.70\n0.64\n1.74\n740\n\n\n174\n13.40\n3.91\n2.48\n23.0\n102\n1.80\n0.75\n0.43\n1.41\n7.30\n0.70\n1.56\n750\n\n\n175\n13.27\n4.28\n2.26\n20.0\n120\n1.59\n0.69\n0.43\n1.35\n10.20\n0.59\n1.56\n835\n\n\n176\n13.17\n2.59\n2.37\n20.0\n120\n1.65\n0.68\n0.53\n1.46\n9.30\n0.60\n1.62\n840\n\n\n177\n14.13\n4.10\n2.74\n24.5\n96\n2.05\n0.76\n0.56\n1.35\n9.20\n0.61\n1.60\n560\n\n\n\n\n178 rows × 13 columns\n\n\n\n\nfrom sklearn.decomposition import PCA\n\nn_components = 2\n\npca = PCA(n_components=n_components)\ndf_pca = pca.fit(df.iloc[:,1:14])\ndf_pca_vals = df_pca.transform(df.iloc[:,1:14])\n\nGrab our projections and plot along with our cluster names.\n\ndf['c1'] = [item[0] for item in df_pca_vals]\ndf['c2'] = [item[1] for item in df_pca_vals]\n\nax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Class label')\nax.set_title('Known labels visualised over PCs')\n\nText(0.5, 1.0, 'Known labels visualised over PCs')\n\n\n\n\n\nIn the figure above, we colored the points based on the actual labels, we observe that there has been several misclassifications in the figure above (i.e., in the algorithm’s results). So one may choose to use an alternative algorithm or devise a better distance metric.\n\nax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Three clusters')\nax.set_title('Results of the algorithm visualised over PCs')\n\nText(0.5, 1.0, 'Results of the algorithm visualised over PCs')\n\n\n\n\n\nThis shows the parallelism between the clustering algorithm and PCA. By looking at the PCA loadings, we can find out what the x-axis mean and try to interpret the clusters (We leave this as an additional exercise for those interested).\nHow might your interpret the above plots? Did the kmeans model identify the ground truth?\nHow robust is our clustering? It may be that the kmeans algorithm becamse stuck or that a few outliers have biased the clustering.\nTwo ways to check are:\n\nRunning the model multiple times with different initial values.\nRemoving some data and running the modelling multiple times.\n\nRun the below cell a few times. What do you see?\n\n# Create a KMeans instance with k clusters: model\nk_means = KMeans(n_clusters=3, init='random', n_init = 10)\n\n# Fit model to samples\ndf_k_means = k_means.fit(df.iloc[:,1:14])\n\ndf['Three clusters'] = pd.Series(df_k_means.predict(df.iloc[:,1:14].values), index = df.index)\n\nax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Three clusters')\nax.set_title('Results of the algorithm visualised over PCs')\n\nText(0.5, 1.0, 'Results of the algorithm visualised over PCs')\n\n\n\n\n\nHow about with only 80% of the data?\n\ndf_sample = df.sample(frac=0.8, replace=False)\n\n# Create a KMeans instance with k clusters: model\nk_means = KMeans(n_clusters=3, init='random', n_init = 10)\n\n# Fit model to samples\ndf_k_means = k_means.fit(df_sample.iloc[:,1:14])\n\ndf_sample['Three clusters'] = pd.Series(df_k_means.predict(df_sample.iloc[:,1:14].values), index = df_sample.index)\n\nax = sns.scatterplot(data = df_sample, x = 'c1', y = 'c2', hue = 'Three clusters')\nax.set_title('Results of the algorithm visualised over PCs')\n\nText(0.5, 1.0, 'Results of the algorithm visualised over PCs')\n\n\n\n\n\nWe may want to automate the process of resampling the data or rerunning the model then perhaps plotting the different inertia values or creating different plots.\nDo you think our clustering algorithm is stable and provide similiar results even when some data is removed or the initial values are random?\nIf so, then is our algorithm capturing the ground truth?\n\n\n\n\nCortez, Paulo, A Cerdeira, F Almeida, T Matos, and J. Reis. 2009. “Wine Quality.” UCI Machine Learning Repository. https://doi.org/10.24432/C56S3T."
   },
   {
     "objectID": "content/labs/Lab_5/IM939_Lab_5_2.html",
     "href": "content/labs/Lab_5/IM939_Lab_5_2.html",
     "title": "22  Lab: Cross validation",
     "section": "",
-    "text": "Details of the crime dataset are here.\nWe are going to examine the data, fit and then cross-validate a regression model.\n\nimport pandas as pd\ndf = pd.read_csv('data/censusCrimeClean.csv')\ndf.head()\n\n\n\n\n\n\n\n\ncommunityname\nfold\npopulation\nhouseholdsize\nracepctblack\nracePctWhite\nracePctAsian\nracePctHisp\nagePct12t21\nagePct12t29\n...\nNumStreet\nPctForeignBorn\nPctBornSameState\nPctSameHouse85\nPctSameCity85\nPctSameState85\nLandArea\nPopDens\nPctUsePubTrans\nViolentCrimesPerPop\n\n\n\n\n0\nLakewoodcity\n1\n0.19\n0.33\n0.02\n0.90\n0.12\n0.17\n0.34\n0.47\n...\n0.0\n0.12\n0.42\n0.50\n0.51\n0.64\n0.12\n0.26\n0.20\n0.20\n\n\n1\nTukwilacity\n1\n0.00\n0.16\n0.12\n0.74\n0.45\n0.07\n0.26\n0.59\n...\n0.0\n0.21\n0.50\n0.34\n0.60\n0.52\n0.02\n0.12\n0.45\n0.67\n\n\n2\nAberdeentown\n1\n0.00\n0.42\n0.49\n0.56\n0.17\n0.04\n0.39\n0.47\n...\n0.0\n0.14\n0.49\n0.54\n0.67\n0.56\n0.01\n0.21\n0.02\n0.43\n\n\n3\nWillingborotownship\n1\n0.04\n0.77\n1.00\n0.08\n0.12\n0.10\n0.51\n0.50\n...\n0.0\n0.19\n0.30\n0.73\n0.64\n0.65\n0.02\n0.39\n0.28\n0.12\n\n\n4\nBethlehemtownship\n1\n0.01\n0.55\n0.02\n0.95\n0.09\n0.05\n0.38\n0.38\n...\n0.0\n0.11\n0.72\n0.64\n0.61\n0.53\n0.04\n0.09\n0.02\n0.03\n\n\n\n\n5 rows × 102 columns\n\n\n\nOne hundred features. Too many for us to visualise at once.\nInstead, we can pick out particular variables and carry out a linear regression. To make our work simple we will look at ViolentCrimesPerPop as our dependent variable and medIncome as our indpendent variable.\nWe may wonder if there is more violent crime in low income areas.\nLet us create a new dataframe containing our regression variables. We do not have to do this I find it makes our work clearer.\n\ndf_reg = df[['communityname', 'medIncome', 'ViolentCrimesPerPop']]\ndf_reg\n\n\n\n\n\n\n\n\ncommunityname\nmedIncome\nViolentCrimesPerPop\n\n\n\n\n0\nLakewoodcity\n0.37\n0.20\n\n\n1\nTukwilacity\n0.31\n0.67\n\n\n2\nAberdeentown\n0.30\n0.43\n\n\n3\nWillingborotownship\n0.58\n0.12\n\n\n4\nBethlehemtownship\n0.50\n0.03\n\n\n...\n...\n...\n...\n\n\n1989\nTempleTerracecity\n0.42\n0.09\n\n\n1990\nSeasidecity\n0.28\n0.45\n\n\n1991\nWaterburytown\n0.31\n0.23\n\n\n1992\nWalthamcity\n0.44\n0.19\n\n\n1993\nOntariocity\n0.40\n0.48\n\n\n\n\n1994 rows × 3 columns\n\n\n\nPlot our data (a nice page on plotting regressions with seaborn is here).\n\nimport seaborn as sns\nsns.jointplot(data = df[['medIncome', 'ViolentCrimesPerPop']], \n              x = 'ViolentCrimesPerPop', \n              y = 'medIncome', kind='reg',\n              marker = '.')\n\n\n\n\nWe may want to z-transform or log these scores as they are heavily skewed.\n\nimport numpy as np\n\n# some values are 0 so 0.1 is added to prevent log giving us infinity\n# there may be a better way to do this!\ndf_reg.loc[:, 'ViolentCrimesPerPop_log'] = np.log(df_reg['ViolentCrimesPerPop'] + 0.1)\ndf_reg.loc[:,'medIncome_log'] = np.log(df_reg['medIncome'] + 0.1)\n\n\ndf_reg\n\n\n\n\n\n\n\n\ncommunityname\nmedIncome\nViolentCrimesPerPop\nViolentCrimesPerPop_log\nmedIncome_log\n\n\n\n\n0\nLakewoodcity\n0.37\n0.20\n-1.203973\n-0.755023\n\n\n1\nTukwilacity\n0.31\n0.67\n-0.261365\n-0.891598\n\n\n2\nAberdeentown\n0.30\n0.43\n-0.634878\n-0.916291\n\n\n3\nWillingborotownship\n0.58\n0.12\n-1.514128\n-0.385662\n\n\n4\nBethlehemtownship\n0.50\n0.03\n-2.040221\n-0.510826\n\n\n...\n...\n...\n...\n...\n...\n\n\n1989\nTempleTerracecity\n0.42\n0.09\n-1.660731\n-0.653926\n\n\n1990\nSeasidecity\n0.28\n0.45\n-0.597837\n-0.967584\n\n\n1991\nWaterburytown\n0.31\n0.23\n-1.108663\n-0.891598\n\n\n1992\nWalthamcity\n0.44\n0.19\n-1.237874\n-0.616186\n\n\n1993\nOntariocity\n0.40\n0.48\n-0.544727\n-0.693147\n\n\n\n\n1994 rows × 5 columns\n\n\n\n\nimport seaborn as sns\nsns.jointplot(data = df_reg[['medIncome_log', 'ViolentCrimesPerPop_log']], \n              x = 'ViolentCrimesPerPop_log', \n              y = 'medIncome_log', kind='reg',\n              marker = '.')\n\n\n\n\nIs log transforming our variables the right thing to do here?\nFit our regression to the log transformed data.\n\nimport matplotlib.pyplot as plt\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn import metrics\n\nx = df_reg[['ViolentCrimesPerPop_log']]\ny = df_reg[['medIncome_log']]\n\nmodel = LinearRegression()\nmodel.fit(x, y)\n\ny_hat = model.predict(x)\nplt.plot(x, y,'o', alpha = 0.5)\nplt.plot(x, y_hat, 'r', alpha = 0.5)\n\nplt.xlabel('Violent Crimes Per Population')\nplt.ylabel('Median Income')\n\nprint (\"MSE:\", metrics.mean_squared_error(y_hat, y))\nprint (\"R^2:\", metrics.r2_score(y, y_hat))\nprint (\"var:\", y.var())\n\nMSE: 0.1531885348757034\nR^2: 0.22763497704356928\nvar: medIncome_log    0.198436\ndtype: float64\n\n\n\n\n\nHas our log transformation distorted the pattern in the data?\n\nx = df_reg[['ViolentCrimesPerPop']]\ny = df_reg[['medIncome']]\n\nmodel = LinearRegression()\nmodel.fit(x, y)\n\ny_hat = model.predict(x)\nplt.plot(x, y,'o', alpha = 0.5)\nplt.plot(x, y_hat, 'r', alpha = 0.5)\n\nplt.xlabel('Violent Crimes Per Population')\nplt.ylabel('Median Income')\n\nprint (\"MSE:\", metrics.mean_squared_error(y_hat, y))\nprint (\"R^2:\", metrics.r2_score(y, y_hat))\nprint (\"var:\", y.var())\n\nMSE: 0.03592636778157073\nR^2: 0.17996313165549482\nvar: medIncome    0.043833\ndtype: float64\n\n\n\n\n\nWhat is the relationship between violent crime and median income? Why might this be?\nAssuming the log data is fine, have we overfit the model? Remember that a good model (which accurately models the relationship between violent crimes per population) need to be robust when faced with new data.\nKfold cross validation splits data into train and test subsets. We can then fit the regression to the training set and see how well it does for the test set.\n\nfrom sklearn.model_selection import KFold\n\nX = df_reg[['ViolentCrimesPerPop']]\ny = df_reg[['medIncome']]\n\n# get four splits, Each split contains a \n# test series and a train series.\nkf = KFold(n_splits=4)\n\n\n# lists to store our statistics\nr_vals = []\nMSEs = []\nmedIncome_coef = []\n\nfor train_index, test_index in kf.split(X):\n    # fit our model and extract statistics\n    model = LinearRegression()\n    model.fit(X.iloc[train_index], y.iloc[train_index])\n    y_hat = model.predict(X.iloc[test_index])\n    \n    MSEs.append(metrics.mean_squared_error(y.iloc[test_index], y_hat))\n    medIncome_coef.append(model.coef_[0][0])\n    r_vals.append(metrics.r2_score(y.iloc[test_index], y_hat))\n\n\ndata = {'MSE' : MSEs, 'medIncome coefficient' : medIncome_coef, 'r squared' : r_vals}\npd.DataFrame(data)\n\n\n\n\n\n\n\n\nMSE\nmedIncome coefficient\nr squared\n\n\n\n\n0\n0.035727\n-0.403609\n0.130479\n\n\n1\n0.035904\n-0.389344\n0.162820\n\n\n2\n0.040777\n-0.353379\n0.200139\n\n\n3\n0.032255\n-0.378883\n0.182403\n\n\n\n\n\n\n\nDoes our model produce similiar coefficients with subsets of the data?\nWe can do this using an inbuild sklearn function (see here).\n\nfrom sklearn.model_selection import cross_val_score\nx = df_reg[['ViolentCrimesPerPop']]\ny = df_reg[['medIncome']]\n\nmodel = LinearRegression()\nmodel.fit(x, y)\n\nprint(cross_val_score(model, x, y, cv=4))\n\n[0.13047946 0.16281953 0.20013867 0.18240261]\n\n\nWhat do these values tell us about our model and data?\nYou might want to carry out multiple regression with more than one predictor variable, or reduce the number of dimensions, or perhaps address different questions using a clustering algorithm instead with all or a subset of features."
+    "text": "Details of the crime dataset are here.\nWe are going to examine the data, fit and then cross-validate a regression model.\n\nimport pandas as pd\ndf = pd.read_csv('data/censusCrimeClean.csv')\ndf.head()\n\n\n\n\n\n\n\n\ncommunityname\nfold\npopulation\nhouseholdsize\nracepctblack\nracePctWhite\nracePctAsian\nracePctHisp\nagePct12t21\nagePct12t29\n...\nNumStreet\nPctForeignBorn\nPctBornSameState\nPctSameHouse85\nPctSameCity85\nPctSameState85\nLandArea\nPopDens\nPctUsePubTrans\nViolentCrimesPerPop\n\n\n\n\n0\nLakewoodcity\n1\n0.19\n0.33\n0.02\n0.90\n0.12\n0.17\n0.34\n0.47\n...\n0.0\n0.12\n0.42\n0.50\n0.51\n0.64\n0.12\n0.26\n0.20\n0.20\n\n\n1\nTukwilacity\n1\n0.00\n0.16\n0.12\n0.74\n0.45\n0.07\n0.26\n0.59\n...\n0.0\n0.21\n0.50\n0.34\n0.60\n0.52\n0.02\n0.12\n0.45\n0.67\n\n\n2\nAberdeentown\n1\n0.00\n0.42\n0.49\n0.56\n0.17\n0.04\n0.39\n0.47\n...\n0.0\n0.14\n0.49\n0.54\n0.67\n0.56\n0.01\n0.21\n0.02\n0.43\n\n\n3\nWillingborotownship\n1\n0.04\n0.77\n1.00\n0.08\n0.12\n0.10\n0.51\n0.50\n...\n0.0\n0.19\n0.30\n0.73\n0.64\n0.65\n0.02\n0.39\n0.28\n0.12\n\n\n4\nBethlehemtownship\n1\n0.01\n0.55\n0.02\n0.95\n0.09\n0.05\n0.38\n0.38\n...\n0.0\n0.11\n0.72\n0.64\n0.61\n0.53\n0.04\n0.09\n0.02\n0.03\n\n\n\n\n5 rows × 102 columns\n\n\n\nOne hundred features. Too many for us to visualise at once.\nInstead, we can pick out particular variables and carry out a linear regression. To make our work simple we will look at ViolentCrimesPerPop as our dependent variable and medIncome as our indpendent variable.\nWe may wonder if there is more violent crime in low income areas.\nLet us create a new dataframe containing our regression variables. We do not have to do this I find it makes our work clearer.\n\ndf_reg = df[['communityname', 'medIncome', 'ViolentCrimesPerPop']]\ndf_reg\n\n\n\n\n\n\n\n\ncommunityname\nmedIncome\nViolentCrimesPerPop\n\n\n\n\n0\nLakewoodcity\n0.37\n0.20\n\n\n1\nTukwilacity\n0.31\n0.67\n\n\n2\nAberdeentown\n0.30\n0.43\n\n\n3\nWillingborotownship\n0.58\n0.12\n\n\n4\nBethlehemtownship\n0.50\n0.03\n\n\n...\n...\n...\n...\n\n\n1989\nTempleTerracecity\n0.42\n0.09\n\n\n1990\nSeasidecity\n0.28\n0.45\n\n\n1991\nWaterburytown\n0.31\n0.23\n\n\n1992\nWalthamcity\n0.44\n0.19\n\n\n1993\nOntariocity\n0.40\n0.48\n\n\n\n\n1994 rows × 3 columns\n\n\n\nPlot our data (a nice page on plotting regressions with seaborn is here).\n\nimport seaborn as sns\nsns.jointplot(data = df[['medIncome', 'ViolentCrimesPerPop']], \n              x = 'ViolentCrimesPerPop', \n              y = 'medIncome', kind='reg',\n              marker = '.')\n\n\n\n\nWe may want to z-transform or log these scores as they are heavily skewed.\n\nimport numpy as np\n\n# some values are 0 so 0.1 is added to prevent log giving us infinity\n# there may be a better way to do this!\ndf_reg.loc[:, 'ViolentCrimesPerPop_log'] = np.log(df_reg['ViolentCrimesPerPop'] + 0.1)\ndf_reg.loc[:,'medIncome_log'] = np.log(df_reg['medIncome'] + 0.1)\n\n/var/folders/7v/zl9mv52s3ls94kntlt_l9ryh0000gq/T/ipykernel_13528/3488182522.py:5: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n  df_reg.loc[:, 'ViolentCrimesPerPop_log'] = np.log(df_reg['ViolentCrimesPerPop'] + 0.1)\n/var/folders/7v/zl9mv52s3ls94kntlt_l9ryh0000gq/T/ipykernel_13528/3488182522.py:6: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n  df_reg.loc[:,'medIncome_log'] = np.log(df_reg['medIncome'] + 0.1)\n\n\n\ndf_reg\n\n\n\n\n\n\n\n\ncommunityname\nmedIncome\nViolentCrimesPerPop\nViolentCrimesPerPop_log\nmedIncome_log\n\n\n\n\n0\nLakewoodcity\n0.37\n0.20\n-1.203973\n-0.755023\n\n\n1\nTukwilacity\n0.31\n0.67\n-0.261365\n-0.891598\n\n\n2\nAberdeentown\n0.30\n0.43\n-0.634878\n-0.916291\n\n\n3\nWillingborotownship\n0.58\n0.12\n-1.514128\n-0.385662\n\n\n4\nBethlehemtownship\n0.50\n0.03\n-2.040221\n-0.510826\n\n\n...\n...\n...\n...\n...\n...\n\n\n1989\nTempleTerracecity\n0.42\n0.09\n-1.660731\n-0.653926\n\n\n1990\nSeasidecity\n0.28\n0.45\n-0.597837\n-0.967584\n\n\n1991\nWaterburytown\n0.31\n0.23\n-1.108663\n-0.891598\n\n\n1992\nWalthamcity\n0.44\n0.19\n-1.237874\n-0.616186\n\n\n1993\nOntariocity\n0.40\n0.48\n-0.544727\n-0.693147\n\n\n\n\n1994 rows × 5 columns\n\n\n\n\nimport seaborn as sns\nsns.jointplot(data = df_reg[['medIncome_log', 'ViolentCrimesPerPop_log']], \n              x = 'ViolentCrimesPerPop_log', \n              y = 'medIncome_log', kind='reg',\n              marker = '.')\n\n\n\n\nIs log transforming our variables the right thing to do here?\nFit our regression to the log transformed data.\n\nimport matplotlib.pyplot as plt\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn import metrics\n\nx = df_reg[['ViolentCrimesPerPop_log']]\ny = df_reg[['medIncome_log']]\n\nmodel = LinearRegression()\nmodel.fit(x, y)\n\ny_hat = model.predict(x)\nplt.plot(x, y,'o', alpha = 0.5)\nplt.plot(x, y_hat, 'r', alpha = 0.5)\n\nplt.xlabel('Violent Crimes Per Population')\nplt.ylabel('Median Income')\n\nprint (\"MSE:\", metrics.mean_squared_error(y_hat, y))\nprint (\"R^2:\", metrics.r2_score(y, y_hat))\nprint (\"var:\", y.var())\n\nMSE: 0.1531885348757034\nR^2: 0.22763497704356928\nvar: medIncome_log    0.198436\ndtype: float64\n\n\n\n\n\nHas our log transformation distorted the pattern in the data?\n\nx = df_reg[['ViolentCrimesPerPop']]\ny = df_reg[['medIncome']]\n\nmodel = LinearRegression()\nmodel.fit(x, y)\n\ny_hat = model.predict(x)\nplt.plot(x, y,'o', alpha = 0.5)\nplt.plot(x, y_hat, 'r', alpha = 0.5)\n\nplt.xlabel('Violent Crimes Per Population')\nplt.ylabel('Median Income')\n\nprint (\"MSE:\", metrics.mean_squared_error(y_hat, y))\nprint (\"R^2:\", metrics.r2_score(y, y_hat))\nprint (\"var:\", y.var())\n\nMSE: 0.03592636778157073\nR^2: 0.17996313165549482\nvar: medIncome    0.043833\ndtype: float64\n\n\n\n\n\nWhat is the relationship between violent crime and median income? Why might this be?\nAssuming the log data is fine, have we overfit the model? Remember that a good model (which accurately models the relationship between violent crimes per population) need to be robust when faced with new data.\nKfold cross validation splits data into train and test subsets. We can then fit the regression to the training set and see how well it does for the test set.\n\nfrom sklearn.model_selection import KFold\n\nX = df_reg[['ViolentCrimesPerPop']]\ny = df_reg[['medIncome']]\n\n# get four splits, Each split contains a \n# test series and a train series.\nkf = KFold(n_splits=4)\n\n\n# lists to store our statistics\nr_vals = []\nMSEs = []\nmedIncome_coef = []\n\nfor train_index, test_index in kf.split(X):\n    # fit our model and extract statistics\n    model = LinearRegression()\n    model.fit(X.iloc[train_index], y.iloc[train_index])\n    y_hat = model.predict(X.iloc[test_index])\n    \n    MSEs.append(metrics.mean_squared_error(y.iloc[test_index], y_hat))\n    medIncome_coef.append(model.coef_[0][0])\n    r_vals.append(metrics.r2_score(y.iloc[test_index], y_hat))\n\n\ndata = {'MSE' : MSEs, 'medIncome coefficient' : medIncome_coef, 'r squared' : r_vals}\npd.DataFrame(data)\n\n\n\n\n\n\n\n\nMSE\nmedIncome coefficient\nr squared\n\n\n\n\n0\n0.035727\n-0.403609\n0.130479\n\n\n1\n0.035904\n-0.389344\n0.162820\n\n\n2\n0.040777\n-0.353379\n0.200139\n\n\n3\n0.032255\n-0.378883\n0.182403\n\n\n\n\n\n\n\nDoes our model produce similiar coefficients with subsets of the data?\nWe can do this using an inbuild sklearn function (see here).\n\nfrom sklearn.model_selection import cross_val_score\nx = df_reg[['ViolentCrimesPerPop']]\ny = df_reg[['medIncome']]\n\nmodel = LinearRegression()\nmodel.fit(x, y)\n\nprint(cross_val_score(model, x, y, cv=4))\n\n[0.13047946 0.16281953 0.20013867 0.18240261]\n\n\nWhat do these values tell us about our model and data?\nYou might want to carry out multiple regression with more than one predictor variable, or reduce the number of dimensions, or perhaps address different questions using a clustering algorithm instead with all or a subset of features."
   },
   {
     "objectID": "content/labs/Lab_5/IM939_Lab_5_3.html",
diff --git a/sitemap.xml b/sitemap.xml
index c213962..2afae71 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -2,186 +2,186 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/index.html</loc>
-    <lastmod>2023-10-31T18:05:28.182Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.086Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/about/teaching_staff.html</loc>
-    <lastmod>2023-10-31T18:05:28.188Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.094Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/about/im939.html</loc>
-    <lastmod>2023-10-31T18:05:28.194Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.102Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/about/teaching_materials.html</loc>
-    <lastmod>2023-10-31T18:05:28.204Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.111Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/about/conventions.html</loc>
-    <lastmod>2023-10-31T18:05:28.229Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.144Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/sessions/session-01.html</loc>
-    <lastmod>2023-10-31T18:05:28.236Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.152Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_1/IM939_Lab_1_1.html</loc>
-    <lastmod>2023-10-31T18:05:28.248Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.166Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_1/IM939_Lab_1_2.html</loc>
-    <lastmod>2023-10-31T18:05:28.256Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.177Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_1/IM939_Lab_1_3.html</loc>
-    <lastmod>2023-10-31T18:05:28.270Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.193Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/sessions/session-02.html</loc>
-    <lastmod>2023-10-31T18:05:28.275Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.201Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_2/IM939_Lab_2_1.html</loc>
-    <lastmod>2023-10-31T18:05:28.307Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.238Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_2/IM939_Lab_2_2.html</loc>
-    <lastmod>2023-10-31T18:05:28.319Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.252Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_2/IM939_Lab_2_3.html</loc>
-    <lastmod>2023-10-31T18:05:28.334Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.268Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_2/IM939_Lab_2_4.html</loc>
-    <lastmod>2023-10-31T18:05:28.346Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.283Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/sessions/session-03.html</loc>
-    <lastmod>2023-10-31T18:05:28.353Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.291Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_3/IM939_Lab_3_1_Data_Processing_and_Summarization.html</loc>
-    <lastmod>2023-10-31T18:05:28.380Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.324Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_3/IM939_Lab_3_2_Linear_Regression.html</loc>
-    <lastmod>2023-10-31T18:05:28.399Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.346Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_3/IM939_Lab_3_Exercise_Linear_Regression.html</loc>
-    <lastmod>2023-10-31T18:05:28.409Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.357Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/sessions/session-04.html</loc>
-    <lastmod>2023-10-31T18:05:28.415Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.366Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_4/IM939_Session-04_PCA_playground.html</loc>
-    <lastmod>2023-10-31T18:05:28.426Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.380Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_4/IM939_Lab_4_1_Iris.html</loc>
-    <lastmod>2023-10-31T18:05:28.462Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.425Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_4/IM939_Lab_4_2_Crime.html</loc>
-    <lastmod>2023-10-31T18:05:28.483Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.450Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_4/IM939_Lab_4_Exercises.html</loc>
-    <lastmod>2023-10-31T18:05:28.496Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.465Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/sessions/session-05.html</loc>
-    <lastmod>2023-10-31T18:05:28.501Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.472Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_5/week5_recap.html</loc>
-    <lastmod>2023-10-31T18:05:28.509Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.480Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_5/IM939_Lab_5_1.html</loc>
-    <lastmod>2023-10-31T18:05:28.521Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.506Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_5/IM939_Lab_5_2.html</loc>
-    <lastmod>2023-10-31T18:05:28.535Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.523Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_5/IM939_Lab_5_3.html</loc>
-    <lastmod>2023-10-31T18:05:28.551Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.543Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_5/IM939_lab_5_Exercise.html</loc>
-    <lastmod>2023-10-31T18:05:28.557Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.549Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/sessions/session-06.html</loc>
-    <lastmod>2023-10-31T18:05:28.563Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.557Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_6/IM939_Lab_6_1-illusions.html</loc>
-    <lastmod>2023-10-31T18:05:28.578Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.575Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_6/IM939_Lab_6_2-AxisManipulation.html</loc>
-    <lastmod>2023-10-31T18:05:28.595Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.596Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_6/IM939_Lab_6_3-Choropleths.html</loc>
-    <lastmod>2023-10-31T18:05:28.804Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.834Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_6/IM939_Lab_6_4-Exercises.html</loc>
-    <lastmod>2023-10-31T18:05:28.806Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.837Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_6/IM939_Lab_6_5-Simpsons_Paradox.html</loc>
-    <lastmod>2023-10-31T18:05:28.828Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.864Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/sessions/session-07.html</loc>
-    <lastmod>2023-10-31T18:05:28.834Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.871Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_7/IM939_Lab7-Part1.html</loc>
-    <lastmod>2023-10-31T18:05:28.915Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.970Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_7/IM939_Lab7-Part2.html</loc>
-    <lastmod>2023-10-31T18:05:28.927Z</lastmod>
+    <lastmod>2023-11-02T15:00:40.986Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_7/IM939_Lab7-Part3.html</loc>
-    <lastmod>2023-10-31T18:05:28.938Z</lastmod>
+    <lastmod>2023-11-02T15:00:41.001Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/Lab_7/IM939_Lab7-Simpsons_Paradox2.html</loc>
-    <lastmod>2023-10-31T18:05:28.959Z</lastmod>
+    <lastmod>2023-11-02T15:00:41.028Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/sessions/session-08.html</loc>
-    <lastmod>2023-10-31T18:05:28.967Z</lastmod>
+    <lastmod>2023-11-02T15:00:41.037Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/sessions/session-08_WorkshopBrief.html</loc>
-    <lastmod>2023-10-31T18:05:28.974Z</lastmod>
+    <lastmod>2023-11-02T15:00:41.046Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/sessions/session-09.html</loc>
-    <lastmod>2023-10-31T18:05:28.979Z</lastmod>
+    <lastmod>2023-11-02T15:00:41.054Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/references.html</loc>
-    <lastmod>2023-10-31T18:05:28.987Z</lastmod>
+    <lastmod>2023-11-02T15:00:41.063Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/labs/labs_setup.html</loc>
-    <lastmod>2023-10-31T18:05:28.995Z</lastmod>
+    <lastmod>2023-11-02T15:00:41.072Z</lastmod>
   </url>
   <url>
     <loc>https://warwickcim.github.io/IM939_handbook/content/files-and-folders.html</loc>
-    <lastmod>2023-10-31T18:05:29.000Z</lastmod>
+    <lastmod>2023-11-02T15:00:41.079Z</lastmod>
   </url>
 </urlset>