diff --git a/.nojekyll b/.nojekyll index a502733..fc8afbc 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -eb45d49f \ No newline at end of file +da494773 \ No newline at end of file diff --git a/content/labs/Lab_3/IM939_Lab_3_Exercise_Linear_Regression.html b/content/labs/Lab_3/IM939_Lab_3_Exercise_Linear_Regression.html index 586500d..86367af 100644 --- a/content/labs/Lab_3/IM939_Lab_3_Exercise_Linear_Regression.html +++ b/content/labs/Lab_3/IM939_Lab_3_Exercise_Linear_Regression.html @@ -580,7 +580,7 @@

Table of contents

-

13  Exercise: Regression

+

13  Exercise: Regression

@@ -595,6 +595,10 @@

+
import warnings
+warnings.filterwarnings('ignore')
+

Now it’s your turn to prepare a linear regression model.

13.1 Scikit Learn

@@ -608,21 +612,24 @@

13.3 Reading Data

-
import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-import seaborn as sns
+
import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
-
wine = pd.read_excel('data/raw/winequality-red_v2.xlsx', engine = 'openpyxl')
+
wine = pd.read_excel('data/raw/winequality-red_v2.xlsx', engine = 'openpyxl')
+
+#You might need to use encoding, then the code will look like:
+# wine = pd.read_excel('data/raw/winequality-red_v2.xlsx', engine = 'openpyxl', encoding='UTF-8')

13.4 Data exploration

Let’s check the data, their distribution and central tendencies

-
print('shape:', wine.shape)
-wine.head()
+
print('shape:', wine.shape)
+wine.head()
shape: (1599, 12)
@@ -735,16 +742,16 @@

Use lmplot() function from Seaborn to explore linear relationship Input data must be in a Pandas DataFrame. To plot them, we provide the predictor and response variable names along with the dataset

Did you find outliers or missing data? You can use function np.unique and find the unique elements of an array.

-
?np.unique
+
?np.unique

Do you need to remove any cases?

-
 
+
 

Did you need to standarize data?

If you standarized data, try to plot them again

-
 
+
 

@@ -757,47 +764,47 @@

-
import scipy.stats
-scipy.stats.pearsonr(wine.???.values, wine.???.values)
+
import scipy.stats
+scipy.stats.pearsonr(wine.???.values, wine.???.values)
SyntaxError: invalid syntax (987973612.py, line 2)

using Scikit-learn, build a simple linear regression (OLS)

-
from sklearn.linear_model import LinearRegression
-
-est = LinearRegression(fit_intercept = True)
-
-x = wine[['???']]
-y = wine[['???']]
-
-est.fit(x, y)
-
-print("Coefficients:", est.coef_)
-print ("Intercept:", est.intercept_)
+
from sklearn.linear_model import LinearRegression
+
+est = LinearRegression(fit_intercept = True)
+
+x = wine[['???']]
+y = wine[['???']]
+
+est.fit(x, y)
+
+print("Coefficients:", est.coef_)
+print ("Intercept:", est.intercept_)
KeyError: "None of [Index(['???'], dtype='object')] are in the [columns]"

What is the model’s mean squared error (\(MSE\)) and the coefficient of determination (\(R^2\)) ?

-
from sklearn import metrics
-
-# Analysis for all months together.
-x = wdi[['???']]
-y = wdi[['???']]
-model = LinearRegression()
-model.fit(x, y)
-y_hat = model.predict(x)
-plt.plot(x, y,'o', alpha = 0.5)
-plt.plot(x, y_hat, 'r', alpha = 0.5)
-plt.xlabel('?')
-plt.ylabel('?')
-print ("MSE:", metrics.mean_squared_error(y_hat, y))
-print ("R^2:", metrics.r2_score(y_hat, y))
-print ("var:", y.var())
-plt.savefig("?.png", dpi = 300, bbox_inches = 'tight')
+
from sklearn import metrics
+
+# Analysis for all months together.
+x = wdi[['???']]
+y = wdi[['???']]
+model = LinearRegression()
+model.fit(x, y)
+y_hat = model.predict(x)
+plt.plot(x, y,'o', alpha = 0.5)
+plt.plot(x, y_hat, 'r', alpha = 0.5)
+plt.xlabel('?')
+plt.ylabel('?')
+print ("MSE:", metrics.mean_squared_error(y_hat, y))
+print ("R^2:", metrics.r2_score(y_hat, y))
+print ("var:", y.var())
+plt.savefig("?.png", dpi = 300, bbox_inches = 'tight')
NameError: name 'wdi' is not defined
diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris.html b/content/labs/Lab_4/IM939_Lab_4_1_Iris.html index b8cde03..380675c 100644 --- a/content/labs/Lab_4/IM939_Lab_4_1_Iris.html +++ b/content/labs/Lab_4/IM939_Lab_4_1_Iris.html @@ -1651,13 +1651,13 @@

k_means.labels_
-
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-       1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0,
-       0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0,
-       0, 2, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2], dtype=int32)
+
array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+       2, 2, 2, 2, 2, 2, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
+       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
+       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0], dtype=int32)

Each row has been assigned a label.

@@ -1689,7 +1689,7 @@

1 @@ -1697,7 +1697,7 @@

2 @@ -1705,7 +1705,7 @@

3 @@ -1713,7 +1713,7 @@

4 @@ -1721,7 +1721,7 @@

... @@ -1737,7 +1737,7 @@

146 @@ -1745,7 +1745,7 @@

147 @@ -1753,7 +1753,7 @@

148 @@ -1761,7 +1761,7 @@

149 @@ -1769,7 +1769,7 @@

k_means.cluster_centers_
-
array([[0.70726496, 0.4508547 , 0.79704476, 0.82478632],
-       [0.19611111, 0.595     , 0.07830508, 0.06083333],
-       [0.44125683, 0.30737705, 0.57571548, 0.54918033]])
+
array([[0.44125683, 0.30737705, 0.57571548, 0.54918033],
+       [0.70726496, 0.4508547 , 0.79704476, 0.82478632],
+       [0.19611111, 0.595     , 0.07830508, 0.06083333]])

It is tricky to plot these using seaborn but we can use a normal maplotlib scatter plot.

@@ -1851,7 +1851,7 @@

alpha = 1, color = 'black' )
-
<matplotlib.collections.PathCollection at 0x14fe387d0>
+
<matplotlib.collections.PathCollection at 0x162e90390>

@@ -1914,9 +1914,9 @@

0.625000 0.067797 0.041667 +2 1 -1 -4 +3 1 @@ -1924,9 +1924,9 @@

0.416667 0.067797 0.041667 +2 1 1 -0 2 @@ -1934,9 +1934,9 @@

0.500000 0.050847 0.041667 +2 1 1 -0 3 @@ -1944,9 +1944,9 @@

0.458333 0.084746 0.041667 +2 1 1 -0 4 @@ -1954,9 +1954,9 @@

0.666667 0.067797 0.041667 +2 1 -1 -4 +3 ... @@ -1974,9 +1974,9 @@

0.416667 0.711864 0.916667 +1 0 -0 -2 +4 146 @@ -1984,9 +1984,9 @@

0.208333 0.677966 0.750000 -2 0 -3 +0 +2 147 @@ -1994,9 +1994,9 @@

0.416667 0.711864 0.791667 +1 0 -0 -3 +2 148 @@ -2004,9 +2004,9 @@

0.583333 0.745763 0.916667 +1 0 -0 -2 +4 149 @@ -2014,9 +2014,9 @@

0.416667 0.694915 0.708333 -2 0 -3 +0 +2 @@ -2041,7 +2041,7 @@

k_means_5.inertia_
-
4.58977540011789
+
4.580948640117293

It looks like our k = 5 model captures the data well. Intertia, looking at the sklearn documentation as the Sum of squared distances of samples to their closest cluster center..

@@ -2315,9 +2315,9 @@

< 0.625000 0.067797 0.041667 +2 1 -1 -4 +3 -0.630703 0.107578 @@ -2327,9 +2327,9 @@

< 0.416667 0.067797 0.041667 +2 1 1 -0 -0.622905 -0.104260 @@ -2339,9 +2339,9 @@

< 0.500000 0.050847 0.041667 +2 1 1 -0 -0.669520 -0.051417 @@ -2351,9 +2351,9 @@

< 0.458333 0.084746 0.041667 +2 1 1 -0 -0.654153 -0.102885 @@ -2363,9 +2363,9 @@

< 0.666667 0.067797 0.041667 +2 1 -1 -4 +3 -0.648788 0.133488 @@ -2387,9 +2387,9 @@

< 0.416667 0.711864 0.916667 +1 0 -0 -2 +4 0.551462 0.059841 @@ -2399,9 +2399,9 @@

< 0.208333 0.677966 0.750000 -2 0 -3 +0 +2 0.407146 -0.171821 @@ -2411,9 +2411,9 @@

< 0.416667 0.711864 0.791667 +1 0 -0 -3 +2 0.447143 0.037560 @@ -2423,9 +2423,9 @@

< 0.583333 0.745763 0.916667 +1 0 -0 -2 +4 0.488208 0.149678 @@ -2435,9 +2435,9 @@

< 0.416667 0.694915 0.708333 -2 0 -3 +0 +2 0.312066 -0.031130 @@ -2497,9 +2497,9 @@

< 0.625000 0.067797 0.041667 +2 1 -1 -4 +3 -0.630703 0.107578 @@ -2509,9 +2509,9 @@

< 0.416667 0.067797 0.041667 +2 1 1 -0 -0.622905 -0.104260 @@ -2521,9 +2521,9 @@

< 0.500000 0.050847 0.041667 +2 1 1 -0 -0.669520 -0.051417 @@ -2533,9 +2533,9 @@

< 0.458333 0.084746 0.041667 +2 1 1 -0 -0.654153 -0.102885 @@ -2545,9 +2545,9 @@

< 0.666667 0.067797 0.041667 +2 1 -1 -4 +3 -0.648788 0.133488 @@ -2569,9 +2569,9 @@

< 0.416667 0.711864 0.916667 +1 0 -0 -2 +4 0.551462 0.059841 @@ -2581,9 +2581,9 @@

< 0.208333 0.677966 0.750000 -2 0 -3 +0 +2 0.407146 -0.171821 @@ -2593,9 +2593,9 @@

< 0.416667 0.711864 0.791667 +1 0 -0 -3 +2 0.447143 0.037560 @@ -2605,9 +2605,9 @@

< 0.583333 0.745763 0.916667 +1 0 -0 -2 +4 0.488208 0.149678 @@ -2617,9 +2617,9 @@

< 0.416667 0.694915 0.708333 -2 0 -3 +0 +2 0.312066 -0.031130 @@ -2675,12 +2675,12 @@

0.625000 0.067797 0.041667 +2 1 -1 -4 +3 -0.630703 0.107578 -1 +0 1 @@ -2688,12 +2688,12 @@

0.416667 0.067797 0.041667 +2 1 1 -0 -0.622905 -0.104260 -1 +0 2 @@ -2701,12 +2701,12 @@

0.500000 0.050847 0.041667 +2 1 1 -0 -0.669520 -0.051417 -1 +0 3 @@ -2714,12 +2714,12 @@

0.458333 0.084746 0.041667 +2 1 1 -0 -0.654153 -0.102885 -1 +0 4 @@ -2727,12 +2727,12 @@

0.666667 0.067797 0.041667 +2 1 -1 -4 +3 -0.648788 0.133488 -1 +0 ... @@ -2753,12 +2753,12 @@

0.416667 0.711864 0.916667 +1 0 -0 -2 +4 0.551462 0.059841 -0 +1 146 @@ -2766,9 +2766,9 @@

0.208333 0.677966 0.750000 -2 0 -3 +0 +2 0.407146 -0.171821 2 @@ -2779,12 +2779,12 @@

0.416667 0.711864 0.791667 +1 0 -0 -3 +2 0.447143 0.037560 -0 +1 148 @@ -2792,12 +2792,12 @@

0.583333 0.745763 0.916667 +1 0 -0 -2 +4 0.488208 0.149678 -0 +1 149 @@ -2805,9 +2805,9 @@

0.416667 0.694915 0.708333 -2 0 -3 +0 +2 0.312066 -0.031130 2 @@ -2853,7 +2853,7 @@

plt.xticks(ks) plt.show()
-

+

Three seems ok. We clearly want no more than three.

@@ -2878,10 +2878,10 @@

df.isna().sum()
-
sepal length (cm)    32
-sepal width (cm)     34
-petal length (cm)    37
-petal width (cm)     29
+
sepal length (cm)    29
+sepal width (cm)     21
+petal length (cm)    32
+petal width (cm)     21
 dtype: int64
@@ -2905,14 +2905,14 @@

0 5.1 -NaN -NaN +3.5 +1.4 0.2 1 4.9 -NaN +3.0 NaN 0.2 @@ -2927,15 +2927,15 @@

3 4.6 3.1 -1.5 NaN +0.2 4 -5.0 NaN +3.6 1.4 -NaN +0.2 ... @@ -2947,28 +2947,28 @@

145 6.7 -3.0 +NaN 5.2 2.3 146 -NaN +6.3 2.5 -5.0 +NaN 1.9 147 6.5 -NaN -NaN +3.0 +5.2 2.0 148 -6.2 NaN +3.4 5.4 2.3 @@ -2976,8 +2976,8 @@

149 5.9 3.0 -5.1 -1.8 +NaN +NaN @@ -3014,14 +3014,14 @@

0 5.1 -0.0 -0.0 +3.5 +1.4 0.2 1 4.9 -0.0 +3.0 0.0 0.2 @@ -3036,15 +3036,15 @@

3 4.6 3.1 -1.5 0.0 +0.2 4 -5.0 0.0 +3.6 1.4 -0.0 +0.2 ... @@ -3056,28 +3056,28 @@

145 6.7 -3.0 +0.0 5.2 2.3 146 -0.0 +6.3 2.5 -5.0 +0.0 1.9 147 6.5 -0.0 -0.0 +3.0 +5.2 2.0 148 -6.2 0.0 +3.4 5.4 2.3 @@ -3085,8 +3085,8 @@

149 5.9 3.0 -5.1 -1.8 +0.0 +0.0 @@ -3123,20 +3123,20 @@

<Axes: xlabel='c1', ylabel='c2'>
-

+

df_1_pca.explained_variance_
-
array([6.71803744, 4.89376791])
+
array([6.24279356, 4.84811544])
df_1_pca.components_
-
array([[-0.91235845,  0.02968512, -0.38161438, -0.14522853],
-       [-0.39939351,  0.05086373,  0.90393389,  0.14422629]])
+
array([[-0.86129917,  0.04084996, -0.48641492, -0.14105157],
+       [-0.50682662, -0.04550418,  0.84286268,  0.175039  ]])
@@ -3167,37 +3167,37 @@

0 5.100000 -3.00431 -3.90885 +3.500000 +1.400000 0.200000 1 4.900000 -3.00431 -3.90885 +3.000000 +3.877119 0.200000 2 -5.866102 -3.20000 -1.30000 +5.839669 +3.200000 +1.300000 0.200000 3 4.600000 -3.10000 -1.50000 -1.210744 +3.100000 +3.877119 +0.200000 4 -5.000000 -3.00431 -1.40000 -1.210744 +5.839669 +3.600000 +1.400000 +0.200000 ... @@ -3209,37 +3209,37 @@

145 6.700000 -3.00000 -5.20000 +3.054264 +5.200000 2.300000 146 -5.866102 -2.50000 -5.00000 +6.300000 +2.500000 +3.877119 1.900000 147 6.500000 -3.00431 -3.90885 +3.000000 +5.200000 2.000000 148 -6.200000 -3.00431 -5.40000 +5.839669 +3.400000 +5.400000 2.300000 149 5.900000 -3.00000 -5.10000 -1.800000 +3.000000 +3.877119 +1.205426 @@ -3280,14 +3280,14 @@

df_2_pca.explained_variance_
-
array([2.68417915, 0.33506061])
+
array([3.01818399, 0.26633671])
df_2_pca.components_
-
array([[ 0.33775908, -0.04345744,  0.87824143,  0.33574133],
-       [ 0.82803166,  0.20108365, -0.42517727,  0.30521014]])
+
array([[ 0.31417904, -0.06487468,  0.88369345,  0.34083528],
+       [ 0.89110506,  0.17000084, -0.37665661,  0.18751344]])
diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-20-output-1.png b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-20-output-1.png index 134a3b4..77f52f6 100644 Binary files a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-20-output-1.png and b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-20-output-1.png differ diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-21-output-2.png b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-21-output-2.png index 709c0ba..6b0d389 100644 Binary files a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-21-output-2.png and b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-21-output-2.png differ diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-25-output-2.png b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-25-output-2.png index 3bc769f..313727e 100644 Binary files a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-25-output-2.png and b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-25-output-2.png differ diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-29-output-1.png b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-29-output-1.png index a5fe247..9c9b7ae 100644 Binary files a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-29-output-1.png and b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-29-output-1.png differ diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-47-output-2.png b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-47-output-2.png index 3932a6d..3b5f16f 100644 Binary files a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-47-output-2.png and b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-47-output-2.png differ diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-52-output-2.png b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-52-output-2.png index 50599b1..a00bf08 100644 Binary files a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-52-output-2.png and b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-52-output-2.png differ diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-53-output-1.png b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-53-output-1.png index 02e1ffe..e1c18d2 100644 Binary files a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-53-output-1.png and b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-53-output-1.png differ diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-59-output-1.png b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-59-output-1.png index 4ed7452..348399d 100644 Binary files a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-59-output-1.png and b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-59-output-1.png differ diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-60-output-2.png b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-60-output-2.png index a4f6bc3..a397ab7 100644 Binary files a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-60-output-2.png and b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-60-output-2.png differ diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-65-output-1.png b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-65-output-1.png index b1c2cf2..8a89879 100644 Binary files a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-65-output-1.png and b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-65-output-1.png differ diff --git a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-66-output-2.png b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-66-output-2.png index d496c2b..ed10bb5 100644 Binary files a/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-66-output-2.png and b/content/labs/Lab_4/IM939_Lab_4_1_Iris_files/figure-html/cell-66-output-2.png differ diff --git a/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-15-output-2.png b/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-15-output-2.png index 4a6e529..2505d8a 100644 Binary files a/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-15-output-2.png and b/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-15-output-2.png differ diff --git a/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-16-output-1.png b/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-16-output-1.png index 878f660..af515ab 100644 Binary files a/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-16-output-1.png and b/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-16-output-1.png differ diff --git a/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-18-output-2.png b/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-18-output-2.png index c3d7f56..322439b 100644 Binary files a/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-18-output-2.png and b/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-18-output-2.png differ diff --git a/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-19-output-2.png b/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-19-output-2.png index f18e6d6..ed64a82 100644 Binary files a/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-19-output-2.png and b/content/labs/Lab_4/IM939_Lab_4_2_Crime_files/figure-html/cell-19-output-2.png differ diff --git a/content/labs/Lab_5/IM939_Lab_5_1.html b/content/labs/Lab_5/IM939_Lab_5_1.html index 52b0099..ca70f26 100644 --- a/content/labs/Lab_5/IM939_Lab_5_1.html +++ b/content/labs/Lab_5/IM939_Lab_5_1.html @@ -115,11 +115,17 @@ "search-label": "Search" } } + + + - + + + + @@ -561,9 +567,12 @@

Table of contents

  • 21.1 Data Wrangling
  • 21.2 Cluster analysis
  • +
  • 21.3 Clusters and Ground Truth +
  • @@ -588,122 +597,512 @@

    21  Cortez et al. (2009) that we used in the past (Chapter 13) and you may be familiar with by now (but if you don’t, tou can find more information about it here: https://doi.org/10.24432/C56S3T).

    +

    We are going to use the Wine Quality Dataset from Cortez et al. (2009) that you may be familiar with by now (but if you don’t, tou can find more information about it here: https://doi.org/10.24432/C56S3T).

    21.1 Data Wrangling

    -

    As usual, we will start by looking at our data, and making transformations, if needed.

    -
    +
    import pandas as pd
     
    -df = pd.read_csv('data/wine.csv')
    -
    -df.head()
    +df = pd.read_csv('data/wine.csv')
    -
    -
    -
    - +

    Look at our data.

    +
    +
    df.head()
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Class labelAlcoholMalic acidAshAlcalinity of ashMagnesiumTotal phenolsFlavanoidsNonflavanoid phenolsProanthocyaninsColor intensityHueOD280/OD315 of diluted winesProline
    0114.231.712.4315.61272.803.060.282.295.641.043.921065
    1113.201.782.1411.21002.652.760.261.284.381.053.401050
    2113.162.362.6718.61012.803.240.302.815.681.033.171185
    3114.371.952.5016.81133.853.490.242.187.800.863.451480
    4113.242.592.8721.01182.802.690.391.824.321.042.93735
    +
    -
    -Tip
    -
    -

    There is a column called Class label that gives us the ground truth. The wines come from three different cultivars. Knowing the actual grouping helps us to identify how well our methods can capture this ground truth.

    -
    -
    -

    Following the data wrangling process that was summarised in Chapter 20, we should first get a sense of our data.

    -
    -
    df.describe()
    -
    -

    As you can see no variable has any missing data, but the scales of our features vary (e.g., Magnesium is in the 100s whereas Hue is in the low single digits).

    -

    Let’s visually inspect how features are distributed using a violin plot:

    -
    -
    import seaborn as sns
    -
    -df_long = df.melt(id_vars='Class label')
    -
    -sns.violinplot(data = df_long, x = 'variable', y = 'value')
    -
    -

    Regretfully, this is not very useful right now, due to the different scales that we detected previously. In this case, it makes sense to normalise our data.

    -
    -
    from sklearn.preprocessing import MinMaxScaler
    -
    -# create a scaler object
    -scaler = MinMaxScaler()
    -
    -# fit and transform the data
    -df_norm = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)
    -
    -df_long = df_norm.melt(id_vars='Class label')
    -df_long
    -
    -
    -
    #create seaborn violin plot
    -my_plot = sns.violinplot(data = df_long, x = 'variable', y = 'value')
    -
    -#rotate x-axis labels
    -my_plot.set_xticklabels(my_plot.get_xticklabels(), rotation=90)
    +

    There is a column called Class label that gives us the ground truth. The wines come from three different cultivars. Knowing the actual grouping helps us to identify how well our methods can capture this ground truth.

    +

    Following our process above, we should first get a sense of our data.

    +
    +
    df.describe()
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Class labelAlcoholMalic acidAshAlcalinity of ashMagnesiumTotal phenolsFlavanoidsNonflavanoid phenolsProanthocyaninsColor intensityHueOD280/OD315 of diluted winesProline
    count178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000
    mean1.93820213.0006182.3363482.36651719.49494499.7415732.2951122.0292700.3618541.5908995.0580900.9574492.611685746.893258
    std0.7750350.8118271.1171460.2743443.33956414.2824840.6258510.9988590.1244530.5723592.3182860.2285720.709990314.907474
    min1.00000011.0300000.7400001.36000010.60000070.0000000.9800000.3400000.1300000.4100001.2800000.4800001.270000278.000000
    25%1.00000012.3625001.6025002.21000017.20000088.0000001.7425001.2050000.2700001.2500003.2200000.7825001.937500500.500000
    50%2.00000013.0500001.8650002.36000019.50000098.0000002.3550002.1350000.3400001.5550004.6900000.9650002.780000673.500000
    75%3.00000013.6775003.0825002.55750021.500000107.0000002.8000002.8750000.4375001.9500006.2000001.1200003.170000985.000000
    max3.00000014.8300005.8000003.23000030.000000162.0000003.8800005.0800000.6600003.58000013.0000001.7100004.0000001680.000000
    + +
    +
    +
    +

    No missing data. The scales of our features vary (e.g., Magnesium is in the 100s whereas Hue is in the low single digits).

    +

    How about our feature distributions?

    +
    +
    df_long = df.melt(id_vars='Class label')
    +
    +
    +
    import seaborn as sns
    +
    +sns.violinplot(data = df_long, x = 'variable', y = 'value')
    +
    +
    <Axes: xlabel='variable', ylabel='value'>
    +
    +
    +

    +
    +
    +

    Makes sense to normalise our data.

    +
    +
    from sklearn.preprocessing import MinMaxScaler
    +
    +# create a scaler object
    +scaler = MinMaxScaler()
    +
    +# fit and transform the data
    +df_norm = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)
    +
    +df_long = df_norm.melt(id_vars='Class label')
    +df_long
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Class labelvariablevalue
    00.0Alcohol0.842105
    10.0Alcohol0.571053
    20.0Alcohol0.560526
    30.0Alcohol0.878947
    40.0Alcohol0.581579
    ............
    23091.0Proline0.329529
    23101.0Proline0.336662
    23111.0Proline0.397290
    23121.0Proline0.400856
    23131.0Proline0.201141
    + +

    2314 rows × 3 columns

    +
    +
    +
    +
    +
    #create seaborn violin plot
    +my_plot = sns.violinplot(data = df_long, x = 'variable', y = 'value')
    +
    +#rotate x-axis labels
    +my_plot.set_xticklabels(my_plot.get_xticklabels(), rotation=90)
    +
    +
    [Text(0, 0, 'Alcohol'),
    + Text(1, 0, 'Malic acid'),
    + Text(2, 0, 'Ash'),
    + Text(3, 0, 'Alcalinity of ash'),
    + Text(4, 0, 'Magnesium'),
    + Text(5, 0, 'Total phenols'),
    + Text(6, 0, 'Flavanoids'),
    + Text(7, 0, 'Nonflavanoid phenols'),
    + Text(8, 0, 'Proanthocyanins'),
    + Text(9, 0, 'Color intensity'),
    + Text(10, 0, 'Hue'),
    + Text(11, 0, 'OD280/OD315 of diluted wines'),
    + Text(12, 0, 'Proline ')]
    +
    +
    +

    +

    Are there any patterns?

    How about a pairplot?

    -
    -
    sns.pairplot(data = df_norm.iloc[:,1:])
    +
    +
    sns.pairplot(data = df_norm.iloc[:,1:])
    +
    +

    +

    Hmm, a few interesting correlations. Some of our variables are skewed. We could apply some PCA here to look at fewer dimension or even log transform some of the skewed variables.

    21.2 Cluster analysis

    For now we will just run a kmeans cluster and then check our results against the ground truth.

    -
    -

    21.2.1 Number of clusters

    +
    +

    21.2.1 Determining the number of clusters

    Lets decide how many clusters we need.

    -
    -
    from sklearn.cluster import KMeans
    -
    -ks = range(1, 10)
    -inertias = []
    -for k in ks:
    -    # Create a KMeans instance with k clusters: model
    -    model = KMeans(n_clusters=k, n_init = 10)
    -    
    -    # Fit model to samples
    -    model.fit(df.iloc[:,1:])
    -    
    -    # Append the inertia to the list of inertias
    -    inertias.append(model.inertia_)
    -
    -import matplotlib.pyplot as plt
    -
    -plt.plot(ks, inertias, '-o', color='black')
    -plt.xlabel('number of clusters, k')
    -plt.ylabel('inertia')
    -plt.xticks(ks)
    -plt.show()
    +
    +
    from sklearn.cluster import KMeans
    +
    +ks = range(1, 10)
    +inertias = []
    +for k in ks:
    +    # Create a KMeans instance with k clusters: model
    +    model = KMeans(n_clusters=k, n_init = 10)
    +    
    +    # Fit model to samples
    +    model.fit(df.iloc[:,1:])
    +    
    +    # Append the inertia to the list of inertias
    +    inertias.append(model.inertia_)
    +
    +import matplotlib.pyplot as plt
    +
    +plt.plot(ks, inertias, '-o', color='black')
    +plt.xlabel('number of clusters, k')
    +plt.ylabel('inertia')
    +plt.xticks(ks)
    +plt.show()
    +
    +

    +

    What happens if we use the normalised data instead?

    -
    -
    from sklearn.cluster import KMeans
    -
    -ks = range(1, 10)
    -inertias = []
    -for k in ks:
    -    # Create a KMeans instance with k clusters: model
    -    model = KMeans(n_clusters=k, n_init = 10)
    -    
    -    # Fit model to samples
    -    model.fit(df_norm.iloc[:,1:])
    -    
    -    # Append the inertia to the list of inertias
    -    inertias.append(model.inertia_)
    -
    -import matplotlib.pyplot as plt
    -
    -plt.plot(ks, inertias, '-o', color='black')
    -plt.xlabel('number of clusters, k')
    -plt.ylabel('inertia')
    -plt.xticks(ks)
    -plt.show()
    +
    +
    from sklearn.cluster import KMeans
    +
    +ks = range(1, 10)
    +inertias = []
    +for k in ks:
    +    # Create a KMeans instance with k clusters: model
    +    model = KMeans(n_clusters=k, n_init = 10)
    +    
    +    # Fit model to samples
    +    model.fit(df_norm.iloc[:,1:])
    +    
    +    # Append the inertia to the list of inertias
    +    inertias.append(model.inertia_)
    +
    +import matplotlib.pyplot as plt
    +
    +plt.plot(ks, inertias, '-o', color='black')
    +plt.xlabel('number of clusters, k')
    +plt.ylabel('inertia')
    +plt.xticks(ks)
    +plt.show()
    +
    +

    +
    @@ -719,39 +1118,324 @@

    Three clusters seems about right (and matches our number of origonal labels).

    -
    -
    df['Class label'].value_counts()
    +
    +
    df['Class label'].value_counts()
    +
    +
    Class label
    +2    71
    +1    59
    +3    48
    +Name: count, dtype: int64
    +
    -
    -

    21.2.2 Calculate 3 clusters

    -

    Now, we are going to calculate three clusters and store each observation’s cluster labels into a variable within the original dataframe:

    -
    -
    # Create a KMeans instance with k clusters: model
    -k_means = KMeans(n_clusters=3)
    -
    -# Fit model to samples
    -df_k_means = k_means.fit(df.iloc[:,1:])
    -
    -# Create a new variable with the fited cluster label.
    -df['Three clusters'] = pd.Series(df_k_means.predict(df.iloc[:,1:].values), index = df.index)
    -df
    +
    +

    21.2.2 Computing the clusters

    +
    +
    # Create a KMeans instance with k clusters: model
    +k_means = KMeans(n_clusters=3)
    +
    +# Fit model to samples
    +df_k_means = k_means.fit(df.iloc[:,1:])
    +
    +df['Three clusters'] = pd.Series(df_k_means.predict(df.iloc[:,1:].values), index = df.index)
    +df
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Class labelAlcoholMalic acidAshAlcalinity of ashMagnesiumTotal phenolsFlavanoidsNonflavanoid phenolsProanthocyaninsColor intensityHueOD280/OD315 of diluted winesProlineThree clusters
    0114.231.712.4315.61272.803.060.282.295.641.043.9210651
    1113.201.782.1411.21002.652.760.261.284.381.053.4010501
    2113.162.362.6718.61012.803.240.302.815.681.033.1711851
    3114.371.952.5016.81133.853.490.242.187.800.863.4514801
    4113.242.592.8721.01182.802.690.391.824.321.042.937352
    ................................................
    173313.715.652.4520.5951.680.610.521.067.700.641.747402
    174313.403.912.4823.01021.800.750.431.417.300.701.567502
    175313.274.282.2620.01201.590.690.431.3510.200.591.568352
    176313.172.592.3720.01201.650.680.531.469.300.601.628402
    177314.134.102.7424.5962.050.760.561.359.200.611.605600
    + +

    178 rows × 15 columns

    +
    +
    -
    -

    21.2.3 Ground Truth Validation

    -

    Do our cluster labels match our ground truth? Did our cluster model capture reality?

    -
    -
    ct = pd.crosstab(df['Three clusters'], df['Class label'])
    -ct
    +
    +
    +

    21.3 Clusters and Ground Truth

    +

    Now that we have created three clusters, we may ask ourselves: Do our cluster labels match our ground truth? Did our cluster model capture reality?

    +
    +
    ct = pd.crosstab(df['Three clusters'], df['Class label'])
    +ct
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Class label123
    Three clusters
    005019
    14610
    2132029
    + +
    +

    It might be easier to see as a stacked plot (see this post).

    -
    -
    import matplotlib.pyplot as plt
    -import numpy as np
    -
    -ct.plot.bar(stacked=True)
    -plt.legend(title='Class label')
    +
    +
    import matplotlib.pyplot as plt
    +import numpy as np
    +
    +ct.plot.bar(stacked=True)
    +plt.legend(title='Class label')
    +
    +
    <matplotlib.legend.Legend at 0x1798f3e50>
    +
    +
    +

    +

    How has the kmeans model done compared to our ground truth?

    @@ -767,32 +1451,251 @@

    -
    df.iloc[:,1:14]
    +
    +

    21.3.1 Principal Components Analysis

    +

    A way to come over this ambiguity and evaluate the results is to look at a visualisations of the results and compare. But this brings in the question of what type of visualisation to use for looking at the clusters. An immediate alternative is to use scatterplots. However, it is not clear which axis to use for clustering. A common method to apply at this stage is to make use of PCA to get a 2D plane where we can project the data points and visualise them over this projection.

    +
    +
    df.iloc[:,1:14]
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    AlcoholMalic acidAshAlcalinity of ashMagnesiumTotal phenolsFlavanoidsNonflavanoid phenolsProanthocyaninsColor intensityHueOD280/OD315 of diluted winesProline
    014.231.712.4315.61272.803.060.282.295.641.043.921065
    113.201.782.1411.21002.652.760.261.284.381.053.401050
    213.162.362.6718.61012.803.240.302.815.681.033.171185
    314.371.952.5016.81133.853.490.242.187.800.863.451480
    413.242.592.8721.01182.802.690.391.824.321.042.93735
    ..........................................
    17313.715.652.4520.5951.680.610.521.067.700.641.74740
    17413.403.912.4823.01021.800.750.431.417.300.701.56750
    17513.274.282.2620.01201.590.690.431.3510.200.591.56835
    17613.172.592.3720.01201.650.680.531.469.300.601.62840
    17714.134.102.7424.5962.050.760.561.359.200.611.60560
    + +

    178 rows × 13 columns

    +
    +
    -
    -
    from sklearn.decomposition import PCA
    -
    -n_components = 2
    -
    -pca = PCA(n_components=n_components)
    -df_pca = pca.fit(df.iloc[:,1:14])
    -df_pca_vals = df_pca.transform(df.iloc[:,1:14])
    +
    +
    from sklearn.decomposition import PCA
    +
    +n_components = 2
    +
    +pca = PCA(n_components=n_components)
    +df_pca = pca.fit(df.iloc[:,1:14])
    +df_pca_vals = df_pca.transform(df.iloc[:,1:14])

    Grab our projections and plot along with our cluster names.

    -
    -
    df['c1'] = [item[0] for item in df_pca_vals]
    -df['c2'] = [item[1] for item in df_pca_vals]
    -
    -ax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Class label')
    -ax.set_title('Known labels visualised over PCs')
    +
    +
    df['c1'] = [item[0] for item in df_pca_vals]
    +df['c2'] = [item[1] for item in df_pca_vals]
    +
    +ax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Class label')
    +ax.set_title('Known labels visualised over PCs')
    +
    +
    Text(0.5, 1.0, 'Known labels visualised over PCs')
    +
    +
    +

    +

    In the figure above, we colored the points based on the actual labels, we observe that there has been several misclassifications in the figure above (i.e., in the algorithm’s results). So one may choose to use an alternative algorithm or devise a better distance metric.

    -
    -
    ax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Three clusters')
    -ax.set_title('Results of the algorithm visualised over PCs')
    +
    +
    ax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Three clusters')
    +ax.set_title('Results of the algorithm visualised over PCs')
    +
    +
    Text(0.5, 1.0, 'Results of the algorithm visualised over PCs')
    +
    +
    +

    +

    This shows the parallelism between the clustering algorithm and PCA. By looking at the PCA loadings, we can find out what the x-axis mean and try to interpret the clusters (We leave this as an additional exercise for those interested).

    How might your interpret the above plots? Did the kmeans model identify the ground truth?

    @@ -803,32 +1706,44 @@

    -
    # Create a KMeans instance with k clusters: model
    -k_means = KMeans(n_clusters=3, init='random', n_init = 10)
    -
    -# Fit model to samples
    -df_k_means = k_means.fit(df.iloc[:,1:14])
    -
    -df['Three clusters'] = pd.Series(df_k_means.predict(df.iloc[:,1:14].values), index = df.index)
    -
    -ax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Three clusters')
    -ax.set_title('Results of the algorithm visualised over PCs')
    +
    +
    # Create a KMeans instance with k clusters: model
    +k_means = KMeans(n_clusters=3, init='random', n_init = 10)
    +
    +# Fit model to samples
    +df_k_means = k_means.fit(df.iloc[:,1:14])
    +
    +df['Three clusters'] = pd.Series(df_k_means.predict(df.iloc[:,1:14].values), index = df.index)
    +
    +ax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Three clusters')
    +ax.set_title('Results of the algorithm visualised over PCs')
    +
    +
    Text(0.5, 1.0, 'Results of the algorithm visualised over PCs')
    +
    +
    +

    +

    How about with only 80% of the data?

    -
    -
    df_sample = df.sample(frac=0.8, replace=False)
    -
    -# Create a KMeans instance with k clusters: model
    -k_means = KMeans(n_clusters=3, init='random', n_init = 10)
    -
    -# Fit model to samples
    -df_k_means = k_means.fit(df_sample.iloc[:,1:14])
    -
    -df_sample['Three clusters'] = pd.Series(df_k_means.predict(df_sample.iloc[:,1:14].values), index = df_sample.index)
    -
    -ax = sns.scatterplot(data = df_sample, x = 'c1', y = 'c2', hue = 'Three clusters')
    -ax.set_title('Results of the algorithm visualised over PCs')
    +
    +
    df_sample = df.sample(frac=0.8, replace=False)
    +
    +# Create a KMeans instance with k clusters: model
    +k_means = KMeans(n_clusters=3, init='random', n_init = 10)
    +
    +# Fit model to samples
    +df_k_means = k_means.fit(df_sample.iloc[:,1:14])
    +
    +df_sample['Three clusters'] = pd.Series(df_k_means.predict(df_sample.iloc[:,1:14].values), index = df_sample.index)
    +
    +ax = sns.scatterplot(data = df_sample, x = 'c1', y = 'c2', hue = 'Three clusters')
    +ax.set_title('Results of the algorithm visualised over PCs')
    +
    +
    Text(0.5, 1.0, 'Results of the algorithm visualised over PCs')
    +
    +
    +

    +

    We may want to automate the process of resampling the data or rerunning the model then perhaps plotting the different inertia values or creating different plots.

    Do you think our clustering algorithm is stable and provide similiar results even when some data is removed or the initial values are random?

    diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-10-output-1.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-10-output-1.png index c686cad..0e88bbd 100644 Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-10-output-1.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-10-output-1.png differ diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-11-output-1.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-11-output-1.png index 33a1e9a..0b09c4f 100644 Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-11-output-1.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-11-output-1.png differ diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-12-output-1.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-12-output-1.png index dc44232..7d26839 100644 Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-12-output-1.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-12-output-1.png differ diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-19-output-2.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-19-output-2.png index f6f3c4d..f9c04c5 100644 Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-19-output-2.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-19-output-2.png differ diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-20-output-2.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-20-output-2.png index 4769a92..f6f3c4d 100644 Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-20-output-2.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-20-output-2.png differ diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-21-output-2.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-21-output-2.png index b326702..d880af7 100644 Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-21-output-2.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-21-output-2.png differ diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-22-output-2.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-22-output-2.png index f33922d..5fecad0 100644 Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-22-output-2.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-22-output-2.png differ diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-7-output-2.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-7-output-2.png index dd4eb86..c08f8b5 100644 Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-7-output-2.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-7-output-2.png differ diff --git a/content/labs/Lab_5/IM939_Lab_5_2.html b/content/labs/Lab_5/IM939_Lab_5_2.html index ce7016e..25803e1 100644 --- a/content/labs/Lab_5/IM939_Lab_5_2.html +++ b/content/labs/Lab_5/IM939_Lab_5_2.html @@ -103,7 +103,7 @@ - + @@ -565,11 +565,11 @@

    22  here.

    We are going to examine the data, fit and then cross-validate a regression model.

    -
    +
    import pandas as pd
     df = pd.read_csv('data/censusCrimeClean.csv')
     df.head()
    -
    +
    @@ -732,10 +732,10 @@

    22  +
    df_reg = df[['communityname', 'medIncome', 'ViolentCrimesPerPop']]
     df_reg
    -
    +
    @@ -823,28 +823,42 @@

    22  here).

    -
    +
    import seaborn as sns
     sns.jointplot(data = df[['medIncome', 'ViolentCrimesPerPop']], 
                   x = 'ViolentCrimesPerPop', 
                   y = 'medIncome', kind='reg',
                   marker = '.')
    -

    +

    We may want to z-transform or log these scores as they are heavily skewed.

    -
    +
    import numpy as np
     
     # some values are 0 so 0.1 is added to prevent log giving us infinity
     # there may be a better way to do this!
     df_reg.loc[:, 'ViolentCrimesPerPop_log'] = np.log(df_reg['ViolentCrimesPerPop'] + 0.1)
     df_reg.loc[:,'medIncome_log'] = np.log(df_reg['medIncome'] + 0.1)
    +
    +
    /var/folders/7v/zl9mv52s3ls94kntlt_l9ryh0000gq/T/ipykernel_13528/3488182522.py:5: SettingWithCopyWarning: 
    +A value is trying to be set on a copy of a slice from a DataFrame.
    +Try using .loc[row_indexer,col_indexer] = value instead
    +
    +See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
    +  df_reg.loc[:, 'ViolentCrimesPerPop_log'] = np.log(df_reg['ViolentCrimesPerPop'] + 0.1)
    +/var/folders/7v/zl9mv52s3ls94kntlt_l9ryh0000gq/T/ipykernel_13528/3488182522.py:6: SettingWithCopyWarning: 
    +A value is trying to be set on a copy of a slice from a DataFrame.
    +Try using .loc[row_indexer,col_indexer] = value instead
    +
    +See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
    +  df_reg.loc[:,'medIncome_log'] = np.log(df_reg['medIncome'] + 0.1)
    -
    -
    df_reg
    -
    +
    +
    +
    df_reg
    +
    @@ -955,39 +969,39 @@

    22  -
    import seaborn as sns
    -sns.jointplot(data = df_reg[['medIncome_log', 'ViolentCrimesPerPop_log']], 
    -              x = 'ViolentCrimesPerPop_log', 
    -              y = 'medIncome_log', kind='reg',
    -              marker = '.')
    +
    +
    import seaborn as sns
    +sns.jointplot(data = df_reg[['medIncome_log', 'ViolentCrimesPerPop_log']], 
    +              x = 'ViolentCrimesPerPop_log', 
    +              y = 'medIncome_log', kind='reg',
    +              marker = '.')
    -

    +

    Is log transforming our variables the right thing to do here?

    Fit our regression to the log transformed data.

    -
    -
    import matplotlib.pyplot as plt
    -from sklearn.linear_model import LinearRegression
    -from sklearn import metrics
    -
    -x = df_reg[['ViolentCrimesPerPop_log']]
    -y = df_reg[['medIncome_log']]
    -
    -model = LinearRegression()
    -model.fit(x, y)
    -
    -y_hat = model.predict(x)
    -plt.plot(x, y,'o', alpha = 0.5)
    -plt.plot(x, y_hat, 'r', alpha = 0.5)
    -
    -plt.xlabel('Violent Crimes Per Population')
    -plt.ylabel('Median Income')
    -
    -print ("MSE:", metrics.mean_squared_error(y_hat, y))
    -print ("R^2:", metrics.r2_score(y, y_hat))
    -print ("var:", y.var())
    +
    +
    import matplotlib.pyplot as plt
    +from sklearn.linear_model import LinearRegression
    +from sklearn import metrics
    +
    +x = df_reg[['ViolentCrimesPerPop_log']]
    +y = df_reg[['medIncome_log']]
    +
    +model = LinearRegression()
    +model.fit(x, y)
    +
    +y_hat = model.predict(x)
    +plt.plot(x, y,'o', alpha = 0.5)
    +plt.plot(x, y_hat, 'r', alpha = 0.5)
    +
    +plt.xlabel('Violent Crimes Per Population')
    +plt.ylabel('Median Income')
    +
    +print ("MSE:", metrics.mean_squared_error(y_hat, y))
    +print ("R^2:", metrics.r2_score(y, y_hat))
    +print ("var:", y.var())
    MSE: 0.1531885348757034
     R^2: 0.22763497704356928
    @@ -995,27 +1009,27 @@ 

    22  -

    +

    Has our log transformation distorted the pattern in the data?

    -
    -
    x = df_reg[['ViolentCrimesPerPop']]
    -y = df_reg[['medIncome']]
    -
    -model = LinearRegression()
    -model.fit(x, y)
    -
    -y_hat = model.predict(x)
    -plt.plot(x, y,'o', alpha = 0.5)
    -plt.plot(x, y_hat, 'r', alpha = 0.5)
    -
    -plt.xlabel('Violent Crimes Per Population')
    -plt.ylabel('Median Income')
    -
    -print ("MSE:", metrics.mean_squared_error(y_hat, y))
    -print ("R^2:", metrics.r2_score(y, y_hat))
    -print ("var:", y.var())
    +
    +
    x = df_reg[['ViolentCrimesPerPop']]
    +y = df_reg[['medIncome']]
    +
    +model = LinearRegression()
    +model.fit(x, y)
    +
    +y_hat = model.predict(x)
    +plt.plot(x, y,'o', alpha = 0.5)
    +plt.plot(x, y_hat, 'r', alpha = 0.5)
    +
    +plt.xlabel('Violent Crimes Per Population')
    +plt.ylabel('Median Income')
    +
    +print ("MSE:", metrics.mean_squared_error(y_hat, y))
    +print ("R^2:", metrics.r2_score(y, y_hat))
    +print ("var:", y.var())
    MSE: 0.03592636778157073
     R^2: 0.17996313165549482
    @@ -1023,42 +1037,42 @@ 

    22  -

    +

    What is the relationship between violent crime and median income? Why might this be?

    Assuming the log data is fine, have we overfit the model? Remember that a good model (which accurately models the relationship between violent crimes per population) need to be robust when faced with new data.

    Kfold cross validation splits data into train and test subsets. We can then fit the regression to the training set and see how well it does for the test set.

    +
    +
    from sklearn.model_selection import KFold
    +
    +X = df_reg[['ViolentCrimesPerPop']]
    +y = df_reg[['medIncome']]
    +
    +# get four splits, Each split contains a 
    +# test series and a train series.
    +kf = KFold(n_splits=4)
    +
    -
    from sklearn.model_selection import KFold
    -
    -X = df_reg[['ViolentCrimesPerPop']]
    -y = df_reg[['medIncome']]
    -
    -# get four splits, Each split contains a 
    -# test series and a train series.
    -kf = KFold(n_splits=4)
    +
    # lists to store our statistics
    +r_vals = []
    +MSEs = []
    +medIncome_coef = []
    +
    +for train_index, test_index in kf.split(X):
    +    # fit our model and extract statistics
    +    model = LinearRegression()
    +    model.fit(X.iloc[train_index], y.iloc[train_index])
    +    y_hat = model.predict(X.iloc[test_index])
    +    
    +    MSEs.append(metrics.mean_squared_error(y.iloc[test_index], y_hat))
    +    medIncome_coef.append(model.coef_[0][0])
    +    r_vals.append(metrics.r2_score(y.iloc[test_index], y_hat))
    -
    # lists to store our statistics
    -r_vals = []
    -MSEs = []
    -medIncome_coef = []
    -
    -for train_index, test_index in kf.split(X):
    -    # fit our model and extract statistics
    -    model = LinearRegression()
    -    model.fit(X.iloc[train_index], y.iloc[train_index])
    -    y_hat = model.predict(X.iloc[test_index])
    -    
    -    MSEs.append(metrics.mean_squared_error(y.iloc[test_index], y_hat))
    -    medIncome_coef.append(model.coef_[0][0])
    -    r_vals.append(metrics.r2_score(y.iloc[test_index], y_hat))
    -
    -
    -
    data = {'MSE' : MSEs, 'medIncome coefficient' : medIncome_coef, 'r squared' : r_vals}
    -pd.DataFrame(data)
    -
    +
    data = {'MSE' : MSEs, 'medIncome coefficient' : medIncome_coef, 'r squared' : r_vals}
    +pd.DataFrame(data)
    +
    @@ -1104,15 +1118,15 @@

    22  here).

    -
    -
    from sklearn.model_selection import cross_val_score
    -x = df_reg[['ViolentCrimesPerPop']]
    -y = df_reg[['medIncome']]
    -
    -model = LinearRegression()
    -model.fit(x, y)
    -
    -print(cross_val_score(model, x, y, cv=4))
    +
    +
    from sklearn.model_selection import cross_val_score
    +x = df_reg[['ViolentCrimesPerPop']]
    +y = df_reg[['medIncome']]
    +
    +model = LinearRegression()
    +model.fit(x, y)
    +
    +print(cross_val_score(model, x, y, cv=4))
    [0.13047946 0.16281953 0.20013867 0.18240261]
    diff --git a/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-4-output-1.png b/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-4-output-1.png new file mode 100644 index 0000000..34793f3 Binary files /dev/null and b/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-4-output-1.png differ diff --git a/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-5-output-1.png b/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-5-output-1.png index a770677..01322ba 100644 Binary files a/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-5-output-1.png and b/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-5-output-1.png differ diff --git a/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-7-output-1.png b/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-7-output-1.png new file mode 100644 index 0000000..a326fa3 Binary files /dev/null and b/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-7-output-1.png differ diff --git a/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-8-output-1.png b/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-8-output-1.png index 86d02e7..f099855 100644 Binary files a/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-8-output-1.png and b/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-8-output-1.png differ diff --git a/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-9-output-2.png b/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-9-output-2.png index d4c20a4..96d96c5 100644 Binary files a/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-9-output-2.png and b/content/labs/Lab_5/IM939_Lab_5_2_files/figure-html/cell-9-output-2.png differ diff --git a/search.json b/search.json index 27493df..1291918 100644 --- a/search.json +++ b/search.json @@ -501,7 +501,7 @@ "href": "content/labs/Lab_3/IM939_Lab_3_Exercise_Linear_Regression.html#reading-data", "title": "13  Exercise: Regression", "section": "13.3 Reading Data", - "text": "13.3 Reading Data\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\n\n\nwine = pd.read_excel('data/raw/winequality-red_v2.xlsx', engine = 'openpyxl')" + "text": "13.3 Reading Data\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\n\n\nwine = pd.read_excel('data/raw/winequality-red_v2.xlsx', engine = 'openpyxl')\n\n#You might need to use encoding, then the code will look like:\n# wine = pd.read_excel('data/raw/winequality-red_v2.xlsx', engine = 'openpyxl', encoding='UTF-8')" }, { "objectID": "content/labs/Lab_3/IM939_Lab_3_Exercise_Linear_Regression.html#data-exploration", @@ -606,14 +606,14 @@ "href": "content/labs/Lab_4/IM939_Lab_4_1_Iris.html#do-this-yourself-check-if-we-need-to-do-any-normalisation-for-this-case", "title": "16  Lab: Dimension Reduction", "section": "16.4 Do-this-yourself: Check if we need to do any normalisation for this case?", - "text": "16.4 Do-this-yourself: Check if we need to do any normalisation for this case?\nWe have already looked at how the data looks, what are the descriptive statistics look like, see if we need to do anything more?\n\nk_means = KMeans(n_clusters = 3, init = 'random', n_init = 10)\n\nFit our kmeans model to the data\n\nk_means.fit(iris)\n\nKMeans(init='random', n_clusters=3, n_init=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.KMeansKMeans(init='random', n_clusters=3, n_init=10)\n\n\nThe algorithm has assigned the a label to each row.\n\nk_means.labels_\n\narray([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0,\n 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0,\n 0, 2, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2], dtype=int32)\n\n\nEach row has been assigned a label.\nTo tidy things up we should put everything into a dataframe.\n\niris_df['Three clusters'] = pd.Series(k_means.predict(iris_df.values), index = iris_df.index)\n\n\niris_df\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\nThree clusters\n\n\n\n\n0\n0.222222\n0.625000\n0.067797\n0.041667\n1\n\n\n1\n0.166667\n0.416667\n0.067797\n0.041667\n1\n\n\n2\n0.111111\n0.500000\n0.050847\n0.041667\n1\n\n\n3\n0.083333\n0.458333\n0.084746\n0.041667\n1\n\n\n4\n0.194444\n0.666667\n0.067797\n0.041667\n1\n\n\n...\n...\n...\n...\n...\n...\n\n\n145\n0.666667\n0.416667\n0.711864\n0.916667\n0\n\n\n146\n0.555556\n0.208333\n0.677966\n0.750000\n2\n\n\n147\n0.611111\n0.416667\n0.711864\n0.791667\n0\n\n\n148\n0.527778\n0.583333\n0.745763\n0.916667\n0\n\n\n149\n0.444444\n0.416667\n0.694915\n0.708333\n2\n\n\n\n\n150 rows × 5 columns\n\n\n\n\nsns.pairplot(iris_df, hue = 'Three clusters')\n\n\n\n\nThat seems quite nice. We can also do individual plots if preferred.\n\nsns.scatterplot(data = iris_df, x = 'sepal length (cm)', y = 'petal width (cm)', hue = 'Three clusters')\n\n<Axes: xlabel='sepal length (cm)', ylabel='petal width (cm)'>\n\n\n\n\n\nK-means works by clustering the data around central points (often called centroids, means or cluster centers). We can extract the cluster centres from the kmeans object.\n\nk_means.cluster_centers_\n\narray([[0.70726496, 0.4508547 , 0.79704476, 0.82478632],\n [0.19611111, 0.595 , 0.07830508, 0.06083333],\n [0.44125683, 0.30737705, 0.57571548, 0.54918033]])\n\n\nIt is tricky to plot these using seaborn but we can use a normal maplotlib scatter plot.\nLet us grab the groups.\n\ngroup1 = iris_df[iris_df['Three clusters'] == 0]\ngroup2 = iris_df[iris_df['Three clusters'] == 1]\ngroup3 = iris_df[iris_df['Three clusters'] == 2]\n\nGrab the centroids\n\nimport pandas as pd\n\ncentres = k_means.cluster_centers_\n\ndata = {'x': [centres[0][0], centres[1][0], centres[2][0]],\n 'y': [centres[0][3], centres[1][3], centres[2][3]]}\n\ndf = pd.DataFrame (data, columns = ['x', 'y'])\n\nCreate the plot\n\nimport matplotlib.pyplot as plt\n\n# Plot each group individually\nplt.scatter(\n x = group1['sepal length (cm)'], \n y = group1['petal width (cm)'], \n alpha = 0.1, color = 'blue'\n)\n\nplt.scatter(\n x = group2['sepal length (cm)'], \n y = group2['petal width (cm)'], \n alpha = 0.1, color = 'orange'\n)\n\nplt.scatter(\n x = group3['sepal length (cm)'], \n y = group3['petal width (cm)'], \n alpha = 0.1, color = 'red'\n)\n\n# Plot cluster centres\nplt.scatter(\n x = df['x'], \n y = df['y'], \n alpha = 1, color = 'black'\n)\n\n<matplotlib.collections.PathCollection at 0x14fe387d0>\n\n\n\n\n\n\n16.4.1 Number of clusters\nWhat happens if we change the number of clusters?\nTwo groups\n\nk_means_2 = KMeans(n_clusters = 2, init = 'random', n_init = 10)\nk_means_2.fit(iris)\niris_df['Two clusters'] = pd.Series(k_means_2.predict(iris_df.iloc[:,0:4].values), index = iris_df.index)\n\nNote that I have added a new column to the iris dataframe called ‘cluster 2 means’ and pass only our origonal 4 columns to the predict function (hence me using .iloc[:,0:4]).\nHow do our groupings look now (without plotting the cluster column)?\n\nsns.pairplot(iris_df.loc[:, iris_df.columns != 'Three clusters'], hue = 'Two clusters')\n\n\n\n\nHmm, does the data have more than two groups in it?\nPerhaps we should try 5 clusters instead.\n\nk_means_5 = KMeans(n_clusters = 5, init = 'random', n_init = 10)\nk_means_5.fit(iris)\niris_df['Five clusters'] = pd.Series(k_means_5.predict(iris_df.iloc[:,0:4].values), index = iris_df.index)\n\nPlot without the columns called ‘cluster’ and ‘Two cluster’\n\nsns.pairplot(iris_df.loc[:, (iris_df.columns != 'Three clusters') & (iris_df.columns != 'Two clusters')], hue = 'Five clusters')\n\n\n\n\n\niris_df\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\nThree clusters\nTwo clusters\nFive clusters\n\n\n\n\n0\n0.222222\n0.625000\n0.067797\n0.041667\n1\n1\n4\n\n\n1\n0.166667\n0.416667\n0.067797\n0.041667\n1\n1\n0\n\n\n2\n0.111111\n0.500000\n0.050847\n0.041667\n1\n1\n0\n\n\n3\n0.083333\n0.458333\n0.084746\n0.041667\n1\n1\n0\n\n\n4\n0.194444\n0.666667\n0.067797\n0.041667\n1\n1\n4\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n145\n0.666667\n0.416667\n0.711864\n0.916667\n0\n0\n2\n\n\n146\n0.555556\n0.208333\n0.677966\n0.750000\n2\n0\n3\n\n\n147\n0.611111\n0.416667\n0.711864\n0.791667\n0\n0\n3\n\n\n148\n0.527778\n0.583333\n0.745763\n0.916667\n0\n0\n2\n\n\n149\n0.444444\n0.416667\n0.694915\n0.708333\n2\n0\n3\n\n\n\n\n150 rows × 7 columns\n\n\n\nWhich did best?\n\nk_means.inertia_\n\n6.982216473785234\n\n\n\nk_means_2.inertia_\n\n12.127790750538193\n\n\n\nk_means_5.inertia_\n\n4.58977540011789\n\n\nIt looks like our k = 5 model captures the data well. Intertia, looking at the sklearn documentation as the Sum of squared distances of samples to their closest cluster center..\nIf you want to dive further into this then Real Python’s practical guide to K-Means Clustering is quite good." + "text": "16.4 Do-this-yourself: Check if we need to do any normalisation for this case?\nWe have already looked at how the data looks, what are the descriptive statistics look like, see if we need to do anything more?\n\nk_means = KMeans(n_clusters = 3, init = 'random', n_init = 10)\n\nFit our kmeans model to the data\n\nk_means.fit(iris)\n\nKMeans(init='random', n_clusters=3, n_init=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.KMeansKMeans(init='random', n_clusters=3, n_init=10)\n\n\nThe algorithm has assigned the a label to each row.\n\nk_means.labels_\n\narray([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n 2, 2, 2, 2, 2, 2, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,\n 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,\n 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0], dtype=int32)\n\n\nEach row has been assigned a label.\nTo tidy things up we should put everything into a dataframe.\n\niris_df['Three clusters'] = pd.Series(k_means.predict(iris_df.values), index = iris_df.index)\n\n\niris_df\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\nThree clusters\n\n\n\n\n0\n0.222222\n0.625000\n0.067797\n0.041667\n2\n\n\n1\n0.166667\n0.416667\n0.067797\n0.041667\n2\n\n\n2\n0.111111\n0.500000\n0.050847\n0.041667\n2\n\n\n3\n0.083333\n0.458333\n0.084746\n0.041667\n2\n\n\n4\n0.194444\n0.666667\n0.067797\n0.041667\n2\n\n\n...\n...\n...\n...\n...\n...\n\n\n145\n0.666667\n0.416667\n0.711864\n0.916667\n1\n\n\n146\n0.555556\n0.208333\n0.677966\n0.750000\n0\n\n\n147\n0.611111\n0.416667\n0.711864\n0.791667\n1\n\n\n148\n0.527778\n0.583333\n0.745763\n0.916667\n1\n\n\n149\n0.444444\n0.416667\n0.694915\n0.708333\n0\n\n\n\n\n150 rows × 5 columns\n\n\n\n\nsns.pairplot(iris_df, hue = 'Three clusters')\n\n\n\n\nThat seems quite nice. We can also do individual plots if preferred.\n\nsns.scatterplot(data = iris_df, x = 'sepal length (cm)', y = 'petal width (cm)', hue = 'Three clusters')\n\n<Axes: xlabel='sepal length (cm)', ylabel='petal width (cm)'>\n\n\n\n\n\nK-means works by clustering the data around central points (often called centroids, means or cluster centers). We can extract the cluster centres from the kmeans object.\n\nk_means.cluster_centers_\n\narray([[0.44125683, 0.30737705, 0.57571548, 0.54918033],\n [0.70726496, 0.4508547 , 0.79704476, 0.82478632],\n [0.19611111, 0.595 , 0.07830508, 0.06083333]])\n\n\nIt is tricky to plot these using seaborn but we can use a normal maplotlib scatter plot.\nLet us grab the groups.\n\ngroup1 = iris_df[iris_df['Three clusters'] == 0]\ngroup2 = iris_df[iris_df['Three clusters'] == 1]\ngroup3 = iris_df[iris_df['Three clusters'] == 2]\n\nGrab the centroids\n\nimport pandas as pd\n\ncentres = k_means.cluster_centers_\n\ndata = {'x': [centres[0][0], centres[1][0], centres[2][0]],\n 'y': [centres[0][3], centres[1][3], centres[2][3]]}\n\ndf = pd.DataFrame (data, columns = ['x', 'y'])\n\nCreate the plot\n\nimport matplotlib.pyplot as plt\n\n# Plot each group individually\nplt.scatter(\n x = group1['sepal length (cm)'], \n y = group1['petal width (cm)'], \n alpha = 0.1, color = 'blue'\n)\n\nplt.scatter(\n x = group2['sepal length (cm)'], \n y = group2['petal width (cm)'], \n alpha = 0.1, color = 'orange'\n)\n\nplt.scatter(\n x = group3['sepal length (cm)'], \n y = group3['petal width (cm)'], \n alpha = 0.1, color = 'red'\n)\n\n# Plot cluster centres\nplt.scatter(\n x = df['x'], \n y = df['y'], \n alpha = 1, color = 'black'\n)\n\n<matplotlib.collections.PathCollection at 0x162e90390>\n\n\n\n\n\n\n16.4.1 Number of clusters\nWhat happens if we change the number of clusters?\nTwo groups\n\nk_means_2 = KMeans(n_clusters = 2, init = 'random', n_init = 10)\nk_means_2.fit(iris)\niris_df['Two clusters'] = pd.Series(k_means_2.predict(iris_df.iloc[:,0:4].values), index = iris_df.index)\n\nNote that I have added a new column to the iris dataframe called ‘cluster 2 means’ and pass only our origonal 4 columns to the predict function (hence me using .iloc[:,0:4]).\nHow do our groupings look now (without plotting the cluster column)?\n\nsns.pairplot(iris_df.loc[:, iris_df.columns != 'Three clusters'], hue = 'Two clusters')\n\n\n\n\nHmm, does the data have more than two groups in it?\nPerhaps we should try 5 clusters instead.\n\nk_means_5 = KMeans(n_clusters = 5, init = 'random', n_init = 10)\nk_means_5.fit(iris)\niris_df['Five clusters'] = pd.Series(k_means_5.predict(iris_df.iloc[:,0:4].values), index = iris_df.index)\n\nPlot without the columns called ‘cluster’ and ‘Two cluster’\n\nsns.pairplot(iris_df.loc[:, (iris_df.columns != 'Three clusters') & (iris_df.columns != 'Two clusters')], hue = 'Five clusters')\n\n\n\n\n\niris_df\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\nThree clusters\nTwo clusters\nFive clusters\n\n\n\n\n0\n0.222222\n0.625000\n0.067797\n0.041667\n2\n1\n3\n\n\n1\n0.166667\n0.416667\n0.067797\n0.041667\n2\n1\n1\n\n\n2\n0.111111\n0.500000\n0.050847\n0.041667\n2\n1\n1\n\n\n3\n0.083333\n0.458333\n0.084746\n0.041667\n2\n1\n1\n\n\n4\n0.194444\n0.666667\n0.067797\n0.041667\n2\n1\n3\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n145\n0.666667\n0.416667\n0.711864\n0.916667\n1\n0\n4\n\n\n146\n0.555556\n0.208333\n0.677966\n0.750000\n0\n0\n2\n\n\n147\n0.611111\n0.416667\n0.711864\n0.791667\n1\n0\n2\n\n\n148\n0.527778\n0.583333\n0.745763\n0.916667\n1\n0\n4\n\n\n149\n0.444444\n0.416667\n0.694915\n0.708333\n0\n0\n2\n\n\n\n\n150 rows × 7 columns\n\n\n\nWhich did best?\n\nk_means.inertia_\n\n6.982216473785234\n\n\n\nk_means_2.inertia_\n\n12.127790750538193\n\n\n\nk_means_5.inertia_\n\n4.580948640117293\n\n\nIt looks like our k = 5 model captures the data well. Intertia, looking at the sklearn documentation as the Sum of squared distances of samples to their closest cluster center..\nIf you want to dive further into this then Real Python’s practical guide to K-Means Clustering is quite good." }, { "objectID": "content/labs/Lab_4/IM939_Lab_4_1_Iris.html#principal-component-analysis-pca", "href": "content/labs/Lab_4/IM939_Lab_4_1_Iris.html#principal-component-analysis-pca", "title": "16  Lab: Dimension Reduction", "section": "16.5 Principal Component Analysis (PCA)", - "text": "16.5 Principal Component Analysis (PCA)\nPCA reduces the dimension of our data. The method derives point in an n dimentional space from our data which are uncorrelated.\nTo carry out a PCA on our Iris dataset where there are only two dimensions.\n\nfrom sklearn.decomposition import PCA\n\nn_components = 2\n\npca = PCA(n_components=n_components)\niris_pca = pca.fit(iris_df.iloc[:,0:4])\n\nWe can look at the components.\n\niris_pca.components_\n\narray([[ 0.42494212, -0.15074824, 0.61626702, 0.64568888],\n [ 0.42320271, 0.90396711, -0.06038308, -0.00983925]])\n\n\nThese components are intersting. You may want to look at a PennState article on interpreting PCA components.\nOur second column, ‘sepal width (cm)’ is positively correlated with our second principle component whereas the first column ‘sepal length (cm)’ is postively correlated with both.\nYou may want to consider:\n\nDo we need more than two components?\nIs it useful to keep sepal length (cm) in the dataset?\n\nWe can also examine the explained variance of the each principle component.\n\niris_pca.explained_variance_\n\narray([0.23245325, 0.0324682 ])\n\n\nA nice worked example showing the link between the explained variance and the component is here.\nOur first principle component explains a lot more of the variance of data then the second.\nAnother way to explore these indicators is to look at the explained_variance_ratio_ values. These present a similar information but provide them as percentage values so they are easier to interpret. You can also create a plot and see how these percentages add up. In this case, the first two components add up to 0.96. Which means the first two features are able to represent around 96% of the variation in the data, not bad. These values are not always this high.\nA high value that is close to 100% means that the PCA is able to represent much of the variance and they will be good representations of the data without losing a lot of that variance in the underlying features. This of course is based on an assumption that variance is a good proxy about how informative a feature is.\n\niris_pca.explained_variance_ratio_\n\narray([0.84136038, 0.11751808])\n\n\n\nplt.plot(np.cumsum(pca.explained_variance_ratio_))\nplt.xlabel('number of components')\nplt.ylabel('cumulative explained variance');\n\n\n\n\n\n16.5.1 Dimension reduction\nFor our purposes, we are interested in using PCA for reducing the number of dimension in our data whilst preseving the maximal data variance.\nWe can extract the projected components from the model.\n\niris_pca_vals = pca.fit_transform(iris_df.iloc[:,0:4])\n\nThe numpy arrays contains the projected values.\n\ntype(iris_pca_vals)\n\nnumpy.ndarray\n\n\n\niris_pca_vals\n\narray([[-6.30702931e-01, 1.07577910e-01],\n [-6.22904943e-01, -1.04259833e-01],\n [-6.69520395e-01, -5.14170597e-02],\n [-6.54152759e-01, -1.02884871e-01],\n [-6.48788056e-01, 1.33487576e-01],\n [-5.35272778e-01, 2.89615724e-01],\n [-6.56537790e-01, 1.07244911e-02],\n [-6.25780499e-01, 5.71335411e-02],\n [-6.75643504e-01, -2.00703283e-01],\n [-6.45644619e-01, -6.72080097e-02],\n [-5.97408238e-01, 2.17151953e-01],\n [-6.38943190e-01, 3.25988375e-02],\n [-6.61612593e-01, -1.15605495e-01],\n [-7.51967943e-01, -1.71313322e-01],\n [-6.00371589e-01, 3.80240692e-01],\n [-5.52157227e-01, 5.15255982e-01],\n [-5.77053593e-01, 2.93709492e-01],\n [-6.03799228e-01, 1.07167941e-01],\n [-5.20483461e-01, 2.87627289e-01],\n [-6.12197555e-01, 2.19140388e-01],\n [-5.57674300e-01, 1.02109180e-01],\n [-5.79012675e-01, 1.81065123e-01],\n [-7.37784662e-01, 9.05588211e-02],\n [-5.06093857e-01, 2.79470846e-02],\n [-6.07607579e-01, 2.95285112e-02],\n [-5.90210587e-01, -9.45510863e-02],\n [-5.61527888e-01, 5.52901611e-02],\n [-6.08453780e-01, 1.18310099e-01],\n [-6.12617807e-01, 8.16682448e-02],\n [-6.38184784e-01, -5.44873860e-02],\n [-6.20099660e-01, -8.03970516e-02],\n [-5.24757301e-01, 1.03336126e-01],\n [-6.73044544e-01, 3.44711846e-01],\n [-6.27455379e-01, 4.18257508e-01],\n [-6.18740916e-01, -6.76179787e-02],\n [-6.44553756e-01, -1.51267253e-02],\n [-5.93932344e-01, 1.55623876e-01],\n [-6.87495707e-01, 1.22141914e-01],\n [-6.92369885e-01, -1.62014545e-01],\n [-6.13976551e-01, 6.88891719e-02],\n [-6.26048380e-01, 9.64357527e-02],\n [-6.09693996e-01, -4.14325957e-01],\n [-7.04932239e-01, -8.66839521e-02],\n [-5.14001659e-01, 9.21355196e-02],\n [-5.43513037e-01, 2.14636651e-01],\n [-6.07805187e-01, -1.16425433e-01],\n [-6.28656055e-01, 2.18526915e-01],\n [-6.70879139e-01, -6.41961326e-02],\n [-6.09212186e-01, 2.05396323e-01],\n [-6.29944525e-01, 2.04916869e-02],\n [ 2.79951766e-01, 1.79245790e-01],\n [ 2.15141376e-01, 1.10348921e-01],\n [ 3.22223106e-01, 1.27368010e-01],\n [ 5.94030131e-02, -3.28502275e-01],\n [ 2.62515235e-01, -2.95800761e-02],\n [ 1.03831043e-01, -1.21781742e-01],\n [ 2.44850362e-01, 1.33801733e-01],\n [-1.71529386e-01, -3.52976762e-01],\n [ 2.14230599e-01, 2.06607890e-02],\n [ 1.53249619e-02, -2.12494509e-01],\n [-1.13710323e-01, -4.93929201e-01],\n [ 1.37348380e-01, -2.06894998e-02],\n [ 4.39928190e-02, -3.06159511e-01],\n [ 1.92559767e-01, -3.95507760e-02],\n [-8.26091518e-03, -8.66610981e-02],\n [ 2.19485489e-01, 1.09383928e-01],\n [ 1.33272148e-01, -5.90267184e-02],\n [-5.75757060e-04, -1.42367733e-01],\n [ 2.54345249e-01, -2.89815304e-01],\n [-5.60800300e-03, -2.39572672e-01],\n [ 2.68168358e-01, 4.72705335e-02],\n [ 9.88208151e-02, -6.96420088e-02],\n [ 2.89086481e-01, -1.69157553e-01],\n [ 1.45033538e-01, -7.63961345e-02],\n [ 1.59287093e-01, 2.19853643e-04],\n [ 2.13962718e-01, 5.99630005e-02],\n [ 2.91913782e-01, 4.04990109e-03],\n [ 3.69148997e-01, 6.43480720e-02],\n [ 1.86769115e-01, -4.96694916e-02],\n [-6.87697501e-02, -1.85648007e-01],\n [-2.15759776e-02, -2.87970157e-01],\n [-5.89248844e-02, -2.86536746e-01],\n [ 3.23412419e-02, -1.41140786e-01],\n [ 2.88906394e-01, -1.31550706e-01],\n [ 1.09664252e-01, -8.25379800e-02],\n [ 1.82266934e-01, 1.38247021e-01],\n [ 2.77724803e-01, 1.05903632e-01],\n [ 1.95615410e-01, -2.38550997e-01],\n [ 3.76839264e-02, -5.41130122e-02],\n [ 4.68406593e-02, -2.53171683e-01],\n [ 5.54365941e-02, -2.19190186e-01],\n [ 1.75833387e-01, -8.62037590e-04],\n [ 4.90676225e-02, -1.79829525e-01],\n [-1.53444261e-01, -3.78886428e-01],\n [ 6.69726607e-02, -1.68132343e-01],\n [ 3.30293747e-02, -4.29708545e-02],\n [ 6.62142547e-02, -8.10461198e-02],\n [ 1.35679197e-01, -2.32914079e-02],\n [-1.58634575e-01, -2.89139847e-01],\n [ 6.20502279e-02, -1.17687974e-01],\n [ 6.22771338e-01, 1.16807265e-01],\n [ 3.46009609e-01, -1.56291874e-01],\n [ 6.17986434e-01, 1.00519741e-01],\n [ 4.17789309e-01, -2.68903690e-02],\n [ 5.63621248e-01, 3.05994289e-02],\n [ 7.50122599e-01, 1.52133800e-01],\n [ 1.35857804e-01, -3.30462554e-01],\n [ 6.08945212e-01, 8.35018443e-02],\n [ 5.11020215e-01, -1.32575915e-01],\n [ 7.20608541e-01, 3.34580389e-01],\n [ 4.24135062e-01, 1.13914054e-01],\n [ 4.37723702e-01, -8.78049736e-02],\n [ 5.40793776e-01, 6.93466165e-02],\n [ 3.63226514e-01, -2.42764625e-01],\n [ 4.74246948e-01, -1.20676423e-01],\n [ 5.13932631e-01, 9.88816323e-02],\n [ 4.24670824e-01, 3.53096310e-02],\n [ 7.49026039e-01, 4.63778390e-01],\n [ 8.72194272e-01, 9.33798117e-03],\n [ 2.82963372e-01, -3.18443776e-01],\n [ 6.14733184e-01, 1.53566018e-01],\n [ 3.22133832e-01, -1.40500924e-01],\n [ 7.58030401e-01, 8.79453649e-02],\n [ 3.57235237e-01, -9.50568671e-02],\n [ 5.31036706e-01, 1.68539991e-01],\n [ 5.46962123e-01, 1.87812429e-01],\n [ 3.28704908e-01, -6.81237595e-02],\n [ 3.14783811e-01, -5.57223965e-03],\n [ 5.16585543e-01, -5.40299414e-02],\n [ 4.84826663e-01, 1.15348658e-01],\n [ 6.33043632e-01, 5.92290940e-02],\n [ 6.87490917e-01, 4.91179916e-01],\n [ 5.43489246e-01, -5.44399104e-02],\n [ 2.91133358e-01, -5.82085481e-02],\n [ 3.05410131e-01, -1.61757644e-01],\n [ 7.63507935e-01, 1.68186703e-01],\n [ 5.47805644e-01, 1.58976299e-01],\n [ 4.06585699e-01, 6.12192966e-02],\n [ 2.92534659e-01, -1.63044284e-02],\n [ 5.35871344e-01, 1.19790986e-01],\n [ 6.13864965e-01, 9.30029331e-02],\n [ 5.58343139e-01, 1.22041374e-01],\n [ 3.46009609e-01, -1.56291874e-01],\n [ 6.23819644e-01, 1.39763503e-01],\n [ 6.38651518e-01, 1.66900115e-01],\n [ 5.51461624e-01, 5.98413741e-02],\n [ 4.07146497e-01, -1.71820871e-01],\n [ 4.47142619e-01, 3.75600193e-02],\n [ 4.88207585e-01, 1.49677521e-01],\n [ 3.12066323e-01, -3.11303854e-02]])\n\n\nEach row corresponds to a row in our data.\n\niris_pca_vals.shape\n\n(150, 2)\n\n\n\niris_df.shape\n\n(150, 7)\n\n\nWe can add the component to our dataset. I prefer to keep everything in one table and it is not at all required. You can just assign the values whichever variables you prefer.\n\niris_df['c1'] = [item[0] for item in iris_pca_vals]\niris_df['c2'] = [item[1] for item in iris_pca_vals]\n\n\niris_df\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\nThree clusters\nTwo clusters\nFive clusters\nc1\nc2\n\n\n\n\n0\n0.222222\n0.625000\n0.067797\n0.041667\n1\n1\n4\n-0.630703\n0.107578\n\n\n1\n0.166667\n0.416667\n0.067797\n0.041667\n1\n1\n0\n-0.622905\n-0.104260\n\n\n2\n0.111111\n0.500000\n0.050847\n0.041667\n1\n1\n0\n-0.669520\n-0.051417\n\n\n3\n0.083333\n0.458333\n0.084746\n0.041667\n1\n1\n0\n-0.654153\n-0.102885\n\n\n4\n0.194444\n0.666667\n0.067797\n0.041667\n1\n1\n4\n-0.648788\n0.133488\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n145\n0.666667\n0.416667\n0.711864\n0.916667\n0\n0\n2\n0.551462\n0.059841\n\n\n146\n0.555556\n0.208333\n0.677966\n0.750000\n2\n0\n3\n0.407146\n-0.171821\n\n\n147\n0.611111\n0.416667\n0.711864\n0.791667\n0\n0\n3\n0.447143\n0.037560\n\n\n148\n0.527778\n0.583333\n0.745763\n0.916667\n0\n0\n2\n0.488208\n0.149678\n\n\n149\n0.444444\n0.416667\n0.694915\n0.708333\n2\n0\n3\n0.312066\n-0.031130\n\n\n\n\n150 rows × 9 columns\n\n\n\nPlotting out our data on our new two component space.\n\nsns.scatterplot(data = iris_df, x = 'c1', y = 'c2')\n\n<Axes: xlabel='c1', ylabel='c2'>\n\n\n\n\n\nWe have reduced our three dimensions to two.\nWe can also colour by our clusters. What does this show us and is it useful?\n\nsns.scatterplot(data = iris_df, x = 'c1', y = 'c2', hue = 'Three clusters')\n\n<Axes: xlabel='c1', ylabel='c2'>\n\n\n\n\n\n\niris_df\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\nThree clusters\nTwo clusters\nFive clusters\nc1\nc2\n\n\n\n\n0\n0.222222\n0.625000\n0.067797\n0.041667\n1\n1\n4\n-0.630703\n0.107578\n\n\n1\n0.166667\n0.416667\n0.067797\n0.041667\n1\n1\n0\n-0.622905\n-0.104260\n\n\n2\n0.111111\n0.500000\n0.050847\n0.041667\n1\n1\n0\n-0.669520\n-0.051417\n\n\n3\n0.083333\n0.458333\n0.084746\n0.041667\n1\n1\n0\n-0.654153\n-0.102885\n\n\n4\n0.194444\n0.666667\n0.067797\n0.041667\n1\n1\n4\n-0.648788\n0.133488\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n145\n0.666667\n0.416667\n0.711864\n0.916667\n0\n0\n2\n0.551462\n0.059841\n\n\n146\n0.555556\n0.208333\n0.677966\n0.750000\n2\n0\n3\n0.407146\n-0.171821\n\n\n147\n0.611111\n0.416667\n0.711864\n0.791667\n0\n0\n3\n0.447143\n0.037560\n\n\n148\n0.527778\n0.583333\n0.745763\n0.916667\n0\n0\n2\n0.488208\n0.149678\n\n\n149\n0.444444\n0.416667\n0.694915\n0.708333\n2\n0\n3\n0.312066\n-0.031130\n\n\n\n\n150 rows × 9 columns\n\n\n\n\n\n16.5.2 PCA to Clusters\nWe have reduced our 4D dataset to 2D whilst keeping the data variance. Reducing the data to fewer dimensions can help with the ‘curse of dimensionality’, reduce the change of overfitting a machine learning model (see here) and reduce the computational complexity of a model fit.\nPutting our new dimensions into a kMeans model\n\nk_means_pca = KMeans(n_clusters = 3, init = 'random', n_init = 10)\niris_pca_kmeans = k_means_pca.fit(iris_df.iloc[:,-2:])\n\n\ntype(iris_df.iloc[:,-2:].values)\n\nnumpy.ndarray\n\n\n\niris_df['PCA 3 clusters'] = pd.Series(k_means_pca.predict(iris_df.iloc[:,-2:].values), index = iris_df.index)\niris_df\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\nThree clusters\nTwo clusters\nFive clusters\nc1\nc2\nPCA 3 clusters\n\n\n\n\n0\n0.222222\n0.625000\n0.067797\n0.041667\n1\n1\n4\n-0.630703\n0.107578\n1\n\n\n1\n0.166667\n0.416667\n0.067797\n0.041667\n1\n1\n0\n-0.622905\n-0.104260\n1\n\n\n2\n0.111111\n0.500000\n0.050847\n0.041667\n1\n1\n0\n-0.669520\n-0.051417\n1\n\n\n3\n0.083333\n0.458333\n0.084746\n0.041667\n1\n1\n0\n-0.654153\n-0.102885\n1\n\n\n4\n0.194444\n0.666667\n0.067797\n0.041667\n1\n1\n4\n-0.648788\n0.133488\n1\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n145\n0.666667\n0.416667\n0.711864\n0.916667\n0\n0\n2\n0.551462\n0.059841\n0\n\n\n146\n0.555556\n0.208333\n0.677966\n0.750000\n2\n0\n3\n0.407146\n-0.171821\n2\n\n\n147\n0.611111\n0.416667\n0.711864\n0.791667\n0\n0\n3\n0.447143\n0.037560\n0\n\n\n148\n0.527778\n0.583333\n0.745763\n0.916667\n0\n0\n2\n0.488208\n0.149678\n0\n\n\n149\n0.444444\n0.416667\n0.694915\n0.708333\n2\n0\n3\n0.312066\n-0.031130\n2\n\n\n\n\n150 rows × 10 columns\n\n\n\nAs we only have two dimensions we can easily plot this on a single scatterplot.\n\n# a different seaborn theme\n# see https://python-graph-gallery.com/104-seaborn-themes/\nsns.set_style(\"darkgrid\")\nsns.scatterplot(data = iris_df, x = 'c1', y = 'c2', hue = 'PCA 3 clusters')\n\n<Axes: xlabel='c1', ylabel='c2'>\n\n\n\n\n\nI suspect having two clusters would work better. We should try a few different models.\nCopying the code from here we can fit multiple numbers of clusters.\n\nks = range(1, 10)\ninertias = [] # Create an empty list (will be populated later)\nfor k in ks:\n # Create a KMeans instance with k clusters: model\n model = KMeans(n_clusters=k, n_init = 10)\n \n # Fit model to samples\n model.fit(iris_df.iloc[:,-2:])\n \n # Append the inertia to the list of inertias\n inertias.append(model.inertia_)\n \nplt.plot(ks, inertias, '-o', color='black')\nplt.xlabel('number of clusters, k')\nplt.ylabel('inertia')\nplt.xticks(ks)\nplt.show()\n\n\n\n\nThree seems ok. We clearly want no more than three.\nThese types of plots show an point about model complexity. More free parameters in the model (here the number of clusters) will improve how well the model captures the data, often with reducing returns. However, a model which overfits the data will not be able to fit new data well - referred to overfitting. Randomish internet blogs introduce the topic pretty well, see here, and also wikipedia, see here.\n\n\n16.5.3 Missing values\nFinally, how we deal with missing values can impact the results of PCA and kMeans clustering.\nLets us load in the iris dataset again and randomly remove 10% of the data (see code from here).\n\nimport numpy as np\n\nx = load_iris()\n\n\niris_df = pd.DataFrame(x.data, columns = x.feature_names)\n\nmask = np.random.choice([True, False], size = iris_df.shape, p = [0.2, 0.8])\nmask[mask.all(1),-1] = 0\n\ndf = iris_df.mask(mask)\n\ndf.isna().sum()\n\nsepal length (cm) 32\nsepal width (cm) 34\npetal length (cm) 37\npetal width (cm) 29\ndtype: int64\n\n\n\ndf\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\n\n\n\n\n0\n5.1\nNaN\nNaN\n0.2\n\n\n1\n4.9\nNaN\nNaN\n0.2\n\n\n2\nNaN\n3.2\n1.3\n0.2\n\n\n3\n4.6\n3.1\n1.5\nNaN\n\n\n4\n5.0\nNaN\n1.4\nNaN\n\n\n...\n...\n...\n...\n...\n\n\n145\n6.7\n3.0\n5.2\n2.3\n\n\n146\nNaN\n2.5\n5.0\n1.9\n\n\n147\n6.5\nNaN\nNaN\n2.0\n\n\n148\n6.2\nNaN\n5.4\n2.3\n\n\n149\n5.9\n3.0\n5.1\n1.8\n\n\n\n\n150 rows × 4 columns\n\n\n\nAbout 20% of the data is randomly an NaN.\n\n16.5.3.1 Zeroing\nWe can 0 them and fit our models.\n\ndf_1 = df.copy()\ndf_1 = df_1.fillna(0)\n\n\ndf_1\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\n\n\n\n\n0\n5.1\n0.0\n0.0\n0.2\n\n\n1\n4.9\n0.0\n0.0\n0.2\n\n\n2\n0.0\n3.2\n1.3\n0.2\n\n\n3\n4.6\n3.1\n1.5\n0.0\n\n\n4\n5.0\n0.0\n1.4\n0.0\n\n\n...\n...\n...\n...\n...\n\n\n145\n6.7\n3.0\n5.2\n2.3\n\n\n146\n0.0\n2.5\n5.0\n1.9\n\n\n147\n6.5\n0.0\n0.0\n2.0\n\n\n148\n6.2\n0.0\n5.4\n2.3\n\n\n149\n5.9\n3.0\n5.1\n1.8\n\n\n\n\n150 rows × 4 columns\n\n\n\n\nk_means_zero = KMeans(n_clusters = 4, init = 'random', n_init = 10)\nk_means_zero.fit(df_1)\ndf_1['Four clusters'] = pd.Series(k_means_zero.predict(df_1.iloc[:,0:4].values), index = df_1.index)\nsns.pairplot(df_1, hue = 'Four clusters')\n\n\n\n\nWhat impact has zeroing the values had on our results?\nNow, onto PCA.\n\n# PCA analysis\nn_components = 2\n\npca = PCA(n_components=n_components)\ndf_1_pca = pca.fit(df_1.iloc[:,0:4])\n\n# Extract projected values\ndf_1_pca_vals = df_1_pca.transform(df_1.iloc[:,0:4])\ndf_1['c1'] = [item[0] for item in df_1_pca_vals]\ndf_1['c2'] = [item[1] for item in df_1_pca_vals]\n\nsns.scatterplot(data = df_1, x = 'c1', y = 'c2')\n\n<Axes: xlabel='c1', ylabel='c2'>\n\n\n\n\n\n\ndf_1_pca.explained_variance_\n\narray([6.71803744, 4.89376791])\n\n\n\ndf_1_pca.components_\n\narray([[-0.91235845, 0.02968512, -0.38161438, -0.14522853],\n [-0.39939351, 0.05086373, 0.90393389, 0.14422629]])\n\n\n\n\n16.5.3.2 Replacing with the average\n\ndf_2 = df.copy()\nfor i in range(4):\n df_2.iloc[:,i] = df_2.iloc[:,i].fillna(df_2.iloc[:,i].mean())\n\n\ndf_2\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\n\n\n\n\n0\n5.100000\n3.00431\n3.90885\n0.200000\n\n\n1\n4.900000\n3.00431\n3.90885\n0.200000\n\n\n2\n5.866102\n3.20000\n1.30000\n0.200000\n\n\n3\n4.600000\n3.10000\n1.50000\n1.210744\n\n\n4\n5.000000\n3.00431\n1.40000\n1.210744\n\n\n...\n...\n...\n...\n...\n\n\n145\n6.700000\n3.00000\n5.20000\n2.300000\n\n\n146\n5.866102\n2.50000\n5.00000\n1.900000\n\n\n147\n6.500000\n3.00431\n3.90885\n2.000000\n\n\n148\n6.200000\n3.00431\n5.40000\n2.300000\n\n\n149\n5.900000\n3.00000\n5.10000\n1.800000\n\n\n\n\n150 rows × 4 columns\n\n\n\n\nk_means_zero = KMeans(n_clusters = 4, init = 'random', n_init = 10)\nk_means_zero.fit(df_2)\ndf_2['Four clusters'] = pd.Series(k_means_zero.predict(df_2.iloc[:,0:4].values), index = df_2.index)\nsns.pairplot(df_2, hue = 'Four clusters')\n\n\n\n\n\n# PCA analysis\nn_components = 2\n\npca = PCA(n_components=n_components)\ndf_2_pca = pca.fit(df_2.iloc[:,0:4])\n\n# Extract projected values\ndf_2_pca_vals = df_2_pca.transform(df_2.iloc[:,0:4])\ndf_2['c1'] = [item[0] for item in df_2_pca_vals]\ndf_2['c2'] = [item[1] for item in df_2_pca_vals]\n\nsns.scatterplot(data = df_2, x = 'c1', y = 'c2')\n\n<Axes: xlabel='c1', ylabel='c2'>\n\n\n\n\n\n\ndf_2_pca.explained_variance_\n\narray([2.68417915, 0.33506061])\n\n\n\ndf_2_pca.components_\n\narray([[ 0.33775908, -0.04345744, 0.87824143, 0.33574133],\n [ 0.82803166, 0.20108365, -0.42517727, 0.30521014]])" + "text": "16.5 Principal Component Analysis (PCA)\nPCA reduces the dimension of our data. The method derives point in an n dimentional space from our data which are uncorrelated.\nTo carry out a PCA on our Iris dataset where there are only two dimensions.\n\nfrom sklearn.decomposition import PCA\n\nn_components = 2\n\npca = PCA(n_components=n_components)\niris_pca = pca.fit(iris_df.iloc[:,0:4])\n\nWe can look at the components.\n\niris_pca.components_\n\narray([[ 0.42494212, -0.15074824, 0.61626702, 0.64568888],\n [ 0.42320271, 0.90396711, -0.06038308, -0.00983925]])\n\n\nThese components are intersting. You may want to look at a PennState article on interpreting PCA components.\nOur second column, ‘sepal width (cm)’ is positively correlated with our second principle component whereas the first column ‘sepal length (cm)’ is postively correlated with both.\nYou may want to consider:\n\nDo we need more than two components?\nIs it useful to keep sepal length (cm) in the dataset?\n\nWe can also examine the explained variance of the each principle component.\n\niris_pca.explained_variance_\n\narray([0.23245325, 0.0324682 ])\n\n\nA nice worked example showing the link between the explained variance and the component is here.\nOur first principle component explains a lot more of the variance of data then the second.\nAnother way to explore these indicators is to look at the explained_variance_ratio_ values. These present a similar information but provide them as percentage values so they are easier to interpret. You can also create a plot and see how these percentages add up. In this case, the first two components add up to 0.96. Which means the first two features are able to represent around 96% of the variation in the data, not bad. These values are not always this high.\nA high value that is close to 100% means that the PCA is able to represent much of the variance and they will be good representations of the data without losing a lot of that variance in the underlying features. This of course is based on an assumption that variance is a good proxy about how informative a feature is.\n\niris_pca.explained_variance_ratio_\n\narray([0.84136038, 0.11751808])\n\n\n\nplt.plot(np.cumsum(pca.explained_variance_ratio_))\nplt.xlabel('number of components')\nplt.ylabel('cumulative explained variance');\n\n\n\n\n\n16.5.1 Dimension reduction\nFor our purposes, we are interested in using PCA for reducing the number of dimension in our data whilst preseving the maximal data variance.\nWe can extract the projected components from the model.\n\niris_pca_vals = pca.fit_transform(iris_df.iloc[:,0:4])\n\nThe numpy arrays contains the projected values.\n\ntype(iris_pca_vals)\n\nnumpy.ndarray\n\n\n\niris_pca_vals\n\narray([[-6.30702931e-01, 1.07577910e-01],\n [-6.22904943e-01, -1.04259833e-01],\n [-6.69520395e-01, -5.14170597e-02],\n [-6.54152759e-01, -1.02884871e-01],\n [-6.48788056e-01, 1.33487576e-01],\n [-5.35272778e-01, 2.89615724e-01],\n [-6.56537790e-01, 1.07244911e-02],\n [-6.25780499e-01, 5.71335411e-02],\n [-6.75643504e-01, -2.00703283e-01],\n [-6.45644619e-01, -6.72080097e-02],\n [-5.97408238e-01, 2.17151953e-01],\n [-6.38943190e-01, 3.25988375e-02],\n [-6.61612593e-01, -1.15605495e-01],\n [-7.51967943e-01, -1.71313322e-01],\n [-6.00371589e-01, 3.80240692e-01],\n [-5.52157227e-01, 5.15255982e-01],\n [-5.77053593e-01, 2.93709492e-01],\n [-6.03799228e-01, 1.07167941e-01],\n [-5.20483461e-01, 2.87627289e-01],\n [-6.12197555e-01, 2.19140388e-01],\n [-5.57674300e-01, 1.02109180e-01],\n [-5.79012675e-01, 1.81065123e-01],\n [-7.37784662e-01, 9.05588211e-02],\n [-5.06093857e-01, 2.79470846e-02],\n [-6.07607579e-01, 2.95285112e-02],\n [-5.90210587e-01, -9.45510863e-02],\n [-5.61527888e-01, 5.52901611e-02],\n [-6.08453780e-01, 1.18310099e-01],\n [-6.12617807e-01, 8.16682448e-02],\n [-6.38184784e-01, -5.44873860e-02],\n [-6.20099660e-01, -8.03970516e-02],\n [-5.24757301e-01, 1.03336126e-01],\n [-6.73044544e-01, 3.44711846e-01],\n [-6.27455379e-01, 4.18257508e-01],\n [-6.18740916e-01, -6.76179787e-02],\n [-6.44553756e-01, -1.51267253e-02],\n [-5.93932344e-01, 1.55623876e-01],\n [-6.87495707e-01, 1.22141914e-01],\n [-6.92369885e-01, -1.62014545e-01],\n [-6.13976551e-01, 6.88891719e-02],\n [-6.26048380e-01, 9.64357527e-02],\n [-6.09693996e-01, -4.14325957e-01],\n [-7.04932239e-01, -8.66839521e-02],\n [-5.14001659e-01, 9.21355196e-02],\n [-5.43513037e-01, 2.14636651e-01],\n [-6.07805187e-01, -1.16425433e-01],\n [-6.28656055e-01, 2.18526915e-01],\n [-6.70879139e-01, -6.41961326e-02],\n [-6.09212186e-01, 2.05396323e-01],\n [-6.29944525e-01, 2.04916869e-02],\n [ 2.79951766e-01, 1.79245790e-01],\n [ 2.15141376e-01, 1.10348921e-01],\n [ 3.22223106e-01, 1.27368010e-01],\n [ 5.94030131e-02, -3.28502275e-01],\n [ 2.62515235e-01, -2.95800761e-02],\n [ 1.03831043e-01, -1.21781742e-01],\n [ 2.44850362e-01, 1.33801733e-01],\n [-1.71529386e-01, -3.52976762e-01],\n [ 2.14230599e-01, 2.06607890e-02],\n [ 1.53249619e-02, -2.12494509e-01],\n [-1.13710323e-01, -4.93929201e-01],\n [ 1.37348380e-01, -2.06894998e-02],\n [ 4.39928190e-02, -3.06159511e-01],\n [ 1.92559767e-01, -3.95507760e-02],\n [-8.26091518e-03, -8.66610981e-02],\n [ 2.19485489e-01, 1.09383928e-01],\n [ 1.33272148e-01, -5.90267184e-02],\n [-5.75757060e-04, -1.42367733e-01],\n [ 2.54345249e-01, -2.89815304e-01],\n [-5.60800300e-03, -2.39572672e-01],\n [ 2.68168358e-01, 4.72705335e-02],\n [ 9.88208151e-02, -6.96420088e-02],\n [ 2.89086481e-01, -1.69157553e-01],\n [ 1.45033538e-01, -7.63961345e-02],\n [ 1.59287093e-01, 2.19853643e-04],\n [ 2.13962718e-01, 5.99630005e-02],\n [ 2.91913782e-01, 4.04990109e-03],\n [ 3.69148997e-01, 6.43480720e-02],\n [ 1.86769115e-01, -4.96694916e-02],\n [-6.87697501e-02, -1.85648007e-01],\n [-2.15759776e-02, -2.87970157e-01],\n [-5.89248844e-02, -2.86536746e-01],\n [ 3.23412419e-02, -1.41140786e-01],\n [ 2.88906394e-01, -1.31550706e-01],\n [ 1.09664252e-01, -8.25379800e-02],\n [ 1.82266934e-01, 1.38247021e-01],\n [ 2.77724803e-01, 1.05903632e-01],\n [ 1.95615410e-01, -2.38550997e-01],\n [ 3.76839264e-02, -5.41130122e-02],\n [ 4.68406593e-02, -2.53171683e-01],\n [ 5.54365941e-02, -2.19190186e-01],\n [ 1.75833387e-01, -8.62037590e-04],\n [ 4.90676225e-02, -1.79829525e-01],\n [-1.53444261e-01, -3.78886428e-01],\n [ 6.69726607e-02, -1.68132343e-01],\n [ 3.30293747e-02, -4.29708545e-02],\n [ 6.62142547e-02, -8.10461198e-02],\n [ 1.35679197e-01, -2.32914079e-02],\n [-1.58634575e-01, -2.89139847e-01],\n [ 6.20502279e-02, -1.17687974e-01],\n [ 6.22771338e-01, 1.16807265e-01],\n [ 3.46009609e-01, -1.56291874e-01],\n [ 6.17986434e-01, 1.00519741e-01],\n [ 4.17789309e-01, -2.68903690e-02],\n [ 5.63621248e-01, 3.05994289e-02],\n [ 7.50122599e-01, 1.52133800e-01],\n [ 1.35857804e-01, -3.30462554e-01],\n [ 6.08945212e-01, 8.35018443e-02],\n [ 5.11020215e-01, -1.32575915e-01],\n [ 7.20608541e-01, 3.34580389e-01],\n [ 4.24135062e-01, 1.13914054e-01],\n [ 4.37723702e-01, -8.78049736e-02],\n [ 5.40793776e-01, 6.93466165e-02],\n [ 3.63226514e-01, -2.42764625e-01],\n [ 4.74246948e-01, -1.20676423e-01],\n [ 5.13932631e-01, 9.88816323e-02],\n [ 4.24670824e-01, 3.53096310e-02],\n [ 7.49026039e-01, 4.63778390e-01],\n [ 8.72194272e-01, 9.33798117e-03],\n [ 2.82963372e-01, -3.18443776e-01],\n [ 6.14733184e-01, 1.53566018e-01],\n [ 3.22133832e-01, -1.40500924e-01],\n [ 7.58030401e-01, 8.79453649e-02],\n [ 3.57235237e-01, -9.50568671e-02],\n [ 5.31036706e-01, 1.68539991e-01],\n [ 5.46962123e-01, 1.87812429e-01],\n [ 3.28704908e-01, -6.81237595e-02],\n [ 3.14783811e-01, -5.57223965e-03],\n [ 5.16585543e-01, -5.40299414e-02],\n [ 4.84826663e-01, 1.15348658e-01],\n [ 6.33043632e-01, 5.92290940e-02],\n [ 6.87490917e-01, 4.91179916e-01],\n [ 5.43489246e-01, -5.44399104e-02],\n [ 2.91133358e-01, -5.82085481e-02],\n [ 3.05410131e-01, -1.61757644e-01],\n [ 7.63507935e-01, 1.68186703e-01],\n [ 5.47805644e-01, 1.58976299e-01],\n [ 4.06585699e-01, 6.12192966e-02],\n [ 2.92534659e-01, -1.63044284e-02],\n [ 5.35871344e-01, 1.19790986e-01],\n [ 6.13864965e-01, 9.30029331e-02],\n [ 5.58343139e-01, 1.22041374e-01],\n [ 3.46009609e-01, -1.56291874e-01],\n [ 6.23819644e-01, 1.39763503e-01],\n [ 6.38651518e-01, 1.66900115e-01],\n [ 5.51461624e-01, 5.98413741e-02],\n [ 4.07146497e-01, -1.71820871e-01],\n [ 4.47142619e-01, 3.75600193e-02],\n [ 4.88207585e-01, 1.49677521e-01],\n [ 3.12066323e-01, -3.11303854e-02]])\n\n\nEach row corresponds to a row in our data.\n\niris_pca_vals.shape\n\n(150, 2)\n\n\n\niris_df.shape\n\n(150, 7)\n\n\nWe can add the component to our dataset. I prefer to keep everything in one table and it is not at all required. You can just assign the values whichever variables you prefer.\n\niris_df['c1'] = [item[0] for item in iris_pca_vals]\niris_df['c2'] = [item[1] for item in iris_pca_vals]\n\n\niris_df\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\nThree clusters\nTwo clusters\nFive clusters\nc1\nc2\n\n\n\n\n0\n0.222222\n0.625000\n0.067797\n0.041667\n2\n1\n3\n-0.630703\n0.107578\n\n\n1\n0.166667\n0.416667\n0.067797\n0.041667\n2\n1\n1\n-0.622905\n-0.104260\n\n\n2\n0.111111\n0.500000\n0.050847\n0.041667\n2\n1\n1\n-0.669520\n-0.051417\n\n\n3\n0.083333\n0.458333\n0.084746\n0.041667\n2\n1\n1\n-0.654153\n-0.102885\n\n\n4\n0.194444\n0.666667\n0.067797\n0.041667\n2\n1\n3\n-0.648788\n0.133488\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n145\n0.666667\n0.416667\n0.711864\n0.916667\n1\n0\n4\n0.551462\n0.059841\n\n\n146\n0.555556\n0.208333\n0.677966\n0.750000\n0\n0\n2\n0.407146\n-0.171821\n\n\n147\n0.611111\n0.416667\n0.711864\n0.791667\n1\n0\n2\n0.447143\n0.037560\n\n\n148\n0.527778\n0.583333\n0.745763\n0.916667\n1\n0\n4\n0.488208\n0.149678\n\n\n149\n0.444444\n0.416667\n0.694915\n0.708333\n0\n0\n2\n0.312066\n-0.031130\n\n\n\n\n150 rows × 9 columns\n\n\n\nPlotting out our data on our new two component space.\n\nsns.scatterplot(data = iris_df, x = 'c1', y = 'c2')\n\n<Axes: xlabel='c1', ylabel='c2'>\n\n\n\n\n\nWe have reduced our three dimensions to two.\nWe can also colour by our clusters. What does this show us and is it useful?\n\nsns.scatterplot(data = iris_df, x = 'c1', y = 'c2', hue = 'Three clusters')\n\n<Axes: xlabel='c1', ylabel='c2'>\n\n\n\n\n\n\niris_df\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\nThree clusters\nTwo clusters\nFive clusters\nc1\nc2\n\n\n\n\n0\n0.222222\n0.625000\n0.067797\n0.041667\n2\n1\n3\n-0.630703\n0.107578\n\n\n1\n0.166667\n0.416667\n0.067797\n0.041667\n2\n1\n1\n-0.622905\n-0.104260\n\n\n2\n0.111111\n0.500000\n0.050847\n0.041667\n2\n1\n1\n-0.669520\n-0.051417\n\n\n3\n0.083333\n0.458333\n0.084746\n0.041667\n2\n1\n1\n-0.654153\n-0.102885\n\n\n4\n0.194444\n0.666667\n0.067797\n0.041667\n2\n1\n3\n-0.648788\n0.133488\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n145\n0.666667\n0.416667\n0.711864\n0.916667\n1\n0\n4\n0.551462\n0.059841\n\n\n146\n0.555556\n0.208333\n0.677966\n0.750000\n0\n0\n2\n0.407146\n-0.171821\n\n\n147\n0.611111\n0.416667\n0.711864\n0.791667\n1\n0\n2\n0.447143\n0.037560\n\n\n148\n0.527778\n0.583333\n0.745763\n0.916667\n1\n0\n4\n0.488208\n0.149678\n\n\n149\n0.444444\n0.416667\n0.694915\n0.708333\n0\n0\n2\n0.312066\n-0.031130\n\n\n\n\n150 rows × 9 columns\n\n\n\n\n\n16.5.2 PCA to Clusters\nWe have reduced our 4D dataset to 2D whilst keeping the data variance. Reducing the data to fewer dimensions can help with the ‘curse of dimensionality’, reduce the change of overfitting a machine learning model (see here) and reduce the computational complexity of a model fit.\nPutting our new dimensions into a kMeans model\n\nk_means_pca = KMeans(n_clusters = 3, init = 'random', n_init = 10)\niris_pca_kmeans = k_means_pca.fit(iris_df.iloc[:,-2:])\n\n\ntype(iris_df.iloc[:,-2:].values)\n\nnumpy.ndarray\n\n\n\niris_df['PCA 3 clusters'] = pd.Series(k_means_pca.predict(iris_df.iloc[:,-2:].values), index = iris_df.index)\niris_df\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\nThree clusters\nTwo clusters\nFive clusters\nc1\nc2\nPCA 3 clusters\n\n\n\n\n0\n0.222222\n0.625000\n0.067797\n0.041667\n2\n1\n3\n-0.630703\n0.107578\n0\n\n\n1\n0.166667\n0.416667\n0.067797\n0.041667\n2\n1\n1\n-0.622905\n-0.104260\n0\n\n\n2\n0.111111\n0.500000\n0.050847\n0.041667\n2\n1\n1\n-0.669520\n-0.051417\n0\n\n\n3\n0.083333\n0.458333\n0.084746\n0.041667\n2\n1\n1\n-0.654153\n-0.102885\n0\n\n\n4\n0.194444\n0.666667\n0.067797\n0.041667\n2\n1\n3\n-0.648788\n0.133488\n0\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n145\n0.666667\n0.416667\n0.711864\n0.916667\n1\n0\n4\n0.551462\n0.059841\n1\n\n\n146\n0.555556\n0.208333\n0.677966\n0.750000\n0\n0\n2\n0.407146\n-0.171821\n2\n\n\n147\n0.611111\n0.416667\n0.711864\n0.791667\n1\n0\n2\n0.447143\n0.037560\n1\n\n\n148\n0.527778\n0.583333\n0.745763\n0.916667\n1\n0\n4\n0.488208\n0.149678\n1\n\n\n149\n0.444444\n0.416667\n0.694915\n0.708333\n0\n0\n2\n0.312066\n-0.031130\n2\n\n\n\n\n150 rows × 10 columns\n\n\n\nAs we only have two dimensions we can easily plot this on a single scatterplot.\n\n# a different seaborn theme\n# see https://python-graph-gallery.com/104-seaborn-themes/\nsns.set_style(\"darkgrid\")\nsns.scatterplot(data = iris_df, x = 'c1', y = 'c2', hue = 'PCA 3 clusters')\n\n<Axes: xlabel='c1', ylabel='c2'>\n\n\n\n\n\nI suspect having two clusters would work better. We should try a few different models.\nCopying the code from here we can fit multiple numbers of clusters.\n\nks = range(1, 10)\ninertias = [] # Create an empty list (will be populated later)\nfor k in ks:\n # Create a KMeans instance with k clusters: model\n model = KMeans(n_clusters=k, n_init = 10)\n \n # Fit model to samples\n model.fit(iris_df.iloc[:,-2:])\n \n # Append the inertia to the list of inertias\n inertias.append(model.inertia_)\n \nplt.plot(ks, inertias, '-o', color='black')\nplt.xlabel('number of clusters, k')\nplt.ylabel('inertia')\nplt.xticks(ks)\nplt.show()\n\n\n\n\nThree seems ok. We clearly want no more than three.\nThese types of plots show an point about model complexity. More free parameters in the model (here the number of clusters) will improve how well the model captures the data, often with reducing returns. However, a model which overfits the data will not be able to fit new data well - referred to overfitting. Randomish internet blogs introduce the topic pretty well, see here, and also wikipedia, see here.\n\n\n16.5.3 Missing values\nFinally, how we deal with missing values can impact the results of PCA and kMeans clustering.\nLets us load in the iris dataset again and randomly remove 10% of the data (see code from here).\n\nimport numpy as np\n\nx = load_iris()\n\n\niris_df = pd.DataFrame(x.data, columns = x.feature_names)\n\nmask = np.random.choice([True, False], size = iris_df.shape, p = [0.2, 0.8])\nmask[mask.all(1),-1] = 0\n\ndf = iris_df.mask(mask)\n\ndf.isna().sum()\n\nsepal length (cm) 29\nsepal width (cm) 21\npetal length (cm) 32\npetal width (cm) 21\ndtype: int64\n\n\n\ndf\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\n\n\n\n\n0\n5.1\n3.5\n1.4\n0.2\n\n\n1\n4.9\n3.0\nNaN\n0.2\n\n\n2\nNaN\n3.2\n1.3\n0.2\n\n\n3\n4.6\n3.1\nNaN\n0.2\n\n\n4\nNaN\n3.6\n1.4\n0.2\n\n\n...\n...\n...\n...\n...\n\n\n145\n6.7\nNaN\n5.2\n2.3\n\n\n146\n6.3\n2.5\nNaN\n1.9\n\n\n147\n6.5\n3.0\n5.2\n2.0\n\n\n148\nNaN\n3.4\n5.4\n2.3\n\n\n149\n5.9\n3.0\nNaN\nNaN\n\n\n\n\n150 rows × 4 columns\n\n\n\nAbout 20% of the data is randomly an NaN.\n\n16.5.3.1 Zeroing\nWe can 0 them and fit our models.\n\ndf_1 = df.copy()\ndf_1 = df_1.fillna(0)\n\n\ndf_1\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\n\n\n\n\n0\n5.1\n3.5\n1.4\n0.2\n\n\n1\n4.9\n3.0\n0.0\n0.2\n\n\n2\n0.0\n3.2\n1.3\n0.2\n\n\n3\n4.6\n3.1\n0.0\n0.2\n\n\n4\n0.0\n3.6\n1.4\n0.2\n\n\n...\n...\n...\n...\n...\n\n\n145\n6.7\n0.0\n5.2\n2.3\n\n\n146\n6.3\n2.5\n0.0\n1.9\n\n\n147\n6.5\n3.0\n5.2\n2.0\n\n\n148\n0.0\n3.4\n5.4\n2.3\n\n\n149\n5.9\n3.0\n0.0\n0.0\n\n\n\n\n150 rows × 4 columns\n\n\n\n\nk_means_zero = KMeans(n_clusters = 4, init = 'random', n_init = 10)\nk_means_zero.fit(df_1)\ndf_1['Four clusters'] = pd.Series(k_means_zero.predict(df_1.iloc[:,0:4].values), index = df_1.index)\nsns.pairplot(df_1, hue = 'Four clusters')\n\n\n\n\nWhat impact has zeroing the values had on our results?\nNow, onto PCA.\n\n# PCA analysis\nn_components = 2\n\npca = PCA(n_components=n_components)\ndf_1_pca = pca.fit(df_1.iloc[:,0:4])\n\n# Extract projected values\ndf_1_pca_vals = df_1_pca.transform(df_1.iloc[:,0:4])\ndf_1['c1'] = [item[0] for item in df_1_pca_vals]\ndf_1['c2'] = [item[1] for item in df_1_pca_vals]\n\nsns.scatterplot(data = df_1, x = 'c1', y = 'c2')\n\n<Axes: xlabel='c1', ylabel='c2'>\n\n\n\n\n\n\ndf_1_pca.explained_variance_\n\narray([6.24279356, 4.84811544])\n\n\n\ndf_1_pca.components_\n\narray([[-0.86129917, 0.04084996, -0.48641492, -0.14105157],\n [-0.50682662, -0.04550418, 0.84286268, 0.175039 ]])\n\n\n\n\n16.5.3.2 Replacing with the average\n\ndf_2 = df.copy()\nfor i in range(4):\n df_2.iloc[:,i] = df_2.iloc[:,i].fillna(df_2.iloc[:,i].mean())\n\n\ndf_2\n\n\n\n\n\n\n\n\nsepal length (cm)\nsepal width (cm)\npetal length (cm)\npetal width (cm)\n\n\n\n\n0\n5.100000\n3.500000\n1.400000\n0.200000\n\n\n1\n4.900000\n3.000000\n3.877119\n0.200000\n\n\n2\n5.839669\n3.200000\n1.300000\n0.200000\n\n\n3\n4.600000\n3.100000\n3.877119\n0.200000\n\n\n4\n5.839669\n3.600000\n1.400000\n0.200000\n\n\n...\n...\n...\n...\n...\n\n\n145\n6.700000\n3.054264\n5.200000\n2.300000\n\n\n146\n6.300000\n2.500000\n3.877119\n1.900000\n\n\n147\n6.500000\n3.000000\n5.200000\n2.000000\n\n\n148\n5.839669\n3.400000\n5.400000\n2.300000\n\n\n149\n5.900000\n3.000000\n3.877119\n1.205426\n\n\n\n\n150 rows × 4 columns\n\n\n\n\nk_means_zero = KMeans(n_clusters = 4, init = 'random', n_init = 10)\nk_means_zero.fit(df_2)\ndf_2['Four clusters'] = pd.Series(k_means_zero.predict(df_2.iloc[:,0:4].values), index = df_2.index)\nsns.pairplot(df_2, hue = 'Four clusters')\n\n\n\n\n\n# PCA analysis\nn_components = 2\n\npca = PCA(n_components=n_components)\ndf_2_pca = pca.fit(df_2.iloc[:,0:4])\n\n# Extract projected values\ndf_2_pca_vals = df_2_pca.transform(df_2.iloc[:,0:4])\ndf_2['c1'] = [item[0] for item in df_2_pca_vals]\ndf_2['c2'] = [item[1] for item in df_2_pca_vals]\n\nsns.scatterplot(data = df_2, x = 'c1', y = 'c2')\n\n<Axes: xlabel='c1', ylabel='c2'>\n\n\n\n\n\n\ndf_2_pca.explained_variance_\n\narray([3.01818399, 0.26633671])\n\n\n\ndf_2_pca.components_\n\narray([[ 0.31417904, -0.06487468, 0.88369345, 0.34083528],\n [ 0.89110506, 0.17000084, -0.37665661, 0.18751344]])" }, { "objectID": "content/labs/Lab_4/IM939_Lab_4_1_Iris.html#useful-resources", @@ -718,21 +718,28 @@ "href": "content/labs/Lab_5/IM939_Lab_5_1.html#data-wrangling", "title": "21  Lab: Clustering and Ground Truth", "section": "21.1 Data Wrangling", - "text": "21.1 Data Wrangling\nAs usual, we will start by looking at our data, and making transformations, if needed.\n\nimport pandas as pd\n\ndf = pd.read_csv('data/wine.csv')\n\ndf.head()\n\n\n\n\n\n\n\nTip\n\n\n\nThere is a column called Class label that gives us the ground truth. The wines come from three different cultivars. Knowing the actual grouping helps us to identify how well our methods can capture this ground truth.\n\n\nFollowing the data wrangling process that was summarised in Chapter 20, we should first get a sense of our data.\n\ndf.describe()\n\nAs you can see no variable has any missing data, but the scales of our features vary (e.g., Magnesium is in the 100s whereas Hue is in the low single digits).\nLet’s visually inspect how features are distributed using a violin plot:\n\nimport seaborn as sns\n\ndf_long = df.melt(id_vars='Class label')\n\nsns.violinplot(data = df_long, x = 'variable', y = 'value')\n\nRegretfully, this is not very useful right now, due to the different scales that we detected previously. In this case, it makes sense to normalise our data.\n\nfrom sklearn.preprocessing import MinMaxScaler\n\n# create a scaler object\nscaler = MinMaxScaler()\n\n# fit and transform the data\ndf_norm = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)\n\ndf_long = df_norm.melt(id_vars='Class label')\ndf_long\n\n\n#create seaborn violin plot\nmy_plot = sns.violinplot(data = df_long, x = 'variable', y = 'value')\n\n#rotate x-axis labels\nmy_plot.set_xticklabels(my_plot.get_xticklabels(), rotation=90)\n\nAre there any patterns?\nHow about a pairplot?\n\nsns.pairplot(data = df_norm.iloc[:,1:])\n\nHmm, a few interesting correlations. Some of our variables are skewed. We could apply some PCA here to look at fewer dimension or even log transform some of the skewed variables." + "text": "21.1 Data Wrangling\n\nimport pandas as pd\n\ndf = pd.read_csv('data/wine.csv')\n\nLook at our data.\n\ndf.head()\n\n\n\n\n\n\n\n\nClass label\nAlcohol\nMalic acid\nAsh\nAlcalinity of ash\nMagnesium\nTotal phenols\nFlavanoids\nNonflavanoid phenols\nProanthocyanins\nColor intensity\nHue\nOD280/OD315 of diluted wines\nProline\n\n\n\n\n0\n1\n14.23\n1.71\n2.43\n15.6\n127\n2.80\n3.06\n0.28\n2.29\n5.64\n1.04\n3.92\n1065\n\n\n1\n1\n13.20\n1.78\n2.14\n11.2\n100\n2.65\n2.76\n0.26\n1.28\n4.38\n1.05\n3.40\n1050\n\n\n2\n1\n13.16\n2.36\n2.67\n18.6\n101\n2.80\n3.24\n0.30\n2.81\n5.68\n1.03\n3.17\n1185\n\n\n3\n1\n14.37\n1.95\n2.50\n16.8\n113\n3.85\n3.49\n0.24\n2.18\n7.80\n0.86\n3.45\n1480\n\n\n4\n1\n13.24\n2.59\n2.87\n21.0\n118\n2.80\n2.69\n0.39\n1.82\n4.32\n1.04\n2.93\n735\n\n\n\n\n\n\n\nThere is a column called Class label that gives us the ground truth. The wines come from three different cultivars. Knowing the actual grouping helps us to identify how well our methods can capture this ground truth.\nFollowing our process above, we should first get a sense of our data.\n\ndf.describe()\n\n\n\n\n\n\n\n\nClass label\nAlcohol\nMalic acid\nAsh\nAlcalinity of ash\nMagnesium\nTotal phenols\nFlavanoids\nNonflavanoid phenols\nProanthocyanins\nColor intensity\nHue\nOD280/OD315 of diluted wines\nProline\n\n\n\n\ncount\n178.000000\n178.000000\n178.000000\n178.000000\n178.000000\n178.000000\n178.000000\n178.000000\n178.000000\n178.000000\n178.000000\n178.000000\n178.000000\n178.000000\n\n\nmean\n1.938202\n13.000618\n2.336348\n2.366517\n19.494944\n99.741573\n2.295112\n2.029270\n0.361854\n1.590899\n5.058090\n0.957449\n2.611685\n746.893258\n\n\nstd\n0.775035\n0.811827\n1.117146\n0.274344\n3.339564\n14.282484\n0.625851\n0.998859\n0.124453\n0.572359\n2.318286\n0.228572\n0.709990\n314.907474\n\n\nmin\n1.000000\n11.030000\n0.740000\n1.360000\n10.600000\n70.000000\n0.980000\n0.340000\n0.130000\n0.410000\n1.280000\n0.480000\n1.270000\n278.000000\n\n\n25%\n1.000000\n12.362500\n1.602500\n2.210000\n17.200000\n88.000000\n1.742500\n1.205000\n0.270000\n1.250000\n3.220000\n0.782500\n1.937500\n500.500000\n\n\n50%\n2.000000\n13.050000\n1.865000\n2.360000\n19.500000\n98.000000\n2.355000\n2.135000\n0.340000\n1.555000\n4.690000\n0.965000\n2.780000\n673.500000\n\n\n75%\n3.000000\n13.677500\n3.082500\n2.557500\n21.500000\n107.000000\n2.800000\n2.875000\n0.437500\n1.950000\n6.200000\n1.120000\n3.170000\n985.000000\n\n\nmax\n3.000000\n14.830000\n5.800000\n3.230000\n30.000000\n162.000000\n3.880000\n5.080000\n0.660000\n3.580000\n13.000000\n1.710000\n4.000000\n1680.000000\n\n\n\n\n\n\n\nNo missing data. The scales of our features vary (e.g., Magnesium is in the 100s whereas Hue is in the low single digits).\nHow about our feature distributions?\n\ndf_long = df.melt(id_vars='Class label')\n\n\nimport seaborn as sns\n\nsns.violinplot(data = df_long, x = 'variable', y = 'value')\n\n<Axes: xlabel='variable', ylabel='value'>\n\n\n\n\n\nMakes sense to normalise our data.\n\nfrom sklearn.preprocessing import MinMaxScaler\n\n# create a scaler object\nscaler = MinMaxScaler()\n\n# fit and transform the data\ndf_norm = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)\n\ndf_long = df_norm.melt(id_vars='Class label')\ndf_long\n\n\n\n\n\n\n\n\nClass label\nvariable\nvalue\n\n\n\n\n0\n0.0\nAlcohol\n0.842105\n\n\n1\n0.0\nAlcohol\n0.571053\n\n\n2\n0.0\nAlcohol\n0.560526\n\n\n3\n0.0\nAlcohol\n0.878947\n\n\n4\n0.0\nAlcohol\n0.581579\n\n\n...\n...\n...\n...\n\n\n2309\n1.0\nProline\n0.329529\n\n\n2310\n1.0\nProline\n0.336662\n\n\n2311\n1.0\nProline\n0.397290\n\n\n2312\n1.0\nProline\n0.400856\n\n\n2313\n1.0\nProline\n0.201141\n\n\n\n\n2314 rows × 3 columns\n\n\n\n\n#create seaborn violin plot\nmy_plot = sns.violinplot(data = df_long, x = 'variable', y = 'value')\n\n#rotate x-axis labels\nmy_plot.set_xticklabels(my_plot.get_xticklabels(), rotation=90)\n\n[Text(0, 0, 'Alcohol'),\n Text(1, 0, 'Malic acid'),\n Text(2, 0, 'Ash'),\n Text(3, 0, 'Alcalinity of ash'),\n Text(4, 0, 'Magnesium'),\n Text(5, 0, 'Total phenols'),\n Text(6, 0, 'Flavanoids'),\n Text(7, 0, 'Nonflavanoid phenols'),\n Text(8, 0, 'Proanthocyanins'),\n Text(9, 0, 'Color intensity'),\n Text(10, 0, 'Hue'),\n Text(11, 0, 'OD280/OD315 of diluted wines'),\n Text(12, 0, 'Proline ')]\n\n\n\n\n\nAre there any patterns?\nHow about a pairplot?\n\nsns.pairplot(data = df_norm.iloc[:,1:])\n\n\n\n\nHmm, a few interesting correlations. Some of our variables are skewed. We could apply some PCA here to look at fewer dimension or even log transform some of the skewed variables." }, { "objectID": "content/labs/Lab_5/IM939_Lab_5_1.html#cluster-analysis", "href": "content/labs/Lab_5/IM939_Lab_5_1.html#cluster-analysis", "title": "21  Lab: Clustering and Ground Truth", "section": "21.2 Cluster analysis", - "text": "21.2 Cluster analysis\nFor now we will just run a kmeans cluster and then check our results against the ground truth.\n\n21.2.1 Number of clusters\nLets decide how many clusters we need.\n\nfrom sklearn.cluster import KMeans\n\nks = range(1, 10)\ninertias = []\nfor k in ks:\n # Create a KMeans instance with k clusters: model\n model = KMeans(n_clusters=k, n_init = 10)\n \n # Fit model to samples\n model.fit(df.iloc[:,1:])\n \n # Append the inertia to the list of inertias\n inertias.append(model.inertia_)\n\nimport matplotlib.pyplot as plt\n\nplt.plot(ks, inertias, '-o', color='black')\nplt.xlabel('number of clusters, k')\nplt.ylabel('inertia')\nplt.xticks(ks)\nplt.show()\n\nWhat happens if we use the normalised data instead?\n\nfrom sklearn.cluster import KMeans\n\nks = range(1, 10)\ninertias = []\nfor k in ks:\n # Create a KMeans instance with k clusters: model\n model = KMeans(n_clusters=k, n_init = 10)\n \n # Fit model to samples\n model.fit(df_norm.iloc[:,1:])\n \n # Append the inertia to the list of inertias\n inertias.append(model.inertia_)\n\nimport matplotlib.pyplot as plt\n\nplt.plot(ks, inertias, '-o', color='black')\nplt.xlabel('number of clusters, k')\nplt.ylabel('inertia')\nplt.xticks(ks)\nplt.show()\n\n\n\n\n\n\n\nPause for thought\n\n\n\nBoth of the graphs are the same. Is that what you would expect?\n\n\nThree clusters seems about right (and matches our number of origonal labels).\n\ndf['Class label'].value_counts()\n\n\n\n21.2.2 Calculate 3 clusters\nNow, we are going to calculate three clusters and store each observation’s cluster labels into a variable within the original dataframe:\n\n# Create a KMeans instance with k clusters: model\nk_means = KMeans(n_clusters=3)\n\n# Fit model to samples\ndf_k_means = k_means.fit(df.iloc[:,1:])\n\n# Create a new variable with the fited cluster label.\ndf['Three clusters'] = pd.Series(df_k_means.predict(df.iloc[:,1:].values), index = df.index)\ndf\n\n\n\n21.2.3 Ground Truth Validation\nDo our cluster labels match our ground truth? Did our cluster model capture reality?\n\nct = pd.crosstab(df['Three clusters'], df['Class label'])\nct\n\nIt might be easier to see as a stacked plot (see this post).\n\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nct.plot.bar(stacked=True)\nplt.legend(title='Class label')\n\nHow has the kmeans model done compared to our ground truth?\n\n\n\n\n\n\nImportant\n\n\n\nWe need to be really careful here. We notice that it is not easily possible to compare the known class labels to clustering labels. The reason is that the clustering algorithm labels are just arbitrary and not assigned to any deterministic criteria. Each time you run the algorithm, you might get a different id for the labels. The reason is that the label itself doesn’t actually mean anything, what is important is the list of items that are in the same cluster and their relations.\n\n\nA way to come over this ambiguity and evaluate the results is to look at a visualisations of the results and compare. But this brings in the question of what type of visualisation to use for looking at the clusters.\nAn immediate alternative is to use scatterplots. However, it is not clear which axis to use for clustering. A common method to apply at this stage is to make use of PCA to get a 2D plane where we can project the data points and visualise them over this projection.\n\ndf.iloc[:,1:14]\n\n\nfrom sklearn.decomposition import PCA\n\nn_components = 2\n\npca = PCA(n_components=n_components)\ndf_pca = pca.fit(df.iloc[:,1:14])\ndf_pca_vals = df_pca.transform(df.iloc[:,1:14])\n\nGrab our projections and plot along with our cluster names.\n\ndf['c1'] = [item[0] for item in df_pca_vals]\ndf['c2'] = [item[1] for item in df_pca_vals]\n\nax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Class label')\nax.set_title('Known labels visualised over PCs')\n\nIn the figure above, we colored the points based on the actual labels, we observe that there has been several misclassifications in the figure above (i.e., in the algorithm’s results). So one may choose to use an alternative algorithm or devise a better distance metric.\n\nax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Three clusters')\nax.set_title('Results of the algorithm visualised over PCs')\n\nThis shows the parallelism between the clustering algorithm and PCA. By looking at the PCA loadings, we can find out what the x-axis mean and try to interpret the clusters (We leave this as an additional exercise for those interested).\nHow might your interpret the above plots? Did the kmeans model identify the ground truth?\nHow robust is our clustering? It may be that the kmeans algorithm becamse stuck or that a few outliers have biased the clustering.\nTwo ways to check are:\n\nRunning the model multiple times with different initial values.\nRemoving some data and running the modelling multiple times.\n\nRun the below cell a few times. What do you see?\n\n# Create a KMeans instance with k clusters: model\nk_means = KMeans(n_clusters=3, init='random', n_init = 10)\n\n# Fit model to samples\ndf_k_means = k_means.fit(df.iloc[:,1:14])\n\ndf['Three clusters'] = pd.Series(df_k_means.predict(df.iloc[:,1:14].values), index = df.index)\n\nax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Three clusters')\nax.set_title('Results of the algorithm visualised over PCs')\n\nHow about with only 80% of the data?\n\ndf_sample = df.sample(frac=0.8, replace=False)\n\n# Create a KMeans instance with k clusters: model\nk_means = KMeans(n_clusters=3, init='random', n_init = 10)\n\n# Fit model to samples\ndf_k_means = k_means.fit(df_sample.iloc[:,1:14])\n\ndf_sample['Three clusters'] = pd.Series(df_k_means.predict(df_sample.iloc[:,1:14].values), index = df_sample.index)\n\nax = sns.scatterplot(data = df_sample, x = 'c1', y = 'c2', hue = 'Three clusters')\nax.set_title('Results of the algorithm visualised over PCs')\n\nWe may want to automate the process of resampling the data or rerunning the model then perhaps plotting the different inertia values or creating different plots.\nDo you think our clustering algorithm is stable and provide similiar results even when some data is removed or the initial values are random?\nIf so, then is our algorithm capturing the ground truth?\n\n\n\n\nCortez, Paulo, A Cerdeira, F Almeida, T Matos, and J. Reis. 2009. “Wine Quality.” UCI Machine Learning Repository. https://doi.org/10.24432/C56S3T." + "text": "21.2 Cluster analysis\nFor now we will just run a kmeans cluster and then check our results against the ground truth.\n\n21.2.1 Determining the number of clusters\nLets decide how many clusters we need.\n\nfrom sklearn.cluster import KMeans\n\nks = range(1, 10)\ninertias = []\nfor k in ks:\n # Create a KMeans instance with k clusters: model\n model = KMeans(n_clusters=k, n_init = 10)\n \n # Fit model to samples\n model.fit(df.iloc[:,1:])\n \n # Append the inertia to the list of inertias\n inertias.append(model.inertia_)\n\nimport matplotlib.pyplot as plt\n\nplt.plot(ks, inertias, '-o', color='black')\nplt.xlabel('number of clusters, k')\nplt.ylabel('inertia')\nplt.xticks(ks)\nplt.show()\n\n\n\n\nWhat happens if we use the normalised data instead?\n\nfrom sklearn.cluster import KMeans\n\nks = range(1, 10)\ninertias = []\nfor k in ks:\n # Create a KMeans instance with k clusters: model\n model = KMeans(n_clusters=k, n_init = 10)\n \n # Fit model to samples\n model.fit(df_norm.iloc[:,1:])\n \n # Append the inertia to the list of inertias\n inertias.append(model.inertia_)\n\nimport matplotlib.pyplot as plt\n\nplt.plot(ks, inertias, '-o', color='black')\nplt.xlabel('number of clusters, k')\nplt.ylabel('inertia')\nplt.xticks(ks)\nplt.show()\n\n\n\n\n\n\n\n\n\n\nPause for thought\n\n\n\nBoth of the graphs are the same. Is that what you would expect?\n\n\nThree clusters seems about right (and matches our number of origonal labels).\n\ndf['Class label'].value_counts()\n\nClass label\n2 71\n1 59\n3 48\nName: count, dtype: int64\n\n\n\n\n21.2.2 Computing the clusters\n\n# Create a KMeans instance with k clusters: model\nk_means = KMeans(n_clusters=3)\n\n# Fit model to samples\ndf_k_means = k_means.fit(df.iloc[:,1:])\n\ndf['Three clusters'] = pd.Series(df_k_means.predict(df.iloc[:,1:].values), index = df.index)\ndf\n\n\n\n\n\n\n\n\nClass label\nAlcohol\nMalic acid\nAsh\nAlcalinity of ash\nMagnesium\nTotal phenols\nFlavanoids\nNonflavanoid phenols\nProanthocyanins\nColor intensity\nHue\nOD280/OD315 of diluted wines\nProline\nThree clusters\n\n\n\n\n0\n1\n14.23\n1.71\n2.43\n15.6\n127\n2.80\n3.06\n0.28\n2.29\n5.64\n1.04\n3.92\n1065\n1\n\n\n1\n1\n13.20\n1.78\n2.14\n11.2\n100\n2.65\n2.76\n0.26\n1.28\n4.38\n1.05\n3.40\n1050\n1\n\n\n2\n1\n13.16\n2.36\n2.67\n18.6\n101\n2.80\n3.24\n0.30\n2.81\n5.68\n1.03\n3.17\n1185\n1\n\n\n3\n1\n14.37\n1.95\n2.50\n16.8\n113\n3.85\n3.49\n0.24\n2.18\n7.80\n0.86\n3.45\n1480\n1\n\n\n4\n1\n13.24\n2.59\n2.87\n21.0\n118\n2.80\n2.69\n0.39\n1.82\n4.32\n1.04\n2.93\n735\n2\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n173\n3\n13.71\n5.65\n2.45\n20.5\n95\n1.68\n0.61\n0.52\n1.06\n7.70\n0.64\n1.74\n740\n2\n\n\n174\n3\n13.40\n3.91\n2.48\n23.0\n102\n1.80\n0.75\n0.43\n1.41\n7.30\n0.70\n1.56\n750\n2\n\n\n175\n3\n13.27\n4.28\n2.26\n20.0\n120\n1.59\n0.69\n0.43\n1.35\n10.20\n0.59\n1.56\n835\n2\n\n\n176\n3\n13.17\n2.59\n2.37\n20.0\n120\n1.65\n0.68\n0.53\n1.46\n9.30\n0.60\n1.62\n840\n2\n\n\n177\n3\n14.13\n4.10\n2.74\n24.5\n96\n2.05\n0.76\n0.56\n1.35\n9.20\n0.61\n1.60\n560\n0\n\n\n\n\n178 rows × 15 columns" + }, + { + "objectID": "content/labs/Lab_5/IM939_Lab_5_1.html#clusters-and-ground-truth", + "href": "content/labs/Lab_5/IM939_Lab_5_1.html#clusters-and-ground-truth", + "title": "21  Lab: Clustering and Ground Truth", + "section": "21.3 Clusters and Ground Truth", + "text": "21.3 Clusters and Ground Truth\nNow that we have created three clusters, we may ask ourselves: Do our cluster labels match our ground truth? Did our cluster model capture reality?\n\nct = pd.crosstab(df['Three clusters'], df['Class label'])\nct\n\n\n\n\n\n\n\nClass label\n1\n2\n3\n\n\nThree clusters\n\n\n\n\n\n\n\n0\n0\n50\n19\n\n\n1\n46\n1\n0\n\n\n2\n13\n20\n29\n\n\n\n\n\n\n\nIt might be easier to see as a stacked plot (see this post).\n\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nct.plot.bar(stacked=True)\nplt.legend(title='Class label')\n\n<matplotlib.legend.Legend at 0x1798f3e50>\n\n\n\n\n\nHow has the kmeans model done compared to our ground truth?\n\n\n\n\n\n\nImportant\n\n\n\nWe need to be really careful here. We notice that it is not easily possible to compare the known class labels to clustering labels. The reason is that the clustering algorithm labels are just arbitrary and not assigned to any deterministic criteria. Each time you run the algorithm, you might get a different id for the labels. The reason is that the label itself doesn’t actually mean anything, what is important is the list of items that are in the same cluster and their relations.\n\n\n\n21.3.1 Principal Components Analysis\nA way to come over this ambiguity and evaluate the results is to look at a visualisations of the results and compare. But this brings in the question of what type of visualisation to use for looking at the clusters. An immediate alternative is to use scatterplots. However, it is not clear which axis to use for clustering. A common method to apply at this stage is to make use of PCA to get a 2D plane where we can project the data points and visualise them over this projection.\n\ndf.iloc[:,1:14]\n\n\n\n\n\n\n\n\nAlcohol\nMalic acid\nAsh\nAlcalinity of ash\nMagnesium\nTotal phenols\nFlavanoids\nNonflavanoid phenols\nProanthocyanins\nColor intensity\nHue\nOD280/OD315 of diluted wines\nProline\n\n\n\n\n0\n14.23\n1.71\n2.43\n15.6\n127\n2.80\n3.06\n0.28\n2.29\n5.64\n1.04\n3.92\n1065\n\n\n1\n13.20\n1.78\n2.14\n11.2\n100\n2.65\n2.76\n0.26\n1.28\n4.38\n1.05\n3.40\n1050\n\n\n2\n13.16\n2.36\n2.67\n18.6\n101\n2.80\n3.24\n0.30\n2.81\n5.68\n1.03\n3.17\n1185\n\n\n3\n14.37\n1.95\n2.50\n16.8\n113\n3.85\n3.49\n0.24\n2.18\n7.80\n0.86\n3.45\n1480\n\n\n4\n13.24\n2.59\n2.87\n21.0\n118\n2.80\n2.69\n0.39\n1.82\n4.32\n1.04\n2.93\n735\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n173\n13.71\n5.65\n2.45\n20.5\n95\n1.68\n0.61\n0.52\n1.06\n7.70\n0.64\n1.74\n740\n\n\n174\n13.40\n3.91\n2.48\n23.0\n102\n1.80\n0.75\n0.43\n1.41\n7.30\n0.70\n1.56\n750\n\n\n175\n13.27\n4.28\n2.26\n20.0\n120\n1.59\n0.69\n0.43\n1.35\n10.20\n0.59\n1.56\n835\n\n\n176\n13.17\n2.59\n2.37\n20.0\n120\n1.65\n0.68\n0.53\n1.46\n9.30\n0.60\n1.62\n840\n\n\n177\n14.13\n4.10\n2.74\n24.5\n96\n2.05\n0.76\n0.56\n1.35\n9.20\n0.61\n1.60\n560\n\n\n\n\n178 rows × 13 columns\n\n\n\n\nfrom sklearn.decomposition import PCA\n\nn_components = 2\n\npca = PCA(n_components=n_components)\ndf_pca = pca.fit(df.iloc[:,1:14])\ndf_pca_vals = df_pca.transform(df.iloc[:,1:14])\n\nGrab our projections and plot along with our cluster names.\n\ndf['c1'] = [item[0] for item in df_pca_vals]\ndf['c2'] = [item[1] for item in df_pca_vals]\n\nax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Class label')\nax.set_title('Known labels visualised over PCs')\n\nText(0.5, 1.0, 'Known labels visualised over PCs')\n\n\n\n\n\nIn the figure above, we colored the points based on the actual labels, we observe that there has been several misclassifications in the figure above (i.e., in the algorithm’s results). So one may choose to use an alternative algorithm or devise a better distance metric.\n\nax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Three clusters')\nax.set_title('Results of the algorithm visualised over PCs')\n\nText(0.5, 1.0, 'Results of the algorithm visualised over PCs')\n\n\n\n\n\nThis shows the parallelism between the clustering algorithm and PCA. By looking at the PCA loadings, we can find out what the x-axis mean and try to interpret the clusters (We leave this as an additional exercise for those interested).\nHow might your interpret the above plots? Did the kmeans model identify the ground truth?\nHow robust is our clustering? It may be that the kmeans algorithm becamse stuck or that a few outliers have biased the clustering.\nTwo ways to check are:\n\nRunning the model multiple times with different initial values.\nRemoving some data and running the modelling multiple times.\n\nRun the below cell a few times. What do you see?\n\n# Create a KMeans instance with k clusters: model\nk_means = KMeans(n_clusters=3, init='random', n_init = 10)\n\n# Fit model to samples\ndf_k_means = k_means.fit(df.iloc[:,1:14])\n\ndf['Three clusters'] = pd.Series(df_k_means.predict(df.iloc[:,1:14].values), index = df.index)\n\nax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Three clusters')\nax.set_title('Results of the algorithm visualised over PCs')\n\nText(0.5, 1.0, 'Results of the algorithm visualised over PCs')\n\n\n\n\n\nHow about with only 80% of the data?\n\ndf_sample = df.sample(frac=0.8, replace=False)\n\n# Create a KMeans instance with k clusters: model\nk_means = KMeans(n_clusters=3, init='random', n_init = 10)\n\n# Fit model to samples\ndf_k_means = k_means.fit(df_sample.iloc[:,1:14])\n\ndf_sample['Three clusters'] = pd.Series(df_k_means.predict(df_sample.iloc[:,1:14].values), index = df_sample.index)\n\nax = sns.scatterplot(data = df_sample, x = 'c1', y = 'c2', hue = 'Three clusters')\nax.set_title('Results of the algorithm visualised over PCs')\n\nText(0.5, 1.0, 'Results of the algorithm visualised over PCs')\n\n\n\n\n\nWe may want to automate the process of resampling the data or rerunning the model then perhaps plotting the different inertia values or creating different plots.\nDo you think our clustering algorithm is stable and provide similiar results even when some data is removed or the initial values are random?\nIf so, then is our algorithm capturing the ground truth?\n\n\n\n\nCortez, Paulo, A Cerdeira, F Almeida, T Matos, and J. Reis. 2009. “Wine Quality.” UCI Machine Learning Repository. https://doi.org/10.24432/C56S3T." }, { "objectID": "content/labs/Lab_5/IM939_Lab_5_2.html", "href": "content/labs/Lab_5/IM939_Lab_5_2.html", "title": "22  Lab: Cross validation", "section": "", - "text": "Details of the crime dataset are here.\nWe are going to examine the data, fit and then cross-validate a regression model.\n\nimport pandas as pd\ndf = pd.read_csv('data/censusCrimeClean.csv')\ndf.head()\n\n\n\n\n\n\n\n\ncommunityname\nfold\npopulation\nhouseholdsize\nracepctblack\nracePctWhite\nracePctAsian\nracePctHisp\nagePct12t21\nagePct12t29\n...\nNumStreet\nPctForeignBorn\nPctBornSameState\nPctSameHouse85\nPctSameCity85\nPctSameState85\nLandArea\nPopDens\nPctUsePubTrans\nViolentCrimesPerPop\n\n\n\n\n0\nLakewoodcity\n1\n0.19\n0.33\n0.02\n0.90\n0.12\n0.17\n0.34\n0.47\n...\n0.0\n0.12\n0.42\n0.50\n0.51\n0.64\n0.12\n0.26\n0.20\n0.20\n\n\n1\nTukwilacity\n1\n0.00\n0.16\n0.12\n0.74\n0.45\n0.07\n0.26\n0.59\n...\n0.0\n0.21\n0.50\n0.34\n0.60\n0.52\n0.02\n0.12\n0.45\n0.67\n\n\n2\nAberdeentown\n1\n0.00\n0.42\n0.49\n0.56\n0.17\n0.04\n0.39\n0.47\n...\n0.0\n0.14\n0.49\n0.54\n0.67\n0.56\n0.01\n0.21\n0.02\n0.43\n\n\n3\nWillingborotownship\n1\n0.04\n0.77\n1.00\n0.08\n0.12\n0.10\n0.51\n0.50\n...\n0.0\n0.19\n0.30\n0.73\n0.64\n0.65\n0.02\n0.39\n0.28\n0.12\n\n\n4\nBethlehemtownship\n1\n0.01\n0.55\n0.02\n0.95\n0.09\n0.05\n0.38\n0.38\n...\n0.0\n0.11\n0.72\n0.64\n0.61\n0.53\n0.04\n0.09\n0.02\n0.03\n\n\n\n\n5 rows × 102 columns\n\n\n\nOne hundred features. Too many for us to visualise at once.\nInstead, we can pick out particular variables and carry out a linear regression. To make our work simple we will look at ViolentCrimesPerPop as our dependent variable and medIncome as our indpendent variable.\nWe may wonder if there is more violent crime in low income areas.\nLet us create a new dataframe containing our regression variables. We do not have to do this I find it makes our work clearer.\n\ndf_reg = df[['communityname', 'medIncome', 'ViolentCrimesPerPop']]\ndf_reg\n\n\n\n\n\n\n\n\ncommunityname\nmedIncome\nViolentCrimesPerPop\n\n\n\n\n0\nLakewoodcity\n0.37\n0.20\n\n\n1\nTukwilacity\n0.31\n0.67\n\n\n2\nAberdeentown\n0.30\n0.43\n\n\n3\nWillingborotownship\n0.58\n0.12\n\n\n4\nBethlehemtownship\n0.50\n0.03\n\n\n...\n...\n...\n...\n\n\n1989\nTempleTerracecity\n0.42\n0.09\n\n\n1990\nSeasidecity\n0.28\n0.45\n\n\n1991\nWaterburytown\n0.31\n0.23\n\n\n1992\nWalthamcity\n0.44\n0.19\n\n\n1993\nOntariocity\n0.40\n0.48\n\n\n\n\n1994 rows × 3 columns\n\n\n\nPlot our data (a nice page on plotting regressions with seaborn is here).\n\nimport seaborn as sns\nsns.jointplot(data = df[['medIncome', 'ViolentCrimesPerPop']], \n x = 'ViolentCrimesPerPop', \n y = 'medIncome', kind='reg',\n marker = '.')\n\n\n\n\nWe may want to z-transform or log these scores as they are heavily skewed.\n\nimport numpy as np\n\n# some values are 0 so 0.1 is added to prevent log giving us infinity\n# there may be a better way to do this!\ndf_reg.loc[:, 'ViolentCrimesPerPop_log'] = np.log(df_reg['ViolentCrimesPerPop'] + 0.1)\ndf_reg.loc[:,'medIncome_log'] = np.log(df_reg['medIncome'] + 0.1)\n\n\ndf_reg\n\n\n\n\n\n\n\n\ncommunityname\nmedIncome\nViolentCrimesPerPop\nViolentCrimesPerPop_log\nmedIncome_log\n\n\n\n\n0\nLakewoodcity\n0.37\n0.20\n-1.203973\n-0.755023\n\n\n1\nTukwilacity\n0.31\n0.67\n-0.261365\n-0.891598\n\n\n2\nAberdeentown\n0.30\n0.43\n-0.634878\n-0.916291\n\n\n3\nWillingborotownship\n0.58\n0.12\n-1.514128\n-0.385662\n\n\n4\nBethlehemtownship\n0.50\n0.03\n-2.040221\n-0.510826\n\n\n...\n...\n...\n...\n...\n...\n\n\n1989\nTempleTerracecity\n0.42\n0.09\n-1.660731\n-0.653926\n\n\n1990\nSeasidecity\n0.28\n0.45\n-0.597837\n-0.967584\n\n\n1991\nWaterburytown\n0.31\n0.23\n-1.108663\n-0.891598\n\n\n1992\nWalthamcity\n0.44\n0.19\n-1.237874\n-0.616186\n\n\n1993\nOntariocity\n0.40\n0.48\n-0.544727\n-0.693147\n\n\n\n\n1994 rows × 5 columns\n\n\n\n\nimport seaborn as sns\nsns.jointplot(data = df_reg[['medIncome_log', 'ViolentCrimesPerPop_log']], \n x = 'ViolentCrimesPerPop_log', \n y = 'medIncome_log', kind='reg',\n marker = '.')\n\n\n\n\nIs log transforming our variables the right thing to do here?\nFit our regression to the log transformed data.\n\nimport matplotlib.pyplot as plt\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn import metrics\n\nx = df_reg[['ViolentCrimesPerPop_log']]\ny = df_reg[['medIncome_log']]\n\nmodel = LinearRegression()\nmodel.fit(x, y)\n\ny_hat = model.predict(x)\nplt.plot(x, y,'o', alpha = 0.5)\nplt.plot(x, y_hat, 'r', alpha = 0.5)\n\nplt.xlabel('Violent Crimes Per Population')\nplt.ylabel('Median Income')\n\nprint (\"MSE:\", metrics.mean_squared_error(y_hat, y))\nprint (\"R^2:\", metrics.r2_score(y, y_hat))\nprint (\"var:\", y.var())\n\nMSE: 0.1531885348757034\nR^2: 0.22763497704356928\nvar: medIncome_log 0.198436\ndtype: float64\n\n\n\n\n\nHas our log transformation distorted the pattern in the data?\n\nx = df_reg[['ViolentCrimesPerPop']]\ny = df_reg[['medIncome']]\n\nmodel = LinearRegression()\nmodel.fit(x, y)\n\ny_hat = model.predict(x)\nplt.plot(x, y,'o', alpha = 0.5)\nplt.plot(x, y_hat, 'r', alpha = 0.5)\n\nplt.xlabel('Violent Crimes Per Population')\nplt.ylabel('Median Income')\n\nprint (\"MSE:\", metrics.mean_squared_error(y_hat, y))\nprint (\"R^2:\", metrics.r2_score(y, y_hat))\nprint (\"var:\", y.var())\n\nMSE: 0.03592636778157073\nR^2: 0.17996313165549482\nvar: medIncome 0.043833\ndtype: float64\n\n\n\n\n\nWhat is the relationship between violent crime and median income? Why might this be?\nAssuming the log data is fine, have we overfit the model? Remember that a good model (which accurately models the relationship between violent crimes per population) need to be robust when faced with new data.\nKfold cross validation splits data into train and test subsets. We can then fit the regression to the training set and see how well it does for the test set.\n\nfrom sklearn.model_selection import KFold\n\nX = df_reg[['ViolentCrimesPerPop']]\ny = df_reg[['medIncome']]\n\n# get four splits, Each split contains a \n# test series and a train series.\nkf = KFold(n_splits=4)\n\n\n# lists to store our statistics\nr_vals = []\nMSEs = []\nmedIncome_coef = []\n\nfor train_index, test_index in kf.split(X):\n # fit our model and extract statistics\n model = LinearRegression()\n model.fit(X.iloc[train_index], y.iloc[train_index])\n y_hat = model.predict(X.iloc[test_index])\n \n MSEs.append(metrics.mean_squared_error(y.iloc[test_index], y_hat))\n medIncome_coef.append(model.coef_[0][0])\n r_vals.append(metrics.r2_score(y.iloc[test_index], y_hat))\n\n\ndata = {'MSE' : MSEs, 'medIncome coefficient' : medIncome_coef, 'r squared' : r_vals}\npd.DataFrame(data)\n\n\n\n\n\n\n\n\nMSE\nmedIncome coefficient\nr squared\n\n\n\n\n0\n0.035727\n-0.403609\n0.130479\n\n\n1\n0.035904\n-0.389344\n0.162820\n\n\n2\n0.040777\n-0.353379\n0.200139\n\n\n3\n0.032255\n-0.378883\n0.182403\n\n\n\n\n\n\n\nDoes our model produce similiar coefficients with subsets of the data?\nWe can do this using an inbuild sklearn function (see here).\n\nfrom sklearn.model_selection import cross_val_score\nx = df_reg[['ViolentCrimesPerPop']]\ny = df_reg[['medIncome']]\n\nmodel = LinearRegression()\nmodel.fit(x, y)\n\nprint(cross_val_score(model, x, y, cv=4))\n\n[0.13047946 0.16281953 0.20013867 0.18240261]\n\n\nWhat do these values tell us about our model and data?\nYou might want to carry out multiple regression with more than one predictor variable, or reduce the number of dimensions, or perhaps address different questions using a clustering algorithm instead with all or a subset of features." + "text": "Details of the crime dataset are here.\nWe are going to examine the data, fit and then cross-validate a regression model.\n\nimport pandas as pd\ndf = pd.read_csv('data/censusCrimeClean.csv')\ndf.head()\n\n\n\n\n\n\n\n\ncommunityname\nfold\npopulation\nhouseholdsize\nracepctblack\nracePctWhite\nracePctAsian\nracePctHisp\nagePct12t21\nagePct12t29\n...\nNumStreet\nPctForeignBorn\nPctBornSameState\nPctSameHouse85\nPctSameCity85\nPctSameState85\nLandArea\nPopDens\nPctUsePubTrans\nViolentCrimesPerPop\n\n\n\n\n0\nLakewoodcity\n1\n0.19\n0.33\n0.02\n0.90\n0.12\n0.17\n0.34\n0.47\n...\n0.0\n0.12\n0.42\n0.50\n0.51\n0.64\n0.12\n0.26\n0.20\n0.20\n\n\n1\nTukwilacity\n1\n0.00\n0.16\n0.12\n0.74\n0.45\n0.07\n0.26\n0.59\n...\n0.0\n0.21\n0.50\n0.34\n0.60\n0.52\n0.02\n0.12\n0.45\n0.67\n\n\n2\nAberdeentown\n1\n0.00\n0.42\n0.49\n0.56\n0.17\n0.04\n0.39\n0.47\n...\n0.0\n0.14\n0.49\n0.54\n0.67\n0.56\n0.01\n0.21\n0.02\n0.43\n\n\n3\nWillingborotownship\n1\n0.04\n0.77\n1.00\n0.08\n0.12\n0.10\n0.51\n0.50\n...\n0.0\n0.19\n0.30\n0.73\n0.64\n0.65\n0.02\n0.39\n0.28\n0.12\n\n\n4\nBethlehemtownship\n1\n0.01\n0.55\n0.02\n0.95\n0.09\n0.05\n0.38\n0.38\n...\n0.0\n0.11\n0.72\n0.64\n0.61\n0.53\n0.04\n0.09\n0.02\n0.03\n\n\n\n\n5 rows × 102 columns\n\n\n\nOne hundred features. Too many for us to visualise at once.\nInstead, we can pick out particular variables and carry out a linear regression. To make our work simple we will look at ViolentCrimesPerPop as our dependent variable and medIncome as our indpendent variable.\nWe may wonder if there is more violent crime in low income areas.\nLet us create a new dataframe containing our regression variables. We do not have to do this I find it makes our work clearer.\n\ndf_reg = df[['communityname', 'medIncome', 'ViolentCrimesPerPop']]\ndf_reg\n\n\n\n\n\n\n\n\ncommunityname\nmedIncome\nViolentCrimesPerPop\n\n\n\n\n0\nLakewoodcity\n0.37\n0.20\n\n\n1\nTukwilacity\n0.31\n0.67\n\n\n2\nAberdeentown\n0.30\n0.43\n\n\n3\nWillingborotownship\n0.58\n0.12\n\n\n4\nBethlehemtownship\n0.50\n0.03\n\n\n...\n...\n...\n...\n\n\n1989\nTempleTerracecity\n0.42\n0.09\n\n\n1990\nSeasidecity\n0.28\n0.45\n\n\n1991\nWaterburytown\n0.31\n0.23\n\n\n1992\nWalthamcity\n0.44\n0.19\n\n\n1993\nOntariocity\n0.40\n0.48\n\n\n\n\n1994 rows × 3 columns\n\n\n\nPlot our data (a nice page on plotting regressions with seaborn is here).\n\nimport seaborn as sns\nsns.jointplot(data = df[['medIncome', 'ViolentCrimesPerPop']], \n x = 'ViolentCrimesPerPop', \n y = 'medIncome', kind='reg',\n marker = '.')\n\n\n\n\nWe may want to z-transform or log these scores as they are heavily skewed.\n\nimport numpy as np\n\n# some values are 0 so 0.1 is added to prevent log giving us infinity\n# there may be a better way to do this!\ndf_reg.loc[:, 'ViolentCrimesPerPop_log'] = np.log(df_reg['ViolentCrimesPerPop'] + 0.1)\ndf_reg.loc[:,'medIncome_log'] = np.log(df_reg['medIncome'] + 0.1)\n\n/var/folders/7v/zl9mv52s3ls94kntlt_l9ryh0000gq/T/ipykernel_13528/3488182522.py:5: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n df_reg.loc[:, 'ViolentCrimesPerPop_log'] = np.log(df_reg['ViolentCrimesPerPop'] + 0.1)\n/var/folders/7v/zl9mv52s3ls94kntlt_l9ryh0000gq/T/ipykernel_13528/3488182522.py:6: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n df_reg.loc[:,'medIncome_log'] = np.log(df_reg['medIncome'] + 0.1)\n\n\n\ndf_reg\n\n\n\n\n\n\n\n\ncommunityname\nmedIncome\nViolentCrimesPerPop\nViolentCrimesPerPop_log\nmedIncome_log\n\n\n\n\n0\nLakewoodcity\n0.37\n0.20\n-1.203973\n-0.755023\n\n\n1\nTukwilacity\n0.31\n0.67\n-0.261365\n-0.891598\n\n\n2\nAberdeentown\n0.30\n0.43\n-0.634878\n-0.916291\n\n\n3\nWillingborotownship\n0.58\n0.12\n-1.514128\n-0.385662\n\n\n4\nBethlehemtownship\n0.50\n0.03\n-2.040221\n-0.510826\n\n\n...\n...\n...\n...\n...\n...\n\n\n1989\nTempleTerracecity\n0.42\n0.09\n-1.660731\n-0.653926\n\n\n1990\nSeasidecity\n0.28\n0.45\n-0.597837\n-0.967584\n\n\n1991\nWaterburytown\n0.31\n0.23\n-1.108663\n-0.891598\n\n\n1992\nWalthamcity\n0.44\n0.19\n-1.237874\n-0.616186\n\n\n1993\nOntariocity\n0.40\n0.48\n-0.544727\n-0.693147\n\n\n\n\n1994 rows × 5 columns\n\n\n\n\nimport seaborn as sns\nsns.jointplot(data = df_reg[['medIncome_log', 'ViolentCrimesPerPop_log']], \n x = 'ViolentCrimesPerPop_log', \n y = 'medIncome_log', kind='reg',\n marker = '.')\n\n\n\n\nIs log transforming our variables the right thing to do here?\nFit our regression to the log transformed data.\n\nimport matplotlib.pyplot as plt\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn import metrics\n\nx = df_reg[['ViolentCrimesPerPop_log']]\ny = df_reg[['medIncome_log']]\n\nmodel = LinearRegression()\nmodel.fit(x, y)\n\ny_hat = model.predict(x)\nplt.plot(x, y,'o', alpha = 0.5)\nplt.plot(x, y_hat, 'r', alpha = 0.5)\n\nplt.xlabel('Violent Crimes Per Population')\nplt.ylabel('Median Income')\n\nprint (\"MSE:\", metrics.mean_squared_error(y_hat, y))\nprint (\"R^2:\", metrics.r2_score(y, y_hat))\nprint (\"var:\", y.var())\n\nMSE: 0.1531885348757034\nR^2: 0.22763497704356928\nvar: medIncome_log 0.198436\ndtype: float64\n\n\n\n\n\nHas our log transformation distorted the pattern in the data?\n\nx = df_reg[['ViolentCrimesPerPop']]\ny = df_reg[['medIncome']]\n\nmodel = LinearRegression()\nmodel.fit(x, y)\n\ny_hat = model.predict(x)\nplt.plot(x, y,'o', alpha = 0.5)\nplt.plot(x, y_hat, 'r', alpha = 0.5)\n\nplt.xlabel('Violent Crimes Per Population')\nplt.ylabel('Median Income')\n\nprint (\"MSE:\", metrics.mean_squared_error(y_hat, y))\nprint (\"R^2:\", metrics.r2_score(y, y_hat))\nprint (\"var:\", y.var())\n\nMSE: 0.03592636778157073\nR^2: 0.17996313165549482\nvar: medIncome 0.043833\ndtype: float64\n\n\n\n\n\nWhat is the relationship between violent crime and median income? Why might this be?\nAssuming the log data is fine, have we overfit the model? Remember that a good model (which accurately models the relationship between violent crimes per population) need to be robust when faced with new data.\nKfold cross validation splits data into train and test subsets. We can then fit the regression to the training set and see how well it does for the test set.\n\nfrom sklearn.model_selection import KFold\n\nX = df_reg[['ViolentCrimesPerPop']]\ny = df_reg[['medIncome']]\n\n# get four splits, Each split contains a \n# test series and a train series.\nkf = KFold(n_splits=4)\n\n\n# lists to store our statistics\nr_vals = []\nMSEs = []\nmedIncome_coef = []\n\nfor train_index, test_index in kf.split(X):\n # fit our model and extract statistics\n model = LinearRegression()\n model.fit(X.iloc[train_index], y.iloc[train_index])\n y_hat = model.predict(X.iloc[test_index])\n \n MSEs.append(metrics.mean_squared_error(y.iloc[test_index], y_hat))\n medIncome_coef.append(model.coef_[0][0])\n r_vals.append(metrics.r2_score(y.iloc[test_index], y_hat))\n\n\ndata = {'MSE' : MSEs, 'medIncome coefficient' : medIncome_coef, 'r squared' : r_vals}\npd.DataFrame(data)\n\n\n\n\n\n\n\n\nMSE\nmedIncome coefficient\nr squared\n\n\n\n\n0\n0.035727\n-0.403609\n0.130479\n\n\n1\n0.035904\n-0.389344\n0.162820\n\n\n2\n0.040777\n-0.353379\n0.200139\n\n\n3\n0.032255\n-0.378883\n0.182403\n\n\n\n\n\n\n\nDoes our model produce similiar coefficients with subsets of the data?\nWe can do this using an inbuild sklearn function (see here).\n\nfrom sklearn.model_selection import cross_val_score\nx = df_reg[['ViolentCrimesPerPop']]\ny = df_reg[['medIncome']]\n\nmodel = LinearRegression()\nmodel.fit(x, y)\n\nprint(cross_val_score(model, x, y, cv=4))\n\n[0.13047946 0.16281953 0.20013867 0.18240261]\n\n\nWhat do these values tell us about our model and data?\nYou might want to carry out multiple regression with more than one predictor variable, or reduce the number of dimensions, or perhaps address different questions using a clustering algorithm instead with all or a subset of features." }, { "objectID": "content/labs/Lab_5/IM939_Lab_5_3.html", diff --git a/sitemap.xml b/sitemap.xml index c213962..2afae71 100644 --- a/sitemap.xml +++ b/sitemap.xml @@ -2,186 +2,186 @@ https://warwickcim.github.io/IM939_handbook/index.html - 2023-10-31T18:05:28.182Z + 2023-11-02T15:00:40.086Z https://warwickcim.github.io/IM939_handbook/content/about/teaching_staff.html - 2023-10-31T18:05:28.188Z + 2023-11-02T15:00:40.094Z https://warwickcim.github.io/IM939_handbook/content/about/im939.html - 2023-10-31T18:05:28.194Z + 2023-11-02T15:00:40.102Z https://warwickcim.github.io/IM939_handbook/content/about/teaching_materials.html - 2023-10-31T18:05:28.204Z + 2023-11-02T15:00:40.111Z https://warwickcim.github.io/IM939_handbook/content/about/conventions.html - 2023-10-31T18:05:28.229Z + 2023-11-02T15:00:40.144Z https://warwickcim.github.io/IM939_handbook/content/sessions/session-01.html - 2023-10-31T18:05:28.236Z + 2023-11-02T15:00:40.152Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_1/IM939_Lab_1_1.html - 2023-10-31T18:05:28.248Z + 2023-11-02T15:00:40.166Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_1/IM939_Lab_1_2.html - 2023-10-31T18:05:28.256Z + 2023-11-02T15:00:40.177Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_1/IM939_Lab_1_3.html - 2023-10-31T18:05:28.270Z + 2023-11-02T15:00:40.193Z https://warwickcim.github.io/IM939_handbook/content/sessions/session-02.html - 2023-10-31T18:05:28.275Z + 2023-11-02T15:00:40.201Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_2/IM939_Lab_2_1.html - 2023-10-31T18:05:28.307Z + 2023-11-02T15:00:40.238Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_2/IM939_Lab_2_2.html - 2023-10-31T18:05:28.319Z + 2023-11-02T15:00:40.252Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_2/IM939_Lab_2_3.html - 2023-10-31T18:05:28.334Z + 2023-11-02T15:00:40.268Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_2/IM939_Lab_2_4.html - 2023-10-31T18:05:28.346Z + 2023-11-02T15:00:40.283Z https://warwickcim.github.io/IM939_handbook/content/sessions/session-03.html - 2023-10-31T18:05:28.353Z + 2023-11-02T15:00:40.291Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_3/IM939_Lab_3_1_Data_Processing_and_Summarization.html - 2023-10-31T18:05:28.380Z + 2023-11-02T15:00:40.324Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_3/IM939_Lab_3_2_Linear_Regression.html - 2023-10-31T18:05:28.399Z + 2023-11-02T15:00:40.346Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_3/IM939_Lab_3_Exercise_Linear_Regression.html - 2023-10-31T18:05:28.409Z + 2023-11-02T15:00:40.357Z https://warwickcim.github.io/IM939_handbook/content/sessions/session-04.html - 2023-10-31T18:05:28.415Z + 2023-11-02T15:00:40.366Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_4/IM939_Session-04_PCA_playground.html - 2023-10-31T18:05:28.426Z + 2023-11-02T15:00:40.380Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_4/IM939_Lab_4_1_Iris.html - 2023-10-31T18:05:28.462Z + 2023-11-02T15:00:40.425Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_4/IM939_Lab_4_2_Crime.html - 2023-10-31T18:05:28.483Z + 2023-11-02T15:00:40.450Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_4/IM939_Lab_4_Exercises.html - 2023-10-31T18:05:28.496Z + 2023-11-02T15:00:40.465Z https://warwickcim.github.io/IM939_handbook/content/sessions/session-05.html - 2023-10-31T18:05:28.501Z + 2023-11-02T15:00:40.472Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_5/week5_recap.html - 2023-10-31T18:05:28.509Z + 2023-11-02T15:00:40.480Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_5/IM939_Lab_5_1.html - 2023-10-31T18:05:28.521Z + 2023-11-02T15:00:40.506Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_5/IM939_Lab_5_2.html - 2023-10-31T18:05:28.535Z + 2023-11-02T15:00:40.523Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_5/IM939_Lab_5_3.html - 2023-10-31T18:05:28.551Z + 2023-11-02T15:00:40.543Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_5/IM939_lab_5_Exercise.html - 2023-10-31T18:05:28.557Z + 2023-11-02T15:00:40.549Z https://warwickcim.github.io/IM939_handbook/content/sessions/session-06.html - 2023-10-31T18:05:28.563Z + 2023-11-02T15:00:40.557Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_6/IM939_Lab_6_1-illusions.html - 2023-10-31T18:05:28.578Z + 2023-11-02T15:00:40.575Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_6/IM939_Lab_6_2-AxisManipulation.html - 2023-10-31T18:05:28.595Z + 2023-11-02T15:00:40.596Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_6/IM939_Lab_6_3-Choropleths.html - 2023-10-31T18:05:28.804Z + 2023-11-02T15:00:40.834Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_6/IM939_Lab_6_4-Exercises.html - 2023-10-31T18:05:28.806Z + 2023-11-02T15:00:40.837Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_6/IM939_Lab_6_5-Simpsons_Paradox.html - 2023-10-31T18:05:28.828Z + 2023-11-02T15:00:40.864Z https://warwickcim.github.io/IM939_handbook/content/sessions/session-07.html - 2023-10-31T18:05:28.834Z + 2023-11-02T15:00:40.871Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_7/IM939_Lab7-Part1.html - 2023-10-31T18:05:28.915Z + 2023-11-02T15:00:40.970Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_7/IM939_Lab7-Part2.html - 2023-10-31T18:05:28.927Z + 2023-11-02T15:00:40.986Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_7/IM939_Lab7-Part3.html - 2023-10-31T18:05:28.938Z + 2023-11-02T15:00:41.001Z https://warwickcim.github.io/IM939_handbook/content/labs/Lab_7/IM939_Lab7-Simpsons_Paradox2.html - 2023-10-31T18:05:28.959Z + 2023-11-02T15:00:41.028Z https://warwickcim.github.io/IM939_handbook/content/sessions/session-08.html - 2023-10-31T18:05:28.967Z + 2023-11-02T15:00:41.037Z https://warwickcim.github.io/IM939_handbook/content/sessions/session-08_WorkshopBrief.html - 2023-10-31T18:05:28.974Z + 2023-11-02T15:00:41.046Z https://warwickcim.github.io/IM939_handbook/content/sessions/session-09.html - 2023-10-31T18:05:28.979Z + 2023-11-02T15:00:41.054Z https://warwickcim.github.io/IM939_handbook/content/references.html - 2023-10-31T18:05:28.987Z + 2023-11-02T15:00:41.063Z https://warwickcim.github.io/IM939_handbook/content/labs/labs_setup.html - 2023-10-31T18:05:28.995Z + 2023-11-02T15:00:41.072Z https://warwickcim.github.io/IM939_handbook/content/files-and-folders.html - 2023-10-31T18:05:29.000Z + 2023-11-02T15:00:41.079Z