import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-import seaborn as sns
import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
diff --git a/.nojekyll b/.nojekyll index a502733..fc8afbc 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -eb45d49f \ No newline at end of file +da494773 \ No newline at end of file diff --git a/content/labs/Lab_3/IM939_Lab_3_Exercise_Linear_Regression.html b/content/labs/Lab_3/IM939_Lab_3_Exercise_Linear_Regression.html index 586500d..86367af 100644 --- a/content/labs/Lab_3/IM939_Lab_3_Exercise_Linear_Regression.html +++ b/content/labs/Lab_3/IM939_Lab_3_Exercise_Linear_Regression.html @@ -580,7 +580,7 @@
Now it’s your turn to prepare a linear regression model.
Let’s check the data, their distribution and central tendencies
shape: (1599, 12)
Use lmplot() function from Seaborn to explore linear relationship Input data must be in a Pandas DataFrame. To plot them, we provide the predictor and response variable names along with the dataset
Did you find outliers or missing data? You can use function np.unique and find the unique elements of an array.
Do you need to remove any cases?
Did you need to standarize data?
If you standarized data, try to plot them again
@@ -757,47 +764,47 @@SyntaxError: invalid syntax (987973612.py, line 2)
using Scikit-learn, build a simple linear regression (OLS)
from sklearn.linear_model import LinearRegression
-
-est = LinearRegression(fit_intercept = True)
-
-x = wine[['???']]
-y = wine[['???']]
-
-est.fit(x, y)
-
-print("Coefficients:", est.coef_)
-print ("Intercept:", est.intercept_)
from sklearn.linear_model import LinearRegression
+
+est = LinearRegression(fit_intercept = True)
+
+x = wine[['???']]
+y = wine[['???']]
+
+est.fit(x, y)
+
+print("Coefficients:", est.coef_)
+print ("Intercept:", est.intercept_)
KeyError: "None of [Index(['???'], dtype='object')] are in the [columns]"
What is the model’s mean squared error (\(MSE\)) and the coefficient of determination (\(R^2\)) ?
from sklearn import metrics
-
-# Analysis for all months together.
-x = wdi[['???']]
-y = wdi[['???']]
-model = LinearRegression()
-model.fit(x, y)
-y_hat = model.predict(x)
-plt.plot(x, y,'o', alpha = 0.5)
-plt.plot(x, y_hat, 'r', alpha = 0.5)
-plt.xlabel('?')
-plt.ylabel('?')
-print ("MSE:", metrics.mean_squared_error(y_hat, y))
-print ("R^2:", metrics.r2_score(y_hat, y))
-print ("var:", y.var())
-plt.savefig("?.png", dpi = 300, bbox_inches = 'tight')
from sklearn import metrics
+
+# Analysis for all months together.
+x = wdi[['???']]
+y = wdi[['???']]
+model = LinearRegression()
+model.fit(x, y)
+y_hat = model.predict(x)
+plt.plot(x, y,'o', alpha = 0.5)
+plt.plot(x, y_hat, 'r', alpha = 0.5)
+plt.xlabel('?')
+plt.ylabel('?')
+print ("MSE:", metrics.mean_squared_error(y_hat, y))
+print ("R^2:", metrics.r2_score(y_hat, y))
+print ("var:", y.var())
+plt.savefig("?.png", dpi = 300, bbox_inches = 'tight')
NameError: name 'wdi' is not defined
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0,
- 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0,
- 0, 2, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2], dtype=int32)
+array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
+ 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
+ 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0], dtype=int32)
Each row has been assigned a label.
@@ -1689,7 +1689,7 @@array([[0.70726496, 0.4508547 , 0.79704476, 0.82478632],
- [0.19611111, 0.595 , 0.07830508, 0.06083333],
- [0.44125683, 0.30737705, 0.57571548, 0.54918033]])
+array([[0.44125683, 0.30737705, 0.57571548, 0.54918033],
+ [0.70726496, 0.4508547 , 0.79704476, 0.82478632],
+ [0.19611111, 0.595 , 0.07830508, 0.06083333]])
It is tricky to plot these using seaborn but we can use a normal maplotlib scatter plot.
@@ -1851,7 +1851,7 @@<matplotlib.collections.PathCollection at 0x14fe387d0>
+<matplotlib.collections.PathCollection at 0x162e90390>
4.58977540011789
+4.580948640117293
It looks like our k = 5 model captures the data well. Intertia, looking at the sklearn documentation as the Sum of squared distances of samples to their closest cluster center..
@@ -2315,9 +2315,9 @@Three seems ok. We clearly want no more than three.
@@ -2878,10 +2878,10 @@sepal length (cm) 32
-sepal width (cm) 34
-petal length (cm) 37
-petal width (cm) 29
+sepal length (cm) 29
+sepal width (cm) 21
+petal length (cm) 32
+petal width (cm) 21
dtype: int64
<Axes: xlabel='c1', ylabel='c2'>
array([[-0.91235845, 0.02968512, -0.38161438, -0.14522853],
- [-0.39939351, 0.05086373, 0.90393389, 0.14422629]])
+array([[-0.86129917, 0.04084996, -0.48641492, -0.14105157],
+ [-0.50682662, -0.04550418, 0.84286268, 0.175039 ]])
array([2.68417915, 0.33506061])
+array([3.01818399, 0.26633671])
array([[ 0.33775908, -0.04345744, 0.87824143, 0.33574133],
- [ 0.82803166, 0.20108365, -0.42517727, 0.30521014]])
+array([[ 0.31417904, -0.06487468, 0.88369345, 0.34083528],
+ [ 0.89110506, 0.17000084, -0.37665661, 0.18751344]])
We are going to use the Wine Quality Dataset from Cortez et al. (2009) that you may be familiar with by now (but if you don’t, tou can find more information about it here: https://doi.org/10.24432/C56S3T).
As usual, we will start by looking at our data, and making transformations, if needed.
-Look at our data.
++ | Class label | +Alcohol | +Malic acid | +Ash | +Alcalinity of ash | +Magnesium | +Total phenols | +Flavanoids | +Nonflavanoid phenols | +Proanthocyanins | +Color intensity | +Hue | +OD280/OD315 of diluted wines | +Proline | +
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | +1 | +14.23 | +1.71 | +2.43 | +15.6 | +127 | +2.80 | +3.06 | +0.28 | +2.29 | +5.64 | +1.04 | +3.92 | +1065 | +
1 | +1 | +13.20 | +1.78 | +2.14 | +11.2 | +100 | +2.65 | +2.76 | +0.26 | +1.28 | +4.38 | +1.05 | +3.40 | +1050 | +
2 | +1 | +13.16 | +2.36 | +2.67 | +18.6 | +101 | +2.80 | +3.24 | +0.30 | +2.81 | +5.68 | +1.03 | +3.17 | +1185 | +
3 | +1 | +14.37 | +1.95 | +2.50 | +16.8 | +113 | +3.85 | +3.49 | +0.24 | +2.18 | +7.80 | +0.86 | +3.45 | +1480 | +
4 | +1 | +13.24 | +2.59 | +2.87 | +21.0 | +118 | +2.80 | +2.69 | +0.39 | +1.82 | +4.32 | +1.04 | +2.93 | +735 | +
There is a column called Class label
that gives us the ground truth. The wines come from three different cultivars. Knowing the actual grouping helps us to identify how well our methods can capture this ground truth.
Following the data wrangling process that was summarised in Chapter 20, we should first get a sense of our data.
- -As you can see no variable has any missing data, but the scales of our features vary (e.g., Magnesium
is in the 100s whereas Hue
is in the low single digits).
Let’s visually inspect how features are distributed using a violin plot:
-Regretfully, this is not very useful right now, due to the different scales that we detected previously. In this case, it makes sense to normalise our data.
-#create seaborn violin plot
-my_plot = sns.violinplot(data = df_long, x = 'variable', y = 'value')
-
-#rotate x-axis labels
-my_plot.set_xticklabels(my_plot.get_xticklabels(), rotation=90)
There is a column called Class label that gives us the ground truth. The wines come from three different cultivars. Knowing the actual grouping helps us to identify how well our methods can capture this ground truth.
+Following our process above, we should first get a sense of our data.
++ | Class label | +Alcohol | +Malic acid | +Ash | +Alcalinity of ash | +Magnesium | +Total phenols | +Flavanoids | +Nonflavanoid phenols | +Proanthocyanins | +Color intensity | +Hue | +OD280/OD315 of diluted wines | +Proline | +
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | +178.000000 | +178.000000 | +178.000000 | +178.000000 | +178.000000 | +178.000000 | +178.000000 | +178.000000 | +178.000000 | +178.000000 | +178.000000 | +178.000000 | +178.000000 | +178.000000 | +
mean | +1.938202 | +13.000618 | +2.336348 | +2.366517 | +19.494944 | +99.741573 | +2.295112 | +2.029270 | +0.361854 | +1.590899 | +5.058090 | +0.957449 | +2.611685 | +746.893258 | +
std | +0.775035 | +0.811827 | +1.117146 | +0.274344 | +3.339564 | +14.282484 | +0.625851 | +0.998859 | +0.124453 | +0.572359 | +2.318286 | +0.228572 | +0.709990 | +314.907474 | +
min | +1.000000 | +11.030000 | +0.740000 | +1.360000 | +10.600000 | +70.000000 | +0.980000 | +0.340000 | +0.130000 | +0.410000 | +1.280000 | +0.480000 | +1.270000 | +278.000000 | +
25% | +1.000000 | +12.362500 | +1.602500 | +2.210000 | +17.200000 | +88.000000 | +1.742500 | +1.205000 | +0.270000 | +1.250000 | +3.220000 | +0.782500 | +1.937500 | +500.500000 | +
50% | +2.000000 | +13.050000 | +1.865000 | +2.360000 | +19.500000 | +98.000000 | +2.355000 | +2.135000 | +0.340000 | +1.555000 | +4.690000 | +0.965000 | +2.780000 | +673.500000 | +
75% | +3.000000 | +13.677500 | +3.082500 | +2.557500 | +21.500000 | +107.000000 | +2.800000 | +2.875000 | +0.437500 | +1.950000 | +6.200000 | +1.120000 | +3.170000 | +985.000000 | +
max | +3.000000 | +14.830000 | +5.800000 | +3.230000 | +30.000000 | +162.000000 | +3.880000 | +5.080000 | +0.660000 | +3.580000 | +13.000000 | +1.710000 | +4.000000 | +1680.000000 | +
No missing data. The scales of our features vary (e.g., Magnesium is in the 100s whereas Hue is in the low single digits).
+How about our feature distributions?
+ +<Axes: xlabel='variable', ylabel='value'>
+Makes sense to normalise our data.
+from sklearn.preprocessing import MinMaxScaler
+
+# create a scaler object
+scaler = MinMaxScaler()
+
+# fit and transform the data
+df_norm = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)
+
+df_long = df_norm.melt(id_vars='Class label')
+df_long
+ | Class label | +variable | +value | +
---|---|---|---|
0 | +0.0 | +Alcohol | +0.842105 | +
1 | +0.0 | +Alcohol | +0.571053 | +
2 | +0.0 | +Alcohol | +0.560526 | +
3 | +0.0 | +Alcohol | +0.878947 | +
4 | +0.0 | +Alcohol | +0.581579 | +
... | +... | +... | +... | +
2309 | +1.0 | +Proline | +0.329529 | +
2310 | +1.0 | +Proline | +0.336662 | +
2311 | +1.0 | +Proline | +0.397290 | +
2312 | +1.0 | +Proline | +0.400856 | +
2313 | +1.0 | +Proline | +0.201141 | +
2314 rows × 3 columns
+#create seaborn violin plot
+my_plot = sns.violinplot(data = df_long, x = 'variable', y = 'value')
+
+#rotate x-axis labels
+my_plot.set_xticklabels(my_plot.get_xticklabels(), rotation=90)
[Text(0, 0, 'Alcohol'),
+ Text(1, 0, 'Malic acid'),
+ Text(2, 0, 'Ash'),
+ Text(3, 0, 'Alcalinity of ash'),
+ Text(4, 0, 'Magnesium'),
+ Text(5, 0, 'Total phenols'),
+ Text(6, 0, 'Flavanoids'),
+ Text(7, 0, 'Nonflavanoid phenols'),
+ Text(8, 0, 'Proanthocyanins'),
+ Text(9, 0, 'Color intensity'),
+ Text(10, 0, 'Hue'),
+ Text(11, 0, 'OD280/OD315 of diluted wines'),
+ Text(12, 0, 'Proline ')]
+Are there any patterns?
How about a pairplot?
-Hmm, a few interesting correlations. Some of our variables are skewed. We could apply some PCA here to look at fewer dimension or even log transform some of the skewed variables.
For now we will just run a kmeans cluster and then check our results against the ground truth.
-Lets decide how many clusters we need.
-from sklearn.cluster import KMeans
-
-ks = range(1, 10)
-inertias = []
-for k in ks:
- # Create a KMeans instance with k clusters: model
- model = KMeans(n_clusters=k, n_init = 10)
-
- # Fit model to samples
- model.fit(df.iloc[:,1:])
-
- # Append the inertia to the list of inertias
- inertias.append(model.inertia_)
-
-import matplotlib.pyplot as plt
-
-plt.plot(ks, inertias, '-o', color='black')
-plt.xlabel('number of clusters, k')
-plt.ylabel('inertia')
-plt.xticks(ks)
-plt.show()
from sklearn.cluster import KMeans
+
+ks = range(1, 10)
+inertias = []
+for k in ks:
+ # Create a KMeans instance with k clusters: model
+ model = KMeans(n_clusters=k, n_init = 10)
+
+ # Fit model to samples
+ model.fit(df.iloc[:,1:])
+
+ # Append the inertia to the list of inertias
+ inertias.append(model.inertia_)
+
+import matplotlib.pyplot as plt
+
+plt.plot(ks, inertias, '-o', color='black')
+plt.xlabel('number of clusters, k')
+plt.ylabel('inertia')
+plt.xticks(ks)
+plt.show()
What happens if we use the normalised data instead?
-from sklearn.cluster import KMeans
-
-ks = range(1, 10)
-inertias = []
-for k in ks:
- # Create a KMeans instance with k clusters: model
- model = KMeans(n_clusters=k, n_init = 10)
-
- # Fit model to samples
- model.fit(df_norm.iloc[:,1:])
-
- # Append the inertia to the list of inertias
- inertias.append(model.inertia_)
-
-import matplotlib.pyplot as plt
-
-plt.plot(ks, inertias, '-o', color='black')
-plt.xlabel('number of clusters, k')
-plt.ylabel('inertia')
-plt.xticks(ks)
-plt.show()
from sklearn.cluster import KMeans
+
+ks = range(1, 10)
+inertias = []
+for k in ks:
+ # Create a KMeans instance with k clusters: model
+ model = KMeans(n_clusters=k, n_init = 10)
+
+ # Fit model to samples
+ model.fit(df_norm.iloc[:,1:])
+
+ # Append the inertia to the list of inertias
+ inertias.append(model.inertia_)
+
+import matplotlib.pyplot as plt
+
+plt.plot(ks, inertias, '-o', color='black')
+plt.xlabel('number of clusters, k')
+plt.ylabel('inertia')
+plt.xticks(ks)
+plt.show()
Three clusters seems about right (and matches our number of origonal labels).
-Now, we are going to calculate three clusters and store each observation’s cluster labels into a variable within the original dataframe:
-# Create a KMeans instance with k clusters: model
-k_means = KMeans(n_clusters=3)
-
-# Fit model to samples
-df_k_means = k_means.fit(df.iloc[:,1:])
-
-# Create a new variable with the fited cluster label.
-df['Three clusters'] = pd.Series(df_k_means.predict(df.iloc[:,1:].values), index = df.index)
-df
# Create a KMeans instance with k clusters: model
+k_means = KMeans(n_clusters=3)
+
+# Fit model to samples
+df_k_means = k_means.fit(df.iloc[:,1:])
+
+df['Three clusters'] = pd.Series(df_k_means.predict(df.iloc[:,1:].values), index = df.index)
+df
+ | Class label | +Alcohol | +Malic acid | +Ash | +Alcalinity of ash | +Magnesium | +Total phenols | +Flavanoids | +Nonflavanoid phenols | +Proanthocyanins | +Color intensity | +Hue | +OD280/OD315 of diluted wines | +Proline | +Three clusters | +
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | +1 | +14.23 | +1.71 | +2.43 | +15.6 | +127 | +2.80 | +3.06 | +0.28 | +2.29 | +5.64 | +1.04 | +3.92 | +1065 | +1 | +
1 | +1 | +13.20 | +1.78 | +2.14 | +11.2 | +100 | +2.65 | +2.76 | +0.26 | +1.28 | +4.38 | +1.05 | +3.40 | +1050 | +1 | +
2 | +1 | +13.16 | +2.36 | +2.67 | +18.6 | +101 | +2.80 | +3.24 | +0.30 | +2.81 | +5.68 | +1.03 | +3.17 | +1185 | +1 | +
3 | +1 | +14.37 | +1.95 | +2.50 | +16.8 | +113 | +3.85 | +3.49 | +0.24 | +2.18 | +7.80 | +0.86 | +3.45 | +1480 | +1 | +
4 | +1 | +13.24 | +2.59 | +2.87 | +21.0 | +118 | +2.80 | +2.69 | +0.39 | +1.82 | +4.32 | +1.04 | +2.93 | +735 | +2 | +
... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +
173 | +3 | +13.71 | +5.65 | +2.45 | +20.5 | +95 | +1.68 | +0.61 | +0.52 | +1.06 | +7.70 | +0.64 | +1.74 | +740 | +2 | +
174 | +3 | +13.40 | +3.91 | +2.48 | +23.0 | +102 | +1.80 | +0.75 | +0.43 | +1.41 | +7.30 | +0.70 | +1.56 | +750 | +2 | +
175 | +3 | +13.27 | +4.28 | +2.26 | +20.0 | +120 | +1.59 | +0.69 | +0.43 | +1.35 | +10.20 | +0.59 | +1.56 | +835 | +2 | +
176 | +3 | +13.17 | +2.59 | +2.37 | +20.0 | +120 | +1.65 | +0.68 | +0.53 | +1.46 | +9.30 | +0.60 | +1.62 | +840 | +2 | +
177 | +3 | +14.13 | +4.10 | +2.74 | +24.5 | +96 | +2.05 | +0.76 | +0.56 | +1.35 | +9.20 | +0.61 | +1.60 | +560 | +0 | +
178 rows × 15 columns
+Do our cluster labels match our ground truth? Did our cluster model capture reality?
-Now that we have created three clusters, we may ask ourselves: Do our cluster labels match our ground truth? Did our cluster model capture reality?
+Class label | +1 | +2 | +3 | +
---|---|---|---|
Three clusters | ++ | + | + |
0 | +0 | +50 | +19 | +
1 | +46 | +1 | +0 | +
2 | +13 | +20 | +29 | +
It might be easier to see as a stacked plot (see this post).
-import matplotlib.pyplot as plt
-import numpy as np
-
-ct.plot.bar(stacked=True)
-plt.legend(title='Class label')
import matplotlib.pyplot as plt
+import numpy as np
+
+ct.plot.bar(stacked=True)
+plt.legend(title='Class label')
<matplotlib.legend.Legend at 0x1798f3e50>
+How has the kmeans model done compared to our ground truth?
A way to come over this ambiguity and evaluate the results is to look at a visualisations of the results and compare. But this brings in the question of what type of visualisation to use for looking at the clusters. An immediate alternative is to use scatterplots. However, it is not clear which axis to use for clustering. A common method to apply at this stage is to make use of PCA to get a 2D plane where we can project the data points and visualise them over this projection.
++ | Alcohol | +Malic acid | +Ash | +Alcalinity of ash | +Magnesium | +Total phenols | +Flavanoids | +Nonflavanoid phenols | +Proanthocyanins | +Color intensity | +Hue | +OD280/OD315 of diluted wines | +Proline | +
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | +14.23 | +1.71 | +2.43 | +15.6 | +127 | +2.80 | +3.06 | +0.28 | +2.29 | +5.64 | +1.04 | +3.92 | +1065 | +
1 | +13.20 | +1.78 | +2.14 | +11.2 | +100 | +2.65 | +2.76 | +0.26 | +1.28 | +4.38 | +1.05 | +3.40 | +1050 | +
2 | +13.16 | +2.36 | +2.67 | +18.6 | +101 | +2.80 | +3.24 | +0.30 | +2.81 | +5.68 | +1.03 | +3.17 | +1185 | +
3 | +14.37 | +1.95 | +2.50 | +16.8 | +113 | +3.85 | +3.49 | +0.24 | +2.18 | +7.80 | +0.86 | +3.45 | +1480 | +
4 | +13.24 | +2.59 | +2.87 | +21.0 | +118 | +2.80 | +2.69 | +0.39 | +1.82 | +4.32 | +1.04 | +2.93 | +735 | +
... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +
173 | +13.71 | +5.65 | +2.45 | +20.5 | +95 | +1.68 | +0.61 | +0.52 | +1.06 | +7.70 | +0.64 | +1.74 | +740 | +
174 | +13.40 | +3.91 | +2.48 | +23.0 | +102 | +1.80 | +0.75 | +0.43 | +1.41 | +7.30 | +0.70 | +1.56 | +750 | +
175 | +13.27 | +4.28 | +2.26 | +20.0 | +120 | +1.59 | +0.69 | +0.43 | +1.35 | +10.20 | +0.59 | +1.56 | +835 | +
176 | +13.17 | +2.59 | +2.37 | +20.0 | +120 | +1.65 | +0.68 | +0.53 | +1.46 | +9.30 | +0.60 | +1.62 | +840 | +
177 | +14.13 | +4.10 | +2.74 | +24.5 | +96 | +2.05 | +0.76 | +0.56 | +1.35 | +9.20 | +0.61 | +1.60 | +560 | +
178 rows × 13 columns
+from sklearn.decomposition import PCA
-
-n_components = 2
-
-pca = PCA(n_components=n_components)
-df_pca = pca.fit(df.iloc[:,1:14])
-df_pca_vals = df_pca.transform(df.iloc[:,1:14])
Grab our projections and plot along with our cluster names.
-df['c1'] = [item[0] for item in df_pca_vals]
-df['c2'] = [item[1] for item in df_pca_vals]
-
-ax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Class label')
-ax.set_title('Known labels visualised over PCs')
df['c1'] = [item[0] for item in df_pca_vals]
+df['c2'] = [item[1] for item in df_pca_vals]
+
+ax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Class label')
+ax.set_title('Known labels visualised over PCs')
Text(0.5, 1.0, 'Known labels visualised over PCs')
+In the figure above, we colored the points based on the actual labels, we observe that there has been several misclassifications in the figure above (i.e., in the algorithm’s results). So one may choose to use an alternative algorithm or devise a better distance metric.
-ax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Three clusters')
-ax.set_title('Results of the algorithm visualised over PCs')
ax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Three clusters')
+ax.set_title('Results of the algorithm visualised over PCs')
Text(0.5, 1.0, 'Results of the algorithm visualised over PCs')
+This shows the parallelism between the clustering algorithm and PCA. By looking at the PCA loadings, we can find out what the x-axis mean and try to interpret the clusters (We leave this as an additional exercise for those interested).
How might your interpret the above plots? Did the kmeans model identify the ground truth?
@@ -803,32 +1706,44 @@# Create a KMeans instance with k clusters: model
-k_means = KMeans(n_clusters=3, init='random', n_init = 10)
-
-# Fit model to samples
-df_k_means = k_means.fit(df.iloc[:,1:14])
-
-df['Three clusters'] = pd.Series(df_k_means.predict(df.iloc[:,1:14].values), index = df.index)
-
-ax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Three clusters')
-ax.set_title('Results of the algorithm visualised over PCs')
# Create a KMeans instance with k clusters: model
+k_means = KMeans(n_clusters=3, init='random', n_init = 10)
+
+# Fit model to samples
+df_k_means = k_means.fit(df.iloc[:,1:14])
+
+df['Three clusters'] = pd.Series(df_k_means.predict(df.iloc[:,1:14].values), index = df.index)
+
+ax = sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Three clusters')
+ax.set_title('Results of the algorithm visualised over PCs')
Text(0.5, 1.0, 'Results of the algorithm visualised over PCs')
+How about with only 80% of the data?
-df_sample = df.sample(frac=0.8, replace=False)
-
-# Create a KMeans instance with k clusters: model
-k_means = KMeans(n_clusters=3, init='random', n_init = 10)
-
-# Fit model to samples
-df_k_means = k_means.fit(df_sample.iloc[:,1:14])
-
-df_sample['Three clusters'] = pd.Series(df_k_means.predict(df_sample.iloc[:,1:14].values), index = df_sample.index)
-
-ax = sns.scatterplot(data = df_sample, x = 'c1', y = 'c2', hue = 'Three clusters')
-ax.set_title('Results of the algorithm visualised over PCs')
df_sample = df.sample(frac=0.8, replace=False)
+
+# Create a KMeans instance with k clusters: model
+k_means = KMeans(n_clusters=3, init='random', n_init = 10)
+
+# Fit model to samples
+df_k_means = k_means.fit(df_sample.iloc[:,1:14])
+
+df_sample['Three clusters'] = pd.Series(df_k_means.predict(df_sample.iloc[:,1:14].values), index = df_sample.index)
+
+ax = sns.scatterplot(data = df_sample, x = 'c1', y = 'c2', hue = 'Three clusters')
+ax.set_title('Results of the algorithm visualised over PCs')
Text(0.5, 1.0, 'Results of the algorithm visualised over PCs')
+We may want to automate the process of resampling the data or rerunning the model then perhaps plotting the different inertia values or creating different plots.
Do you think our clustering algorithm is stable and provide similiar results even when some data is removed or the initial values are random?
diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-10-output-1.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-10-output-1.png index c686cad..0e88bbd 100644 Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-10-output-1.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-10-output-1.png differ diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-11-output-1.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-11-output-1.png index 33a1e9a..0b09c4f 100644 Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-11-output-1.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-11-output-1.png differ diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-12-output-1.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-12-output-1.png index dc44232..7d26839 100644 Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-12-output-1.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-12-output-1.png differ diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-19-output-2.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-19-output-2.png index f6f3c4d..f9c04c5 100644 Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-19-output-2.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-19-output-2.png differ diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-20-output-2.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-20-output-2.png index 4769a92..f6f3c4d 100644 Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-20-output-2.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-20-output-2.png differ diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-21-output-2.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-21-output-2.png index b326702..d880af7 100644 Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-21-output-2.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-21-output-2.png differ diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-22-output-2.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-22-output-2.png index f33922d..5fecad0 100644 Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-22-output-2.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-22-output-2.png differ diff --git a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-7-output-2.png b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-7-output-2.png index dd4eb86..c08f8b5 100644 Binary files a/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-7-output-2.png and b/content/labs/Lab_5/IM939_Lab_5_1_files/figure-html/cell-7-output-2.png differ diff --git a/content/labs/Lab_5/IM939_Lab_5_2.html b/content/labs/Lab_5/IM939_Lab_5_2.html index ce7016e..25803e1 100644 --- a/content/labs/Lab_5/IM939_Lab_5_2.html +++ b/content/labs/Lab_5/IM939_Lab_5_2.html @@ -103,7 +103,7 @@ - + @@ -565,11 +565,11 @@We are going to examine the data, fit and then cross-validate a regression model.
-import seaborn as sns
sns.jointplot(data = df[['medIncome', 'ViolentCrimesPerPop']],
x = 'ViolentCrimesPerPop',
y = 'medIncome', kind='reg',
marker = '.')
We may want to z-transform or log these scores as they are heavily skewed.
-import numpy as np
# some values are 0 so 0.1 is added to prevent log giving us infinity
# there may be a better way to do this!
df_reg.loc[:, 'ViolentCrimesPerPop_log'] = np.log(df_reg['ViolentCrimesPerPop'] + 0.1)
df_reg.loc[:,'medIncome_log'] = np.log(df_reg['medIncome'] + 0.1)
/var/folders/7v/zl9mv52s3ls94kntlt_l9ryh0000gq/T/ipykernel_13528/3488182522.py:5: SettingWithCopyWarning:
+A value is trying to be set on a copy of a slice from a DataFrame.
+Try using .loc[row_indexer,col_indexer] = value instead
+
+See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
+ df_reg.loc[:, 'ViolentCrimesPerPop_log'] = np.log(df_reg['ViolentCrimesPerPop'] + 0.1)
+/var/folders/7v/zl9mv52s3ls94kntlt_l9ryh0000gq/T/ipykernel_13528/3488182522.py:6: SettingWithCopyWarning:
+A value is trying to be set on a copy of a slice from a DataFrame.
+Try using .loc[row_indexer,col_indexer] = value instead
+
+See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
+ df_reg.loc[:,'medIncome_log'] = np.log(df_reg['medIncome'] + 0.1)
import seaborn as sns
-sns.jointplot(data = df_reg[['medIncome_log', 'ViolentCrimesPerPop_log']],
- x = 'ViolentCrimesPerPop_log',
- y = 'medIncome_log', kind='reg',
- marker = '.')
import seaborn as sns
+sns.jointplot(data = df_reg[['medIncome_log', 'ViolentCrimesPerPop_log']],
+ x = 'ViolentCrimesPerPop_log',
+ y = 'medIncome_log', kind='reg',
+ marker = '.')
Is log transforming our variables the right thing to do here?
Fit our regression to the log transformed data.
-import matplotlib.pyplot as plt
-from sklearn.linear_model import LinearRegression
-from sklearn import metrics
-
-x = df_reg[['ViolentCrimesPerPop_log']]
-y = df_reg[['medIncome_log']]
-
-model = LinearRegression()
-model.fit(x, y)
-
-y_hat = model.predict(x)
-plt.plot(x, y,'o', alpha = 0.5)
-plt.plot(x, y_hat, 'r', alpha = 0.5)
-
-plt.xlabel('Violent Crimes Per Population')
-plt.ylabel('Median Income')
-
-print ("MSE:", metrics.mean_squared_error(y_hat, y))
-print ("R^2:", metrics.r2_score(y, y_hat))
-print ("var:", y.var())
import matplotlib.pyplot as plt
+from sklearn.linear_model import LinearRegression
+from sklearn import metrics
+
+x = df_reg[['ViolentCrimesPerPop_log']]
+y = df_reg[['medIncome_log']]
+
+model = LinearRegression()
+model.fit(x, y)
+
+y_hat = model.predict(x)
+plt.plot(x, y,'o', alpha = 0.5)
+plt.plot(x, y_hat, 'r', alpha = 0.5)
+
+plt.xlabel('Violent Crimes Per Population')
+plt.ylabel('Median Income')
+
+print ("MSE:", metrics.mean_squared_error(y_hat, y))
+print ("R^2:", metrics.r2_score(y, y_hat))
+print ("var:", y.var())
MSE: 0.1531885348757034
R^2: 0.22763497704356928
@@ -995,27 +1009,27 @@ 22
-
+
Has our log transformation distorted the pattern in the data?
-x = df_reg[['ViolentCrimesPerPop']]
-y = df_reg[['medIncome']]
-
-model = LinearRegression()
-model.fit(x, y)
-
-y_hat = model.predict(x)
-plt.plot(x, y,'o', alpha = 0.5)
-plt.plot(x, y_hat, 'r', alpha = 0.5)
-
-plt.xlabel('Violent Crimes Per Population')
-plt.ylabel('Median Income')
-
-print ("MSE:", metrics.mean_squared_error(y_hat, y))
-print ("R^2:", metrics.r2_score(y, y_hat))
-print ("var:", y.var())
x = df_reg[['ViolentCrimesPerPop']]
+y = df_reg[['medIncome']]
+
+model = LinearRegression()
+model.fit(x, y)
+
+y_hat = model.predict(x)
+plt.plot(x, y,'o', alpha = 0.5)
+plt.plot(x, y_hat, 'r', alpha = 0.5)
+
+plt.xlabel('Violent Crimes Per Population')
+plt.ylabel('Median Income')
+
+print ("MSE:", metrics.mean_squared_error(y_hat, y))
+print ("R^2:", metrics.r2_score(y, y_hat))
+print ("var:", y.var())
MSE: 0.03592636778157073
R^2: 0.17996313165549482
@@ -1023,42 +1037,42 @@ 22
-
+
What is the relationship between violent crime and median income? Why might this be?
Assuming the log data is fine, have we overfit the model? Remember that a good model (which accurately models the relationship between violent crimes per population) need to be robust when faced with new data.
Kfold cross validation splits data into train and test subsets. We can then fit the regression to the training set and see how well it does for the test set.
+from sklearn.model_selection import KFold
-
-X = df_reg[['ViolentCrimesPerPop']]
-y = df_reg[['medIncome']]
-
-# get four splits, Each split contains a
-# test series and a train series.
-kf = KFold(n_splits=4)
# lists to store our statistics
+r_vals = []
+MSEs = []
+medIncome_coef = []
+
+for train_index, test_index in kf.split(X):
+ # fit our model and extract statistics
+ model = LinearRegression()
+ model.fit(X.iloc[train_index], y.iloc[train_index])
+ y_hat = model.predict(X.iloc[test_index])
+
+ MSEs.append(metrics.mean_squared_error(y.iloc[test_index], y_hat))
+ medIncome_coef.append(model.coef_[0][0])
+ r_vals.append(metrics.r2_score(y.iloc[test_index], y_hat))
# lists to store our statistics
-r_vals = []
-MSEs = []
-medIncome_coef = []
-
-for train_index, test_index in kf.split(X):
- # fit our model and extract statistics
- model = LinearRegression()
- model.fit(X.iloc[train_index], y.iloc[train_index])
- y_hat = model.predict(X.iloc[test_index])
-
- MSEs.append(metrics.mean_squared_error(y.iloc[test_index], y_hat))
- medIncome_coef.append(model.coef_[0][0])
- r_vals.append(metrics.r2_score(y.iloc[test_index], y_hat))
data = {'MSE' : MSEs, 'medIncome coefficient' : medIncome_coef, 'r squared' : r_vals}
-pd.DataFrame(data)
data = {'MSE' : MSEs, 'medIncome coefficient' : medIncome_coef, 'r squared' : r_vals}
+pd.DataFrame(data)
from sklearn.model_selection import cross_val_score
-x = df_reg[['ViolentCrimesPerPop']]
-y = df_reg[['medIncome']]
-
-model = LinearRegression()
-model.fit(x, y)
-
-print(cross_val_score(model, x, y, cv=4))
from sklearn.model_selection import cross_val_score
+x = df_reg[['ViolentCrimesPerPop']]
+y = df_reg[['medIncome']]
+
+model = LinearRegression()
+model.fit(x, y)
+
+print(cross_val_score(model, x, y, cv=4))
[0.13047946 0.16281953 0.20013867 0.18240261]