diff --git a/content/labs/Lab_4/IM939_Lab_4_Exercises.ipynb b/content/labs/Lab_4/IM939_Lab_4_Exercises.ipynb index 9865b80..9a4e038 100644 --- a/content/labs/Lab_4/IM939_Lab_4_Exercises.ipynb +++ b/content/labs/Lab_4/IM939_Lab_4_Exercises.ipynb @@ -38,18 +38,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "SyntaxError", - "evalue": "invalid syntax (197875397.py, line 9)", - "output_type": "error", - "traceback": [ - "\u001b[0;36m Cell \u001b[0;32mIn[1], line 9\u001b[0;36m\u001b[0m\n\u001b[0;31m from sklearn.decomposition import S????ePCA\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" - ] - } - ], + "outputs": [], "source": [ "#| error: true\n", "\n", @@ -69,18 +60,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "SyntaxError", - "evalue": "invalid syntax (197674394.py, line 1)", - "output_type": "error", - "traceback": [ - "\u001b[0;36m Cell \u001b[0;32mIn[2], line 1\u001b[0;36m\u001b[0m\n\u001b[0;31m df.h??d()\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" - ] - } - ], + "outputs": [], "source": [ "#| error: true\n", "df.h??d()" @@ -88,18 +70,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "SyntaxError", - "evalue": "invalid syntax (4010129658.py, line 3)", - "output_type": "error", - "traceback": [ - "\u001b[0;36m Cell \u001b[0;32mIn[3], line 3\u001b[0;36m\u001b[0m\n\u001b[0;31m sns.pair????(df)\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" - ] - } - ], + "outputs": [], "source": [ "#| error: true\n", "\n", @@ -108,6 +81,55 @@ "sns.pair????(df)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Normalisation\n", + "\n", + "Before you carry out any operation, you might want to perform some normalisation. This will ensure that some of the assumptions that the algorithms are making are met and also the results are not biased/determined by the different value ranges and variation ranges inherent in the data. \n", + "\n", + "Do try out the following steps **without** normalisation first and then come back to this, normalise the data and see the differences it makes using a **normalised** copy of the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import MinMaxScaler\n", + "\n", + "# first save the column names, we will create a new dataset with the scaled data\n", + "col_names = df.columns\n", + "\n", + "# This is the normalization function. \n", + "# We are using the MinMaxScaler which brings all the data between 0 and 1.\n", + "# Make use of other transformations offered by scikitlearn, experiment, note changes. \n", + "\n", + "# The last column of the data contains the \"quality\" labels/scores, we don't want to normalize them \n", + "# as they is sort of the \"dependent (or \"target\") variable and there is meaning in these scores. \n", + "# Let's normalize the first 11 columns which are our \"independent\" columns.\n", + "scaled_df = pd.DataFrame(MinMaxScaler().fit_transform(df.iloc[:, 0:11]))\n", + "\n", + "# now we want to add the \"quality\" values back in. We'll need them.\n", + "scaled_df = scaled_df.join(df.iloc[:, 11:12])\n", + "\n", + "# now we name the columns with the original column names. We do this because MinMaxScaler \n", + "# produces a data frame with no column names (don't ask me why..)\n", + "scaled_df.columns = col_names\n", + "\n", + "# let's have a look at what the data is looking like:\n", + "scaled_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Important note:** The rest of the code will continue to use the **non-normalised** version of the data. For now, just carry on with that and try running the operations with the non-normalised version. Once you are through and/or somewhere in the middle, try them out with the **normalised** data. See what this will change." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -117,18 +139,9 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "SyntaxError", - "evalue": "invalid syntax (2518476773.py, line 5)", - "output_type": "error", - "traceback": [ - "\u001b[0;36m Cell \u001b[0;32mIn[4], line 5\u001b[0;36m\u001b[0m\n\u001b[0;31m pca = PCA(n_??????????=n_components)\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" - ] - } - ], + "outputs": [], "source": [ "#| error: true\n", "\n", @@ -142,18 +155,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "SyntaxError", - "evalue": "invalid syntax (2292344659.py, line 1)", - "output_type": "error", - "traceback": [ - "\u001b[0;36m Cell \u001b[0;32mIn[5], line 1\u001b[0;36m\u001b[0m\n\u001b[0;31m df_pca_vals = df_pca.???_transform(df.iloc[:, 0:11])\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" - ] - } - ], + "outputs": [], "source": [ "#| error: true\n", "\n", @@ -164,18 +168,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "SyntaxError", - "evalue": "invalid syntax (321227352.py, line 1)", - "output_type": "error", - "traceback": [ - "\u001b[0;36m Cell \u001b[0;32mIn[6], line 1\u001b[0;36m\u001b[0m\n\u001b[0;31m sns.scatterplot(data = df, x = ?, y = ?, hue = 'quality')\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" - ] - } - ], + "outputs": [], "source": [ "#| error: true\n", "\n", @@ -184,21 +179,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'df' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mdf\u001b[49m\u001b[38;5;241m.\u001b[39mcolumns)\n\u001b[1;32m 2\u001b[0m df_pca\u001b[38;5;241m.\u001b[39mcomponents_\n", - "\u001b[0;31mNameError\u001b[0m: name 'df' is not defined" - ] - } - ], + "outputs": [], "source": [ "#| error: true\n", "\n", @@ -217,18 +200,9 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "SyntaxError", - "evalue": "invalid syntax (2237240520.py, line 2)", - "output_type": "error", - "traceback": [ - "\u001b[0;36m Cell \u001b[0;32mIn[8], line 2\u001b[0;36m\u001b[0m\n\u001b[0;31m df_s_pca = s_pca.fit(df.????[:, 0:11])\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" - ] - } - ], + "outputs": [], "source": [ "#| error: true\n", "\n", @@ -238,18 +212,9 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "SyntaxError", - "evalue": "invalid syntax (496583237.py, line 1)", - "output_type": "error", - "traceback": [ - "\u001b[0;36m Cell \u001b[0;32mIn[9], line 1\u001b[0;36m\u001b[0m\n\u001b[0;31m df_s_pca_vals = s_pca.fit_?????????(df.iloc[:, 0:11])\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" - ] - } - ], + "outputs": [], "source": [ "#| error: true\n", "\n", @@ -260,21 +225,9 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'sns' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[10], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43msns\u001b[49m\u001b[38;5;241m.\u001b[39mscatterplot(data \u001b[38;5;241m=\u001b[39m df, x \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mc1 spca\u001b[39m\u001b[38;5;124m'\u001b[39m, y \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mc2 spca\u001b[39m\u001b[38;5;124m'\u001b[39m, hue \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mquality\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", - "\u001b[0;31mNameError\u001b[0m: name 'sns' is not defined" - ] - } - ], + "outputs": [], "source": [ "#| error: true\n", "\n", @@ -290,21 +243,9 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'TSNE' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[11], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m tsne_model \u001b[38;5;241m=\u001b[39m \u001b[43mTSNE\u001b[49m(n_components\u001b[38;5;241m=\u001b[39mn_components)\n\u001b[1;32m 2\u001b[0m df_tsne \u001b[38;5;241m=\u001b[39m tsne_model\u001b[38;5;241m.\u001b[39mfit(df\u001b[38;5;241m.\u001b[39miloc[:, \u001b[38;5;241m0\u001b[39m:\u001b[38;5;241m11\u001b[39m])\n", - "\u001b[0;31mNameError\u001b[0m: name 'TSNE' is not defined" - ] - } - ], + "outputs": [], "source": [ "#| error: true\n", "\n", @@ -314,18 +255,9 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "SyntaxError", - "evalue": "invalid syntax (3280343393.py, line 2)", - "output_type": "error", - "traceback": [ - "\u001b[0;36m Cell \u001b[0;32mIn[12], line 2\u001b[0;36m\u001b[0m\n\u001b[0;31m df['c1 tsne'] = [item[0] for item in ??_tsne_vals]\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" - ] - } - ], + "outputs": [], "source": [ "#| error: true\n", "\n", @@ -336,18 +268,9 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "SyntaxError", - "evalue": "invalid syntax (847552475.py, line 3)", - "output_type": "error", - "traceback": [ - "\u001b[0;36m Cell \u001b[0;32mIn[13], line 3\u001b[0;36m\u001b[0m\n\u001b[0;31m sns.scatterplot(data = ??, x = 'c1 tsne', y = 'c1 tsne', hue = 'quality')\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" - ] - } - ], + "outputs": [], "source": [ "#| error: true\n", "\n", @@ -360,91 +283,27 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "That looks concerning - there is a straight line. It looks like something in the data has caused the model to have issues.\n", - "\n", - "Does normalising the data sort out the issue?" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'df' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[14], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpreprocessing\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MinMaxScaler\n\u001b[0;32m----> 2\u001b[0m col_names \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[38;5;241m.\u001b[39mcolumns\n\u001b[1;32m 3\u001b[0m scaled_df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(MinMaxScaler()\u001b[38;5;241m.\u001b[39mfit_transform(df))\n\u001b[1;32m 4\u001b[0m scaled_df\u001b[38;5;241m.\u001b[39mcolumns \u001b[38;5;241m=\u001b[39m col_names\n", - "\u001b[0;31mNameError\u001b[0m: name 'df' is not defined" - ] - } - ], - "source": [ - "#| error: true\n", - "\n", - "from sklearn.preprocessing import MinMaxScaler\n", - "col_names = df.columns\n", - "scaled_df = pd.DataFrame(MinMaxScaler().fit_transform(df))\n", - "scaled_df.columns = col_names" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'TSNE' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[15], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m tsne_model \u001b[38;5;241m=\u001b[39m \u001b[43mTSNE\u001b[49m(n_components\u001b[38;5;241m=\u001b[39mn_components)\n\u001b[1;32m 3\u001b[0m scaled_df_tsne \u001b[38;5;241m=\u001b[39m tsne_model\u001b[38;5;241m.\u001b[39mfit(scaled_df\u001b[38;5;241m.\u001b[39miloc[:, \u001b[38;5;241m0\u001b[39m:\u001b[38;5;241m11\u001b[39m])\n\u001b[1;32m 4\u001b[0m scaled_df_tsne_vals \u001b[38;5;241m=\u001b[39m tsne_model\u001b[38;5;241m.\u001b[39mfit_transform(df\u001b[38;5;241m.\u001b[39miloc[:, \u001b[38;5;241m0\u001b[39m:\u001b[38;5;241m11\u001b[39m])\n", - "\u001b[0;31mNameError\u001b[0m: name 'TSNE' is not defined" - ] - } - ], - "source": [ - "#| error: true\n", + "That looks concerning - there is a straight line. It looks like something in the above code might not be correct.\n", "\n", - "tsne_model = TSNE(n_components=n_components)\n", + "Can you find out what that might be? \n", "\n", - "scaled_df_tsne = tsne_model.fit(scaled_df.iloc[:, 0:11])\n", - "scaled_df_tsne_vals = tsne_model.fit_transform(df.iloc[:, 0:11])\n", - "\n", - "scaled_df['c1 tsne'] = [item[0] for item in scaled_df_tsne_vals]\n", - "scaled_df['c2 tsne'] = [item[1] for item in scaled_df_tsne_vals]\n", - "\n", - "sns.scatterplot(data = scaled_df, x = 'c1 tsne', y = 'c1 tsne', hue = 'quality')" + "**Hint:** think about when you would get a straight line in a scatterplot?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Normalising the data makes no difference. It could be the model is getting stuck somehow. You could check the various attributes of the tsne fit object (tsne_model.fit), try using only a few columns and search google a lot - this could be a problem other have encountered.\n", + "Once you fixed the error above, you will notice a different structure to the ones you observed in the PCA runs. There isn't really a clear next step which of these projections one should adopt. \n", "\n", - "For now, we will use PCA components." + "For now, we will use PCA components. PCA would be a good choice if the interpretability of the components is important to us. Since PCA is a linear projection method, the components carry the weights of each raw feature which enable us to make inferences about the axes. However, if we are more interested in finding structures and identify groups of similar items, t-SNE might be a better projection to use since it emphasises proximity but the axes don't mean much since the layout is formed stochastically (fancy speak for saying that there is randomness in the algorithm and the layout will be different each time your run it)." ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "SyntaxError", - "evalue": "invalid syntax (4262587979.py, line 6)", - "output_type": "error", - "traceback": [ - "\u001b[0;36m Cell \u001b[0;32mIn[16], line 6\u001b[0;36m\u001b[0m\n\u001b[0;31m loadings = pd.?????????(data)\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" - ] - } - ], + "outputs": [], "source": [ "#| error: true\n", "\n", @@ -460,21 +319,9 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'loadings' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[17], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m loadings_sorted \u001b[38;5;241m=\u001b[39m \u001b[43mloadings\u001b[49m\u001b[38;5;241m.\u001b[39msort_values(by\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcomponent 2\u001b[39m\u001b[38;5;124m'\u001b[39m], ascending\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m 2\u001b[0m loadings_sorted\u001b[38;5;241m.\u001b[39miloc[\u001b[38;5;241m1\u001b[39m:\u001b[38;5;241m10\u001b[39m,:]\n", - "\u001b[0;31mNameError\u001b[0m: name 'loadings' is not defined" - ] - } - ], + "outputs": [], "source": [ "#| error: true\n", "\n", @@ -491,28 +338,9 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Object `??? = KMeans(n_clusters=k)` not found.\n" - ] - }, - { - "ename": "NameError", - "evalue": "name 'model' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[18], line 10\u001b[0m\n\u001b[1;32m 7\u001b[0m get_ipython()\u001b[38;5;241m.\u001b[39mrun_line_magic(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpinfo2\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m??? = KMeans(n_clusters=k)\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 9\u001b[0m \u001b[38;5;66;03m# Fit model to samples\u001b[39;00m\n\u001b[0;32m---> 10\u001b[0m \u001b[43mmodel\u001b[49m\u001b[38;5;241m.\u001b[39mfit(df[[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mc1\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mc2\u001b[39m\u001b[38;5;124m'\u001b[39m]])\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# Append the inertia to the list of inertias\u001b[39;00m\n\u001b[1;32m 13\u001b[0m inertias\u001b[38;5;241m.\u001b[39mappend(model\u001b[38;5;241m.\u001b[39minertia_)\n", - "\u001b[0;31mNameError\u001b[0m: name 'model' is not defined" - ] - } - ], + "outputs": [], "source": [ "#| error: true\n", "\n", @@ -541,18 +369,9 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "SyntaxError", - "evalue": "invalid syntax (2729138574.py, line 3)", - "output_type": "error", - "traceback": [ - "\u001b[0;36m Cell \u001b[0;32mIn[19], line 3\u001b[0;36m\u001b[0m\n\u001b[0;31m df['Three clusters'] = pd.Series(k_means_3.???????(df[['c1', 'c2']].values), index = df.index)\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" - ] - } - ], + "outputs": [], "source": [ "#| error: true\n", "\n", @@ -563,37 +382,32 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'sns' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[20], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43msns\u001b[49m\u001b[38;5;241m.\u001b[39mscatterplot(data \u001b[38;5;241m=\u001b[39m df, x \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mc1\u001b[39m\u001b[38;5;124m'\u001b[39m, y \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mc2\u001b[39m\u001b[38;5;124m'\u001b[39m, hue \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mThree clusters\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", - "\u001b[0;31mNameError\u001b[0m: name 'sns' is not defined" - ] - } - ], + "outputs": [], "source": [ "#| error: true\n", "\n", "sns.scatterplot(data = df, x = 'c1', y = 'c2', hue = 'Three clusters')" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {}, "source": [ "Consider:\n", "\n", - "* Is that userful? \n", + "* Is that useful? \n", "* What might it mean?\n", "\n", - "Outside of this session you could try normalising the data (centering around the mean), clustering the raw data (and not the projections from PCA), trying to get tSNE working or using different numbers of components." + "Outside of this session go back to normalising the data and try out different methods for normalisation as well (e.g., centering around the mean), clustering the raw data (and not the projections from PCA), trying to get tSNE working or using different numbers of components." ] } ], diff --git a/content/labs/Lab_4/data/winequality-red_v2_(3).xlsx b/content/labs/Lab_4/data/winequality-red_v2.xlsx similarity index 100% rename from content/labs/Lab_4/data/winequality-red_v2_(3).xlsx rename to content/labs/Lab_4/data/winequality-red_v2.xlsx