diff --git a/src/multidim/notebooks/04_dependency.ipynb b/src/multidim/notebooks/04_dependency.ipynb index 79a3291..54fa0ad 100644 --- a/src/multidim/notebooks/04_dependency.ipynb +++ b/src/multidim/notebooks/04_dependency.ipynb @@ -589,7 +589,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.10.10" }, "vscode": { "interpreter": { diff --git a/src/multidim/notebooks/05_mds_outliers.ipynb b/src/multidim/notebooks/05_mds_outliers.ipynb index 574e3ac..b850504 100644 --- a/src/multidim/notebooks/05_mds_outliers.ipynb +++ b/src/multidim/notebooks/05_mds_outliers.ipynb @@ -1,515 +1,515 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "d8927acb", - "metadata": {}, - "source": [ - "# Analiza Wielowymiarowa - zajecia 05 - Skalowanie wielowymiarowe i obserwacje nietypowe" - ] - }, - { - "cell_type": "markdown", - "id": "e285e16f", - "metadata": {}, - "source": [ - "## Skalowanie wielowymiarowe" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b64e6d7f-c1c6-4248-8d3e-d51600c69de1", - "metadata": {}, - "outputs": [], - "source": [ - "from multidim.utils import resolve_stata, load_stata\n", - "\n", - "STATA_PATH, STATA_TYPE = resolve_stata(version = 18, stype = \"se\")\n", - "# make sure they are proper ones\n", - "STATA_PATH, STATA_TYPE" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1bf9afd9-feeb-41c4-ae20-d620098eb810", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "#load_stata(STATA_PATH, STATA_TYPE)\n", - "load_stata(STATA_PATH, \"se\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "086775bb", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import scipy\n", - "import sklearn\n", - "import numpy as np\n", - "import seaborn as sns\n", - "import scipy as sp\n", - "from scipy.stats import chi2\n", - "from matplotlib import pyplot as plt\n", - "from sklearn.cluster import AgglomerativeClustering\n", - "from multidim.funs import plot_dendrogram\n", - "from multidim.datasets import load_boston" - ] - }, - { - "cell_type": "markdown", - "id": "a8aecafd", - "metadata": {}, - "source": [ - "### Przykład 1." - ] - }, - { - "cell_type": "markdown", - "id": "02066827", - "metadata": {}, - "source": [ - "Dane i przyklad zostaly pozyczone z podrecznika Stata MV" - ] - }, - { - "cell_type": "markdown", - "id": "584ddf05", - "metadata": {}, - "source": [ - "Zaladowanie danych z internetu. Jezeli nie dziala, zbior danych jest w materialach do zajec\n", - "\n", - "Dane dotycza charakterystyk odzywczych platkow sniadanowych" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fa7313eb", - "metadata": {}, - "outputs": [], - "source": [ - "%%stata \n", - "use https://www.stata-press.com/data/r17/cerealnut\n", - "des " - ] - }, - { - "cell_type": "markdown", - "id": "69e4edb1", - "metadata": {}, - "source": [ - "Podstawowy opis statystyczny cech charakterystycznych płatkow sniadaniowych" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6be565a9", - "metadata": {}, - "outputs": [], - "source": [ - "%stata summarize calories-K, sep(4)" - ] - }, - { - "cell_type": "raw", - "id": "91eb0aef", - "metadata": {}, - "source": [ - "Zastepujemy odstepy w nazwach znakiem podreslenia dla czytelnosci wykresow" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "573323e2", - "metadata": {}, - "outputs": [], - "source": [ - "%stata replace brand = subinstr(brand,\" \",\"_\",.)" - ] - }, - { - "cell_type": "markdown", - "id": "4bdf01e8", - "metadata": {}, - "source": [ - "Polecenie mds wykonuje skalowanie wielowymiarowe na podstawie podanych charakterystyk\n", - "opcja \"id\" jest niezbedna, identyfikuje obserwacje" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2207f143", - "metadata": {}, - "outputs": [], - "source": [ - "%stata mds calories-K, id(brand)" - ] - }, - { - "cell_type": "markdown", - "id": "780c7b17", - "metadata": {}, - "source": [ - "Dwie pierwsze wartosci wlasne wyjasniaja 99,7% niepodobienstwa " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "77d1c505", - "metadata": {}, - "outputs": [], - "source": [ - "%stata mds calories-K, id(brand) std" - ] - }, - { - "cell_type": "markdown", - "id": "c19b33df", - "metadata": {}, - "source": [ - "opcja \"std\" przed wykonaniem obliczen standaryzuje wartosci charakterystyk" - ] - }, - { - "cell_type": "markdown", - "id": "cb365671", - "metadata": {}, - "source": [ - "## Obserwacje nietypowe" - ] - }, - { - "cell_type": "markdown", - "id": "33034ef9", - "metadata": {}, - "source": [ - "Wczytanie danych do pakietu Stata" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3deb87d3", - "metadata": {}, - "outputs": [], - "source": [ - "%stata load boston " - ] - }, - { - "cell_type": "markdown", - "id": "325fb836", - "metadata": {}, - "source": [ - "Wczytanie danych" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "85e79dce", - "metadata": {}, - "outputs": [], - "source": [ - "boston = load_boston()\n", - "y = boston['MEDV'] \n", - "df = boston[['CRIM', 'ZN', 'INDUS', 'CHAS','NOX','RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT' ]]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "721defa2", - "metadata": {}, - "outputs": [], - "source": [ - "print(boston)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ef1fb479", - "metadata": {}, - "outputs": [], - "source": [ - "#df.describe()\n", - "df_1 = df[['TAX', 'B']]\n", - "df_2 = df[['CRIM', 'ZN', 'INDUS', 'RM', 'AGE', 'DIS', 'RAD', 'PTRATIO','LSTAT']]\n", - "df_3 = df[['CHAS', 'NOX']]\n", - "\n", - "ax = sns.boxplot(data=df_2, orient=\"h\", palette=\"Set2\")\n", - "\n", - "ax = sns.boxplot(x=df[\"CRIM\"])\n", - "ax.set_xlabel('Crime rate per capita')\n" - ] - }, - { - "cell_type": "markdown", - "id": "5005b530", - "metadata": {}, - "source": [ - "Wizualizacja danych" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5d393131", - "metadata": {}, - "outputs": [], - "source": [ - "#Wykres rozrzutu\n", - "ax = sns.scatterplot(x=\"LSTAT\", y=\"CRIM\", data=df)" - ] - }, - { - "cell_type": "markdown", - "id": "0819a2d4", - "metadata": {}, - "source": [ - "Odległość Mahalanobisa - funckcja zwracająca obserwacje odstające" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "14bd9807", - "metadata": {}, - "outputs": [], - "source": [ - "def mahalanobis_method(df):\n", - " #M-Distance\n", - " x_minus_mu = df - np.mean(df, axis=0)\n", - " cov = np.cov(df.values.T) #Covariance\n", - " inv_covmat = sp.linalg.inv(cov) #Inverse covariance\n", - " left_term = np.dot(x_minus_mu, inv_covmat) \n", - " mahal = np.dot(left_term, x_minus_mu.T)\n", - " md = np.sqrt(mahal.diagonal())\n", - " \n", - " #Flag as outlier\n", - " outlier = []\n", - " #Cut-off point\n", - " C = np.sqrt(chi2.ppf((1-0.001), df=df.shape[1])) #degrees of freedom = number of variables\n", - " for index, value in enumerate(md):\n", - " if value > C:\n", - " outlier.append(index)\n", - " else:\n", - " continue\n", - " return outlier, md" - ] - }, - { - "cell_type": "markdown", - "id": "46d1b04c", - "metadata": {}, - "source": [ - "Obserwacje odstające według odległości Mahalanobisa: 2 wymiarowe" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2e1e452f", - "metadata": {}, - "outputs": [], - "source": [ - "df_bivariate = df[['LSTAT', 'CRIM']]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bd711c1f", - "metadata": {}, - "outputs": [], - "source": [ - "outliers_mahal_bi, md_bi = mahalanobis_method(df=df_bivariate)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "232e0200", - "metadata": {}, - "outputs": [], - "source": [ - "outliers_mahal_bi" - ] - }, - { - "cell_type": "markdown", - "id": "832e8fe8", - "metadata": {}, - "source": [ - "Obserwacje odstające według odległości Mahalanobisa: względem wszystkich zmiennych w zbiorze" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d5e2f332", - "metadata": {}, - "outputs": [], - "source": [ - "outliers_mahal, md = mahalanobis_method(df=df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aa926fc0", - "metadata": {}, - "outputs": [], - "source": [ - "outliers_mahal" - ] - }, - { - "cell_type": "markdown", - "id": "d56794dc", - "metadata": {}, - "source": [ - "### Winsoryzacja" - ] - }, - { - "cell_type": "markdown", - "id": "bc68873c", - "metadata": {}, - "source": [ - "Stworzenie kopii danych" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2f657f64", - "metadata": {}, - "outputs": [], - "source": [ - "df_win = df.copy(deep=True)" - ] - }, - { - "cell_type": "markdown", - "id": "9c19abb3", - "metadata": {}, - "source": [ - "Winsoryzacja prawego ogona rozkładu " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "28e6d114", - "metadata": {}, - "outputs": [], - "source": [ - "df_win['CRIM_wins_95%'] = sp.stats.mstats.winsorize(df['CRIM'], limits=(0, 0.05))\n", - "df_win['CRIM_wins_925%'] = sp.stats.mstats.winsorize(df['CRIM'], limits=(0, 0.075))" - ] - }, - { - "cell_type": "markdown", - "id": "0714427a", - "metadata": {}, - "source": [ - "Tu będzie opis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4ccb6502", - "metadata": {}, - "outputs": [], - "source": [ - "df_win.describe()" - ] - }, - { - "cell_type": "markdown", - "id": "863a476b", - "metadata": {}, - "source": [ - "Wykres rozkładu zmiennej Crime rate per capita by town" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "23c1b673", - "metadata": {}, - "outputs": [], - "source": [ - "sns.distplot(df['CRIM'])" - ] - }, - { - "cell_type": "markdown", - "id": "0cb07301", - "metadata": {}, - "source": [ - "Wykresy po winsoryzacji " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "678bd5eb", - "metadata": {}, - "outputs": [], - "source": [ - "sns.distplot(df_win['CRIM_wins_95%'])\n", - "sns.distplot(df_win['CRIM_wins_925%'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9d9bd4fd", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - }, - "vscode": { - "interpreter": { - "hash": "a8685faa0ed749449a0f1a8710c4e7cd8c1c7833bc8ac4d1844d25fbee35609f" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "cells": [ + { + "cell_type": "markdown", + "id": "d8927acb", + "metadata": {}, + "source": [ + "# Analiza Wielowymiarowa - zajecia 05 - Skalowanie wielowymiarowe i obserwacje nietypowe" + ] + }, + { + "cell_type": "markdown", + "id": "e285e16f", + "metadata": {}, + "source": [ + "## Skalowanie wielowymiarowe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b64e6d7f-c1c6-4248-8d3e-d51600c69de1", + "metadata": {}, + "outputs": [], + "source": [ + "from multidim.utils import resolve_stata, load_stata\n", + "\n", + "STATA_PATH, STATA_TYPE = resolve_stata(version = 18, stype = \"se\")\n", + "# make sure they are proper ones\n", + "STATA_PATH, STATA_TYPE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1bf9afd9-feeb-41c4-ae20-d620098eb810", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#load_stata(STATA_PATH, STATA_TYPE)\n", + "load_stata(STATA_PATH, \"se\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "086775bb", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import scipy\n", + "import sklearn\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import scipy as sp\n", + "from scipy.stats import chi2\n", + "from matplotlib import pyplot as plt\n", + "from sklearn.cluster import AgglomerativeClustering\n", + "from multidim.funs import plot_dendrogram\n", + "from multidim.datasets import load_boston" + ] + }, + { + "cell_type": "markdown", + "id": "a8aecafd", + "metadata": {}, + "source": [ + "### Przykład 1." + ] + }, + { + "cell_type": "markdown", + "id": "02066827", + "metadata": {}, + "source": [ + "Dane i przyklad zostaly pozyczone z podrecznika Stata MV" + ] + }, + { + "cell_type": "markdown", + "id": "584ddf05", + "metadata": {}, + "source": [ + "Zaladowanie danych z internetu. Jezeli nie dziala, zbior danych jest w materialach do zajec\n", + "\n", + "Dane dotycza charakterystyk odzywczych platkow sniadanowych" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa7313eb", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata \n", + "use https://www.stata-press.com/data/r17/cerealnut\n", + "des " + ] + }, + { + "cell_type": "markdown", + "id": "69e4edb1", + "metadata": {}, + "source": [ + "Podstawowy opis statystyczny cech charakterystycznych płatkow sniadaniowych" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6be565a9", + "metadata": {}, + "outputs": [], + "source": [ + "%stata summarize calories-K, sep(4)" + ] + }, + { + "cell_type": "raw", + "id": "91eb0aef", + "metadata": {}, + "source": [ + "Zastepujemy odstepy w nazwach znakiem podreslenia dla czytelnosci wykresow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "573323e2", + "metadata": {}, + "outputs": [], + "source": [ + "%stata replace brand = subinstr(brand,\" \",\"_\",.)" + ] + }, + { + "cell_type": "markdown", + "id": "4bdf01e8", + "metadata": {}, + "source": [ + "Polecenie mds wykonuje skalowanie wielowymiarowe na podstawie podanych charakterystyk\n", + "opcja \"id\" jest niezbedna, identyfikuje obserwacje" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2207f143", + "metadata": {}, + "outputs": [], + "source": [ + "%stata mds calories-K, id(brand)" + ] + }, + { + "cell_type": "markdown", + "id": "780c7b17", + "metadata": {}, + "source": [ + "Dwie pierwsze wartosci wlasne wyjasniaja 99,7% niepodobienstwa " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77d1c505", + "metadata": {}, + "outputs": [], + "source": [ + "%stata mds calories-K, id(brand) std" + ] + }, + { + "cell_type": "markdown", + "id": "c19b33df", + "metadata": {}, + "source": [ + "opcja \"std\" przed wykonaniem obliczen standaryzuje wartosci charakterystyk" + ] + }, + { + "cell_type": "markdown", + "id": "cb365671", + "metadata": {}, + "source": [ + "## Obserwacje nietypowe" + ] + }, + { + "cell_type": "markdown", + "id": "33034ef9", + "metadata": {}, + "source": [ + "Wczytanie danych do pakietu Stata" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3deb87d3", + "metadata": {}, + "outputs": [], + "source": [ + "%stata load boston " + ] + }, + { + "cell_type": "markdown", + "id": "325fb836", + "metadata": {}, + "source": [ + "Wczytanie danych" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85e79dce", + "metadata": {}, + "outputs": [], + "source": [ + "boston = load_boston()\n", + "y = boston['MEDV'] \n", + "df = boston[['CRIM', 'ZN', 'INDUS', 'CHAS','NOX','RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT' ]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "721defa2", + "metadata": {}, + "outputs": [], + "source": [ + "print(boston)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef1fb479", + "metadata": {}, + "outputs": [], + "source": [ + "#df.describe()\n", + "df_1 = df[['TAX', 'B']]\n", + "df_2 = df[['CRIM', 'ZN', 'INDUS', 'RM', 'AGE', 'DIS', 'RAD', 'PTRATIO','LSTAT']]\n", + "df_3 = df[['CHAS', 'NOX']]\n", + "\n", + "ax = sns.boxplot(data=df_2, orient=\"h\", palette=\"Set2\")\n", + "\n", + "ax = sns.boxplot(x=df[\"CRIM\"])\n", + "ax.set_xlabel('Crime rate per capita')\n" + ] + }, + { + "cell_type": "markdown", + "id": "5005b530", + "metadata": {}, + "source": [ + "Wizualizacja danych" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d393131", + "metadata": {}, + "outputs": [], + "source": [ + "#Wykres rozrzutu\n", + "ax = sns.scatterplot(x=\"LSTAT\", y=\"CRIM\", data=df)" + ] + }, + { + "cell_type": "markdown", + "id": "0819a2d4", + "metadata": {}, + "source": [ + "Odległość Mahalanobisa - funckcja zwracająca obserwacje odstające" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14bd9807", + "metadata": {}, + "outputs": [], + "source": [ + "def mahalanobis_method(df):\n", + " #M-Distance\n", + " x_minus_mu = df - np.mean(df, axis=0)\n", + " cov = np.cov(df.values.T) #Covariance\n", + " inv_covmat = sp.linalg.inv(cov) #Inverse covariance\n", + " left_term = np.dot(x_minus_mu, inv_covmat) \n", + " mahal = np.dot(left_term, x_minus_mu.T)\n", + " md = np.sqrt(mahal.diagonal())\n", + " \n", + " #Flag as outlier\n", + " outlier = []\n", + " #Cut-off point\n", + " C = np.sqrt(chi2.ppf((1-0.001), df=df.shape[1])) #degrees of freedom = number of variables\n", + " for index, value in enumerate(md):\n", + " if value > C:\n", + " outlier.append(index)\n", + " else:\n", + " continue\n", + " return outlier, md" + ] + }, + { + "cell_type": "markdown", + "id": "46d1b04c", + "metadata": {}, + "source": [ + "Obserwacje odstające według odległości Mahalanobisa: 2 wymiarowe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e1e452f", + "metadata": {}, + "outputs": [], + "source": [ + "df_bivariate = df[['LSTAT', 'CRIM']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd711c1f", + "metadata": {}, + "outputs": [], + "source": [ + "outliers_mahal_bi, md_bi = mahalanobis_method(df=df_bivariate)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "232e0200", + "metadata": {}, + "outputs": [], + "source": [ + "outliers_mahal_bi" + ] + }, + { + "cell_type": "markdown", + "id": "832e8fe8", + "metadata": {}, + "source": [ + "Obserwacje odstające według odległości Mahalanobisa: względem wszystkich zmiennych w zbiorze" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5e2f332", + "metadata": {}, + "outputs": [], + "source": [ + "outliers_mahal, md = mahalanobis_method(df=df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa926fc0", + "metadata": {}, + "outputs": [], + "source": [ + "outliers_mahal" + ] + }, + { + "cell_type": "markdown", + "id": "d56794dc", + "metadata": {}, + "source": [ + "### Winsoryzacja" + ] + }, + { + "cell_type": "markdown", + "id": "bc68873c", + "metadata": {}, + "source": [ + "Stworzenie kopii danych" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f657f64", + "metadata": {}, + "outputs": [], + "source": [ + "df_win = df.copy(deep=True)" + ] + }, + { + "cell_type": "markdown", + "id": "9c19abb3", + "metadata": {}, + "source": [ + "Winsoryzacja prawego ogona rozkładu " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28e6d114", + "metadata": {}, + "outputs": [], + "source": [ + "df_win['CRIM_wins_95%'] = sp.stats.mstats.winsorize(df['CRIM'], limits=(0, 0.05))\n", + "df_win['CRIM_wins_925%'] = sp.stats.mstats.winsorize(df['CRIM'], limits=(0, 0.075))" + ] + }, + { + "cell_type": "markdown", + "id": "0714427a", + "metadata": {}, + "source": [ + "Tu będzie opis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ccb6502", + "metadata": {}, + "outputs": [], + "source": [ + "df_win.describe()" + ] + }, + { + "cell_type": "markdown", + "id": "863a476b", + "metadata": {}, + "source": [ + "Wykres rozkładu zmiennej Crime rate per capita by town" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23c1b673", + "metadata": {}, + "outputs": [], + "source": [ + "sns.distplot(df['CRIM'])" + ] + }, + { + "cell_type": "markdown", + "id": "0cb07301", + "metadata": {}, + "source": [ + "Wykresy po winsoryzacji " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "678bd5eb", + "metadata": {}, + "outputs": [], + "source": [ + "sns.distplot(df_win['CRIM_wins_95%'])\n", + "sns.distplot(df_win['CRIM_wins_925%'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d9bd4fd", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + }, + "vscode": { + "interpreter": { + "hash": "a8685faa0ed749449a0f1a8710c4e7cd8c1c7833bc8ac4d1844d25fbee35609f" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/src/multidim/notebooks/06_factor.ipynb b/src/multidim/notebooks/06_factor.ipynb index 0f928de..50783dd 100644 --- a/src/multidim/notebooks/06_factor.ipynb +++ b/src/multidim/notebooks/06_factor.ipynb @@ -1,862 +1,862 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "2a48c727", - "metadata": { - "tags": [] - }, - "source": [ - "# Analiza Wielowymiarowa - zajecia 6 - Analiza czynnikowa" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "32b52329", - "metadata": {}, - "outputs": [], - "source": [ - "from multidim.utils import resolve_stata, load_stata\n", - "\n", - "STATA_PATH, STATA_TYPE = resolve_stata(version = 18, stype = \"se\")\n", - "# make sure they are proper ones\n", - "STATA_PATH, STATA_TYPE" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "40123e5a", - "metadata": {}, - "outputs": [], - "source": [ - "load_stata(STATA_PATH, STATA_TYPE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1df4db13", - "metadata": {}, - "outputs": [], - "source": [ - "# Załadowanie bibliotek\n", - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "#https://scikit-learn.org/stable/modules/decomposition.html#fa\n", - "#https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.FactorAnalysis.html\n", - "from sklearn import decomposition\n", - "from sklearn.decomposition import FactorAnalysis\n", - "from sklearn.preprocessing import StandardScaler\n", - "#https://factor-analyzer.readthedocs.io/en/latest/factor_analyzer.html\n", - "from factor_analyzer import FactorAnalyzer\n", - "from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity\n", - "from factor_analyzer.factor_analyzer import calculate_kmo\n", - "\n", - "from sklearn.decomposition import PCA\n", - "from sklearn.decomposition import TruncatedSVD \n", - "from scipy.linalg import svd\n", - "\n", - "from multidim.funs import corr_mat\n" - ] - }, - { - "cell_type": "markdown", - "id": "3d84d8e6-cda6-49dc-b436-02eca4735e97", - "metadata": {}, - "source": [ - "## Analiza Czynnikowa" - ] - }, - { - "cell_type": "markdown", - "id": "e91bc599", - "metadata": {}, - "source": [ - "### Przyklad 1 -- zaczerpniety z materialow pani Natalii Nehrebeckiej" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c97bb918", - "metadata": {}, - "outputs": [], - "source": [ - "%%stata\n", - "/*Ponizej przedstawimy analize czynnikowa metoda najwiekszej wiarogodnosci. Zaleca\n", - "sie na wstepie analize skladowych glownych, aby ustalic przyblizona liczbe czynnikow*/\n", - "\n", - "/*Dane wejsciowe do analizy czynnikowej moga miec postac macierzy kowariancji lub korelacji.\n", - "Posluzymy sie danymi pochodzacymi z badania przeprowadzonego na 123 osobach cierpiacych\n", - "z powodu silnych napadow bolu. Poproszono ich o wydanie opinii na skali od 1 do 6\n", - "(1-calkowicie sie zgadzam, 6-nie zgadzam sie) na temat 9 oswiadczen na temat bolu.\n", - "\n", - "Ponizej lista zmiennych:\n", - "1. To, czy bede cierpial z powodu bolu w przyszlosci zalezy od lekarza.\n", - "2. To, czy bede cierpial z powodu bolu, zalezy zwykle od tego, czy cos zrobilem lub nie\n", - " zrobilem.\n", - "3. To, czy bede cierpial z powodu bolu, zalezy od tego, co zrobi dla mnie lekarz.\n", - "4. Nie moge poradzic sobie z bolem, dopoki nie skorzystam z pomocy medycznej.\n", - "5. Jesli czuje bol, to jest to spowodowane tym, iz nie wykonywalem odpowiednich cwiczen lub\n", - " nieprawidlowo sie odzywialem.\n", - "6. Bol jest wynikiem zaniedbania.\n", - "7. Jestem calkowicie odpowiedzialny za moj bol.\n", - "8. Pozbycie sie bolu jest kontrolowane przez doktora.\n", - "9. Ludzie, ktorzy nigdy nie cierpia z powodu bolu, sa szczesciarzami.*/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bf289888-8057-4819-bd98-29ad7de08189", - "metadata": {}, - "outputs": [], - "source": [ - "%%stata -qui\n", - "matrix C = ( 1.0000, -0.0385, 0.6066, 0.4507, 0.0320, -0.2877, -0.2974, 0.4526, 0.2952 \\/*\n", - "*/ -0.0385, 1.0000, -0.0693, -0.1167, 0.4881, 0.4271, 0.3045, -0.3090, -0.1704 \\/*\n", - "*/ 0.6066, -0.0693, 1.000, 0.5916, 0.0317, -0.1336, -0.2404, 0.5886, 0.3165 \\ /*\n", - "*/0.4507, -0.1167, 0.5916, 1.0000, -0.0802, -0.2073, -0.1850, 0.6286, 0.3680 \\ /*\n", - "*/0.0320, 0.4881, 0.0317, -0.0802, 1.0000, 0.4731, 0.4138, -0.1397, -0.2367 \\ /*\n", - "*/-0.2877, 0.4271, -0.1336, -0.2073, 0.4731, 1.0000, 0.6346, -0.1329, -0.1541 \\ /*\n", - "*/-0.2974, 0.3045, -0.2404, -0.1850, 0.4138, 0.6346, 1.0000, -0.2599, -0.2893 \\ /*\n", - "*/0.4526, -0.3090, 0.5886, 0.6286, -0.1397, -0.1329, -0.2599, 1.0000, 0.4047 \\ /*\n", - "*/0.2952, -0.1704, 0.3165, 0.3680, -0.2367, -0.1541, -0.2893, 0.4047, 1.0000 )" - ] - }, - { - "cell_type": "markdown", - "id": "725b97fd-8d47-439f-9260-b97bd403c342", - "metadata": {}, - "source": [ - "Test Ilorazu Wiarogodności (ang. LR test) https://www.jstor.org/stable/2287400" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4efab7f6-a854-4dc4-8456-a961b42a3d1f", - "metadata": {}, - "outputs": [], - "source": [ - "%%stata\n", - "/*Nie musimy nigdzie okreslac, iz na wejsciu mamy dane w postaci macierzy\n", - "korelacji. Rozpoczniemy od 2 czynnikow*/\n", - "/*Jesli wykorzystujemy dane w postaci macierzy korelacji, musimy okreslic liczbe obserwacji*/\n", - "\n", - "factormat C, n(123) names(p1 p2 p3 p4 p5 p6 p7 p8 p9) fac(2) ml" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b8064bb2-0926-4d81-9936-81423d61f07e", - "metadata": {}, - "outputs": [], - "source": [ - "C = np.array([[ 1.0000, -0.0385, 0.6066, 0.4507, 0.0320, -0.2877, -0.2974, 0.4526, 0.2952],\n", - "[-0.0385, 1.0000, -0.0693, -0.1167, 0.4881, 0.4271, 0.3045, -0.3090, -0.1704],\n", - "[0.6066, -0.0693, 1.000, 0.5916, 0.0317, -0.1336, -0.2404, 0.5886, 0.3165],\n", - "[0.4507, -0.1167, 0.5916, 1.0000, -0.0802, -0.2073, -0.1850, 0.6286, 0.3680],\n", - "[0.0320, 0.4881, 0.0317, -0.0802, 1.0000, 0.4731, 0.4138, -0.1397, -0.2367],\n", - "[-0.2877, 0.4271, -0.1336, -0.2073, 0.4731, 1.0000, 0.6346, -0.1329, -0.1541],\n", - "[-0.2974, 0.3045, -0.2404, -0.1850, 0.4138, 0.6346, 1.0000, -0.2599, -0.2893],\n", - "[0.4526, -0.3090, 0.5886, 0.6286, -0.1397, -0.1329, -0.2599, 1.0000, 0.4047],\n", - "[0.2952, -0.1704, 0.3165, 0.3680, -0.2367, -0.1541, -0.2893, 0.4047, 1.0000]])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c89669eb-a7bd-450e-9c6d-8fcba38342a1", - "metadata": {}, - "outputs": [], - "source": [ - "np.linalg.eigvals(C) # macierz oddatnio okreslona" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0405f2b3-b008-4cdc-ad78-1b42910ae576", - "metadata": {}, - "outputs": [], - "source": [ - "fa = FactorAnalyzer(rotation = None, is_corr_matrix = True, n_factors = 2, method = 'ml')\n", - "fa.fit(C)\n", - "\n", - "# GET EIGENVALUES\n", - "# Large values of the communalities will indicate that the fitting hyperplane (factors) is rather accurately reproducing the correlation matrix. \n", - "fa.get_uniquenesses(), fa.get_communalities()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b929250-f3aa-48ef-9f32-46f97962e6be", - "metadata": {}, - "outputs": [], - "source": [ - "fa.get_uniquenesses() + fa.get_communalities()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "930a859f-7cca-4352-8ca8-01d71be84a90", - "metadata": {}, - "outputs": [], - "source": [ - "fa.get_factor_variance()[0], fa.get_eigenvalues()[1][0:2]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1d9273bf-7487-48c5-bf64-9165d87e3c39", - "metadata": {}, - "outputs": [], - "source": [ - "nams = [ \"p\" + i for i in list(\"123456789\")]\n", - "loadings = pd.DataFrame(np.column_stack((fa.loadings_, fa.get_uniquenesses())))\n", - "loadings.index = nams\n", - "loadings.columns = [\"Factor1\", \"Factor2\", \"uniquenesses\"] \n", - "loadings" - ] - }, - { - "cell_type": "markdown", - "id": "5fc8b2c6", - "metadata": {}, - "source": [ - "Truncated SVD Directly" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dea69757", - "metadata": {}, - "outputs": [], - "source": [ - "tsvd = TruncatedSVD(2)\n", - "tsvd.fit(C)\n", - "loadings_direct_svd = tsvd.components_.T * np.sqrt(tsvd.explained_variance_)" - ] - }, - { - "cell_type": "markdown", - "id": "001e9054", - "metadata": {}, - "source": [ - "Correlation between FA loadings and direct Truncated SVD loadings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d0be0520", - "metadata": {}, - "outputs": [], - "source": [ - "np.diag(\n", - " corr_mat(\n", - " loadings_direct_svd,\n", - " fa.loadings_\n", - " )\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9f099d12-a22b-427b-9057-2071180da802", - "metadata": {}, - "outputs": [], - "source": [ - "%%stata\n", - "\n", - "/*Wyniki testu -- okazuje sie ze 2 czyniki nie wystarcza (p-value = 0.0000<0.05)*/\n", - "//Probujemy z 3 czynnikami\n", - "\n", - "factormat C, n(123) names(p1 p2 p3 p4 p5 p6 p7 p8 p9) fac(3) ml\n", - "\n", - "/*Na poziomie istotnosci 0,05 brak podstaw do odrzucenia H0 zakladajacej, ze model\n", - "trzyczynnikowy jest adekwatny (wystarczajacy). p-value 0.1055>0.05*/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fac579d2-b14b-4edf-8e35-12542bf71dad", - "metadata": {}, - "outputs": [], - "source": [ - "fa = FactorAnalyzer(rotation = None, is_corr_matrix = True, n_factors = 3, method = 'ml')\n", - "fa.fit(C)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "114e4ecf-9c86-4d8f-972e-91a3c94f7541", - "metadata": {}, - "outputs": [], - "source": [ - "fa.get_factor_variance()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6cf19e06-3788-44c8-b866-7fd9f9364e44", - "metadata": {}, - "outputs": [], - "source": [ - "fa.get_eigenvalues()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2c19b8c7-f645-472a-b641-557fdbd8d086", - "metadata": {}, - "outputs": [], - "source": [ - "pd.Series(fa.get_eigenvalues()[1]).plot()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c946d1de-2f68-4cd0-8157-9abfe69801b9", - "metadata": {}, - "outputs": [], - "source": [ - "nams = [ \"p\" + i for i in list(\"123456789\")]\n", - "loadings = pd.DataFrame(np.column_stack((fa.loadings_, fa.get_uniquenesses())))\n", - "loadings.index = nams\n", - "loadings.columns = [\"Factor1\", \"Factor2\", \"Factor3\", \"uniquenesses\"] \n", - "loadings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3e12fe4e", - "metadata": {}, - "outputs": [], - "source": [ - "%%stata\n", - "/*Sprobujemy nadac czynnikom interpretacje. Przeprowadzamy rotacje czynnikow*/\n", - "rotate, varimax" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5c8b4750-515e-4de4-8af8-8b405feca4e9", - "metadata": {}, - "outputs": [], - "source": [ - "fa = FactorAnalyzer(rotation='varimax', is_corr_matrix = True, n_factors = 3, method = 'ml')\n", - "fa.fit(C)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "022e0b74-ce17-47fb-9f2b-44d41937a461", - "metadata": {}, - "outputs": [], - "source": [ - "#GET EIGENVALUES\n", - "fa.get_uniquenesses(),fa.get_communalities()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4d3f528e-996d-4b3c-a2c3-7fa7dc49e756", - "metadata": {}, - "outputs": [], - "source": [ - "nams = [ \"p\" + i for i in list(\"123456789\")]\n", - "loadings = pd.DataFrame(np.column_stack((fa.loadings_, fa.get_uniquenesses())))\n", - "loadings.index = nams\n", - "loadings.columns = [\"Factor1\", \"Factor2\", \"Factor3\", \"uniquenesses\"] \n", - "loadings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0bdd798f-527f-48c7-9922-23490ca7a5bd", - "metadata": {}, - "outputs": [], - "source": [ - "fa.rotation_matrix_" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b166b19-77fc-43f5-85ff-365908907953", - "metadata": {}, - "outputs": [], - "source": [ - "np.matmul(fa.rotation_matrix_.T, fa.rotation_matrix_)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f4a7b9f2-885e-404f-bec8-4eb3000ba251", - "metadata": {}, - "outputs": [], - "source": [ - "# https://www.tandfonline.com/doi/abs/10.1080/10705510701301891?journalCode=hsem20" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d0b147ca", - "metadata": {}, - "outputs": [], - "source": [ - "%%stata\n", - "/*pierwszy czynnik - stwierdzenia 1, 3, 4 i 8 - wszystkie zwiazane z lekarzami; mozemy zinterpretowac jako \"kontrola lekarska bolu\"\n", - " drugi czynnik - stwierdzenia 6 i 7 - bol jako wynik wlasnych dzialan\n", - " trzeci czynnik - stwierdzenia 2 i 5 - znow bol jako wynik wlasnych dzialan.*/\n", - "\n", - "estat smc\n", - "/*oszacowanie czesci wspólnej ->\"communality\" (jaka czesc zmiennej Xi jest zwiazana z pozostalymi zmiennymi X)\n", - "szacowna jako kwadrat wspolczynnika korelacji wielorakiej\n", - "danej zmiennej z pozostalymi (czyli R2 z regresji tej zmiennej na pozostale)*/\n", - "\n", - "estat kmo\n", - "\n", - "/*statystyka adekwatnosci proby Kaiser-Meyer-Olkin.\n", - "Metoda ta polega na porownaniu korelacji i czastkowych korelacji pomiedzy zmiennymi.\n", - "Gdy korelacja czastkowa jest relatywnie wysoka w stosunku do zwyklej korelacji to KMO jest male,\n", - "co oznacza ze uzyskanie adekwatnego rozwiazania w przestrzeni malego wymiaru jest niewykonalne.\n", - "\n", - "Wielkosci wspolczynnika:\n", - "0.00 to 0.49 nie do przyjecia\n", - "0.50 to 0.59 bardzo slaby\n", - "0.60 to 0.69 slaby\n", - "0.70 to 0.79 umiarkowany\n", - "0.80 to 0.89 dobry\n", - "0.90 to 1.00 znakomity*/\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4839275e-2092-4a5e-80e9-362cbce99fcf", - "metadata": {}, - "outputs": [], - "source": [ - "# calculate_bartlett_sphericity(C) not for correlation matrix" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "44d01513-be7c-4b08-b292-c19dad083990", - "metadata": {}, - "outputs": [], - "source": [ - "# calculate_kmo(C) not for correlation matrix" - ] - }, - { - "cell_type": "markdown", - "id": "d6078e46", - "metadata": {}, - "source": [ - "### Przyklad 2 -- Indeks kapitalu spolecznego i problemy z analiza czynnikowa" - ] - }, - { - "cell_type": "markdown", - "id": "acaf4150-ae71-43d1-a509-30b589a372de", - "metadata": {}, - "source": [ - "#### Probujemy stworzyc indeks kapitalu spolecznego" - ] - }, - { - "cell_type": "markdown", - "id": "dd0aee94-451d-40ae-b534-fc4d53bd005a", - "metadata": {}, - "source": [ - "Dane oryginalnie pochodzily z badania World Values Survey" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "43507f16-99c6-469f-bcfb-50fc5db4c359", - "metadata": {}, - "outputs": [], - "source": [ - "from multidim.datasets import load_indeks_spol\n", - "F = load_indeks_spol()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35d74f92-5b9f-4ea5-94fd-076c1122cf86", - "metadata": {}, - "outputs": [], - "source": [ - "%%mata -m F\n", - "st_matrix(\"F\", F)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5e708caf-a0e8-407c-b4a4-61f6e057beba", - "metadata": {}, - "outputs": [], - "source": [ - "%%stata\n", - "//METODA NAJWIEKSZEJ WIARYGODNOSCI\n", - "factormat F, n(35312) names(imp_family imp_friends imp_politics imp_church member_dis political_dis trust_family trust_ppers trust_neighbour trust_arel trust_firsttime trust_anation fair conf_church conf_forces conf_press conf_tv conf_labour conf_police conf_courts conf_govern conf_parties conf_parl religion_freq tradition help local) fac(7) ml\n", - "//za malo" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1fcde1ec-1bd7-4b92-9e6d-34fb18e59a41", - "metadata": {}, - "outputs": [], - "source": [ - "nams = [\"imp_family\", \"imp_friends\", \"imp_politics\", \"imp_church\", \"member_dis\", \"political_dis\", \"trust_family\",\n", - " \"trust_ppers\", \"trust_neighbour\", \"trust_arel\", \"trust_firsttime\", \"trust_anation\", \"fair\", \"conf_church\",\n", - " \"conf_forces\", \"conf_press\", \"conf_tv\", \"conf_labour\", \"conf_police\", \"conf_courts\", \"conf_govern\", \"conf_parties\",\n", - " \"conf_parl\", \"religion_freq\", \"tradition\", \"help\", \"local\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "60d7a783-7a42-467f-a657-e96aca25fe38", - "metadata": {}, - "outputs": [], - "source": [ - "n_factors = 7\n", - "fa = FactorAnalyzer(rotation=None, is_corr_matrix = True, n_factors = n_factors, method = 'ml')\n", - "fa.fit(F)\n", - "loadings = pd.DataFrame(np.column_stack((fa.loadings_, fa.get_uniquenesses())))\n", - "loadings.index = nams\n", - "loadings.columns = [ \"Factor\" + str(i + 1) for i in range(n_factors)] + [\"uniquenesses\"] \n", - "loadings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7643a085-d510-4b2c-a1f1-dfb706ab57d0", - "metadata": {}, - "outputs": [], - "source": [ - "pd.Series(fa.get_eigenvalues()[1]).plot()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e4363894-1a9d-4eb5-a5cd-a3bcb23ff77e", - "metadata": {}, - "outputs": [], - "source": [ - "%%stata \n", - "factormat F, n(35312) names(imp_family imp_friends imp_politics imp_church member_dis political_dis trust_family trust_ppers trust_neighbour trust_arel trust_firsttime trust_anation fair conf_church conf_forces conf_press conf_tv conf_labour conf_police conf_courts conf_govern conf_parties conf_parl religion_freq tradition help local) fac(9) ml\n", - "//HEYWOOD CASE -- negative variance estimate" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1bf940f2-4e0f-4836-8a71-b638d8cb946d", - "metadata": {}, - "outputs": [], - "source": [ - "n_factors = 9\n", - "fa = FactorAnalyzer(rotation=None, is_corr_matrix = True, n_factors = n_factors, method = 'ml')\n", - "fa.fit(F)\n", - "loadings = pd.DataFrame(np.column_stack((fa.loadings_, fa.get_uniquenesses())))\n", - "loadings.index = nams\n", - "loadings.columns = [ \"Factor\" + str(i + 1) for i in range(n_factors)] + [\"uniquenesses\"] \n", - "loadings" - ] - }, - { - "cell_type": "markdown", - "id": "3c002bcf-1fdb-4d6e-85eb-d53269117348", - "metadata": {}, - "source": [ - "### Analiza czynnikowa (FA) przy wykorzytaniu Analizy głównych składowych (PCA) (optymalizacja)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9529f72f", - "metadata": {}, - "outputs": [], - "source": [ - "#%%stata\n", - "#factormat F, n(35312) names(imp_family imp_friends imp_politics imp_church member_dis political_dis trust_family trust_ppers trust_neighbour trust_arel trust_firsttime trust_anation fair conf_church conf_forces conf_press conf_tv conf_labour conf_police conf_courts conf_govern conf_parties conf_parl religion_freq tradition help local) fac(4) pcf\n", - "#//za malo\n", - "#\n", - "#factormat F, n(35312) names(imp_family imp_friends imp_politics imp_church member_dis political_dis trust_family trust_ppers trust_neighbour trust_arel trust_firsttime trust_anation fair conf_church conf_forces conf_press conf_tv conf_labour conf_police conf_courts conf_govern conf_parties conf_parl religion_freq tradition help local) fac(27) pcf\n", - "#//tez nie" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a69f19ab-2eb3-4559-a12f-7068e4887f60", - "metadata": {}, - "outputs": [], - "source": [ - "# n_factors = 10\n", - "# fa = FactorAnalyzer(rotation=None, is_corr_matrix = True, n_factors = n_factors, method = 'principal')\n", - "# fa.fit(F)\n", - "# loadings = pd.DataFrame(np.column_stack((fa.loadings_, fa.get_uniquenesses())))\n", - "# loadings.index = nams\n", - "# loadings.columns = [ \"Factor\" + str(i + 1) for i in range(n_factors)] + [\"uniquenesses\"] \n", - "# loadings" - ] - }, - { - "cell_type": "markdown", - "id": "f5549f40", - "metadata": {}, - "source": [ - "## Przyklad 3 - Scores" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "253695bd", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "from sklearn.decomposition import FactorAnalysis\n", - "from sklearn.preprocessing import StandardScaler, scale\n", - "from scipy.stats import rankdata\n", - "\n", - "from factor_analyzer import FactorAnalyzer\n", - "from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity\n", - "from factor_analyzer.factor_analyzer import calculate_kmo" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7dcf1d83", - "metadata": {}, - "outputs": [], - "source": [ - "from multidim.datasets import load_seul1988\n", - "seul1988 = load_seul1988()\n", - "seul1988 = seul1988.sample(seul1988.shape[0])\n", - "seul1988 = seul1988.query(\"wynik >= 6000\")\n", - "seul1988[\"Subject\"] = list(range(1, seul1988.shape[0]+1, 1))\n", - "seul1988_copy = seul1988.copy()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d6c9a4d", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "seul1988 = seul1988[~np.isnan(seul1988).any(axis=1)]\n", - "seul1988_normal = scale(seul1988)\n", - "fa = FactorAnalysis(n_components = 3, tol = 0.001, svd_method = \"lapack\")\n", - "X = seul1988_normal[:, :-2]\n", - "fa.fit(X)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "24b90c06", - "metadata": {}, - "outputs": [], - "source": [ - "comps = fa.transform(X)" - ] - }, - { - "cell_type": "markdown", - "id": "14d23f7a", - "metadata": {}, - "source": [ - "Interpretacja dla kazdego czynnika ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dce0cffe", - "metadata": {}, - "outputs": [], - "source": [ - "cc = corr_mat(comps, seul1988.iloc[:, :-2]).T\n", - "cc" - ] - }, - { - "cell_type": "markdown", - "id": "13871178", - "metadata": {}, - "source": [ - "Jest duzo roznych metod budowania rankingu." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aa4ac1f8", - "metadata": {}, - "outputs": [], - "source": [ - "scores1a = comps[:,0]\n", - "order1a = rankdata(scores1a, \"max\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fbec394e", - "metadata": {}, - "outputs": [], - "source": [ - "scores1b = comps.sum(axis = 1)\n", - "order1b = rankdata(scores1b, \"max\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7a01aae5", - "metadata": {}, - "outputs": [], - "source": [ - "scores2 = fa.score_samples(X)\n", - "order2 = rankdata(scores2, \"max\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4a6e13b2", - "metadata": {}, - "outputs": [], - "source": [ - "fa2 = FactorAnalysis(n_components = 1, tol = 0.001, svd_method = \"lapack\")\n", - "fa2.fit(X)\n", - "comps = fa2.transform(X)\n", - "scores1c = comps.ravel()\n", - "order1c = rankdata(scores1c, \"max\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5440c686", - "metadata": {}, - "outputs": [], - "source": [ - "res = pd.DataFrame({\n", - " 'subject': seul1988['Subject'],\n", - " 'wynik': seul1988['wynik'], \n", - " 'scores_one': scores1c,\n", - " 'rank_one': order1c,\n", - " 'scores_first': scores1a,\n", - " 'rank_first': order1a,\n", - " 'scores_sum': scores1b,\n", - " 'rank_sum': order1b,\n", - " 'scores_loglike': scores2,\n", - " 'rank_loglike': order2\n", - "})\n", - "res['rank_wynik'] = rankdata(res[\"wynik\"].values, \"max\")\n", - "res" - ] - }, - { - "cell_type": "markdown", - "id": "ffa88034", - "metadata": {}, - "source": [ - "Correlation between different rankings - ABS is needed as we do not know sense (zwrot)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3aa48fd6", - "metadata": {}, - "outputs": [], - "source": [ - "from multidim.funs import corr_mat\n", - "rank_cols = [\"rank_one\", \"rank_first\", \"rank_sum\", \"rank_loglike\", \"rank_wynik\"]\n", - "# ABS\n", - "np.abs(np.corrcoef(res.loc[:,rank_cols].T))[-1,:]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a777eb20", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - }, - "vscode": { - "interpreter": { - "hash": "aa9d594cf5340018b76a70eeac417329866dfddee009818da806e7b1b5f80883" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "cells": [ + { + "cell_type": "markdown", + "id": "2a48c727", + "metadata": { + "tags": [] + }, + "source": [ + "# Analiza Wielowymiarowa - zajecia 6 - Analiza czynnikowa" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32b52329", + "metadata": {}, + "outputs": [], + "source": [ + "from multidim.utils import resolve_stata, load_stata\n", + "\n", + "STATA_PATH, STATA_TYPE = resolve_stata(version = 18, stype = \"se\")\n", + "# make sure they are proper ones\n", + "STATA_PATH, STATA_TYPE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40123e5a", + "metadata": {}, + "outputs": [], + "source": [ + "load_stata(STATA_PATH, STATA_TYPE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1df4db13", + "metadata": {}, + "outputs": [], + "source": [ + "# Załadowanie bibliotek\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "#https://scikit-learn.org/stable/modules/decomposition.html#fa\n", + "#https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.FactorAnalysis.html\n", + "from sklearn import decomposition\n", + "from sklearn.decomposition import FactorAnalysis\n", + "from sklearn.preprocessing import StandardScaler\n", + "#https://factor-analyzer.readthedocs.io/en/latest/factor_analyzer.html\n", + "from factor_analyzer import FactorAnalyzer\n", + "from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity\n", + "from factor_analyzer.factor_analyzer import calculate_kmo\n", + "\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.decomposition import TruncatedSVD \n", + "from scipy.linalg import svd\n", + "\n", + "from multidim.funs import corr_mat\n" + ] + }, + { + "cell_type": "markdown", + "id": "3d84d8e6-cda6-49dc-b436-02eca4735e97", + "metadata": {}, + "source": [ + "## Analiza Czynnikowa" + ] + }, + { + "cell_type": "markdown", + "id": "e91bc599", + "metadata": {}, + "source": [ + "### Przyklad 1 -- zaczerpniety z materialow pani Natalii Nehrebeckiej" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c97bb918", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata\n", + "/*Ponizej przedstawimy analize czynnikowa metoda najwiekszej wiarogodnosci. Zaleca\n", + "sie na wstepie analize skladowych glownych, aby ustalic przyblizona liczbe czynnikow*/\n", + "\n", + "/*Dane wejsciowe do analizy czynnikowej moga miec postac macierzy kowariancji lub korelacji.\n", + "Posluzymy sie danymi pochodzacymi z badania przeprowadzonego na 123 osobach cierpiacych\n", + "z powodu silnych napadow bolu. Poproszono ich o wydanie opinii na skali od 1 do 6\n", + "(1-calkowicie sie zgadzam, 6-nie zgadzam sie) na temat 9 oswiadczen na temat bolu.\n", + "\n", + "Ponizej lista zmiennych:\n", + "1. To, czy bede cierpial z powodu bolu w przyszlosci zalezy od lekarza.\n", + "2. To, czy bede cierpial z powodu bolu, zalezy zwykle od tego, czy cos zrobilem lub nie\n", + " zrobilem.\n", + "3. To, czy bede cierpial z powodu bolu, zalezy od tego, co zrobi dla mnie lekarz.\n", + "4. Nie moge poradzic sobie z bolem, dopoki nie skorzystam z pomocy medycznej.\n", + "5. Jesli czuje bol, to jest to spowodowane tym, iz nie wykonywalem odpowiednich cwiczen lub\n", + " nieprawidlowo sie odzywialem.\n", + "6. Bol jest wynikiem zaniedbania.\n", + "7. Jestem calkowicie odpowiedzialny za moj bol.\n", + "8. Pozbycie sie bolu jest kontrolowane przez doktora.\n", + "9. Ludzie, ktorzy nigdy nie cierpia z powodu bolu, sa szczesciarzami.*/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf289888-8057-4819-bd98-29ad7de08189", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata -qui\n", + "matrix C = ( 1.0000, -0.0385, 0.6066, 0.4507, 0.0320, -0.2877, -0.2974, 0.4526, 0.2952 \\/*\n", + "*/ -0.0385, 1.0000, -0.0693, -0.1167, 0.4881, 0.4271, 0.3045, -0.3090, -0.1704 \\/*\n", + "*/ 0.6066, -0.0693, 1.000, 0.5916, 0.0317, -0.1336, -0.2404, 0.5886, 0.3165 \\ /*\n", + "*/0.4507, -0.1167, 0.5916, 1.0000, -0.0802, -0.2073, -0.1850, 0.6286, 0.3680 \\ /*\n", + "*/0.0320, 0.4881, 0.0317, -0.0802, 1.0000, 0.4731, 0.4138, -0.1397, -0.2367 \\ /*\n", + "*/-0.2877, 0.4271, -0.1336, -0.2073, 0.4731, 1.0000, 0.6346, -0.1329, -0.1541 \\ /*\n", + "*/-0.2974, 0.3045, -0.2404, -0.1850, 0.4138, 0.6346, 1.0000, -0.2599, -0.2893 \\ /*\n", + "*/0.4526, -0.3090, 0.5886, 0.6286, -0.1397, -0.1329, -0.2599, 1.0000, 0.4047 \\ /*\n", + "*/0.2952, -0.1704, 0.3165, 0.3680, -0.2367, -0.1541, -0.2893, 0.4047, 1.0000 )" + ] + }, + { + "cell_type": "markdown", + "id": "725b97fd-8d47-439f-9260-b97bd403c342", + "metadata": {}, + "source": [ + "Test Ilorazu Wiarogodności (ang. LR test) https://www.jstor.org/stable/2287400" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4efab7f6-a854-4dc4-8456-a961b42a3d1f", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata\n", + "/*Nie musimy nigdzie okreslac, iz na wejsciu mamy dane w postaci macierzy\n", + "korelacji. Rozpoczniemy od 2 czynnikow*/\n", + "/*Jesli wykorzystujemy dane w postaci macierzy korelacji, musimy okreslic liczbe obserwacji*/\n", + "\n", + "factormat C, n(123) names(p1 p2 p3 p4 p5 p6 p7 p8 p9) fac(2) ml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8064bb2-0926-4d81-9936-81423d61f07e", + "metadata": {}, + "outputs": [], + "source": [ + "C = np.array([[ 1.0000, -0.0385, 0.6066, 0.4507, 0.0320, -0.2877, -0.2974, 0.4526, 0.2952],\n", + "[-0.0385, 1.0000, -0.0693, -0.1167, 0.4881, 0.4271, 0.3045, -0.3090, -0.1704],\n", + "[0.6066, -0.0693, 1.000, 0.5916, 0.0317, -0.1336, -0.2404, 0.5886, 0.3165],\n", + "[0.4507, -0.1167, 0.5916, 1.0000, -0.0802, -0.2073, -0.1850, 0.6286, 0.3680],\n", + "[0.0320, 0.4881, 0.0317, -0.0802, 1.0000, 0.4731, 0.4138, -0.1397, -0.2367],\n", + "[-0.2877, 0.4271, -0.1336, -0.2073, 0.4731, 1.0000, 0.6346, -0.1329, -0.1541],\n", + "[-0.2974, 0.3045, -0.2404, -0.1850, 0.4138, 0.6346, 1.0000, -0.2599, -0.2893],\n", + "[0.4526, -0.3090, 0.5886, 0.6286, -0.1397, -0.1329, -0.2599, 1.0000, 0.4047],\n", + "[0.2952, -0.1704, 0.3165, 0.3680, -0.2367, -0.1541, -0.2893, 0.4047, 1.0000]])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c89669eb-a7bd-450e-9c6d-8fcba38342a1", + "metadata": {}, + "outputs": [], + "source": [ + "np.linalg.eigvals(C) # macierz oddatnio okreslona" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0405f2b3-b008-4cdc-ad78-1b42910ae576", + "metadata": {}, + "outputs": [], + "source": [ + "fa = FactorAnalyzer(rotation = None, is_corr_matrix = True, n_factors = 2, method = 'ml')\n", + "fa.fit(C)\n", + "\n", + "# GET EIGENVALUES\n", + "# Large values of the communalities will indicate that the fitting hyperplane (factors) is rather accurately reproducing the correlation matrix. \n", + "fa.get_uniquenesses(), fa.get_communalities()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b929250-f3aa-48ef-9f32-46f97962e6be", + "metadata": {}, + "outputs": [], + "source": [ + "fa.get_uniquenesses() + fa.get_communalities()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "930a859f-7cca-4352-8ca8-01d71be84a90", + "metadata": {}, + "outputs": [], + "source": [ + "fa.get_factor_variance()[0], fa.get_eigenvalues()[1][0:2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d9273bf-7487-48c5-bf64-9165d87e3c39", + "metadata": {}, + "outputs": [], + "source": [ + "nams = [ \"p\" + i for i in list(\"123456789\")]\n", + "loadings = pd.DataFrame(np.column_stack((fa.loadings_, fa.get_uniquenesses())))\n", + "loadings.index = nams\n", + "loadings.columns = [\"Factor1\", \"Factor2\", \"uniquenesses\"] \n", + "loadings" + ] + }, + { + "cell_type": "markdown", + "id": "5fc8b2c6", + "metadata": {}, + "source": [ + "Truncated SVD Directly" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dea69757", + "metadata": {}, + "outputs": [], + "source": [ + "tsvd = TruncatedSVD(2)\n", + "tsvd.fit(C)\n", + "loadings_direct_svd = tsvd.components_.T * np.sqrt(tsvd.explained_variance_)" + ] + }, + { + "cell_type": "markdown", + "id": "001e9054", + "metadata": {}, + "source": [ + "Correlation between FA loadings and direct Truncated SVD loadings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0be0520", + "metadata": {}, + "outputs": [], + "source": [ + "np.diag(\n", + " corr_mat(\n", + " loadings_direct_svd,\n", + " fa.loadings_\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f099d12-a22b-427b-9057-2071180da802", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata\n", + "\n", + "/*Wyniki testu -- okazuje sie ze 2 czyniki nie wystarcza (p-value = 0.0000<0.05)*/\n", + "//Probujemy z 3 czynnikami\n", + "\n", + "factormat C, n(123) names(p1 p2 p3 p4 p5 p6 p7 p8 p9) fac(3) ml\n", + "\n", + "/*Na poziomie istotnosci 0,05 brak podstaw do odrzucenia H0 zakladajacej, ze model\n", + "trzyczynnikowy jest adekwatny (wystarczajacy). p-value 0.1055>0.05*/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fac579d2-b14b-4edf-8e35-12542bf71dad", + "metadata": {}, + "outputs": [], + "source": [ + "fa = FactorAnalyzer(rotation = None, is_corr_matrix = True, n_factors = 3, method = 'ml')\n", + "fa.fit(C)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "114e4ecf-9c86-4d8f-972e-91a3c94f7541", + "metadata": {}, + "outputs": [], + "source": [ + "fa.get_factor_variance()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6cf19e06-3788-44c8-b866-7fd9f9364e44", + "metadata": {}, + "outputs": [], + "source": [ + "fa.get_eigenvalues()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c19b8c7-f645-472a-b641-557fdbd8d086", + "metadata": {}, + "outputs": [], + "source": [ + "pd.Series(fa.get_eigenvalues()[1]).plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c946d1de-2f68-4cd0-8157-9abfe69801b9", + "metadata": {}, + "outputs": [], + "source": [ + "nams = [ \"p\" + i for i in list(\"123456789\")]\n", + "loadings = pd.DataFrame(np.column_stack((fa.loadings_, fa.get_uniquenesses())))\n", + "loadings.index = nams\n", + "loadings.columns = [\"Factor1\", \"Factor2\", \"Factor3\", \"uniquenesses\"] \n", + "loadings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e12fe4e", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata\n", + "/*Sprobujemy nadac czynnikom interpretacje. Przeprowadzamy rotacje czynnikow*/\n", + "rotate, varimax" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c8b4750-515e-4de4-8af8-8b405feca4e9", + "metadata": {}, + "outputs": [], + "source": [ + "fa = FactorAnalyzer(rotation='varimax', is_corr_matrix = True, n_factors = 3, method = 'ml')\n", + "fa.fit(C)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "022e0b74-ce17-47fb-9f2b-44d41937a461", + "metadata": {}, + "outputs": [], + "source": [ + "#GET EIGENVALUES\n", + "fa.get_uniquenesses(),fa.get_communalities()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d3f528e-996d-4b3c-a2c3-7fa7dc49e756", + "metadata": {}, + "outputs": [], + "source": [ + "nams = [ \"p\" + i for i in list(\"123456789\")]\n", + "loadings = pd.DataFrame(np.column_stack((fa.loadings_, fa.get_uniquenesses())))\n", + "loadings.index = nams\n", + "loadings.columns = [\"Factor1\", \"Factor2\", \"Factor3\", \"uniquenesses\"] \n", + "loadings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0bdd798f-527f-48c7-9922-23490ca7a5bd", + "metadata": {}, + "outputs": [], + "source": [ + "fa.rotation_matrix_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b166b19-77fc-43f5-85ff-365908907953", + "metadata": {}, + "outputs": [], + "source": [ + "np.matmul(fa.rotation_matrix_.T, fa.rotation_matrix_)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4a7b9f2-885e-404f-bec8-4eb3000ba251", + "metadata": {}, + "outputs": [], + "source": [ + "# https://www.tandfonline.com/doi/abs/10.1080/10705510701301891?journalCode=hsem20" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0b147ca", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata\n", + "/*pierwszy czynnik - stwierdzenia 1, 3, 4 i 8 - wszystkie zwiazane z lekarzami; mozemy zinterpretowac jako \"kontrola lekarska bolu\"\n", + " drugi czynnik - stwierdzenia 6 i 7 - bol jako wynik wlasnych dzialan\n", + " trzeci czynnik - stwierdzenia 2 i 5 - znow bol jako wynik wlasnych dzialan.*/\n", + "\n", + "estat smc\n", + "/*oszacowanie czesci wspólnej ->\"communality\" (jaka czesc zmiennej Xi jest zwiazana z pozostalymi zmiennymi X)\n", + "szacowna jako kwadrat wspolczynnika korelacji wielorakiej\n", + "danej zmiennej z pozostalymi (czyli R2 z regresji tej zmiennej na pozostale)*/\n", + "\n", + "estat kmo\n", + "\n", + "/*statystyka adekwatnosci proby Kaiser-Meyer-Olkin.\n", + "Metoda ta polega na porownaniu korelacji i czastkowych korelacji pomiedzy zmiennymi.\n", + "Gdy korelacja czastkowa jest relatywnie wysoka w stosunku do zwyklej korelacji to KMO jest male,\n", + "co oznacza ze uzyskanie adekwatnego rozwiazania w przestrzeni malego wymiaru jest niewykonalne.\n", + "\n", + "Wielkosci wspolczynnika:\n", + "0.00 to 0.49 nie do przyjecia\n", + "0.50 to 0.59 bardzo slaby\n", + "0.60 to 0.69 slaby\n", + "0.70 to 0.79 umiarkowany\n", + "0.80 to 0.89 dobry\n", + "0.90 to 1.00 znakomity*/\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4839275e-2092-4a5e-80e9-362cbce99fcf", + "metadata": {}, + "outputs": [], + "source": [ + "# calculate_bartlett_sphericity(C) not for correlation matrix" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44d01513-be7c-4b08-b292-c19dad083990", + "metadata": {}, + "outputs": [], + "source": [ + "# calculate_kmo(C) not for correlation matrix" + ] + }, + { + "cell_type": "markdown", + "id": "d6078e46", + "metadata": {}, + "source": [ + "### Przyklad 2 -- Indeks kapitalu spolecznego i problemy z analiza czynnikowa" + ] + }, + { + "cell_type": "markdown", + "id": "acaf4150-ae71-43d1-a509-30b589a372de", + "metadata": {}, + "source": [ + "#### Probujemy stworzyc indeks kapitalu spolecznego" + ] + }, + { + "cell_type": "markdown", + "id": "dd0aee94-451d-40ae-b534-fc4d53bd005a", + "metadata": {}, + "source": [ + "Dane oryginalnie pochodzily z badania World Values Survey" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43507f16-99c6-469f-bcfb-50fc5db4c359", + "metadata": {}, + "outputs": [], + "source": [ + "from multidim.datasets import load_indeks_spol\n", + "F = load_indeks_spol()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35d74f92-5b9f-4ea5-94fd-076c1122cf86", + "metadata": {}, + "outputs": [], + "source": [ + "%%mata -m F\n", + "st_matrix(\"F\", F)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e708caf-a0e8-407c-b4a4-61f6e057beba", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata\n", + "//METODA NAJWIEKSZEJ WIARYGODNOSCI\n", + "factormat F, n(35312) names(imp_family imp_friends imp_politics imp_church member_dis political_dis trust_family trust_ppers trust_neighbour trust_arel trust_firsttime trust_anation fair conf_church conf_forces conf_press conf_tv conf_labour conf_police conf_courts conf_govern conf_parties conf_parl religion_freq tradition help local) fac(7) ml\n", + "//za malo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fcde1ec-1bd7-4b92-9e6d-34fb18e59a41", + "metadata": {}, + "outputs": [], + "source": [ + "nams = [\"imp_family\", \"imp_friends\", \"imp_politics\", \"imp_church\", \"member_dis\", \"political_dis\", \"trust_family\",\n", + " \"trust_ppers\", \"trust_neighbour\", \"trust_arel\", \"trust_firsttime\", \"trust_anation\", \"fair\", \"conf_church\",\n", + " \"conf_forces\", \"conf_press\", \"conf_tv\", \"conf_labour\", \"conf_police\", \"conf_courts\", \"conf_govern\", \"conf_parties\",\n", + " \"conf_parl\", \"religion_freq\", \"tradition\", \"help\", \"local\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60d7a783-7a42-467f-a657-e96aca25fe38", + "metadata": {}, + "outputs": [], + "source": [ + "n_factors = 7\n", + "fa = FactorAnalyzer(rotation=None, is_corr_matrix = True, n_factors = n_factors, method = 'ml')\n", + "fa.fit(F)\n", + "loadings = pd.DataFrame(np.column_stack((fa.loadings_, fa.get_uniquenesses())))\n", + "loadings.index = nams\n", + "loadings.columns = [ \"Factor\" + str(i + 1) for i in range(n_factors)] + [\"uniquenesses\"] \n", + "loadings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7643a085-d510-4b2c-a1f1-dfb706ab57d0", + "metadata": {}, + "outputs": [], + "source": [ + "pd.Series(fa.get_eigenvalues()[1]).plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4363894-1a9d-4eb5-a5cd-a3bcb23ff77e", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata \n", + "factormat F, n(35312) names(imp_family imp_friends imp_politics imp_church member_dis political_dis trust_family trust_ppers trust_neighbour trust_arel trust_firsttime trust_anation fair conf_church conf_forces conf_press conf_tv conf_labour conf_police conf_courts conf_govern conf_parties conf_parl religion_freq tradition help local) fac(9) ml\n", + "//HEYWOOD CASE -- negative variance estimate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1bf940f2-4e0f-4836-8a71-b638d8cb946d", + "metadata": {}, + "outputs": [], + "source": [ + "n_factors = 9\n", + "fa = FactorAnalyzer(rotation=None, is_corr_matrix = True, n_factors = n_factors, method = 'ml')\n", + "fa.fit(F)\n", + "loadings = pd.DataFrame(np.column_stack((fa.loadings_, fa.get_uniquenesses())))\n", + "loadings.index = nams\n", + "loadings.columns = [ \"Factor\" + str(i + 1) for i in range(n_factors)] + [\"uniquenesses\"] \n", + "loadings" + ] + }, + { + "cell_type": "markdown", + "id": "3c002bcf-1fdb-4d6e-85eb-d53269117348", + "metadata": {}, + "source": [ + "### Analiza czynnikowa (FA) przy wykorzytaniu Analizy głównych składowych (PCA) (optymalizacja)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9529f72f", + "metadata": {}, + "outputs": [], + "source": [ + "#%%stata\n", + "#factormat F, n(35312) names(imp_family imp_friends imp_politics imp_church member_dis political_dis trust_family trust_ppers trust_neighbour trust_arel trust_firsttime trust_anation fair conf_church conf_forces conf_press conf_tv conf_labour conf_police conf_courts conf_govern conf_parties conf_parl religion_freq tradition help local) fac(4) pcf\n", + "#//za malo\n", + "#\n", + "#factormat F, n(35312) names(imp_family imp_friends imp_politics imp_church member_dis political_dis trust_family trust_ppers trust_neighbour trust_arel trust_firsttime trust_anation fair conf_church conf_forces conf_press conf_tv conf_labour conf_police conf_courts conf_govern conf_parties conf_parl religion_freq tradition help local) fac(27) pcf\n", + "#//tez nie" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a69f19ab-2eb3-4559-a12f-7068e4887f60", + "metadata": {}, + "outputs": [], + "source": [ + "# n_factors = 10\n", + "# fa = FactorAnalyzer(rotation=None, is_corr_matrix = True, n_factors = n_factors, method = 'principal')\n", + "# fa.fit(F)\n", + "# loadings = pd.DataFrame(np.column_stack((fa.loadings_, fa.get_uniquenesses())))\n", + "# loadings.index = nams\n", + "# loadings.columns = [ \"Factor\" + str(i + 1) for i in range(n_factors)] + [\"uniquenesses\"] \n", + "# loadings" + ] + }, + { + "cell_type": "markdown", + "id": "f5549f40", + "metadata": {}, + "source": [ + "## Przyklad 3 - Scores" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "253695bd", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from sklearn.decomposition import FactorAnalysis\n", + "from sklearn.preprocessing import StandardScaler, scale\n", + "from scipy.stats import rankdata\n", + "\n", + "from factor_analyzer import FactorAnalyzer\n", + "from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity\n", + "from factor_analyzer.factor_analyzer import calculate_kmo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7dcf1d83", + "metadata": {}, + "outputs": [], + "source": [ + "from multidim.datasets import load_seul1988\n", + "seul1988 = load_seul1988()\n", + "seul1988 = seul1988.sample(seul1988.shape[0])\n", + "seul1988 = seul1988.query(\"wynik >= 6000\")\n", + "seul1988[\"Subject\"] = list(range(1, seul1988.shape[0]+1, 1))\n", + "seul1988_copy = seul1988.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d6c9a4d", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "seul1988 = seul1988[~np.isnan(seul1988).any(axis=1)]\n", + "seul1988_normal = scale(seul1988)\n", + "fa = FactorAnalysis(n_components = 3, tol = 0.001, svd_method = \"lapack\")\n", + "X = seul1988_normal[:, :-2]\n", + "fa.fit(X)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24b90c06", + "metadata": {}, + "outputs": [], + "source": [ + "comps = fa.transform(X)" + ] + }, + { + "cell_type": "markdown", + "id": "14d23f7a", + "metadata": {}, + "source": [ + "Interpretacja dla kazdego czynnika ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dce0cffe", + "metadata": {}, + "outputs": [], + "source": [ + "cc = corr_mat(comps, seul1988.iloc[:, :-2]).T\n", + "cc" + ] + }, + { + "cell_type": "markdown", + "id": "13871178", + "metadata": {}, + "source": [ + "Jest duzo roznych metod budowania rankingu." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa4ac1f8", + "metadata": {}, + "outputs": [], + "source": [ + "scores1a = comps[:,0]\n", + "order1a = rankdata(scores1a, \"max\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbec394e", + "metadata": {}, + "outputs": [], + "source": [ + "scores1b = comps.sum(axis = 1)\n", + "order1b = rankdata(scores1b, \"max\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a01aae5", + "metadata": {}, + "outputs": [], + "source": [ + "scores2 = fa.score_samples(X)\n", + "order2 = rankdata(scores2, \"max\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a6e13b2", + "metadata": {}, + "outputs": [], + "source": [ + "fa2 = FactorAnalysis(n_components = 1, tol = 0.001, svd_method = \"lapack\")\n", + "fa2.fit(X)\n", + "comps = fa2.transform(X)\n", + "scores1c = comps.ravel()\n", + "order1c = rankdata(scores1c, \"max\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5440c686", + "metadata": {}, + "outputs": [], + "source": [ + "res = pd.DataFrame({\n", + " 'subject': seul1988['Subject'],\n", + " 'wynik': seul1988['wynik'], \n", + " 'scores_one': scores1c,\n", + " 'rank_one': order1c,\n", + " 'scores_first': scores1a,\n", + " 'rank_first': order1a,\n", + " 'scores_sum': scores1b,\n", + " 'rank_sum': order1b,\n", + " 'scores_loglike': scores2,\n", + " 'rank_loglike': order2\n", + "})\n", + "res['rank_wynik'] = rankdata(res[\"wynik\"].values, \"max\")\n", + "res" + ] + }, + { + "cell_type": "markdown", + "id": "ffa88034", + "metadata": {}, + "source": [ + "Correlation between different rankings - ABS is needed as we do not know sense (zwrot)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3aa48fd6", + "metadata": {}, + "outputs": [], + "source": [ + "from multidim.funs import corr_mat\n", + "rank_cols = [\"rank_one\", \"rank_first\", \"rank_sum\", \"rank_loglike\", \"rank_wynik\"]\n", + "# ABS\n", + "np.abs(np.corrcoef(res.loc[:,rank_cols].T))[-1,:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a777eb20", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + }, + "vscode": { + "interpreter": { + "hash": "aa9d594cf5340018b76a70eeac417329866dfddee009818da806e7b1b5f80883" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/src/multidim/notebooks/06_pca.ipynb b/src/multidim/notebooks/06_pca.ipynb index ee49dd3..8f7dc07 100644 --- a/src/multidim/notebooks/06_pca.ipynb +++ b/src/multidim/notebooks/06_pca.ipynb @@ -1,967 +1,967 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "2a48c727", - "metadata": { - "tags": [] - }, - "source": [ - "# Analiza Wielowymiarowa - zajecia 6 - Analiza głównych składowych (PCA)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "32b52329", - "metadata": {}, - "outputs": [], - "source": [ - "from multidim.utils import resolve_stata, load_stata\n", - "\n", - "STATA_PATH, STATA_TYPE = resolve_stata(version = 17, stype = \"se\")\n", - "# make sure they are proper ones\n", - "STATA_PATH, STATA_TYPE" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "40123e5a", - "metadata": {}, - "outputs": [], - "source": [ - "load_stata(STATA_PATH, STATA_TYPE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1df4db13", - "metadata": {}, - "outputs": [], - "source": [ - "# Załadowanie bibliotek\n", - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "from sklearn.decomposition import PCA\n", - "from scipy.linalg import svd\n", - "\n", - "from sklearn import datasets\n", - "\n", - "import statsmodels.api as sm\n", - "from statsmodels.stats.outliers_influence import variance_inflation_factor\n", - "\n", - "\n", - "from multidim.funs import corr_mat" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f5417c75", - "metadata": {}, - "outputs": [], - "source": [ - "np.random.seed(1234)" - ] - }, - { - "cell_type": "markdown", - "id": "9979aab1-fd4c-4c58-9f07-632efd3bff5c", - "metadata": {}, - "source": [ - "## Przykład 1 - Dekompozycja głównych składowych (SVD) a wartości własne (Eigenvalues) and Internals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2f8470d0-1e77-409a-a2e3-8578580f53a1", - "metadata": {}, - "outputs": [], - "source": [ - "iris_set = datasets.load_iris()\n", - "iris_x = iris_set[\"data\"]\n", - "# CENTRING\n", - "iris_x_centred = iris_set[\"data\"] - np.mean(iris_set[\"data\"], axis = 0)\n", - "#SCALING\n", - "iris_x_scaled = (iris_set[\"data\"] - np.mean(iris_set[\"data\"], axis = 0))/np.std(iris_set[\"data\"], axis = 0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5e811df0-5797-460c-90db-c938d06d36ce", - "metadata": {}, - "outputs": [], - "source": [ - "# almost always centring\n", - "# in most scenerios full scaling\n", - "# correlation matrix = scaling" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e126cc6a-d481-4add-b637-c51ad30c24dc", - "metadata": {}, - "outputs": [], - "source": [ - "# when X CENTRED -> X`X/n = COV(X)\n", - "np.allclose((iris_x_centred.T @ iris_x_centred) / (iris_x_centred.shape[0] - 1), np.cov(iris_x_centred.T, ddof = 1))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b06ff9c-8155-4a8f-8c68-37c5e60334e1", - "metadata": {}, - "outputs": [], - "source": [ - "# when X SCALED -> Corr(X) = COV(X)\n", - "np.allclose(np.corrcoef(iris_x_scaled.T), np.cov(iris_x_scaled.T, ddof=0))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "debad822-40b7-424b-a40a-bb48e6cdabb1", - "metadata": {}, - "outputs": [], - "source": [ - "# SVD -> X = U * S * Vt\n", - "u, s, vh = np.linalg.svd(iris_x_scaled, full_matrices = False)\n", - "np.allclose(iris_x_scaled, u @ np.diag(s) @ vh)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "99d7348e-76c1-4d53-ba40-52bdb0103fd8", - "metadata": {}, - "outputs": [], - "source": [ - "# Eigen -> C = V * S * Vt\n", - "s_eigen, v_eigen = np.linalg.eigh(np.corrcoef(iris_x.T))\n", - "np.allclose(np.corrcoef(iris_x.T), v_eigen @ np.diag(s_eigen) @ v_eigen.T)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ac221e9e", - "metadata": {}, - "outputs": [], - "source": [ - "# Unit disc\n", - "# https://en.wikipedia.org/wiki/Singular_value_decomposition#/media/File:Singular-Value-Decomposition.svg\n", - "# SVD could be run for square or long shape\n", - "corr_iris_x = np.corrcoef(iris_x.T)\n", - "u_s, s_s, vh_s = np.linalg.svd(corr_iris_x, full_matrices = False)\n", - "assert np.allclose(\n", - " np.diag(np.ones((4))) @ corr_iris_x, \n", - " corr_iris_x\n", - ")\n", - "assert np.allclose(\n", - " np.diag(np.ones((4))) @ u_s @ np.diag(s_s) @ vh_s, \n", - " corr_iris_x\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8b3de809-2cd4-4406-a539-fc25a3c23128", - "metadata": {}, - "outputs": [], - "source": [ - "# principal components\n", - "pca_svd = pd.DataFrame(iris_x_scaled @ vh.T)\n", - "pca_svd_alt = pd.DataFrame(u @ np.diag(s))\n", - "assert np.allclose(pca_svd, pca_svd_alt)\n", - "pca_eigen = pd.DataFrame(iris_x_scaled @ v_eigen)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "92a51609", - "metadata": {}, - "outputs": [], - "source": [ - "# direction is stable (kierunek) but not sense (zwrot)\n", - "# from sklearn.utils.extmath import svd_flip\n", - "# Sign correction to ensure deterministic output from SVD.\n", - "# Adjusts the columns of u and the rows of v such that the loadings in the\n", - "# columns in u that are largest in absolute value are always positive." - ] - }, - { - "cell_type": "markdown", - "id": "126b29fa", - "metadata": {}, - "source": [ - "Ładunki czynnikowe (ang. Loadings )\n", - "\n", - "Korelacje między oryginalnymi zmiennymi a składowymi głównymi (correlations between the original variables and the principal components)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ece668ca-3ff2-4687-8533-ea53512909ff", - "metadata": {}, - "outputs": [], - "source": [ - "pca_nams = ['PC1', 'PC2', 'PC3', 'PC4']\n", - "feature_nams = iris_set[\"feature_names\"]\n", - "# loadings from sklearn\n", - "pca = PCA(n_components = 4)\n", - "X = pca.fit_transform(iris_x_scaled)\n", - "loadings1 = pd.DataFrame(pca.components_.T * np.sqrt(pca.explained_variance_), index = pca_nams, columns = feature_nams)\n", - "# Loadings from numpy svd decomposition\n", - "loadings2 = pd.DataFrame(vh.T * s/np.sqrt(iris_x_scaled.shape[0] - 1), index = pca_nams, columns = feature_nams)\n", - "# Loadings by directly getting correlations between variables and prinicpal components\n", - "loadings3 = pd.DataFrame(corr_mat(pca_svd, iris_x_scaled).values, columns = pca_nams, index = feature_nams)\n", - "loadings1.T, loadings2.T, loadings3" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c148d158-2316-4537-af2c-b6ee0f031614", - "metadata": {}, - "outputs": [], - "source": [ - "# e.g. first PCA component == original data * first rotation\n", - "assert np.allclose(\n", - " np.round(pca_svd.iloc[:,0].values, 2), \n", - " np.round(vh.T[:, 0] @ iris_x_scaled.T, 2)\n", - ")\n", - "ev = pd.DataFrame(vh.T)\n", - "ev.index = iris_set.feature_names\n", - "ev" - ] - }, - { - "cell_type": "markdown", - "id": "ab7ae6bb", - "metadata": {}, - "source": [ - "Korelacje pomiędzy czynnikami głównymi (ang. correlation between principal components)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d866375c-ac54-4099-b77c-4535b4ac91e6", - "metadata": {}, - "outputs": [], - "source": [ - "pca_svd.corr(), pca_eigen.corr()" - ] - }, - { - "cell_type": "markdown", - "id": "46c2e1e7", - "metadata": {}, - "source": [ - "Wartości własne (ang. Eigenvalues)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "02808a2d-3b65-462c-99f0-b56783822723", - "metadata": {}, - "outputs": [], - "source": [ - "index_eigen = [i[0] for i in sorted(enumerate(s_eigen), reverse=True, key = lambda x: x[1])]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f0611c6f-ab43-4ffe-898e-c3d9e5c2e606", - "metadata": {}, - "outputs": [], - "source": [ - "pd.DataFrame(s**2 / (iris_x_scaled.shape[0]), s_eigen[index_eigen])" - ] - }, - { - "cell_type": "markdown", - "id": "0da92302", - "metadata": {}, - "source": [ - "Pierwsze wiersze czynników głównych (ang. First rows of principal components)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "482ee345-da56-427b-b67d-e9bd706999a0", - "metadata": {}, - "outputs": [], - "source": [ - "pca_svd.head(), pca_eigen[index_eigen].head()" - ] - }, - { - "cell_type": "markdown", - "id": "b80815e0-c5ba-44a6-ab03-2be923f3599b", - "metadata": {}, - "source": [ - "### PCA wykres na podstawie 2 pierwszych komponentow\n", - "\n", - "Prosze pamietac ze informacja o odmianach iris-ow (y) nie byla wykorzystana przy obliczaniu PCA" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "48cbc312-366e-4946-8d91-c0e21ed7a437", - "metadata": {}, - "outputs": [], - "source": [ - "pca_svd.plot(\n", - " x = 0,\n", - " y = 1, \n", - " color = pd.Series(iris_set[\"target\"]).map({0: \"b\", 1: \"r\", 2: \"y\"}), \n", - " kind = \"scatter\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "b58d0e58", - "metadata": {}, - "source": [ - "Iloraz wiariancji pokazuje udział kolejnych komponentow" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5494a6fc-0bd5-4606-a775-3aefc6829df0", - "metadata": {}, - "outputs": [], - "source": [ - "s_eigen[index_eigen] / np.sum(s_eigen),\\\n", - "s**2 / np.sum(s**2)" - ] - }, - { - "cell_type": "markdown", - "id": "5eda8fd6-30fd-426f-a102-341716622462", - "metadata": {}, - "source": [ - "## Przyklad 2 -- seul1988 -- wyniki dziesiecioboju mezczyzn w Seulu w 1988" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d48de20f-c1bf-42a4-9be4-25cb9ee922a2", - "metadata": {}, - "outputs": [], - "source": [ - "from multidim.datasets import load_seul1988\n", - "seul1988 = load_seul1988()\n", - "seul1988_copy = seul1988.copy()" - ] - }, - { - "cell_type": "markdown", - "id": "ffce172b-2765-49ff-bb3a-9d24de7e8e9f", - "metadata": {}, - "source": [ - "Sprawdzamy czy w zbiorze danych sa obserwacje nietypowe - zmienna wynik" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b5826a1e-ee1d-4a93-b523-7a22c229fef8", - "metadata": {}, - "outputs": [], - "source": [ - "%%stata -d seul1988_copy -force\n", - "des" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "22c503b4", - "metadata": {}, - "outputs": [], - "source": [ - "seul1988.shape, seul1988.dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d03dbf91", - "metadata": {}, - "outputs": [], - "source": [ - "%%stata\n", - "graph box wynik, title(\"Wykres pudelkowy\")\n", - "/*Istnieje jedna potencjalna obserwacja nietypowa.*/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3e6d72e0", - "metadata": {}, - "outputs": [], - "source": [ - "seul1988[\"wynik\"].plot(kind = \"box\")" - ] - }, - { - "cell_type": "markdown", - "id": "b1b6b789-daab-4eee-92ff-26c0a9b0c0f3", - "metadata": {}, - "source": [ - "Przygotowujemy zbior danych do analizy -- usuwamy obserwacje potencjalnie nietypowa oraz przemnazamy wyniki uzyskane w konkurencjach biegowych przez -1, tak aby najnizsza wartosc oznaczala najgorszy wynik" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5607ab08-a701-445b-959c-9853da16d4d6", - "metadata": {}, - "outputs": [], - "source": [ - "%%stata -qui\n", - "drop if wynik < 6000\n", - "gen bieg100_1 = bieg100 * (-1)\n", - "gen bieg400_1 = bieg400 * (-1)\n", - "gen plotki_1 = plotki * (-1)\n", - "gen bieg1500_1 = bieg1500 * (-1)\n", - "\n", - "replace bieg100 = bieg100_1\n", - "replace bieg400 = bieg400_1\n", - "replace bieg1500 = bieg1500_1\n", - "replace plotki = plotki_1\n", - "drop bieg100_1 bieg400_1 plotki_1 bieg1500_1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "88b71fcb-5f8f-403e-af00-632ea6043e4b", - "metadata": {}, - "outputs": [], - "source": [ - "seul1988 = seul1988.query(\"wynik >= 6000\")\n", - "seul1988[\"bieg100\"] = seul1988.bieg100 * -1\n", - "seul1988[\"bieg400\"] = seul1988.bieg400 * -1\n", - "seul1988[\"bieg1500\"] = seul1988.bieg1500 * -1\n", - "seul1988[\"plotki\"] = seul1988.plotki * -1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7ffeb575-ebfe-4781-a0a6-98c04c51eb99", - "metadata": {}, - "outputs": [], - "source": [ - "%%stata\n", - "/*Przeprowadzamy analize glownych skladowych. Domyslnie wykorzystywana jest macierz korelacji*/\n", - "corr bieg100-bieg1500\n", - "summarize bieg100-bieg1500" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "be7d3bbf-bac3-4db7-b286-13c8e289465f", - "metadata": {}, - "outputs": [], - "source": [ - "%%stata\n", - "/*Przeprowadzamy analize glownych skladowych. Domyslnie wykorzystywana jest macierz korelacji*/\n", - "pca bieg100-bieg1500\n", - "\n", - "/*Tylko pierwsze dwie glowne skladowe maja wartosci wlasne wieksze od 1 (co jest rownoznaczne z wyjasniona wariancja wieksza niz srednia) i wyjasniaja ponad\n", - "60% calkowitej wariancji.*/\n", - "\n", - "/*Interpretacja dwoch pierwszych skladowych:\n", - "pierwsza -- mierzy osiagniety wynik (wszystkie wspolczynniki dodatnie)\n", - "druga -- ma wysokie wartosci dla konkurencji zwiazanych z rzucaniem i sila (kula, dysk, oszczep) i\n", - "duze ujemne dla wytrzymalosciowych (biegi na 400 i 1500 metrow)*/\n", - "\n", - "/*Okreslenie liczby skladowych glownych, ktore dobrze opisuja wariancje wyjsciowych zmiennych*/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9db8f841-a5a9-4aad-a8ee-5b8ac169af18", - "metadata": {}, - "outputs": [], - "source": [ - "cols_x = ['bieg100', 'skok_w_dal', 'rzut_kula', 'skok_wzwyz', \n", - " 'bieg400', 'plotki', 'rzut_dysk', 'tyczka', 'oszczep', 'bieg1500']\n", - "seul1988_x = seul1988[cols_x]\n", - "centred_seul1988_x = pd.DataFrame(seul1988_x - np.mean(seul1988_x), columns=cols_x)\n", - "scaled_seul1988_x = pd.DataFrame(seul1988_x - np.mean(seul1988_x), columns=cols_x) / np.std(seul1988_x)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4adf576e-3d83-4be7-9a0e-ecb672605702", - "metadata": {}, - "outputs": [], - "source": [ - "# seul1988_x.corr()\n", - "# seul1988_x.describe().T" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bb55dd35-ef04-495f-9038-e3914b77f2e4", - "metadata": {}, - "outputs": [], - "source": [ - "# https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/decomposition/_pca.py\n", - "pca = PCA(svd_solver = 'full')\n", - "# SVD\n", - "# X = U * S * Vt\n", - "U, s, Vt = np.linalg.svd(scaled_seul1988_x, full_matrices = False)" - ] - }, - { - "cell_type": "markdown", - "id": "bc4bec22", - "metadata": {}, - "source": [ - "Główne składowe (ang. Principal components)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6c209621-5957-4c4e-b2e9-af9669bcaa90", - "metadata": {}, - "outputs": [], - "source": [ - "# principal components\n", - "# X_principals = X * V = U * S * Vt * V = U * S\n", - "pca_components = pca.fit_transform(scaled_seul1988_x)\n", - "np.abs(np.round(pca_components, 2)), np.abs(np.round(np.matmul(scaled_seul1988_x, Vt.T), 2))\n", - "pca.explained_variance_ratio_, (s**2 / np.sum(s**2))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d5a65e03-1690-4374-93a8-79b4b244b0ee", - "metadata": {}, - "outputs": [], - "source": [ - "# rotation matrix\n", - "# np.abs needed\n", - "np.abs(np.round(Vt.T, 3)), np.abs(np.round(pca.components_.T, 3))\n", - "pd.DataFrame(Vt.T)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bd5832e7", - "metadata": {}, - "outputs": [], - "source": [ - "%stata predict comp1-comp2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a9d56cd6-1d90-4135-8e3a-7e88dc82bef9", - "metadata": {}, - "outputs": [], - "source": [ - "%%stata\n", - "/*Wykres osypiska*/\n", - "screeplot, mean /*Wykres kolejnych wartosci wlasnych; \"mean\" - zostaje naniesiona prosta odpowiadajaca sredniej z wartosci wlasnych*/\n", - "screeplot, ci /*Dodatkowo zostaje nalozony przedzial ufnosci*/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4e62e5da-2cb7-4229-8198-a6eac4bba6ce", - "metadata": {}, - "outputs": [], - "source": [ - "pd.Series(s**2 / seul1988_x.shape[0]).plot()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ba70d846-f71a-4cc1-b72a-c8884ea0dd27", - "metadata": {}, - "outputs": [], - "source": [ - "%%stata\n", - "/*Wykres wspólczynników dla poszczegolnych zmiennych - przydatne przy interpretacji skladowych glownych*/\n", - "loadingplot /*Tylko dla dwóch pierwszych wektorów wlasnych*/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ac3a5367-7e37-4743-aab4-65de787a8d43", - "metadata": {}, - "outputs": [], - "source": [ - "loadings = pd.DataFrame(Vt.T * s / np.sqrt(seul1988_x.shape[0] - 1))\n", - "ax = loadings.plot.scatter(0,1)\n", - "cols = seul1988_x.columns.tolist()\n", - "for i in range(loadings.shape[0]):\n", - " ax.text(x = loadings.iloc[i,0], y = loadings.iloc[i,1], s = cols[i])" - ] - }, - { - "cell_type": "markdown", - "id": "80cdc522", - "metadata": {}, - "source": [ - "Wykres dwoch pierwszych czynników głównych\n", - "First two prinicipal components vs not known during extraction \"wynik\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b088bb6-b516-4a1e-b39e-919949af027b", - "metadata": {}, - "outputs": [], - "source": [ - "pd.DataFrame(pca_components).plot.scatter(0, 1, c = seul1988[\"wynik\"], colormap = \"winter\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d8a31923-1ffb-4495-a0f2-fea751522a99", - "metadata": {}, - "outputs": [], - "source": [ - "#%%stata\n", - "#mvtest normality bieg100-bieg1500, stats(all)\n", - "#pca bieg100-bieg1500, cov vce(normal)\n", - "#testparm bieg100-bieg1500, equal eq(Comp1)\n", - "#\n", - "#/*niestety, nie jest spelnione zalozenie o wielowymiarowym rozkladzie normalnym*/\n", - "#\n", - "#/*Dalej analizujemy tylko dwie pierwsze glowne skladowe. */" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "854e72f5-2e44-4458-b3fb-806850f0926d", - "metadata": {}, - "outputs": [], - "source": [ - "#%%stata\n", - "#/*Jeszcze wyznaczmy macierz korelacji*/\n", - "#pwcorr bieg100-bieg1500 wynik comp2 comp1\n", - "#sum bieg100-bieg1500 wynik comp2 comp1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c6bdd4c8-e16f-4e32-be7d-cbcd4fa92ccc", - "metadata": {}, - "outputs": [], - "source": [ - "%%stata\n", - "//Weryfikacja poprawności przeprowadzonej analizy głównych składowych (PCA)\n", - "\n", - "/*Aby sprawdzic, czy wystarczajaco dobrze odtworzylismy zmiennosc wykorzystamy \"reszty\" -\n", - "roznice pomiedzy zaobserwowanymi korelacjami a tymi odtworzonymi za pomoca tylko kilku pierwszych skladowych*/\n", - "\n", - "pca bieg100 - bieg1500, components(3) /*Rozwiazanie skladajace sie z 3 pierwszych skladowych glownych*/\n", - "estat residual, fit /*Macierz \"resztowa\" oraz macierz korelacji odtworzona za pomoca 3 pierwszych skladowych*/\n", - "corr bieg100 - bieg1500" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dab5a5f5-c28a-465c-9775-8c7baf803566", - "metadata": {}, - "outputs": [], - "source": [ - "cor_original = np.corrcoef(scaled_seul1988_x.T)\n", - "# set to zero not used principal components\n", - "s3 = np.concatenate([s[:3], [0] * 7]) \n", - "# Eigen -> C = V * (S**2/ (n-1)) * Vt\n", - "cor_pca3 = Vt.T @ (np.diag(s3 ** 2) / (seul1988.shape[0] - 1)) @ Vt\n", - "pd.DataFrame(cor_original).round(3),\\\n", - "pd.DataFrame(cor_pca3).round(3),\\\n", - "pd.DataFrame(cor_original - cor_pca3).round(3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "615b26da-f093-4c3e-aa84-2463d9ba7758", - "metadata": {}, - "outputs": [], - "source": [ - "%%stata\n", - "/*Sprawdzenie, czy zmienne wejsciowe sa ze soba skorelowane.\n", - "Czyli sprawdzamy czy ma sens w ogole przeprwowadzenie analizy skladowych glownych*/\n", - "\n", - "/*1. Pierwszy sposob to analiza R^2*/\n", - "estat smc\n", - "reg bieg100 skok_w_dal - bieg1500 /*sprawdzenie, skad sie biora te wartosci*/\n", - "\n", - "//WNIOSEK: \"Skok wzwyz\" wykazuje najmniejsza zaleznosc z pozostalymi konkurencjami" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cdbfbfc4-d34f-42d9-a2a3-d07c3154b858", - "metadata": {}, - "outputs": [], - "source": [ - "%%stata\n", - "/*2. Drugi sposób: \"anti-image correlation\" (minus korelacja czastkowa).*/\n", - "\n", - "/*Korelacja czastkowa pokazuje \"czysta\" zaleznosc miedzy dwoma zmiennymi, traktujac pozostale jako stale.\n", - "Dazymy do uzyskania malej korelacji czastkowej!\n", - "Jesli wiele tych korelacji jest relatywnie duzych, to zaleznosc pomiedzy niektorymi zmiennymi nie zalezy od poziomu pozostalych zmiennych.\n", - "Tym samym moze byc trudno uzyskac wlasciwe rozwiazanie malego wymiaru. */\n", - "\n", - "pca bieg100 - bieg1500\n", - "estat anti, nocov /*\"anti-image correlation\" (minus korelacja czastkowa)*/\n", - "pcorr bieg100 skok_w_dal - bieg1500 /*Korelacje czastkowe zmiennej bieg100 z pozostalymi*/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4abb002a-e035-466a-87fe-03c0cddac5d1", - "metadata": {}, - "outputs": [], - "source": [ - "%%stata\n", - "/*3. Trzeci sposób: statystyka adekwatnosci proby Kaiser-Meyer-Olkin.*/\n", - "// pierwsza linia kodu to rezultat uruchamiania Stata przez Jupytera, w Stata ponowne szacowanie \n", - "// analizy skladowych glownych jest zbędne\n", - "qui pca bieg100 - bieg1500\n", - "estat kmo\n", - "\n", - "/*Bez zmiennej \"skok_wzwyz\"*/\n", - "pca bieg100 - kula bieg400 - bieg1500\n", - "estat kmo /*Wielkosc statystyki ulegla nieznacznej poprawie*/\n", - "\n", - "/*Wykres wspolczynnikow dla poszczegolnych zmiennych -- przydatne przy interpretacji skladowych glownych*/\n", - "pca bieg100 - bieg1500\n", - "loadingplot, comp(3) combined /*3 pierwsze skladowe glowne*/\n", - "\n", - "pca bieg100 - kula bieg400 - bieg1500\n", - "loadingplot, comp(3) combined /*bez \"skoku wzwyz\"*/" - ] - }, - { - "cell_type": "markdown", - "id": "9e6747ac-5d0f-4bbc-9512-768200f58c5a", - "metadata": {}, - "source": [ - "## Przykład 3 - Regresja Liniowa" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b8be9cde-e17d-472c-a025-f41f417ee8bb", - "metadata": {}, - "outputs": [], - "source": [ - "X = seul1988_x.copy()\n", - "X[\"const\"] = 1\n", - "y = seul1988[\"wynik\"]\n", - "ols = sm.OLS(y, X)\n", - "ols_result = ols.fit()\n", - "print(ols_result.summary())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "af806163-9973-4d8a-9ec6-3c9ab4180980", - "metadata": {}, - "outputs": [], - "source": [ - "# VIF\n", - "pd.DataFrame({\n", - " \"variable\": seul1988_x.columns, \n", - " \"VIF\": [variance_inflation_factor(seul1988_x, i) for i in range(seul1988_x.shape[1])]\n", - "})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "480db3f3-06db-4b32-9d18-b1bf06dd32d7", - "metadata": {}, - "outputs": [], - "source": [ - "x_skok = centred_seul1988_x[['skok_w_dal', 'skok_wzwyz', 'tyczka']]\n", - "x_bieg = centred_seul1988_x[['bieg100','bieg400', 'bieg1500', 'plotki']]\n", - "x_atlet = centred_seul1988_x[['rzut_kula','rzut_dysk', 'oszczep']]\n", - "\n", - "svd_skok = np.linalg.svd(x_skok, full_matrices = False)\n", - "svd_bieg = np.linalg.svd(x_bieg, full_matrices = False)\n", - "svd_atlet = np.linalg.svd(x_atlet, full_matrices = False)\n", - "\n", - "print(svd_skok[1]**2/np.sum(svd_skok[1]**2))\n", - "print(svd_bieg[1]**2/np.sum(svd_bieg[1]**2))\n", - "print(svd_atlet[1]**2/np.sum(svd_atlet[1]**2))\n", - "\n", - "pca_skok = x_skok @ svd_skok[2].T\n", - "pca_bieg = x_bieg @ svd_bieg[2].T\n", - "pca_atlet = x_atlet @ svd_atlet[2].T" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a0a39b2-2a54-4dde-804e-0ed4891dd874", - "metadata": {}, - "outputs": [], - "source": [ - "svd_skok[2].T[:, 0],\\\n", - "svd_bieg[2].T[:, 0],\\\n", - "svd_atlet[2].T[:, 0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2d09ffe8-8e26-4c78-b57c-b52a7ad34d1a", - "metadata": {}, - "outputs": [], - "source": [ - "# We do not now the sense (zwrot)\n", - "X = pd.concat([pca_skok.iloc[:,0]*-1, pca_bieg.iloc[:,0], pca_atlet.iloc[:,0]*-1], axis = 1)\n", - "X.columns = [\"skok\", \"bieg\", \"atlet\"]\n", - "print(pd.DataFrame({\"variable\": X.columns, \"VIF\": [variance_inflation_factor(X, i) for i in range(X.shape[1])]}))\n", - "X[\"const\"] = 1\n", - "y = seul1988[\"wynik\"]\n", - "ols = sm.OLS(y, X)\n", - "ols_result = ols.fit()\n", - "print(ols_result.summary())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "059094ba-81c0-46f0-baed-4b5b5a7d2607", - "metadata": {}, - "outputs": [], - "source": [ - "# Interpretacja?" - ] - }, - { - "cell_type": "markdown", - "id": "dc3dddb4", - "metadata": {}, - "source": [ - "## Przyklad 4 -- Objawy depresji (Dane z Diagnoza Spoleczna 2011) PCA" - ] - }, - { - "cell_type": "markdown", - "id": "1c018614", - "metadata": {}, - "source": [ - "1. Wczytaj zbior danych \"depresja.dta\"\n", - "2. Przeprowadz analize glownych skladowych.\n", - "3. Utworz wykres osypiska Ile powinismy wyroznic glownych skladowych?\n", - "4. Przeprowadz jeszcze raz analize, ale tylko dla ustalonej w punkcie 3 liczby glownych skladowych. Zapisz glowne skladowe oraz wyjsciowe zmienne w pliku.\n", - "5. Zinterpretuj glowne skladowe. Interpretacje poprzyj macierza korelacji.\n", - "6. Sprawdz, czy zmienne wejsciowe sa ze soba skorelowane (zweryfikuj, czy ma sens w ogole przeprwowadzenie analizy skladowych glownych).\n", - " Uzyj: R^2, \"anti-image correlation\" oraz statystyki adekwatnosci Kaisera-Meyera-Olkina.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4809038e-0403-4cec-a24e-5e94830b9eff", - "metadata": {}, - "outputs": [], - "source": [ - "from multidim.datasets import load_depresja\n", - "depresja = load_depresja()\n", - "depresja_copy = depresja.copy()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8f55bf0b-cbbe-448c-9148-4ebaf79d84d2", - "metadata": {}, - "outputs": [], - "source": [ - "%%stata -d depresja_copy -force\n", - "des" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "363b60c2", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - }, - "vscode": { - "interpreter": { - "hash": "aa9d594cf5340018b76a70eeac417329866dfddee009818da806e7b1b5f80883" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "cells": [ + { + "cell_type": "markdown", + "id": "2a48c727", + "metadata": { + "tags": [] + }, + "source": [ + "# Analiza Wielowymiarowa - zajecia 6 - Analiza głównych składowych (PCA)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32b52329", + "metadata": {}, + "outputs": [], + "source": [ + "from multidim.utils import resolve_stata, load_stata\n", + "\n", + "STATA_PATH, STATA_TYPE = resolve_stata(version = 17, stype = \"se\")\n", + "# make sure they are proper ones\n", + "STATA_PATH, STATA_TYPE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40123e5a", + "metadata": {}, + "outputs": [], + "source": [ + "load_stata(STATA_PATH, STATA_TYPE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1df4db13", + "metadata": {}, + "outputs": [], + "source": [ + "# Załadowanie bibliotek\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "from sklearn.decomposition import PCA\n", + "from scipy.linalg import svd\n", + "\n", + "from sklearn import datasets\n", + "\n", + "import statsmodels.api as sm\n", + "from statsmodels.stats.outliers_influence import variance_inflation_factor\n", + "\n", + "\n", + "from multidim.funs import corr_mat" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5417c75", + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(1234)" + ] + }, + { + "cell_type": "markdown", + "id": "9979aab1-fd4c-4c58-9f07-632efd3bff5c", + "metadata": {}, + "source": [ + "## Przykład 1 - Dekompozycja głównych składowych (SVD) a wartości własne (Eigenvalues) and Internals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f8470d0-1e77-409a-a2e3-8578580f53a1", + "metadata": {}, + "outputs": [], + "source": [ + "iris_set = datasets.load_iris()\n", + "iris_x = iris_set[\"data\"]\n", + "# CENTRING\n", + "iris_x_centred = iris_set[\"data\"] - np.mean(iris_set[\"data\"], axis = 0)\n", + "#SCALING\n", + "iris_x_scaled = (iris_set[\"data\"] - np.mean(iris_set[\"data\"], axis = 0))/np.std(iris_set[\"data\"], axis = 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e811df0-5797-460c-90db-c938d06d36ce", + "metadata": {}, + "outputs": [], + "source": [ + "# almost always centring\n", + "# in most scenerios full scaling\n", + "# correlation matrix = scaling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e126cc6a-d481-4add-b637-c51ad30c24dc", + "metadata": {}, + "outputs": [], + "source": [ + "# when X CENTRED -> X`X/n = COV(X)\n", + "np.allclose((iris_x_centred.T @ iris_x_centred) / (iris_x_centred.shape[0] - 1), np.cov(iris_x_centred.T, ddof = 1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b06ff9c-8155-4a8f-8c68-37c5e60334e1", + "metadata": {}, + "outputs": [], + "source": [ + "# when X SCALED -> Corr(X) = COV(X)\n", + "np.allclose(np.corrcoef(iris_x_scaled.T), np.cov(iris_x_scaled.T, ddof=0))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "debad822-40b7-424b-a40a-bb48e6cdabb1", + "metadata": {}, + "outputs": [], + "source": [ + "# SVD -> X = U * S * Vt\n", + "u, s, vh = np.linalg.svd(iris_x_scaled, full_matrices = False)\n", + "np.allclose(iris_x_scaled, u @ np.diag(s) @ vh)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99d7348e-76c1-4d53-ba40-52bdb0103fd8", + "metadata": {}, + "outputs": [], + "source": [ + "# Eigen -> C = V * S * Vt\n", + "s_eigen, v_eigen = np.linalg.eigh(np.corrcoef(iris_x.T))\n", + "np.allclose(np.corrcoef(iris_x.T), v_eigen @ np.diag(s_eigen) @ v_eigen.T)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac221e9e", + "metadata": {}, + "outputs": [], + "source": [ + "# Unit disc\n", + "# https://en.wikipedia.org/wiki/Singular_value_decomposition#/media/File:Singular-Value-Decomposition.svg\n", + "# SVD could be run for square or long shape\n", + "corr_iris_x = np.corrcoef(iris_x.T)\n", + "u_s, s_s, vh_s = np.linalg.svd(corr_iris_x, full_matrices = False)\n", + "assert np.allclose(\n", + " np.diag(np.ones((4))) @ corr_iris_x, \n", + " corr_iris_x\n", + ")\n", + "assert np.allclose(\n", + " np.diag(np.ones((4))) @ u_s @ np.diag(s_s) @ vh_s, \n", + " corr_iris_x\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b3de809-2cd4-4406-a539-fc25a3c23128", + "metadata": {}, + "outputs": [], + "source": [ + "# principal components\n", + "pca_svd = pd.DataFrame(iris_x_scaled @ vh.T)\n", + "pca_svd_alt = pd.DataFrame(u @ np.diag(s))\n", + "assert np.allclose(pca_svd, pca_svd_alt)\n", + "pca_eigen = pd.DataFrame(iris_x_scaled @ v_eigen)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92a51609", + "metadata": {}, + "outputs": [], + "source": [ + "# direction is stable (kierunek) but not sense (zwrot)\n", + "# from sklearn.utils.extmath import svd_flip\n", + "# Sign correction to ensure deterministic output from SVD.\n", + "# Adjusts the columns of u and the rows of v such that the loadings in the\n", + "# columns in u that are largest in absolute value are always positive." + ] + }, + { + "cell_type": "markdown", + "id": "126b29fa", + "metadata": {}, + "source": [ + "Ładunki czynnikowe (ang. Loadings )\n", + "\n", + "Korelacje między oryginalnymi zmiennymi a składowymi głównymi (correlations between the original variables and the principal components)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ece668ca-3ff2-4687-8533-ea53512909ff", + "metadata": {}, + "outputs": [], + "source": [ + "pca_nams = ['PC1', 'PC2', 'PC3', 'PC4']\n", + "feature_nams = iris_set[\"feature_names\"]\n", + "# loadings from sklearn\n", + "pca = PCA(n_components = 4)\n", + "X = pca.fit_transform(iris_x_scaled)\n", + "loadings1 = pd.DataFrame(pca.components_.T * np.sqrt(pca.explained_variance_), index = pca_nams, columns = feature_nams)\n", + "# Loadings from numpy svd decomposition\n", + "loadings2 = pd.DataFrame(vh.T * s/np.sqrt(iris_x_scaled.shape[0] - 1), index = pca_nams, columns = feature_nams)\n", + "# Loadings by directly getting correlations between variables and prinicpal components\n", + "loadings3 = pd.DataFrame(corr_mat(pca_svd, iris_x_scaled).values, columns = pca_nams, index = feature_nams)\n", + "loadings1.T, loadings2.T, loadings3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c148d158-2316-4537-af2c-b6ee0f031614", + "metadata": {}, + "outputs": [], + "source": [ + "# e.g. first PCA component == original data * first rotation\n", + "assert np.allclose(\n", + " np.round(pca_svd.iloc[:,0].values, 2), \n", + " np.round(vh.T[:, 0] @ iris_x_scaled.T, 2)\n", + ")\n", + "ev = pd.DataFrame(vh.T)\n", + "ev.index = iris_set.feature_names\n", + "ev" + ] + }, + { + "cell_type": "markdown", + "id": "ab7ae6bb", + "metadata": {}, + "source": [ + "Korelacje pomiędzy czynnikami głównymi (ang. correlation between principal components)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d866375c-ac54-4099-b77c-4535b4ac91e6", + "metadata": {}, + "outputs": [], + "source": [ + "pca_svd.corr(), pca_eigen.corr()" + ] + }, + { + "cell_type": "markdown", + "id": "46c2e1e7", + "metadata": {}, + "source": [ + "Wartości własne (ang. Eigenvalues)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02808a2d-3b65-462c-99f0-b56783822723", + "metadata": {}, + "outputs": [], + "source": [ + "index_eigen = [i[0] for i in sorted(enumerate(s_eigen), reverse=True, key = lambda x: x[1])]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0611c6f-ab43-4ffe-898e-c3d9e5c2e606", + "metadata": {}, + "outputs": [], + "source": [ + "pd.DataFrame(s**2 / (iris_x_scaled.shape[0]), s_eigen[index_eigen])" + ] + }, + { + "cell_type": "markdown", + "id": "0da92302", + "metadata": {}, + "source": [ + "Pierwsze wiersze czynników głównych (ang. First rows of principal components)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "482ee345-da56-427b-b67d-e9bd706999a0", + "metadata": {}, + "outputs": [], + "source": [ + "pca_svd.head(), pca_eigen[index_eigen].head()" + ] + }, + { + "cell_type": "markdown", + "id": "b80815e0-c5ba-44a6-ab03-2be923f3599b", + "metadata": {}, + "source": [ + "### PCA wykres na podstawie 2 pierwszych komponentow\n", + "\n", + "Prosze pamietac ze informacja o odmianach iris-ow (y) nie byla wykorzystana przy obliczaniu PCA" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48cbc312-366e-4946-8d91-c0e21ed7a437", + "metadata": {}, + "outputs": [], + "source": [ + "pca_svd.plot(\n", + " x = 0,\n", + " y = 1, \n", + " color = pd.Series(iris_set[\"target\"]).map({0: \"b\", 1: \"r\", 2: \"y\"}), \n", + " kind = \"scatter\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b58d0e58", + "metadata": {}, + "source": [ + "Iloraz wiariancji pokazuje udział kolejnych komponentow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5494a6fc-0bd5-4606-a775-3aefc6829df0", + "metadata": {}, + "outputs": [], + "source": [ + "s_eigen[index_eigen] / np.sum(s_eigen),\\\n", + "s**2 / np.sum(s**2)" + ] + }, + { + "cell_type": "markdown", + "id": "5eda8fd6-30fd-426f-a102-341716622462", + "metadata": {}, + "source": [ + "## Przyklad 2 -- seul1988 -- wyniki dziesiecioboju mezczyzn w Seulu w 1988" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d48de20f-c1bf-42a4-9be4-25cb9ee922a2", + "metadata": {}, + "outputs": [], + "source": [ + "from multidim.datasets import load_seul1988\n", + "seul1988 = load_seul1988()\n", + "seul1988_copy = seul1988.copy()" + ] + }, + { + "cell_type": "markdown", + "id": "ffce172b-2765-49ff-bb3a-9d24de7e8e9f", + "metadata": {}, + "source": [ + "Sprawdzamy czy w zbiorze danych sa obserwacje nietypowe - zmienna wynik" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5826a1e-ee1d-4a93-b523-7a22c229fef8", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata -d seul1988_copy -force\n", + "des" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22c503b4", + "metadata": {}, + "outputs": [], + "source": [ + "seul1988.shape, seul1988.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d03dbf91", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata\n", + "graph box wynik, title(\"Wykres pudelkowy\")\n", + "/*Istnieje jedna potencjalna obserwacja nietypowa.*/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e6d72e0", + "metadata": {}, + "outputs": [], + "source": [ + "seul1988[\"wynik\"].plot(kind = \"box\")" + ] + }, + { + "cell_type": "markdown", + "id": "b1b6b789-daab-4eee-92ff-26c0a9b0c0f3", + "metadata": {}, + "source": [ + "Przygotowujemy zbior danych do analizy -- usuwamy obserwacje potencjalnie nietypowa oraz przemnazamy wyniki uzyskane w konkurencjach biegowych przez -1, tak aby najnizsza wartosc oznaczala najgorszy wynik" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5607ab08-a701-445b-959c-9853da16d4d6", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata -qui\n", + "drop if wynik < 6000\n", + "gen bieg100_1 = bieg100 * (-1)\n", + "gen bieg400_1 = bieg400 * (-1)\n", + "gen plotki_1 = plotki * (-1)\n", + "gen bieg1500_1 = bieg1500 * (-1)\n", + "\n", + "replace bieg100 = bieg100_1\n", + "replace bieg400 = bieg400_1\n", + "replace bieg1500 = bieg1500_1\n", + "replace plotki = plotki_1\n", + "drop bieg100_1 bieg400_1 plotki_1 bieg1500_1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88b71fcb-5f8f-403e-af00-632ea6043e4b", + "metadata": {}, + "outputs": [], + "source": [ + "seul1988 = seul1988.query(\"wynik >= 6000\")\n", + "seul1988[\"bieg100\"] = seul1988.bieg100 * -1\n", + "seul1988[\"bieg400\"] = seul1988.bieg400 * -1\n", + "seul1988[\"bieg1500\"] = seul1988.bieg1500 * -1\n", + "seul1988[\"plotki\"] = seul1988.plotki * -1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ffeb575-ebfe-4781-a0a6-98c04c51eb99", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata\n", + "/*Przeprowadzamy analize glownych skladowych. Domyslnie wykorzystywana jest macierz korelacji*/\n", + "corr bieg100-bieg1500\n", + "summarize bieg100-bieg1500" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be7d3bbf-bac3-4db7-b286-13c8e289465f", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata\n", + "/*Przeprowadzamy analize glownych skladowych. Domyslnie wykorzystywana jest macierz korelacji*/\n", + "pca bieg100-bieg1500\n", + "\n", + "/*Tylko pierwsze dwie glowne skladowe maja wartosci wlasne wieksze od 1 (co jest rownoznaczne z wyjasniona wariancja wieksza niz srednia) i wyjasniaja ponad\n", + "60% calkowitej wariancji.*/\n", + "\n", + "/*Interpretacja dwoch pierwszych skladowych:\n", + "pierwsza -- mierzy osiagniety wynik (wszystkie wspolczynniki dodatnie)\n", + "druga -- ma wysokie wartosci dla konkurencji zwiazanych z rzucaniem i sila (kula, dysk, oszczep) i\n", + "duze ujemne dla wytrzymalosciowych (biegi na 400 i 1500 metrow)*/\n", + "\n", + "/*Okreslenie liczby skladowych glownych, ktore dobrze opisuja wariancje wyjsciowych zmiennych*/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9db8f841-a5a9-4aad-a8ee-5b8ac169af18", + "metadata": {}, + "outputs": [], + "source": [ + "cols_x = ['bieg100', 'skok_w_dal', 'rzut_kula', 'skok_wzwyz', \n", + " 'bieg400', 'plotki', 'rzut_dysk', 'tyczka', 'oszczep', 'bieg1500']\n", + "seul1988_x = seul1988[cols_x]\n", + "centred_seul1988_x = pd.DataFrame(seul1988_x - np.mean(seul1988_x), columns=cols_x)\n", + "scaled_seul1988_x = pd.DataFrame(seul1988_x - np.mean(seul1988_x), columns=cols_x) / np.std(seul1988_x)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4adf576e-3d83-4be7-9a0e-ecb672605702", + "metadata": {}, + "outputs": [], + "source": [ + "# seul1988_x.corr()\n", + "# seul1988_x.describe().T" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb55dd35-ef04-495f-9038-e3914b77f2e4", + "metadata": {}, + "outputs": [], + "source": [ + "# https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/decomposition/_pca.py\n", + "pca = PCA(svd_solver = 'full')\n", + "# SVD\n", + "# X = U * S * Vt\n", + "U, s, Vt = np.linalg.svd(scaled_seul1988_x, full_matrices = False)" + ] + }, + { + "cell_type": "markdown", + "id": "bc4bec22", + "metadata": {}, + "source": [ + "Główne składowe (ang. Principal components)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c209621-5957-4c4e-b2e9-af9669bcaa90", + "metadata": {}, + "outputs": [], + "source": [ + "# principal components\n", + "# X_principals = X * V = U * S * Vt * V = U * S\n", + "pca_components = pca.fit_transform(scaled_seul1988_x)\n", + "np.abs(np.round(pca_components, 2)), np.abs(np.round(np.matmul(scaled_seul1988_x, Vt.T), 2))\n", + "pca.explained_variance_ratio_, (s**2 / np.sum(s**2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5a65e03-1690-4374-93a8-79b4b244b0ee", + "metadata": {}, + "outputs": [], + "source": [ + "# rotation matrix\n", + "# np.abs needed\n", + "np.abs(np.round(Vt.T, 3)), np.abs(np.round(pca.components_.T, 3))\n", + "pd.DataFrame(Vt.T)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd5832e7", + "metadata": {}, + "outputs": [], + "source": [ + "%stata predict comp1-comp2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9d56cd6-1d90-4135-8e3a-7e88dc82bef9", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata\n", + "/*Wykres osypiska*/\n", + "screeplot, mean /*Wykres kolejnych wartosci wlasnych; \"mean\" - zostaje naniesiona prosta odpowiadajaca sredniej z wartosci wlasnych*/\n", + "screeplot, ci /*Dodatkowo zostaje nalozony przedzial ufnosci*/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e62e5da-2cb7-4229-8198-a6eac4bba6ce", + "metadata": {}, + "outputs": [], + "source": [ + "pd.Series(s**2 / seul1988_x.shape[0]).plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba70d846-f71a-4cc1-b72a-c8884ea0dd27", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata\n", + "/*Wykres wspólczynników dla poszczegolnych zmiennych - przydatne przy interpretacji skladowych glownych*/\n", + "loadingplot /*Tylko dla dwóch pierwszych wektorów wlasnych*/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac3a5367-7e37-4743-aab4-65de787a8d43", + "metadata": {}, + "outputs": [], + "source": [ + "loadings = pd.DataFrame(Vt.T * s / np.sqrt(seul1988_x.shape[0] - 1))\n", + "ax = loadings.plot.scatter(0,1)\n", + "cols = seul1988_x.columns.tolist()\n", + "for i in range(loadings.shape[0]):\n", + " ax.text(x = loadings.iloc[i,0], y = loadings.iloc[i,1], s = cols[i])" + ] + }, + { + "cell_type": "markdown", + "id": "80cdc522", + "metadata": {}, + "source": [ + "Wykres dwoch pierwszych czynników głównych\n", + "First two prinicipal components vs not known during extraction \"wynik\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b088bb6-b516-4a1e-b39e-919949af027b", + "metadata": {}, + "outputs": [], + "source": [ + "pd.DataFrame(pca_components).plot.scatter(0, 1, c = seul1988[\"wynik\"], colormap = \"winter\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8a31923-1ffb-4495-a0f2-fea751522a99", + "metadata": {}, + "outputs": [], + "source": [ + "#%%stata\n", + "#mvtest normality bieg100-bieg1500, stats(all)\n", + "#pca bieg100-bieg1500, cov vce(normal)\n", + "#testparm bieg100-bieg1500, equal eq(Comp1)\n", + "#\n", + "#/*niestety, nie jest spelnione zalozenie o wielowymiarowym rozkladzie normalnym*/\n", + "#\n", + "#/*Dalej analizujemy tylko dwie pierwsze glowne skladowe. */" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "854e72f5-2e44-4458-b3fb-806850f0926d", + "metadata": {}, + "outputs": [], + "source": [ + "#%%stata\n", + "#/*Jeszcze wyznaczmy macierz korelacji*/\n", + "#pwcorr bieg100-bieg1500 wynik comp2 comp1\n", + "#sum bieg100-bieg1500 wynik comp2 comp1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6bdd4c8-e16f-4e32-be7d-cbcd4fa92ccc", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata\n", + "//Weryfikacja poprawności przeprowadzonej analizy głównych składowych (PCA)\n", + "\n", + "/*Aby sprawdzic, czy wystarczajaco dobrze odtworzylismy zmiennosc wykorzystamy \"reszty\" -\n", + "roznice pomiedzy zaobserwowanymi korelacjami a tymi odtworzonymi za pomoca tylko kilku pierwszych skladowych*/\n", + "\n", + "pca bieg100 - bieg1500, components(3) /*Rozwiazanie skladajace sie z 3 pierwszych skladowych glownych*/\n", + "estat residual, fit /*Macierz \"resztowa\" oraz macierz korelacji odtworzona za pomoca 3 pierwszych skladowych*/\n", + "corr bieg100 - bieg1500" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dab5a5f5-c28a-465c-9775-8c7baf803566", + "metadata": {}, + "outputs": [], + "source": [ + "cor_original = np.corrcoef(scaled_seul1988_x.T)\n", + "# set to zero not used principal components\n", + "s3 = np.concatenate([s[:3], [0] * 7]) \n", + "# Eigen -> C = V * (S**2/ (n-1)) * Vt\n", + "cor_pca3 = Vt.T @ (np.diag(s3 ** 2) / (seul1988.shape[0] - 1)) @ Vt\n", + "pd.DataFrame(cor_original).round(3),\\\n", + "pd.DataFrame(cor_pca3).round(3),\\\n", + "pd.DataFrame(cor_original - cor_pca3).round(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "615b26da-f093-4c3e-aa84-2463d9ba7758", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata\n", + "/*Sprawdzenie, czy zmienne wejsciowe sa ze soba skorelowane.\n", + "Czyli sprawdzamy czy ma sens w ogole przeprwowadzenie analizy skladowych glownych*/\n", + "\n", + "/*1. Pierwszy sposob to analiza R^2*/\n", + "estat smc\n", + "reg bieg100 skok_w_dal - bieg1500 /*sprawdzenie, skad sie biora te wartosci*/\n", + "\n", + "//WNIOSEK: \"Skok wzwyz\" wykazuje najmniejsza zaleznosc z pozostalymi konkurencjami" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cdbfbfc4-d34f-42d9-a2a3-d07c3154b858", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata\n", + "/*2. Drugi sposób: \"anti-image correlation\" (minus korelacja czastkowa).*/\n", + "\n", + "/*Korelacja czastkowa pokazuje \"czysta\" zaleznosc miedzy dwoma zmiennymi, traktujac pozostale jako stale.\n", + "Dazymy do uzyskania malej korelacji czastkowej!\n", + "Jesli wiele tych korelacji jest relatywnie duzych, to zaleznosc pomiedzy niektorymi zmiennymi nie zalezy od poziomu pozostalych zmiennych.\n", + "Tym samym moze byc trudno uzyskac wlasciwe rozwiazanie malego wymiaru. */\n", + "\n", + "pca bieg100 - bieg1500\n", + "estat anti, nocov /*\"anti-image correlation\" (minus korelacja czastkowa)*/\n", + "pcorr bieg100 skok_w_dal - bieg1500 /*Korelacje czastkowe zmiennej bieg100 z pozostalymi*/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4abb002a-e035-466a-87fe-03c0cddac5d1", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata\n", + "/*3. Trzeci sposób: statystyka adekwatnosci proby Kaiser-Meyer-Olkin.*/\n", + "// pierwsza linia kodu to rezultat uruchamiania Stata przez Jupytera, w Stata ponowne szacowanie \n", + "// analizy skladowych glownych jest zbędne\n", + "qui pca bieg100 - bieg1500\n", + "estat kmo\n", + "\n", + "/*Bez zmiennej \"skok_wzwyz\"*/\n", + "pca bieg100 - kula bieg400 - bieg1500\n", + "estat kmo /*Wielkosc statystyki ulegla nieznacznej poprawie*/\n", + "\n", + "/*Wykres wspolczynnikow dla poszczegolnych zmiennych -- przydatne przy interpretacji skladowych glownych*/\n", + "pca bieg100 - bieg1500\n", + "loadingplot, comp(3) combined /*3 pierwsze skladowe glowne*/\n", + "\n", + "pca bieg100 - kula bieg400 - bieg1500\n", + "loadingplot, comp(3) combined /*bez \"skoku wzwyz\"*/" + ] + }, + { + "cell_type": "markdown", + "id": "9e6747ac-5d0f-4bbc-9512-768200f58c5a", + "metadata": {}, + "source": [ + "## Przykład 3 - Regresja Liniowa" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8be9cde-e17d-472c-a025-f41f417ee8bb", + "metadata": {}, + "outputs": [], + "source": [ + "X = seul1988_x.copy()\n", + "X[\"const\"] = 1\n", + "y = seul1988[\"wynik\"]\n", + "ols = sm.OLS(y, X)\n", + "ols_result = ols.fit()\n", + "print(ols_result.summary())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af806163-9973-4d8a-9ec6-3c9ab4180980", + "metadata": {}, + "outputs": [], + "source": [ + "# VIF\n", + "pd.DataFrame({\n", + " \"variable\": seul1988_x.columns, \n", + " \"VIF\": [variance_inflation_factor(seul1988_x, i) for i in range(seul1988_x.shape[1])]\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "480db3f3-06db-4b32-9d18-b1bf06dd32d7", + "metadata": {}, + "outputs": [], + "source": [ + "x_skok = centred_seul1988_x[['skok_w_dal', 'skok_wzwyz', 'tyczka']]\n", + "x_bieg = centred_seul1988_x[['bieg100','bieg400', 'bieg1500', 'plotki']]\n", + "x_atlet = centred_seul1988_x[['rzut_kula','rzut_dysk', 'oszczep']]\n", + "\n", + "svd_skok = np.linalg.svd(x_skok, full_matrices = False)\n", + "svd_bieg = np.linalg.svd(x_bieg, full_matrices = False)\n", + "svd_atlet = np.linalg.svd(x_atlet, full_matrices = False)\n", + "\n", + "print(svd_skok[1]**2/np.sum(svd_skok[1]**2))\n", + "print(svd_bieg[1]**2/np.sum(svd_bieg[1]**2))\n", + "print(svd_atlet[1]**2/np.sum(svd_atlet[1]**2))\n", + "\n", + "pca_skok = x_skok @ svd_skok[2].T\n", + "pca_bieg = x_bieg @ svd_bieg[2].T\n", + "pca_atlet = x_atlet @ svd_atlet[2].T" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a0a39b2-2a54-4dde-804e-0ed4891dd874", + "metadata": {}, + "outputs": [], + "source": [ + "svd_skok[2].T[:, 0],\\\n", + "svd_bieg[2].T[:, 0],\\\n", + "svd_atlet[2].T[:, 0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d09ffe8-8e26-4c78-b57c-b52a7ad34d1a", + "metadata": {}, + "outputs": [], + "source": [ + "# We do not now the sense (zwrot)\n", + "X = pd.concat([pca_skok.iloc[:,0]*-1, pca_bieg.iloc[:,0], pca_atlet.iloc[:,0]*-1], axis = 1)\n", + "X.columns = [\"skok\", \"bieg\", \"atlet\"]\n", + "print(pd.DataFrame({\"variable\": X.columns, \"VIF\": [variance_inflation_factor(X, i) for i in range(X.shape[1])]}))\n", + "X[\"const\"] = 1\n", + "y = seul1988[\"wynik\"]\n", + "ols = sm.OLS(y, X)\n", + "ols_result = ols.fit()\n", + "print(ols_result.summary())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "059094ba-81c0-46f0-baed-4b5b5a7d2607", + "metadata": {}, + "outputs": [], + "source": [ + "# Interpretacja?" + ] + }, + { + "cell_type": "markdown", + "id": "dc3dddb4", + "metadata": {}, + "source": [ + "## Przyklad 4 -- Objawy depresji (Dane z Diagnoza Spoleczna 2011) PCA" + ] + }, + { + "cell_type": "markdown", + "id": "1c018614", + "metadata": {}, + "source": [ + "1. Wczytaj zbior danych \"depresja.dta\"\n", + "2. Przeprowadz analize glownych skladowych.\n", + "3. Utworz wykres osypiska Ile powinismy wyroznic glownych skladowych?\n", + "4. Przeprowadz jeszcze raz analize, ale tylko dla ustalonej w punkcie 3 liczby glownych skladowych. Zapisz glowne skladowe oraz wyjsciowe zmienne w pliku.\n", + "5. Zinterpretuj glowne skladowe. Interpretacje poprzyj macierza korelacji.\n", + "6. Sprawdz, czy zmienne wejsciowe sa ze soba skorelowane (zweryfikuj, czy ma sens w ogole przeprwowadzenie analizy skladowych glownych).\n", + " Uzyj: R^2, \"anti-image correlation\" oraz statystyki adekwatnosci Kaisera-Meyera-Olkina.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4809038e-0403-4cec-a24e-5e94830b9eff", + "metadata": {}, + "outputs": [], + "source": [ + "from multidim.datasets import load_depresja\n", + "depresja = load_depresja()\n", + "depresja_copy = depresja.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f55bf0b-cbbe-448c-9148-4ebaf79d84d2", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata -d depresja_copy -force\n", + "des" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "363b60c2", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + }, + "vscode": { + "interpreter": { + "hash": "aa9d594cf5340018b76a70eeac417329866dfddee009818da806e7b1b5f80883" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/src/multidim/notebooks/08_discriminant.ipynb b/src/multidim/notebooks/08_discriminant.ipynb index cb5b648..356830d 100644 --- a/src/multidim/notebooks/08_discriminant.ipynb +++ b/src/multidim/notebooks/08_discriminant.ipynb @@ -11,13 +11,13 @@ { "cell_type": "code", "execution_count": null, - "id": "45a6e5ed", + "id": "f0bf89ff-9232-478a-bf5e-f75f4580b914", "metadata": {}, "outputs": [], "source": [ "from multidim.utils import resolve_stata, load_stata\n", "\n", - "STATA_PATH, STATA_TYPE = resolve_stata(version = 18, stype = \"se\")\n", + "STATA_PATH, STATA_TYPE = resolve_stata(version = 17, stype = \"se\")\n", "# make sure they are proper ones\n", "STATA_PATH, STATA_TYPE" ] @@ -25,7 +25,7 @@ { "cell_type": "code", "execution_count": null, - "id": "832e6b55", + "id": "cf3ccf91", "metadata": {}, "outputs": [], "source": [ @@ -39,11 +39,8 @@ "metadata": {}, "outputs": [], "source": [ - "from pystata import stata\n", "import pandas as pd\n", "import numpy as np\n", - "import scipy\n", - "import sklearn\n", "\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.pipeline import Pipeline\n", @@ -89,7 +86,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e23142e0", + "id": "92ce7c16", "metadata": {}, "outputs": [], "source": [ @@ -99,80 +96,166 @@ { "cell_type": "code", "execution_count": null, - "id": "46f97c9e-28cd-4859-b226-13df84dca0fc", + "id": "1207744c", "metadata": {}, "outputs": [], "source": [ - "%%stata\n", - "/// opis zbioru\n", - "des\n", - "/// Rozpoczynamy analize od podstawowych statystyk zmiennych psychologicznych\n", - "summarize outdoor social conservative\n", - "/// Podstawowe statystyki w podziale na typ wykonywanej pracy\n", - "/// opcja stat definiuje statystki do wyswietlenia\n", - "/// opcja col(stat) okresla ze maja one byc wyswietlone w kolumnach tabeli\n", - "tabstat outdoor social conservative, by(job) stat(n mean sd min max) col(stat)\n", - "/// Tablica korelacji. Opcja sig wyswietla wartosc p dla hipotezy o braku korelacji\n", - "pwcorr outdoor social conservative, sig\n", - "/// Analiza rozkladu zmiennej grupujacej\n", - "tabulate job" + "discrim = pd.read_stata(\"https://stats.idre.ucla.edu/stat/stata/dae/discrim.dta\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41a7c8a8", + "metadata": {}, + "outputs": [], + "source": [ + "%stata des" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ece2ebfc", + "metadata": {}, + "outputs": [], + "source": [ + "discrim.dtypes" ] }, { "cell_type": "markdown", - "id": "994d8d28-dd4e-4c39-ace0-f8e5bbe789d6", + "id": "6d24b1f9", "metadata": {}, "source": [ - "Kanoniczna analiza dyskryminacyjna \n", - "Wykonanie polecenia powoduje wykonanie kanonicznej analizy dyskryminacyjnej \n", - "Wyniki sa prezentowane w 6 tabelach \n", - "Tabela 1 - wartosci wlasne i statystyczna istotnosc kierunkow dyskryminacji \n", - "Tabela 2 - standaryzowane oszacowania kanonicznych wspolczynnikow dyskryminacji \n", - "Tabela 3 - tabela ladunkow kanonicznych \n", - "Tabela 4 - etykiety grup \n", - "Tabela 5 - srednie wartosci zmiennych kanonicznych dla grup \n", - "Tabela 6 - tabela klasyfikacji przy zalozonym jednostajnym rozkladzie a-priori. \n", - "Przyjmowany rozklad a-priori mozna zmienic wykorzystujac opcje prior " + " Rozpoczynamy analize od podstawowych statystyk zmiennych psychologicznych" ] }, { "cell_type": "code", "execution_count": null, - "id": "e6003535-c5de-4343-bf6c-3a123cd27405", + "id": "490723f6", "metadata": {}, "outputs": [], "source": [ - "discrim = pd.read_stata(\"https://stats.idre.ucla.edu/stat/stata/dae/discrim.dta\")" + "%stata summarize outdoor social conservative" ] }, { "cell_type": "code", "execution_count": null, - "id": "888d38fe-2782-4ee1-a3b0-54dfb7b1a662", + "id": "dd375532", "metadata": {}, "outputs": [], "source": [ "discrim.describe()" ] }, + { + "cell_type": "markdown", + "id": "10a2d432", + "metadata": {}, + "source": [ + "Podstawowe statystyki w podziale na typ wykonywanej pracy\n", + "- opcja stat definiuje statystki do wyswietlenia\n", + "- opcja col(stat) okresla ze maja one byc wyswietlone w kolumnach tabeli" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "be83605d-621f-471c-976d-e3cea071ce25", + "id": "18368fd5", + "metadata": {}, + "outputs": [], + "source": [ + "%stata tabstat outdoor social conservative, by(job) stat(n mean sd min max) col(stat)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9df183de", + "metadata": {}, + "outputs": [], + "source": [ + "discrim.groupby([\"job\"])[[\"outdoor\", \"social\", \"conservative\"]].describe().T" + ] + }, + { + "cell_type": "markdown", + "id": "5a08c73c", + "metadata": {}, + "source": [ + "Tablica korelacji. Opcja sig wyswietla wartosc p dla hipotezy o braku korelacji" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "958b3d6c", + "metadata": {}, + "outputs": [], + "source": [ + "%stata pwcorr outdoor social conservative, sig" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9b9fdce", "metadata": {}, "outputs": [], "source": [ "discrim[[\"outdoor\", \"social\", \"conservative\"]].corr()" ] }, + { + "cell_type": "markdown", + "id": "d61f9c60", + "metadata": {}, + "source": [ + "Analiza rozkladu zmiennej grupujacej\n" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "f3070b3d-3d6d-4e1d-aa0c-ef2f28f44796", + "id": "3de68133", "metadata": {}, "outputs": [], "source": [ - "discrim.job.value_counts()" + "\n", + "%stata tabulate job" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bafc5058", + "metadata": {}, + "outputs": [], + "source": [ + "tab = pd.DataFrame({\"Freq\": discrim[\"job\"].value_counts()})\n", + "tab[\"Percent\"] = tab / tab.sum() * 100\n", + "tab[\"Cum\"] = tab[\"Percent\"].cumsum()\n", + "tab" + ] + }, + { + "cell_type": "markdown", + "id": "994d8d28-dd4e-4c39-ace0-f8e5bbe789d6", + "metadata": {}, + "source": [ + "Kanoniczna analiza dyskryminacyjna \n", + "Wykonanie polecenia powoduje wykonanie kanonicznej analizy dyskryminacyjnej \n", + "Wyniki sa prezentowane w 6 tabelach \n", + "Tabela 1 - wartosci wlasne i statystyczna istotnosc kierunkow dyskryminacji \n", + "Tabela 2 - standaryzowane oszacowania kanonicznych wspolczynnikow dyskryminacji \n", + "Tabela 3 - tabela ladunkow kanonicznych \n", + "Tabela 4 - etykiety grup \n", + "Tabela 5 - srednie wartosci zmiennych kanonicznych dla grup \n", + "Tabela 6 - tabela klasyfikacji przy zalozonym jednostajnym rozkladzie a-priori. \n", + "Przyjmowany rozklad a-priori mozna zmienic wykorzystujac opcje prior " ] }, { @@ -202,8 +285,23 @@ "metadata": {}, "outputs": [], "source": [ - "%%stata\n", - "candisc outdoor social conservative, group(job)" + "%stata candisc outdoor social conservative, group(job)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ef48887", + "metadata": {}, + "outputs": [], + "source": [ + "y = discrim.job\n", + "X = discrim.loc[:, [\"outdoor\", \"social\", \"conservative\"]]\n", + "lda = LinearDiscriminantAnalysis(solver = 'eigen')\n", + "lda.fit(X, y)\n", + "print(lda.explained_variance_ratio_)\n", + "print(confusion_matrix(y, lda.predict(X)))\n", + "print(classification_report(y, lda.predict(X)))" ] }, { @@ -216,6 +314,18 @@ "%stata estat classfunctions" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "c4613bcc", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "print(lda.classes_)\n", + "print(lda.coef_.T, lda.intercept_)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -234,6 +344,15 @@ "loadingplot" ] }, + { + "cell_type": "markdown", + "id": "15b668ff", + "metadata": {}, + "source": [ + "Te same wyniki mozna uzyskac poleceniem **discrim lda**\n", + "W przypadku tego polecenia Stata wyswietla wylacznie tablice klasyfikacji" + ] + }, { "cell_type": "code", "execution_count": null, @@ -241,10 +360,7 @@ "metadata": {}, "outputs": [], "source": [ - "%%stata\n", - "/// Te same wyniki mozna uzyskac poleceniem discrim lda\n", - "/// W przypadku tego polecenia Stata wyswietla wylacznie tablice klasyfikacji\n", - "discrim lda outdoor social conservative, group(job)" + "%stata discrim lda outdoor social conservative, group(job)" ] }, { @@ -285,72 +401,71 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "90069c81-26ff-4d2a-ad91-f392d3eca488", + "cell_type": "markdown", + "id": "55945b1d", "metadata": {}, - "outputs": [], "source": [ - "y = discrim.job\n", - "X = discrim.loc[:, [\"outdoor\", \"social\", \"conservative\"]]" + "### Przykład 2.\n" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "507d9ebe-7b62-47dd-9f04-05bfcbe40a2a", + "cell_type": "markdown", + "id": "0c2f6711", "metadata": {}, - "outputs": [], "source": [ - "lda = LinearDiscriminantAnalysis(solver = 'eigen')\n", - "lda.fit(X, y)\n", - "print(lda.classes_)\n", - "lda.coef_.T, lda.intercept_" + "Jest to przyklad analizy dyskryminacyjnej, ktory mozna znalezc w niemal kazdym \n", + "podreczniku analizy wielowymiarowej. Zostal opracowany przez Fishera w 1936 roku\n", + "Analiza obejmuje trzy odmiany irysow i cztery zmienne opisujace ich cechy:\n", + "- Dlugosc platka [cm] (petal lenght)\n", + "- Szerokosc platka [cm] (petal width)\n", + "- Dlugosc listka kielicha [cm]\n", + "- Szerokosc listka kielicha [cm]" ] }, { "cell_type": "code", "execution_count": null, - "id": "ccad4d6a-e57a-4ac6-b693-a60d574c2c3a", + "id": "5b8a60ef", "metadata": {}, "outputs": [], "source": [ - "print(classification_report(y, lda.predict(X)))\n", - "confusion_matrix(y, lda.predict(X))" + "from multidim.datasets import load_iris\n", + "iris = load_iris()\n", + "iris.columns = [\"seplen\", \"sepwid\", \"petlen\", \"petwid\", \"iris\"]\n", + "iris[\"iris\"] = iris[\"iris\"].astype(\"category\")" ] }, { "cell_type": "markdown", - "id": "55945b1d", + "id": "78e98567", "metadata": {}, "source": [ - "### Przykład 2.\n" + "Wykorzystujemy dane z pythona w STATA poprzez argument -d" ] }, { - "cell_type": "markdown", - "id": "0c2f6711", + "cell_type": "code", + "execution_count": null, + "id": "1678c0a9-1f67-4661-ad8e-2265f7351472", "metadata": {}, + "outputs": [], "source": [ - "Jest to przyklad analizy dyskryminacyjnej, ktory mozna znalezc w niemal kazdym \n", - "podreczniku analizy wielowymiarowej. Zostal opracowany przez Fishera w 1936 roku\n", - "Analiza obejmuje trzy odmiany irysow i cztery zmienne opisujace ich cechy:\n", - "- Dlugosc platka [cm] (petal lenght)\n", - "- Szerokosc platka [cm] (petal width)\n", - "- Dlugosc listka kielicha [cm]\n", - "- Szerokosc listka kielicha [cm]" + "%%stata -d iris -force\n", + "di \"Fisher example iris data\"\n", + "egen giris = group(iris)" ] }, { "cell_type": "code", "execution_count": null, - "id": "1678c0a9-1f67-4661-ad8e-2265f7351472", + "id": "b4a3d501", "metadata": {}, "outputs": [], "source": [ - "%%stata\n", - "di \"Fisher example iris data\"\n", - "use ../../dane/iris.dta, clear" + "from multidim.datasets import load_iris\n", + "iris = load_iris()\n", + "iris.columns = [\"seplen\", \"sepwid\", \"petlen\", \"petwid\", \"iris\"]\n", + "iris[\"iris\"] = iris[\"iris\"].astype(\"category\")" ] }, { @@ -360,12 +475,9 @@ "metadata": {}, "outputs": [], "source": [ - "iris = pd.read_stata(\"../../dane/iris.dta\")\n", - "# iris_set = datasets.load_iris()\n", - "# iris_x = iris_set[\"data\"]\n", - "# iris_y = iris_set[\"target\"]\n", - "iris_x = iris.iloc[:,1:]\n", - "iris_y = iris.iloc[:,0].values\n", + "\n", + "iris_x = iris.iloc[:,:-1].values\n", + "iris_y = iris.iloc[:,-1].values\n", "# SCALING\n", "# W pewnych przypadkach istotna bedzie standaryzacja\n", "iris_x_scaled = (iris_x - np.mean(iris_x, axis = 0)) / np.std(iris_x, axis = 0)" @@ -378,11 +490,27 @@ "metadata": {}, "outputs": [], "source": [ - "%%stata\n", - "/// Opis danych\n", - "des\n", - "/// Podstawowe statystyki opisowe\n", - "bysort iris: su seplen sepwid petlen petwid " + "%stata des" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d2e8cee", + "metadata": {}, + "outputs": [], + "source": [ + "iris.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0cbefa7", + "metadata": {}, + "outputs": [], + "source": [ + "%stata bysort giris: su seplen sepwid petlen petwid " ] }, { @@ -397,52 +525,163 @@ "describe().T" ] }, + { + "cell_type": "markdown", + "id": "f5eec8eb", + "metadata": {}, + "source": [ + "Analiza dyskryminacji" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "8e9e87dc-86ec-4301-b0a8-dd320847271a", + "id": "7967a8ce", "metadata": {}, "outputs": [], "source": [ - "%%stata\n", - "/// Analiza dyskryminacji\n", - "discrim lda seplen sepwid petlen petwid, group(iris)\n", - "/// Tabela nieprawidlowo klasyfikowanych obserwacji\n", - "estat list, misclassified\n", - "/// wagi\n", - "estat classfunctions\n", - "/// Tabela niestandaryzowanych i standaryzowanych oszacowan kanonicznych wspolczynnikow dyskrminacji\n", - "estat loadings, unstandardized standardized\n", - "loadingplot\n", - "scoreplot, msymbol(i)\n", - "/// Uzyskanie klasyfikacji\n", - "predict klasyfikacja\n", - "/// Tabela klasyfikacji\n", - "tab iris klasyfikacja" + "%stata discrim lda seplen sepwid petlen petwid, group(giris)" ] }, { "cell_type": "code", "execution_count": null, - "id": "fd1cb87a-9cd8-43e3-95a4-62ed2b33a323", + "id": "219f66e0", "metadata": {}, "outputs": [], "source": [ "lda = LinearDiscriminantAnalysis(solver = 'eigen')\n", - "lda.fit(iris_x.values, iris_y)\n", + "lda.fit(iris_x, iris_y)" + ] + }, + { + "cell_type": "markdown", + "id": "e35dc518", + "metadata": {}, + "source": [ + "Tabela nieprawidlowo klasyfikowanych obserwacji\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fef8bcb2", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "%stata estat list, misclassified" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6adf93a", + "metadata": {}, + "outputs": [], + "source": [ + "# misclassified\n", + "np.where(iris_y != lda.predict(iris_x))" + ] + }, + { + "cell_type": "markdown", + "id": "f1034660", + "metadata": {}, + "source": [ + "Wagi" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4172e3d5", + "metadata": {}, + "outputs": [], + "source": [ + "%stata estat classfunctions\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb792407", + "metadata": {}, + "outputs": [], + "source": [ + "\n", "print(lda.classes_)\n", "lda.coef_.T, lda.intercept_" ] }, + { + "cell_type": "markdown", + "id": "a3382dba", + "metadata": {}, + "source": [ + "/// Tabela niestandaryzowanych i standaryzowanych oszacowan kanonicznych wspolczynnikow dyskrminacji" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "c991130b-4944-4ad6-bea9-83fb7a0f4a98", + "id": "be08af4e", "metadata": {}, "outputs": [], "source": [ - "# misclassified\n", - "np.where(iris_y != lda.predict(iris_x.values))" + "%stata estat loadings, unstandardized standardized\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d1d2100", + "metadata": {}, + "outputs": [], + "source": [ + "lda.scalings_[:, 0:2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ccf0ceef", + "metadata": {}, + "outputs": [], + "source": [ + "%stata loadingplot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5732cc8d", + "metadata": {}, + "outputs": [], + "source": [ + "%stata scoreplot, msymbol(i)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebb8f44f", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata \n", + "predict klasyfikacja\n", + "tab iris klasyfikacja" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "923fb626", + "metadata": {}, + "outputs": [], + "source": [ + "confusion_matrix(iris_y, lda.predict(iris_x))" ] }, { @@ -548,16 +787,6 @@ "Na podstawie danych z badania Diagnoza Spoleczna 2011 bedziemy analizowac stosunek obywateli do euro" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "15a5693b", - "metadata": {}, - "outputs": [], - "source": [ - "%stata use ../../dane/euro.dta, clear" - ] - }, { "cell_type": "code", "execution_count": null, @@ -565,7 +794,9 @@ "metadata": {}, "outputs": [], "source": [ - "euro = pd.read_stata(\"../../dane/euro.dta\")" + "from multidim.datasets import load_euro\n", + "euro = load_euro()\n", + "euro[\"wiek\"] = euro[\"wiek\"].astype(\"int\")" ] }, { @@ -575,7 +806,8 @@ "metadata": {}, "outputs": [], "source": [ - "%stata des" + "%%stata -d euro -force\n", + "des" ] }, { @@ -661,16 +893,46 @@ { "cell_type": "code", "execution_count": null, - "id": "37a367ac-beac-4a37-8654-dd65d1933eb4", + "id": "19cda014", "metadata": {}, "outputs": [], "source": [ "%%stata\n", "/// Zbadaj stosunek do euro metoda najblizszych sasiadow\n", "// losujemy 10% probe prosta by przyspieszyc obliczenia\n", - "sample 10\n", - "// Zastanow sie jaki rozklad a-priori wybrac aby zbadac poprawnosc klasyfikacji\n", - "discrim knn wiek klm kobieta wyksz zna_angielski partia_PO partia_PiS, group(euro) k(5)" + "sample 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2bc438e7", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata\n", + "egen geuro = group(euro)\n", + "egen gwyksz = group(wyksz)\n", + "egen gklm = group(klm)" + ] + }, + { + "cell_type": "markdown", + "id": "35092761", + "metadata": {}, + "source": [ + "Zastanow sie jaki rozklad a-priori wybrac aby zbadac poprawnosc klasyfikacji\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37a367ac-beac-4a37-8654-dd65d1933eb4", + "metadata": {}, + "outputs": [], + "source": [ + "%%stata\n", + "discrim knn wiek gklm kobieta gwyksz zna_angielski partia_PO partia_PiS, group(geuro) k(5)" ] }, { @@ -718,11 +980,19 @@ "source": [ "print(classification_report(y, preds))" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "badbc79c", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3.10.6 ('.venv': venv)", "language": "python", "name": "python3" }, @@ -736,7 +1006,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.8" + }, + "vscode": { + "interpreter": { + "hash": "a8685faa0ed749449a0f1a8710c4e7cd8c1c7833bc8ac4d1844d25fbee35609f" + } } }, "nbformat": 4, diff --git a/src/multidim/notebooks/09_divisive.ipynb b/src/multidim/notebooks/09_divisive.ipynb index 2714557..0800d72 100644 --- a/src/multidim/notebooks/09_divisive.ipynb +++ b/src/multidim/notebooks/09_divisive.ipynb @@ -17,7 +17,7 @@ "source": [ "from multidim.utils import resolve_stata, load_stata\n", "\n", - "STATA_PATH, STATA_TYPE = resolve_stata(version = 18, stype = \"se\")\n", + "STATA_PATH, STATA_TYPE = resolve_stata(version = 17, stype = \"se\")\n", "# make sure they are proper ones\n", "STATA_PATH, STATA_TYPE" ] @@ -25,10 +25,8 @@ { "cell_type": "code", "execution_count": null, - "id": "e2f5fa31-b268-43c7-b43f-bb42ddae8e97", - "metadata": { - "tags": [] - }, + "id": "8a22419d", + "metadata": {}, "outputs": [], "source": [ "load_stata(STATA_PATH, STATA_TYPE)" @@ -42,15 +40,11 @@ "outputs": [], "source": [ "import pandas as pd\n", - "import scipy\n", - "import sklearn\n", "from sklearn.cluster import KMeans\n", "import numpy as np\n", "from collections import Counter\n", "from collections import defaultdict\n", - "from sklearn.metrics import calinski_harabasz_score\n", - "\n", - "import pyAesCrypt" + "from sklearn.metrics import calinski_harabasz_score" ] }, { @@ -85,25 +79,11 @@ { "cell_type": "code", "execution_count": null, - "id": "8fadfa59-30ac-4243-b8a0-e2d198e904d6", - "metadata": {}, - "outputs": [], - "source": [ - "if not os.path.isfile(\"../../dane/uscities.dta\"):\n", - " password = spec[\"password_pyaescrypt\"]\n", - " if password is None:\n", - " password = input(\"password: \")\n", - " pyAesCrypt.decryptFile(\"../../dane/uscities.dta.aes\", \"../../dane/uscities.dta\", password)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "639cbf85", + "id": "589cc854", "metadata": {}, "outputs": [], "source": [ - "%stata use ../../dane/uscities.dta, clear" + "from multidim.datasets import load_uscities" ] }, { @@ -113,7 +93,8 @@ "metadata": {}, "outputs": [], "source": [ - "uscities = pd.read_stata(\"../../dane/uscities.dta\")" + "uscities = load_uscities()\n", + "uscities_copy = uscities.copy()" ] }, { @@ -123,7 +104,7 @@ "metadata": {}, "outputs": [], "source": [ - "%%stata\n", + "%%stata -d uscities_copy\n", "// precipitaion = opady\n", "des\n", "sum\n", @@ -162,7 +143,7 @@ "source": [ "%%stata\n", "foreach zmienna of varlist so2 temp manuf pop wind precip days {\n", - "egen `zmienna'_s = std(`zmienna')\n", + " egen `zmienna'_s = std(`zmienna')\n", "}" ] }, @@ -220,7 +201,8 @@ " cc = str(i)\n", " kmeans = KMeans(n_clusters=i, random_state=1234, init = \"random\").fit(X)\n", " # print(kmeans.labels_)\n", - " res[cc][\"labels\"] = Counter(sorted(kmeans.labels_))\n", + " res[cc][\"labels\"] = kmeans.labels_\n", + " res[cc][\"count_labels\"] = Counter(sorted(kmeans.labels_))\n", " dfs = pd.DataFrame(kmeans.cluster_centers_.T)\n", " dfs.index = x_names_s\n", " res[cc][\"centres\"] = dfs\n", @@ -267,17 +249,6 @@ "dla podzialu zbiory danych o zadeklarowanej nazwie" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "35346661", - "metadata": {}, - "outputs": [], - "source": [ - "for c in cmean_l['r(names)'].split(\" \"):\n", - " stata.run('cluster stop {}'.format(c))" - ] - }, { "cell_type": "code", "execution_count": null, @@ -285,10 +256,10 @@ "metadata": {}, "outputs": [], "source": [ - "#%%stata\n", - "#forvalues i=2(1)6 {\n", - "#cluster stop cl`i'_mean\n", - "#}" + "%%stata\n", + "forvalues i=2(1)6 {\n", + " cluster stop cl`i'_mean\n", + "}" ] }, { @@ -343,7 +314,7 @@ "metadata": {}, "outputs": [], "source": [ - "c_eret_temp, c_sret_temp, c_ret_temp" + "# c_eret_temp, c_sret_temp, c_ret_temp" ] }, { @@ -374,12 +345,11 @@ { "cell_type": "code", "execution_count": null, - "id": "eb25a6b5", + "id": "27831626", "metadata": {}, "outputs": [], "source": [ - "for c in cmedian_l['r(names)'].split(\" \"):\n", - " stata.run('cluster stop {}'.format(c))" + "%stata cluster drop *" ] }, { @@ -393,16 +363,6 @@ "Zmieniamy metryke na miejska." ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "2753e359", - "metadata": {}, - "outputs": [], - "source": [ - "%stata cluster drop *" - ] - }, { "cell_type": "code", "execution_count": null, @@ -475,7 +435,7 @@ "cluster kmedian so2_s temp_s manuf_s pop_s wind_s precip_s days_s, k(6) name(cl6_median) measure(Linfinity)\n", "\n", "forvalues i=2(1)6 {\n", - "cluster stop cl`i'_median\n", + " cluster stop cl`i'_median\n", "}" ] }, @@ -499,6 +459,16 @@ "%stata net search silhouette" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "29485790", + "metadata": {}, + "outputs": [], + "source": [ + "%stata ssc install silhouette" + ] + }, { "cell_type": "markdown", "id": "772ceee8", @@ -550,9 +520,49 @@ "metadata": {}, "outputs": [], "source": [ - "%%stata\n", - "sort town\n", - "silhouette cl25_median, dist(dist) idvar(town) " + "%stata sort town" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20f57803", + "metadata": {}, + "outputs": [], + "source": [ + "%stata silhouette cl2_median, dist(dist) idvar(town) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8c152ce", + "metadata": {}, + "outputs": [], + "source": [ + "%stata silhouette cl5_median, dist(dist) idvar(town) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b50ec2e", + "metadata": {}, + "outputs": [], + "source": [ + "%stata version 17" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fd47093", + "metadata": {}, + "outputs": [], + "source": [ + "from yellowbrick.cluster import silhouette_visualizer\n", + "\n", + "silhouette_visualizer(KMeans(5), X, colors='yellowbrick')" ] }, { @@ -579,7 +589,7 @@ "metadata": {}, "outputs": [], "source": [ - "%stata use ../../dane/nauczyciele.dta, clear" + "from multidim.datasets import load_nauczyciele" ] }, { @@ -589,7 +599,18 @@ "metadata": {}, "outputs": [], "source": [ - "nauczyciele = pd.read_stata(\"../../dane/nauczyciele.dta\")" + "nauczyciele = load_nauczyciele()\n", + "nauczyciele_copy = nauczyciele.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ea0f446", + "metadata": {}, + "outputs": [], + "source": [ + "%stata clear" ] }, { @@ -599,7 +620,7 @@ "metadata": {}, "outputs": [], "source": [ - "%%stata\n", + "%%stata -d nauczyciele_copy\n", "// Obejrzyjmy dane.\n", "describe\n", "summarize" @@ -638,7 +659,7 @@ "%%stata\n", "// Wybor poczatkowej liczby skupien. Poniewaz mamy dwie cechy nauczycieli, zalozylismy ze beda co najmniej 4.\n", "forvalues i=4(1)9 {\n", - "cluster kmeans staz_c_std czas0205_std, k(`i') name(means_6_`i')\n", + " cluster kmeans staz_c_std czas0205_std, k(`i') name(means_6_`i')\n", "}" ] }, @@ -691,7 +712,7 @@ "source": [ "%%stata\n", "forvalues i=4(1)9 {\n", - "cluster stop means_6_`i'\n", + " cluster stop means_6_`i'\n", "}" ] }, @@ -756,7 +777,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3.10.8 ('.venv': venv)", "language": "python", "name": "python3" }, @@ -770,7 +791,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.6" + }, + "vscode": { + "interpreter": { + "hash": "a8685faa0ed749449a0f1a8710c4e7cd8c1c7833bc8ac4d1844d25fbee35609f" + } } }, "nbformat": 4, diff --git a/src/multidim/notebooks/10_hierarchical.ipynb b/src/multidim/notebooks/10_hierarchical.ipynb index fa8ee3d..53708f2 100644 --- a/src/multidim/notebooks/10_hierarchical.ipynb +++ b/src/multidim/notebooks/10_hierarchical.ipynb @@ -17,7 +17,7 @@ "source": [ "from multidim.utils import resolve_stata, load_stata\n", "\n", - "STATA_PATH, STATA_TYPE = resolve_stata(version = 18, stype = \"se\")\n", + "STATA_PATH, STATA_TYPE = resolve_stata(version = 17, stype = \"se\")\n", "# make sure they are proper ones\n", "STATA_PATH, STATA_TYPE" ] @@ -999,7 +999,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3.10.6 ('.venv': venv)", "language": "python", "name": "python3" }, @@ -1013,7 +1013,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.6" }, "vscode": { "interpreter": { diff --git a/src/multidim/notebooks/11_modern_clustering.ipynb b/src/multidim/notebooks/11_modern_clustering.ipynb index ac11346..4befe71 100644 --- a/src/multidim/notebooks/11_modern_clustering.ipynb +++ b/src/multidim/notebooks/11_modern_clustering.ipynb @@ -1,832 +1,832 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "2a48c727", - "metadata": { - "tags": [] - }, - "source": [ - "# Analiza Wielowymiarowa - zajecia 11 - Metody współczesne" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "53cb6590", - "metadata": {}, - "outputs": [], - "source": [ - "# Do tych zajęć nie ma kodów Staty, metody nie są oprogramowane" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "313a653d", - "metadata": {}, - "outputs": [], - "source": [ - "# Załadowanie bibliotek\n", - "import pandas as pd\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "from sklearn.mixture import GaussianMixture\n", - "from sklearn.cluster import DBSCAN\n", - "from sklearn.cluster import KMeans\n", - "from sklearn import datasets\n", - "\n", - "from scipy.linalg import svd\n", - "\n", - "from multidim.datasets import load_iris" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c6bfe09c", - "metadata": {}, - "outputs": [], - "source": [ - "CAT_COLORS = np.array([\"red\", \"blue\", \"yellow\", \"purple\", \"green\", \"orange\", \"black\", \"pink\", \"brown\", \"gray\", \"cyan\", \"magenta\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f5417c75", - "metadata": {}, - "outputs": [], - "source": [ - "np.random.seed(1234)" - ] - }, - { - "cell_type": "markdown", - "id": "203271d0", - "metadata": {}, - "source": [ - "### Metody analizy skupień - sklearn\n", - "\n", - "Source: https://scikit-learn.org/stable/modules/clustering.html" - ] - }, - { - "cell_type": "markdown", - "id": "ee457577", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "

Method name

Parameters

Scalability

Usecase

Geometry (metric used)

K-Means

number of clusters

Very large n_samples, medium n_clusters with\n", - "MiniBatch code

General-purpose, even cluster size, flat geometry,\n", - "not too many clusters, inductive

Distances between points

Affinity propagation

damping, sample preference

Not scalable with n_samples

Many clusters, uneven cluster size, non-flat geometry, inductive

Graph distance (e.g. nearest-neighbor graph)

Mean-shift

bandwidth

Not scalable with n_samples

Many clusters, uneven cluster size, non-flat geometry, inductive

Distances between points

Spectral clustering

number of clusters

Medium n_samples, small n_clusters

Few clusters, even cluster size, non-flat geometry, transductive

Graph distance (e.g. nearest-neighbor graph)

Ward hierarchical clustering

number of clusters or distance threshold

Large n_samples and n_clusters

Many clusters, possibly connectivity constraints, transductive

Distances between points

Agglomerative clustering

number of clusters or distance threshold, linkage type, distance

Large n_samples and n_clusters

Many clusters, possibly connectivity constraints, non Euclidean\n", - "distances, transductive

Any pairwise distance

DBSCAN

neighborhood size

Very large n_samples, medium n_clusters

Non-flat geometry, uneven cluster sizes, outlier removal,\n", - "transductive

Distances between nearest points

OPTICS

minimum cluster membership

Very large n_samples, large n_clusters

Non-flat geometry, uneven cluster sizes, variable cluster density,\n", - "outlier removal, transductive

Distances between points

Gaussian mixtures

many

Not scalable

Flat geometry, good for density estimation, inductive

Mahalanobis distances to centers

BIRCH

branching factor, threshold, optional global clusterer.

Large n_clusters and n_samples

Large dataset, outlier removal, data reduction, inductive

Euclidean distance between points

Bisecting K-Means

number of clusters

Very large n_samples, medium n_clusters

General-purpose, even cluster size, flat geometry,\n", - "no empty clusters, inductive, hierarchical

Distances between points

" - ] - }, - { - "cell_type": "markdown", - "id": "d8a21aec", - "metadata": {}, - "source": [ - "![](https://scikit-learn.org/stable/_images/sphx_glr_plot_cluster_comparison_001.png)" - ] - }, - { - "cell_type": "markdown", - "id": "88ca1629", - "metadata": {}, - "source": [ - "### Skoncentrujemy sie na metodach GMM oraz DBSCAN. W tym celu wykorzystamy zbiór danych Iris oraz okregow." - ] - }, - { - "cell_type": "markdown", - "id": "dc2529d7", - "metadata": {}, - "source": [ - "### 1-D Example Data" - ] - }, - { - "cell_type": "markdown", - "id": "7be06648", - "metadata": {}, - "source": [ - "- Czym są etykiety miekkie (soft) oraz twarde (hard)?\n", - "- Kiedy mowimy o latent variable (zmienna ukryta)? Zmienna ukryta - której nie można bezpośrednio zaobserwować, ale można ją wywnioskować z obserwowanych zmiennych i parametrów.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7c8125ef", - "metadata": {}, - "outputs": [], - "source": [ - "A = np.random.normal(size = 100, loc = 0, scale = 1)\n", - "B = np.random.normal(size = 100, loc = 5, scale = 2)" - ] - }, - { - "cell_type": "markdown", - "id": "6157574e", - "metadata": {}, - "source": [ - "Wykres danych przy zalożeniu że znane są ich etykiety:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "760237e3", - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "plt.figure()\n", - "plt.plot(A, np.zeros(100), \"o\", c = \"red\", alpha = 0.2)\n", - "plt.plot(B, np.zeros(100), \"o\", c = \"blue\", alpha = 0.2)\n", - "plt.yticks([])\n", - "plt.show()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "492688a1", - "metadata": {}, - "outputs": [], - "source": [ - "sns.set_style('whitegrid')\n", - "sns.kdeplot(A, bw_method=0.5, c = \"red\")\n", - "sns.kdeplot(B, bw_method=0.5, c = \"blue\")" - ] - }, - { - "cell_type": "markdown", - "id": "32d89829", - "metadata": {}, - "source": [ - "Wykres danych przy zalożeniu że nieznane są ich etykiety:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e57b5396", - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "plt.figure()\n", - "plt.plot(A, np.zeros(100), \"o\", c = \"black\", alpha = 0.2)\n", - "plt.plot(B, np.zeros(100), \"o\", c = \"black\", alpha = 0.2)\n", - "plt.yticks([])\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1c44a0c9", - "metadata": {}, - "outputs": [], - "source": [ - "sns.kdeplot(np.concatenate([A, B]), bw_method=0.5)" - ] - }, - { - "cell_type": "markdown", - "id": "f041b0c3", - "metadata": {}, - "source": [ - "### Iris Data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e1a38926", - "metadata": {}, - "outputs": [], - "source": [ - "iris = load_iris()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "33d9315f", - "metadata": {}, - "outputs": [], - "source": [ - "g = sns.PairGrid(iris, hue=\"Species\")\n", - "g.map_diag(sns.histplot)\n", - "g.map_offdiag(sns.scatterplot)\n", - "g.add_legend()" - ] - }, - { - "cell_type": "markdown", - "id": "864e43ae", - "metadata": {}, - "source": [ - "Standaryzacja danych oraz redukcja wymiaru do 2D." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "660b8255", - "metadata": {}, - "outputs": [], - "source": [ - "iris_x = iris.iloc[:, 0:4].values\n", - "iris_x = (iris_x - iris_x.mean(axis = 0)) / iris_x.std(axis = 0)\n", - "iris_y = iris.iloc[:, 4].values\n", - "iris_svd = np.linalg.svd(iris_x, full_matrices = False)\n", - "iris_x_pca = np.matmul(iris_x, iris_svd[2])\n", - "iris_x_pca_x = iris_x_pca[:, 0]\n", - "iris_x_pca_y = iris_x_pca[:, 1]" - ] - }, - { - "cell_type": "markdown", - "id": "33941348", - "metadata": {}, - "source": [ - "Wykres danych przy zalożeniu że znane są ich etykiety:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fef4c946", - "metadata": {}, - "outputs": [], - "source": [ - "c = pd.Series(iris_y).map({\"setosa\": 0, \"versicolor\": 1, \"virginica\": 2}).values\n", - "plt.scatter(iris_x_pca_x, iris_x_pca_y, s=10, color = np.array([\"red\", \"blue\", \"yellow\"])[c])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "50e3ffef", - "metadata": {}, - "outputs": [], - "source": [ - "for ir in range(3):\n", - " sns.kdeplot(x = iris_x_pca_x[c == ir], y = iris_x_pca_y[c == ir], bw_method=0.5, color = CAT_COLORS[ir])" - ] - }, - { - "cell_type": "markdown", - "id": "277990bf", - "metadata": {}, - "source": [ - "Wykres danych przy zalożeniu że nieznane są ich etykiety:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f81aeb9b", - "metadata": {}, - "outputs": [], - "source": [ - "plt.scatter(iris_x_pca_x, iris_x_pca_y, s=10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9284a686", - "metadata": {}, - "outputs": [], - "source": [ - "for ir in range(3):\n", - " sns.kdeplot(x = iris_x_pca_x, y = iris_x_pca_y, bw_method=0.5)" - ] - }, - { - "cell_type": "markdown", - "id": "7f72aebb", - "metadata": {}, - "source": [ - "## K-means - irysy\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dc04e6af", - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.cluster import KMeans\n", - "kmeans = KMeans(n_clusters=3, random_state=1234)\n", - "kmeans.fit(iris_x_pca)\n", - "kmeans_labels = kmeans.predict(iris_x_pca)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e0de960d", - "metadata": {}, - "outputs": [], - "source": [ - "# plot the cluster assignments and cluster centers\n", - "plt.scatter(iris_x_pca_x, iris_x_pca_y, c=kmeans_labels, s=10, cmap='viridis')" - ] - }, - { - "cell_type": "markdown", - "id": "d82ed0bc", - "metadata": {}, - "source": [ - "## GMM - Gaussian Mixture Model - irysy" - ] - }, - { - "cell_type": "markdown", - "id": "f34a8b35", - "metadata": {}, - "source": [ - "Dla przypomnienia algorytm EM dla GMM jest następujący:\n", - "\n", - "1. Wybierz liczbę skupień $K$.\n", - "2. Wybierz początkowe wartości $\\mu_1, \\ldots, \\mu_K$ oraz $\\Sigma_1, \\ldots, \\Sigma_K$.\n", - "3. Wykonaj kroki E i M aż do zbieżności." - ] - }, - { - "cell_type": "markdown", - "id": "1748bdc3", - "metadata": {}, - "source": [ - "Dla kroku \"Wybierz początkowe wartości $\\mu_1, \\ldots, \\mu_K$ oraz $\\Sigma_1, \\ldots, \\Sigma_K$.\", kluczowe jest podanie właściwej metody inicjalizacji. W bibliotece sklearn możliwe są następujące metody: 'random, 'random_from_data', 'kmeans', 'kmeans++'. Wartości domyślne to 'kmeans'.\n", - "\n", - "Opis metod inicjalizacji:\n", - "\n", - " * 'random' - losowe wartości z rozkładu jednostajnego\n", - " * 'random_from_data' - losowe wartości z rozkładu jednostajnego z zakresu danych\n", - " * 'kmeans' - wykorzystanie algorytmu k-means\n", - " * 'kmeans++' - wykorzystanie algorytmu k-means++ - [wybór punktów startowych w sposób zachłanny, tak aby odległość między punktami była jak największa](https://en.wikipedia.org/wiki/K-means%2B%2B)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b0acf880", - "metadata": {}, - "outputs": [], - "source": [ - "init_params = ['random', 'random_from_data', 'kmeans', 'k-means++']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0df9a88c", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "RANDOM_SEED = 1234\n", - "fig, ax = plt.subplots(2, 2, sharex='all', sharey='all')\n", - "for e, init_param in enumerate(init_params):\n", - " gm = GaussianMixture(n_components=3, random_state=RANDOM_SEED, verbose = 1, init_params=init_param, ).fit(iris_x_pca)\n", - " gm_labels = gm.predict(iris_x_pca)\n", - " ax.ravel()[e].scatter(iris_x_pca_x, iris_x_pca_y, s=10, color = CAT_COLORS[gm_labels])\n", - " ax.take(e).set_title(init_param)\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d075be9e", - "metadata": {}, - "outputs": [], - "source": [ - "RANDOM_SEED = 123456\n", - "\n", - "fig, ax = plt.subplots(2, 2, sharex='all', sharey='all')\n", - "for e, init_param in enumerate(init_params):\n", - " gm = GaussianMixture(n_components=3, random_state=RANDOM_SEED, verbose = 1, init_params=init_param, ).fit(iris_x_pca)\n", - " gm_labels = gm.predict(iris_x_pca)\n", - " ax.ravel()[e].scatter(iris_x_pca_x, iris_x_pca_y, s=10, color = CAT_COLORS[gm_labels])\n", - " ax.take(e).set_title(init_param)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "ffec011b", - "metadata": {}, - "source": [ - "GMM bazujace na k-means dla inicjalizacji" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b79fdf51", - "metadata": {}, - "outputs": [], - "source": [ - "gm = GaussianMixture(n_components=3, random_state=1234, verbose = 1, init_params=\"kmeans\").fit(iris_x_pca)\n", - "gm_labels = gm.predict(iris_x_pca)\n", - "g = sns.PairGrid(pd.DataFrame({'PC1': iris_x_pca[:, 0], 'PC2': iris_x_pca[:, 1], 'labels': gm_labels}), hue='labels')\n", - "g.map_upper(sns.scatterplot)\n", - "g.map_lower(sns.kdeplot)\n", - "g.map_diag(sns.kdeplot, lw=3, legend=False)" - ] - }, - { - "cell_type": "markdown", - "id": "bf198b4b", - "metadata": {}, - "source": [ - "soft labels - prawdopodobieństwo przynależności do danej grupy dla kazdej obserwacji w modelu GMM" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fe1e35e6", - "metadata": {}, - "outputs": [], - "source": [ - "pd.DataFrame(np.round(gm.predict_proba(iris_x_pca), 3), columns = ['Prob1','Prob2','Prob3']).\\\n", - " assign(obs = list(range(iris.shape[0]))).\\\n", - " sample(10, random_state = 1234).\\\n", - " sort_values(by = 'obs')" - ] - }, - { - "cell_type": "markdown", - "id": "b513345a", - "metadata": {}, - "source": [ - "GMM bazujace na losowym mechanizmie dla inicjalizacji" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "82c6f557", - "metadata": {}, - "outputs": [], - "source": [ - "gm = GaussianMixture(n_components=3, random_state=1234, verbose = 1, init_params=\"random\").fit(iris_x_pca)\n", - "gm_labels = gm.predict(iris_x_pca)\n", - "g = sns.PairGrid(pd.DataFrame({'PC1': iris_x_pca[:, 0], 'PC2': iris_x_pca[:, 1], 'labels': gm_labels}), hue='labels')\n", - "g.map_upper(sns.scatterplot)\n", - "g.map_lower(sns.kdeplot)\n", - "g.map_diag(sns.kdeplot, lw=3, legend=False)" - ] - }, - { - "cell_type": "markdown", - "id": "1477d7b8", - "metadata": {}, - "source": [ - "Macierz kowariancji w GMM moze byc:\n", - "\n", - " * diagonalna - macierz kowariancji jest macierza diagonalna\n", - " * pelna - macierz kowariancji jest macierza pelna\n", - " * diagonalna z ograniczeniami - macierz kowariancji jest macierza diagonalna z ograniczeniami na wartosci na diagonie\n", - " * pelna z ograniczeniami - macierz kowariancji jest macierza pelna z ograniczeniami na wartosci na diagonie" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e19da1f6", - "metadata": {}, - "outputs": [], - "source": [ - "# https://scikit-learn.org/stable/auto_examples/mixture/plot_gmm_covariances.html\n", - "# covariances types: full, tied, diag, spherical" - ] - }, - { - "cell_type": "markdown", - "id": "789be179", - "metadata": {}, - "source": [ - "![](https://scikit-learn.org/stable/_images/sphx_glr_plot_gmm_covariances_001.png)\n" - ] - }, - { - "cell_type": "markdown", - "id": "723e407c", - "metadata": {}, - "source": [ - "## DBSCAN - irysy" - ] - }, - { - "cell_type": "markdown", - "id": "842d9509", - "metadata": {}, - "source": [ - "Dla przypomnienia DBSCAN wymaga podania dwóch głownych parametrów: eps oraz min_samples. \n", - "\n", - "- eps określa maksymalną odległość między punktami, aby zostały uznane za sąsiadów. \n", - "- min_samples określa minimalną liczbę punktów w sąsiedztwie punktu, aby został uznany za punkt centralny. Punkty, które nie spełniają tych warunków, są uznawane za szum.\n", - "\n", - "oraz dodatkowych jak:\n", - "\n", - "- metric - metryka odległościowa, domyślnie euklidesowa\n", - "\n", - "\n", - "Nie ma koniecznosci podania liczby klastrow." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d8f03dbf", - "metadata": {}, - "outputs": [], - "source": [ - "EPS = 1\n", - "MIN_SAMPLES = 5" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "92beb979", - "metadata": {}, - "outputs": [], - "source": [ - "dbscan = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES, ).fit(iris_x_pca)\n", - "dbscan_labels = dbscan.labels_" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "056d6f66", - "metadata": {}, - "outputs": [], - "source": [ - "dbscan_labels_unique = np.unique(dbscan_labels)\n", - "assert len(dbscan_labels_unique) == 3\n", - "print(dbscan_labels_unique)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6f69d33e", - "metadata": {}, - "outputs": [], - "source": [ - "plt.scatter(iris_x_pca_x, iris_x_pca_y, s=10, color = CAT_COLORS[dbscan_labels + 1])" - ] - }, - { - "cell_type": "markdown", - "id": "5bfcb521", - "metadata": {}, - "source": [ - "## Example - Circles data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18f0056c", - "metadata": {}, - "outputs": [], - "source": [ - "n_samples = 500\n", - "noisy_circles_x, noisy_circles_y = datasets.make_circles(n_samples=n_samples, factor=0.5, noise=0.05)\n", - "plt.scatter(noisy_circles_x[:, 0], noisy_circles_x[:, 1], s=10, color = CAT_COLORS[noisy_circles_y])" - ] - }, - { - "cell_type": "markdown", - "id": "478cc10f", - "metadata": {}, - "source": [ - "### K-Means - circles" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e68bc969", - "metadata": {}, - "outputs": [], - "source": [ - "kmeans = KMeans(n_clusters=2, random_state=1234).fit(noisy_circles_x)\n", - "kmeans_labels = kmeans.predict(noisy_circles_x)\n", - "plt.scatter(noisy_circles_x[:, 0], noisy_circles_x[:, 1], s=10, color = CAT_COLORS[kmeans_labels])" - ] - }, - { - "cell_type": "markdown", - "id": "53ca57e3", - "metadata": {}, - "source": [ - "### GMM - circles" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "49938ed7", - "metadata": {}, - "outputs": [], - "source": [ - "gm = GaussianMixture(n_components=2, random_state=1234, init_params=\"kmeans\").fit(noisy_circles_x)\n", - "gm_labels = gm.predict(noisy_circles_x)\n", - "plt.scatter(noisy_circles_x[:, 0], noisy_circles_x[:, 1], s=10, color = CAT_COLORS[gm_labels])" - ] - }, - { - "cell_type": "markdown", - "id": "507f1afd", - "metadata": {}, - "source": [ - "### DBSCAN - circles" - ] - }, - { - "cell_type": "markdown", - "id": "97f74068", - "metadata": {}, - "source": [ - "**Prosze przetestujcie z innymi wartosciami parametrow eps oraz min_samples.**\n", - "\n", - "Łatwo osiągnąć sytuacje, w której otrzymujemy wyniki niezgodne z oczekiwaniami." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59e82d35", - "metadata": {}, - "outputs": [], - "source": [ - "EPS = 0.2 # or 0.3 to get 2 clusters\n", - "MIN_SAMPLES = 10 # or 30 to get 2 clusters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ae78ee4a", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "dbscan = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES, ).fit(noisy_circles_x)\n", - "dbscan_labels = dbscan.labels_" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4cbc6048", - "metadata": {}, - "outputs": [], - "source": [ - "dbscan_labels_unique = np.unique(dbscan_labels)\n", - "print(dbscan_labels_unique)\n", - "plt.scatter(noisy_circles_x[:, 0], noisy_circles_x[:, 1], s=10, color = CAT_COLORS[dbscan_labels + 1])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5ca24657", - "metadata": {}, - "outputs": [], - "source": [ - "# OPTICS" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - }, - "vscode": { - "interpreter": { - "hash": "a8685faa0ed749449a0f1a8710c4e7cd8c1c7833bc8ac4d1844d25fbee35609f" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "cells": [ + { + "cell_type": "markdown", + "id": "2a48c727", + "metadata": { + "tags": [] + }, + "source": [ + "# Analiza Wielowymiarowa - zajecia 11 - Modern Clustering" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53cb6590", + "metadata": {}, + "outputs": [], + "source": [ + "# NO STATA code on this lecture" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "313a653d", + "metadata": {}, + "outputs": [], + "source": [ + "# Załadowanie bibliotek\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "from sklearn.mixture import GaussianMixture\n", + "from sklearn.cluster import DBSCAN\n", + "from sklearn.cluster import KMeans\n", + "from sklearn import datasets\n", + "\n", + "from scipy.linalg import svd\n", + "\n", + "from multidim.datasets import load_iris" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6bfe09c", + "metadata": {}, + "outputs": [], + "source": [ + "CAT_COLORS = np.array([\"red\", \"blue\", \"yellow\", \"purple\", \"green\", \"orange\", \"black\", \"pink\", \"brown\", \"gray\", \"cyan\", \"magenta\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5417c75", + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(1234)" + ] + }, + { + "cell_type": "markdown", + "id": "203271d0", + "metadata": {}, + "source": [ + "### Metody analizy skupień - sklearn\n", + "\n", + "Source: https://scikit-learn.org/stable/modules/clustering.html" + ] + }, + { + "cell_type": "markdown", + "id": "ee457577", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "

Method name

Parameters

Scalability

Usecase

Geometry (metric used)

K-Means

number of clusters

Very large n_samples, medium n_clusters with\n", + "MiniBatch code

General-purpose, even cluster size, flat geometry,\n", + "not too many clusters, inductive

Distances between points

Affinity propagation

damping, sample preference

Not scalable with n_samples

Many clusters, uneven cluster size, non-flat geometry, inductive

Graph distance (e.g. nearest-neighbor graph)

Mean-shift

bandwidth

Not scalable with n_samples

Many clusters, uneven cluster size, non-flat geometry, inductive

Distances between points

Spectral clustering

number of clusters

Medium n_samples, small n_clusters

Few clusters, even cluster size, non-flat geometry, transductive

Graph distance (e.g. nearest-neighbor graph)

Ward hierarchical clustering

number of clusters or distance threshold

Large n_samples and n_clusters

Many clusters, possibly connectivity constraints, transductive

Distances between points

Agglomerative clustering

number of clusters or distance threshold, linkage type, distance

Large n_samples and n_clusters

Many clusters, possibly connectivity constraints, non Euclidean\n", + "distances, transductive

Any pairwise distance

DBSCAN

neighborhood size

Very large n_samples, medium n_clusters

Non-flat geometry, uneven cluster sizes, outlier removal,\n", + "transductive

Distances between nearest points

OPTICS

minimum cluster membership

Very large n_samples, large n_clusters

Non-flat geometry, uneven cluster sizes, variable cluster density,\n", + "outlier removal, transductive

Distances between points

Gaussian mixtures

many

Not scalable

Flat geometry, good for density estimation, inductive

Mahalanobis distances to centers

BIRCH

branching factor, threshold, optional global clusterer.

Large n_clusters and n_samples

Large dataset, outlier removal, data reduction, inductive

Euclidean distance between points

Bisecting K-Means

number of clusters

Very large n_samples, medium n_clusters

General-purpose, even cluster size, flat geometry,\n", + "no empty clusters, inductive, hierarchical

Distances between points

" + ] + }, + { + "cell_type": "markdown", + "id": "d8a21aec", + "metadata": {}, + "source": [ + "![](https://scikit-learn.org/stable/_images/sphx_glr_plot_cluster_comparison_001.png)" + ] + }, + { + "cell_type": "markdown", + "id": "88ca1629", + "metadata": {}, + "source": [ + "### Skoncentrujemy sie na metodach GMM oraz DBSCAN. W tym celu wykorzystamy zbiór danych Iris oraz okregow." + ] + }, + { + "cell_type": "markdown", + "id": "dc2529d7", + "metadata": {}, + "source": [ + "### 1-D Example Data" + ] + }, + { + "cell_type": "markdown", + "id": "7be06648", + "metadata": {}, + "source": [ + "- Czym są etykiety miekkie (soft) oraz twarde (hard)?\n", + "- Kiedy mowimy o latent variable (zmienna ukryta)? Zmienna ukryta - której nie można bezpośrednio zaobserwować, ale można ją wywnioskować z obserwowanych zmiennych i parametrów.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c8125ef", + "metadata": {}, + "outputs": [], + "source": [ + "A = np.random.normal(size = 100, loc = 0, scale = 1)\n", + "B = np.random.normal(size = 100, loc = 5, scale = 2)" + ] + }, + { + "cell_type": "markdown", + "id": "6157574e", + "metadata": {}, + "source": [ + "Wykres danych przy zalożeniu że znane są ich etykiety:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "760237e3", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "plt.figure()\n", + "plt.plot(A, np.zeros(100), \"o\", c = \"red\", alpha = 0.2)\n", + "plt.plot(B, np.zeros(100), \"o\", c = \"blue\", alpha = 0.2)\n", + "plt.yticks([])\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "492688a1", + "metadata": {}, + "outputs": [], + "source": [ + "sns.set_style('whitegrid')\n", + "sns.kdeplot(A, bw_method=0.5, c = \"red\")\n", + "sns.kdeplot(B, bw_method=0.5, c = \"blue\")" + ] + }, + { + "cell_type": "markdown", + "id": "32d89829", + "metadata": {}, + "source": [ + "Wykres danych przy zalożeniu że nieznane są ich etykiety:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e57b5396", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "plt.figure()\n", + "plt.plot(A, np.zeros(100), \"o\", c = \"black\", alpha = 0.2)\n", + "plt.plot(B, np.zeros(100), \"o\", c = \"black\", alpha = 0.2)\n", + "plt.yticks([])\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c44a0c9", + "metadata": {}, + "outputs": [], + "source": [ + "sns.kdeplot(np.concatenate([A, B]), bw_method=0.5)" + ] + }, + { + "cell_type": "markdown", + "id": "f041b0c3", + "metadata": {}, + "source": [ + "### Iris Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1a38926", + "metadata": {}, + "outputs": [], + "source": [ + "iris = load_iris()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33d9315f", + "metadata": {}, + "outputs": [], + "source": [ + "g = sns.PairGrid(iris, hue=\"Species\")\n", + "g.map_diag(sns.histplot)\n", + "g.map_offdiag(sns.scatterplot)\n", + "g.add_legend()" + ] + }, + { + "cell_type": "markdown", + "id": "864e43ae", + "metadata": {}, + "source": [ + "Standaryzacja danych oraz redukcja wymiaru do 2D." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "660b8255", + "metadata": {}, + "outputs": [], + "source": [ + "iris_x = iris.iloc[:, 0:4].values\n", + "iris_x = (iris_x - iris_x.mean(axis = 0)) / iris_x.std(axis = 0)\n", + "iris_y = iris.iloc[:, 4].values\n", + "iris_svd = np.linalg.svd(iris_x, full_matrices = False)\n", + "iris_x_pca = np.matmul(iris_x, iris_svd[2])\n", + "iris_x_pca_x = iris_x_pca[:, 0]\n", + "iris_x_pca_y = iris_x_pca[:, 1]" + ] + }, + { + "cell_type": "markdown", + "id": "33941348", + "metadata": {}, + "source": [ + "Wykres danych przy zalożeniu że znane są ich etykiety:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fef4c946", + "metadata": {}, + "outputs": [], + "source": [ + "c = pd.Series(iris_y).map({\"setosa\": 0, \"versicolor\": 1, \"virginica\": 2}).values\n", + "plt.scatter(iris_x_pca_x, iris_x_pca_y, s=10, color = np.array([\"red\", \"blue\", \"yellow\"])[c])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50e3ffef", + "metadata": {}, + "outputs": [], + "source": [ + "for ir in range(3):\n", + " sns.kdeplot(x = iris_x_pca_x[c == ir], y = iris_x_pca_y[c == ir], bw_method=0.5, color = CAT_COLORS[ir])" + ] + }, + { + "cell_type": "markdown", + "id": "277990bf", + "metadata": {}, + "source": [ + "Wykres danych przy zalożeniu że nieznane są ich etykiety:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f81aeb9b", + "metadata": {}, + "outputs": [], + "source": [ + "plt.scatter(iris_x_pca_x, iris_x_pca_y, s=10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9284a686", + "metadata": {}, + "outputs": [], + "source": [ + "for ir in range(3):\n", + " sns.kdeplot(x = iris_x_pca_x, y = iris_x_pca_y, bw_method=0.5)" + ] + }, + { + "cell_type": "markdown", + "id": "7f72aebb", + "metadata": {}, + "source": [ + "## K-means - irysy\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc04e6af", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.cluster import KMeans\n", + "kmeans = KMeans(n_clusters=3, random_state=1234)\n", + "kmeans.fit(iris_x_pca)\n", + "kmeans_labels = kmeans.predict(iris_x_pca)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0de960d", + "metadata": {}, + "outputs": [], + "source": [ + "# plot the cluster assignments and cluster centers\n", + "plt.scatter(iris_x_pca_x, iris_x_pca_y, c=kmeans_labels, s=10, cmap='viridis')" + ] + }, + { + "cell_type": "markdown", + "id": "d82ed0bc", + "metadata": {}, + "source": [ + "## GMM - Gaussian Mixture Model - irysy" + ] + }, + { + "cell_type": "markdown", + "id": "f34a8b35", + "metadata": {}, + "source": [ + "Dla przypomnienia algorytm EM dla GMM jest następujący:\n", + "\n", + "1. Wybierz liczbę skupień $K$.\n", + "2. Wybierz początkowe wartości $\\mu_1, \\ldots, \\mu_K$ oraz $\\Sigma_1, \\ldots, \\Sigma_K$.\n", + "3. Wykonaj kroki E i M aż do zbieżności." + ] + }, + { + "cell_type": "markdown", + "id": "1748bdc3", + "metadata": {}, + "source": [ + "Dla kroku \"Wybierz początkowe wartości $\\mu_1, \\ldots, \\mu_K$ oraz $\\Sigma_1, \\ldots, \\Sigma_K$.\", kluczowe jest podanie właściwej metody inicjalizacji. W bibliotece sklearn możliwe są następujące metody: 'random, 'random_from_data', 'kmeans', 'kmeans++'. Wartości domyślne to 'kmeans'.\n", + "\n", + "Opis metod inicjalizacji:\n", + "\n", + " * 'random' - losowe wartości z rozkładu jednostajnego\n", + " * 'random_from_data' - losowe wartości z rozkładu jednostajnego z zakresu danych\n", + " * 'kmeans' - wykorzystanie algorytmu k-means\n", + " * 'kmeans++' - wykorzystanie algorytmu k-means++ - [wybór punktów startowych w sposób zachłanny, tak aby odległość między punktami była jak największa](https://en.wikipedia.org/wiki/K-means%2B%2B)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0acf880", + "metadata": {}, + "outputs": [], + "source": [ + "init_params = ['random', 'random_from_data', 'kmeans', 'k-means++']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0df9a88c", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "RANDOM_SEED = 1234\n", + "fig, ax = plt.subplots(2, 2, sharex='all', sharey='all')\n", + "for e, init_param in enumerate(init_params):\n", + " gm = GaussianMixture(n_components=3, random_state=RANDOM_SEED, verbose = 1, init_params=init_param, ).fit(iris_x_pca)\n", + " gm_labels = gm.predict(iris_x_pca)\n", + " ax.ravel()[e].scatter(iris_x_pca_x, iris_x_pca_y, s=10, color = CAT_COLORS[gm_labels])\n", + " ax.take(e).set_title(init_param)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d075be9e", + "metadata": {}, + "outputs": [], + "source": [ + "RANDOM_SEED = 123456\n", + "\n", + "fig, ax = plt.subplots(2, 2, sharex='all', sharey='all')\n", + "for e, init_param in enumerate(init_params):\n", + " gm = GaussianMixture(n_components=3, random_state=RANDOM_SEED, verbose = 1, init_params=init_param, ).fit(iris_x_pca)\n", + " gm_labels = gm.predict(iris_x_pca)\n", + " ax.ravel()[e].scatter(iris_x_pca_x, iris_x_pca_y, s=10, color = CAT_COLORS[gm_labels])\n", + " ax.take(e).set_title(init_param)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "ffec011b", + "metadata": {}, + "source": [ + "GMM bazujace na k-means dla inicjalizacji" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b79fdf51", + "metadata": {}, + "outputs": [], + "source": [ + "gm = GaussianMixture(n_components=3, random_state=1234, verbose = 1, init_params=\"kmeans\").fit(iris_x_pca)\n", + "gm_labels = gm.predict(iris_x_pca)\n", + "g = sns.PairGrid(pd.DataFrame({'PC1': iris_x_pca[:, 0], 'PC2': iris_x_pca[:, 1], 'labels': gm_labels}), hue='labels')\n", + "g.map_upper(sns.scatterplot)\n", + "g.map_lower(sns.kdeplot)\n", + "g.map_diag(sns.kdeplot, lw=3, legend=False)" + ] + }, + { + "cell_type": "markdown", + "id": "bf198b4b", + "metadata": {}, + "source": [ + "soft labels - prawdopodobieństwo przynależności do danej grupy dla kazdej obserwacji w modelu GMM" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe1e35e6", + "metadata": {}, + "outputs": [], + "source": [ + "pd.DataFrame(np.round(gm.predict_proba(iris_x_pca), 3), columns = ['Prob1','Prob2','Prob3']).\\\n", + " assign(obs = list(range(iris.shape[0]))).\\\n", + " sample(10, random_state = 1234).\\\n", + " sort_values(by = 'obs')" + ] + }, + { + "cell_type": "markdown", + "id": "b513345a", + "metadata": {}, + "source": [ + "GMM bazujace na losowym mechanizmie dla inicjalizacji" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82c6f557", + "metadata": {}, + "outputs": [], + "source": [ + "gm = GaussianMixture(n_components=3, random_state=1234, verbose = 1, init_params=\"random\").fit(iris_x_pca)\n", + "gm_labels = gm.predict(iris_x_pca)\n", + "g = sns.PairGrid(pd.DataFrame({'PC1': iris_x_pca[:, 0], 'PC2': iris_x_pca[:, 1], 'labels': gm_labels}), hue='labels')\n", + "g.map_upper(sns.scatterplot)\n", + "g.map_lower(sns.kdeplot)\n", + "g.map_diag(sns.kdeplot, lw=3, legend=False)" + ] + }, + { + "cell_type": "markdown", + "id": "1477d7b8", + "metadata": {}, + "source": [ + "Macierz kowariancji w GMM moze byc:\n", + "\n", + " * diagonalna - macierz kowariancji jest macierza diagonalna\n", + " * pelna - macierz kowariancji jest macierza pelna\n", + " * diagonalna z ograniczeniami - macierz kowariancji jest macierza diagonalna z ograniczeniami na wartosci na diagonie\n", + " * pelna z ograniczeniami - macierz kowariancji jest macierza pelna z ograniczeniami na wartosci na diagonie" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e19da1f6", + "metadata": {}, + "outputs": [], + "source": [ + "# https://scikit-learn.org/stable/auto_examples/mixture/plot_gmm_covariances.html\n", + "# covariances types: full, tied, diag, spherical" + ] + }, + { + "cell_type": "markdown", + "id": "789be179", + "metadata": {}, + "source": [ + "![](https://scikit-learn.org/stable/_images/sphx_glr_plot_gmm_covariances_001.png)\n" + ] + }, + { + "cell_type": "markdown", + "id": "723e407c", + "metadata": {}, + "source": [ + "## DBSCAN - irysy" + ] + }, + { + "cell_type": "markdown", + "id": "842d9509", + "metadata": {}, + "source": [ + "Dla przypomnienia DBSCAN wymaga podania dwóch głownych parametrów: eps oraz min_samples. \n", + "\n", + "- eps określa maksymalną odległość między punktami, aby zostały uznane za sąsiadów. \n", + "- min_samples określa minimalną liczbę punktów w sąsiedztwie punktu, aby został uznany za punkt centralny. Punkty, które nie spełniają tych warunków, są uznawane za szum.\n", + "\n", + "oraz dodatkowych jak:\n", + "\n", + "- metric - metryka odległościowa, domyślnie euklidesowa\n", + "\n", + "\n", + "Nie ma koniecznosci podania liczby klastrow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8f03dbf", + "metadata": {}, + "outputs": [], + "source": [ + "EPS = 1\n", + "MIN_SAMPLES = 5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92beb979", + "metadata": {}, + "outputs": [], + "source": [ + "dbscan = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES, ).fit(iris_x_pca)\n", + "dbscan_labels = dbscan.labels_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "056d6f66", + "metadata": {}, + "outputs": [], + "source": [ + "dbscan_labels_unique = np.unique(dbscan_labels)\n", + "assert len(dbscan_labels_unique) == 3\n", + "print(dbscan_labels_unique)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f69d33e", + "metadata": {}, + "outputs": [], + "source": [ + "plt.scatter(iris_x_pca_x, iris_x_pca_y, s=10, color = CAT_COLORS[dbscan_labels + 1])" + ] + }, + { + "cell_type": "markdown", + "id": "5bfcb521", + "metadata": {}, + "source": [ + "## Example - Circles data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18f0056c", + "metadata": {}, + "outputs": [], + "source": [ + "n_samples = 500\n", + "noisy_circles_x, noisy_circles_y = datasets.make_circles(n_samples=n_samples, factor=0.5, noise=0.05)\n", + "plt.scatter(noisy_circles_x[:, 0], noisy_circles_x[:, 1], s=10, color = CAT_COLORS[noisy_circles_y])" + ] + }, + { + "cell_type": "markdown", + "id": "478cc10f", + "metadata": {}, + "source": [ + "### K-Means - circles" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e68bc969", + "metadata": {}, + "outputs": [], + "source": [ + "kmeans = KMeans(n_clusters=2, random_state=1234).fit(noisy_circles_x)\n", + "kmeans_labels = kmeans.predict(noisy_circles_x)\n", + "plt.scatter(noisy_circles_x[:, 0], noisy_circles_x[:, 1], s=10, color = CAT_COLORS[kmeans_labels])" + ] + }, + { + "cell_type": "markdown", + "id": "53ca57e3", + "metadata": {}, + "source": [ + "### GMM - circles" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49938ed7", + "metadata": {}, + "outputs": [], + "source": [ + "gm = GaussianMixture(n_components=2, random_state=1234, init_params=\"kmeans\").fit(noisy_circles_x)\n", + "gm_labels = gm.predict(noisy_circles_x)\n", + "plt.scatter(noisy_circles_x[:, 0], noisy_circles_x[:, 1], s=10, color = CAT_COLORS[gm_labels])" + ] + }, + { + "cell_type": "markdown", + "id": "507f1afd", + "metadata": {}, + "source": [ + "### DBSCAN - circles" + ] + }, + { + "cell_type": "markdown", + "id": "97f74068", + "metadata": {}, + "source": [ + "**Prosze przetestujcie z innymi wartosciami parametrow eps oraz min_samples.**\n", + "\n", + "Łatwo osiągnąć sytuacje, w której otrzymujemy wyniki niezgodne z oczekiwaniami." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59e82d35", + "metadata": {}, + "outputs": [], + "source": [ + "EPS = 0.2 # or 0.3 to get 2 clusters\n", + "MIN_SAMPLES = 10 # or 30 to get 2 clusters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae78ee4a", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "dbscan = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES, ).fit(noisy_circles_x)\n", + "dbscan_labels = dbscan.labels_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4cbc6048", + "metadata": {}, + "outputs": [], + "source": [ + "dbscan_labels_unique = np.unique(dbscan_labels)\n", + "print(dbscan_labels_unique)\n", + "plt.scatter(noisy_circles_x[:, 0], noisy_circles_x[:, 1], s=10, color = CAT_COLORS[dbscan_labels + 1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ca24657", + "metadata": {}, + "outputs": [], + "source": [ + "# OPTICS" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.8 ('.venv': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + }, + "vscode": { + "interpreter": { + "hash": "a8685faa0ed749449a0f1a8710c4e7cd8c1c7833bc8ac4d1844d25fbee35609f" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 }