From aab797b4e00272f35f5a02fa2fc9e6c8ba451539 Mon Sep 17 00:00:00 2001 From: Aaron Cruz Date: Fri, 28 Apr 2023 22:11:55 -0400 Subject: [PATCH] Add files via upload Second commit, did some additional cleaning of this dataframe. --- Python-Basic-Data-Cleaning.ipynb | 489 +++++++++++++++++++++++++++++-- 1 file changed, 457 insertions(+), 32 deletions(-) diff --git a/Python-Basic-Data-Cleaning.ipynb b/Python-Basic-Data-Cleaning.ipynb index 81b7df3..6efe415 100644 --- a/Python-Basic-Data-Cleaning.ipynb +++ b/Python-Basic-Data-Cleaning.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "id": "dd24ccc0", "metadata": {}, "outputs": [], @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "id": "e75c7c42", "metadata": {}, "outputs": [ @@ -221,7 +221,7 @@ "[5 rows x 82 columns]" ] }, - "execution_count": 3, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -236,7 +236,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "id": "89727911", "metadata": {}, "outputs": [], @@ -251,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "id": "122bfa3b", "metadata": {}, "outputs": [], @@ -271,7 +271,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "id": "b962446e", "metadata": {}, "outputs": [ @@ -477,7 +477,7 @@ "[5 rows x 32 columns]" ] }, - "execution_count": 6, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -489,7 +489,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "id": "5f2a4bac", "metadata": {}, "outputs": [], @@ -501,7 +501,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "id": "19f80a57", "metadata": {}, "outputs": [ @@ -735,7 +735,7 @@ "[5 rows x 31 columns]" ] }, - "execution_count": 8, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -746,7 +746,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "id": "5dba9a9c", "metadata": {}, "outputs": [ @@ -787,7 +787,7 @@ "Name: 526301100, dtype: object" ] }, - "execution_count": 9, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -799,7 +799,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "id": "791a379b", "metadata": {}, "outputs": [], @@ -810,7 +810,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "id": "dd76c23f", "metadata": {}, "outputs": [ @@ -1212,7 +1212,7 @@ "[370 rows x 31 columns]" ] }, - "execution_count": 11, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -1223,7 +1223,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "id": "d0ab5134", "metadata": {}, "outputs": [ @@ -1264,7 +1264,7 @@ "dtype: object" ] }, - "execution_count": 12, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1332,7 +1332,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "id": "52ff9cea", "metadata": {}, "outputs": [ @@ -1383,7 +1383,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "id": "a7531b07", "metadata": {}, "outputs": [], @@ -1394,7 +1394,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "id": "23aac6d1", "metadata": {}, "outputs": [ @@ -1445,7 +1445,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 18, "id": "4443cdeb", "metadata": {}, "outputs": [ @@ -1672,7 +1672,7 @@ " AllPub\n", " Inside\n", " Gtl\n", - " Old Town\n", + " OldTown\n", " 1Fam\n", " ...\n", " 1566\n", @@ -1696,7 +1696,7 @@ " AllPub\n", " Inside\n", " Gtl\n", - " Old Town\n", + " OldTown\n", " 1Fam\n", " ...\n", " 1182\n", @@ -1720,7 +1720,7 @@ " AllPub\n", " Inside\n", " Gtl\n", - " Old Town\n", + " OldTown\n", " 1Fam\n", " ...\n", " 3608\n", @@ -1744,7 +1744,7 @@ " AllPub\n", " Inside\n", " Gtl\n", - " Old Town\n", + " OldTown\n", " 2fmCon\n", " ...\n", " 1118\n", @@ -1768,7 +1768,7 @@ " AllPub\n", " Inside\n", " Gtl\n", - " Old Town\n", + " OldTown\n", " 1Fam\n", " ...\n", " 1103\n", @@ -1810,11 +1810,11 @@ "534152100 Inside Gtl Northwest Ames 1Fam ... 1714 \n", "534176230 Inside Gtl Northwest Ames 1Fam ... 1645 \n", "... ... ... ... ... ... ... \n", - "902204120 Inside Gtl Old Town 1Fam ... 1566 \n", - "902206090 Inside Gtl Old Town 1Fam ... 1182 \n", - "902400110 Inside Gtl Old Town 1Fam ... 3608 \n", - "903229040 Inside Gtl Old Town 2fmCon ... 1118 \n", - "903234030 Inside Gtl Old Town 1Fam ... 1103 \n", + "902204120 Inside Gtl OldTown 1Fam ... 1566 \n", + "902206090 Inside Gtl OldTown 1Fam ... 1182 \n", + "902400110 Inside Gtl OldTown 1Fam ... 3608 \n", + "903229040 Inside Gtl OldTown 2fmCon ... 1118 \n", + "903234030 Inside Gtl OldTown 1Fam ... 1103 \n", "\n", " Bedroom AbvGr TotRms AbvGrd Paved Drive Wood Deck SF Fence \\\n", "PID \n", @@ -1847,7 +1847,7 @@ "[87 rows x 31 columns]" ] }, - "execution_count": 24, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1860,6 +1860,431 @@ "df.replace('NWAmes', 'Northwest Ames')" ] }, + { + "cell_type": "code", + "execution_count": 24, + "id": "ee7aa7be", + "metadata": {}, + "outputs": [], + "source": [ + "# Convert columns to lower case, replacing spaces with underscores.\n", + "df.columns = df.columns.str.lower()\n", + "df.columns = df.columns.str.replace(' ', '-')" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "067a4df9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orderms-subclassms-zoninglot-frontagelot-areautilitieslot-configland-slopeneighborhoodbldg-type...gr-liv-areabedroom-abvgrtotrms-abvgrdpaved-drivewood-deck-sffencemo-soldyr-soldsale-typesaleprice
PID
5273021102020RL85.013175AllPubInsideGtlNWAmes1Fam...207337Y349MnPrv22010WD210000
5273581402120RL105.011751AllPubInsideGtlNWAmes1Fam...184437Y0MnPrv12010COD190000
5273582002285RL85.010625AllPubInsideGtlNWAmes1Fam...117336Y0MnPrv12010WD170000
53415210011560RL80.010421AllPubInsideGtlNWAmes1Fam...171437Y228MnPrv32010WD196500
53417623011760RL80.09600AllPubInsideGtlNWAmes1Fam...164547Y0GdWo62010WD171000
..................................................................
902204120265350RM50.06000AllPubInsideGtlOldTown1Fam...156657Y24MnPrv82006WD139000
902206090265550RM56.09576AllPubInsideGtlOldTown1Fam...118235Y0GdWo52006WD120000
902400110266775RM90.022950AllPubInsideGtlOldTown1Fam...3608412Y0GdPrv62006WD475000
9032290402675190RM65.07800AllPubInsideGtlOldTown2fmCon...111835Y0MnPrv52006WD119900
903234030268330RM50.06000AllPubInsideGtlOldTown1Fam...110325Y166MnPrv72006WD110500
\n", + "

87 rows × 31 columns

\n", + "
" + ], + "text/plain": [ + " order ms-subclass ms-zoning lot-frontage lot-area utilities \\\n", + "PID \n", + "527302110 20 20 RL 85.0 13175 AllPub \n", + "527358140 21 20 RL 105.0 11751 AllPub \n", + "527358200 22 85 RL 85.0 10625 AllPub \n", + "534152100 115 60 RL 80.0 10421 AllPub \n", + "534176230 117 60 RL 80.0 9600 AllPub \n", + "... ... ... ... ... ... ... \n", + "902204120 2653 50 RM 50.0 6000 AllPub \n", + "902206090 2655 50 RM 56.0 9576 AllPub \n", + "902400110 2667 75 RM 90.0 22950 AllPub \n", + "903229040 2675 190 RM 65.0 7800 AllPub \n", + "903234030 2683 30 RM 50.0 6000 AllPub \n", + "\n", + " lot-config land-slope neighborhood bldg-type ... gr-liv-area \\\n", + "PID ... \n", + "527302110 Inside Gtl NWAmes 1Fam ... 2073 \n", + "527358140 Inside Gtl NWAmes 1Fam ... 1844 \n", + "527358200 Inside Gtl NWAmes 1Fam ... 1173 \n", + "534152100 Inside Gtl NWAmes 1Fam ... 1714 \n", + "534176230 Inside Gtl NWAmes 1Fam ... 1645 \n", + "... ... ... ... ... ... ... \n", + "902204120 Inside Gtl OldTown 1Fam ... 1566 \n", + "902206090 Inside Gtl OldTown 1Fam ... 1182 \n", + "902400110 Inside Gtl OldTown 1Fam ... 3608 \n", + "903229040 Inside Gtl OldTown 2fmCon ... 1118 \n", + "903234030 Inside Gtl OldTown 1Fam ... 1103 \n", + "\n", + " bedroom-abvgr totrms-abvgrd paved-drive wood-deck-sf fence \\\n", + "PID \n", + "527302110 3 7 Y 349 MnPrv \n", + "527358140 3 7 Y 0 MnPrv \n", + "527358200 3 6 Y 0 MnPrv \n", + "534152100 3 7 Y 228 MnPrv \n", + "534176230 4 7 Y 0 GdWo \n", + "... ... ... ... ... ... \n", + "902204120 5 7 Y 24 MnPrv \n", + "902206090 3 5 Y 0 GdWo \n", + "902400110 4 12 Y 0 GdPrv \n", + "903229040 3 5 Y 0 MnPrv \n", + "903234030 2 5 Y 166 MnPrv \n", + "\n", + " mo-sold yr-sold sale-type saleprice \n", + "PID \n", + "527302110 2 2010 WD 210000 \n", + "527358140 1 2010 COD 190000 \n", + "527358200 1 2010 WD 170000 \n", + "534152100 3 2010 WD 196500 \n", + "534176230 6 2010 WD 171000 \n", + "... ... ... ... ... \n", + "902204120 8 2006 WD 139000 \n", + "902206090 5 2006 WD 120000 \n", + "902400110 6 2006 WD 475000 \n", + "903229040 5 2006 WD 119900 \n", + "903234030 7 2006 WD 110500 \n", + "\n", + "[87 rows x 31 columns]" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, { "cell_type": "code", "execution_count": 19,