From 853605b2591b45519a805d2c3bdeac9aabdda01e Mon Sep 17 00:00:00 2001 From: NovaVolunteer Date: Mon, 18 Sep 2023 09:40:11 -0400 Subject: [PATCH] Updated Lab --- .gitignore | 2 ++ 02_function_basics/Class_II_Python.ipynb | 4 ++-- 05_ML_Concepts_II_Data_Prep/ml_bootcamp.py | 23 +++++++++++++--------- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index c2ae83f..746a8fd 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,5 @@ README.html in_class_example_knitr.html Lab_Overview.html Decision_Tree_Lab_Answers.Rmd +03_Communications/presentation_example.html +03_Communications/presentation_example.html diff --git a/02_function_basics/Class_II_Python.ipynb b/02_function_basics/Class_II_Python.ipynb index 574c5dc..b57a53d 100644 --- a/02_function_basics/Class_II_Python.ipynb +++ b/02_function_basics/Class_II_Python.ipynb @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "317858ce", "metadata": {}, "outputs": [ @@ -1035,7 +1035,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.17" + "version": "3.10.12" }, "vscode": { "interpreter": { diff --git a/05_ML_Concepts_II_Data_Prep/ml_bootcamp.py b/05_ML_Concepts_II_Data_Prep/ml_bootcamp.py index 5bac72d..f9571c5 100644 --- a/05_ML_Concepts_II_Data_Prep/ml_bootcamp.py +++ b/05_ML_Concepts_II_Data_Prep/ml_bootcamp.py @@ -33,18 +33,22 @@ #read in the cereal dataset, you should have this locally or you can use the URL linking to the class repo below cereal = pd.read_csv("https://raw.githubusercontent.com/UVADS/DS-3001/main/data/cereal.csv") -print(cereal.info()) # Let's check the structure of the dataset and see if we have any issues with variable classes +cereal.info() # Let's check the structure of the dataset and see if we have any issues with variable classes #usually it's converting things to category # %% -#Looks like columns 1,2,11 and 12 need to be converted to category +#Looks like columns 11 and 12 need to be converted to category -Column_index_list = [1,2,11,12] -cereal.iloc[:,Column_index_list]= cereal.iloc[:,Column_index_list].astype('category') +cols = ["type","mfr","vitamins","shelf"] +cereal[cols]= cereal[cols].astype('category') #iloc accesses the index of a dataframe, bypassing having to manually type in the names of each column -print(cereal.dtypes) #another way of checking the structure of the dataset. Simpler, but does not give an index +#convert type variable in category variable +#cereal.type = cereal.type.astype('category') #this is the same as the above code, but for a single column + + +cereal.dtypes #another way of checking the structure of the dataset. Simpler, but does not give an index # %% #Let's take a closer look at mfr @@ -63,6 +67,7 @@ print(cereal.mfr.value_counts()) #This is a lot better + # %% print(cereal.type.value_counts()) #looks good @@ -110,11 +115,11 @@ # %% # Next let's one-hot encode those categorical variables -category_list = list(cereal.select_dtypes('category')) #select function to find the categorical variables and create a list +#category_list = list(cereal.select_dtypes('category')) #select function to find the categorical variables and create a list -cereal_1h = pd.get_dummies(cereal, columns = category_list) +#cereal_1h = pd.get_dummies(cereal, columns = category_list) #get_dummies encodes categorical variables into binary by adding in indicator column for each group of a category and assigning it 0 if false or 1 if true -print(cereal_1h) #see the difference? This is one-hot encoding! +cereal_1h.info() #see the difference? This is one-hot encoding! # %% [markdown] # ### Baseline/Prevalance @@ -131,7 +136,7 @@ cereal_1h['rating_f'] = pd.cut(cereal_1h.rating, bins = [-1,0.43,1], labels =[0,1]) #If we want two segments we input three numbers, start, cut and stop values -print(cereal_1h) #notice the new column rating_f, it is now binary based on if the continuous value is above 0.43 or not +cereal.info() #notice the new column rating_f, it is now binary based on if the continuous value is above 0.43 or not # %% #So now let's check the prevalence