Updated Lab

UVADS · Sep 18, 2023 · 853605b · 853605b
1 parent 39ebb12
commit 853605b
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 11 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,3 +14,5 @@ README.html
 in_class_example_knitr.html
 Lab_Overview.html
 Decision_Tree_Lab_Answers.Rmd
+03_Communications/presentation_example.html
+03_Communications/presentation_example.html
diff --git a/02_function_basics/Class_II_Python.ipynb b/02_function_basics/Class_II_Python.ipynb
@@ -50,7 +50,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "317858ce",
    "metadata": {},
    "outputs": [
@@ -1035,7 +1035,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.17"
+   "version": "3.10.12"
   },
   "vscode": {
    "interpreter": {

diff --git a/05_ML_Concepts_II_Data_Prep/ml_bootcamp.py b/05_ML_Concepts_II_Data_Prep/ml_bootcamp.py
@@ -33,18 +33,22 @@
 #read in the cereal dataset, you should have this locally or you can use the URL linking to the class repo below
 cereal = pd.read_csv("https://raw.githubusercontent.com/UVADS/DS-3001/main/data/cereal.csv")
 
-print(cereal.info()) # Let's check the structure of the dataset and see if we have any issues with variable classes
+cereal.info() # Let's check the structure of the dataset and see if we have any issues with variable classes
 #usually it's converting things to category
 
 
 # %%
-#Looks like columns 1,2,11 and 12 need to be converted to category
+#Looks like columns 11 and 12 need to be converted to category
 
-Column_index_list = [1,2,11,12]
-cereal.iloc[:,Column_index_list]= cereal.iloc[:,Column_index_list].astype('category') 
+cols = ["type","mfr","vitamins","shelf"]
+cereal[cols]= cereal[cols].astype('category') 
 #iloc accesses the index of a dataframe, bypassing having to manually type in the names of each column
 
-print(cereal.dtypes) #another way of checking the structure of the dataset. Simpler, but does not give an index
+#convert type variable in category variable
+#cereal.type = cereal.type.astype('category') #this is the same as the above code, but for a single column
+
+
+cereal.dtypes #another way of checking the structure of the dataset. Simpler, but does not give an index
 
 # %%
 #Let's take a closer look at mfr
@@ -63,6 +67,7 @@
 
 print(cereal.mfr.value_counts()) #This is a lot better
 
+
 # %%
 print(cereal.type.value_counts()) #looks good
 
@@ -110,11 +115,11 @@
 # %%
 # Next let's one-hot encode those categorical variables
 
-category_list = list(cereal.select_dtypes('category')) #select function to find the categorical variables and create a list  
+#category_list = list(cereal.select_dtypes('category')) #select function to find the categorical variables and create a list  
 
-cereal_1h = pd.get_dummies(cereal, columns = category_list) 
+#cereal_1h = pd.get_dummies(cereal, columns = category_list) 
 #get_dummies encodes categorical variables into binary by adding in indicator column for each group of a category and assigning it 0 if false or 1 if true
-print(cereal_1h) #see the difference? This is one-hot encoding!
+cereal_1h.info() #see the difference? This is one-hot encoding!
 
 # %% [markdown]
 # ### Baseline/Prevalance 
@@ -131,7 +136,7 @@
 cereal_1h['rating_f'] = pd.cut(cereal_1h.rating, bins = [-1,0.43,1], labels =[0,1])
 #If we want two segments we input three numbers, start, cut and stop values
 
-print(cereal_1h) #notice the new column rating_f, it is now binary based on if the continuous value is above 0.43 or not
+cereal.info() #notice the new column rating_f, it is now binary based on if the continuous value is above 0.43 or not
 
 # %%
 #So now let's check the prevalence