Skip to content

Commit

Permalink
Updated Lab
Browse files Browse the repository at this point in the history
  • Loading branch information
NovaVolunteer committed Sep 18, 2023
1 parent 39ebb12 commit 853605b
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 11 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@ README.html
in_class_example_knitr.html
Lab_Overview.html
Decision_Tree_Lab_Answers.Rmd
03_Communications/presentation_example.html
03_Communications/presentation_example.html
4 changes: 2 additions & 2 deletions 02_function_basics/Class_II_Python.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "317858ce",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -1035,7 +1035,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.17"
"version": "3.10.12"
},
"vscode": {
"interpreter": {
Expand Down
23 changes: 14 additions & 9 deletions 05_ML_Concepts_II_Data_Prep/ml_bootcamp.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,22 @@
#read in the cereal dataset, you should have this locally or you can use the URL linking to the class repo below
cereal = pd.read_csv("https://raw.githubusercontent.com/UVADS/DS-3001/main/data/cereal.csv")

print(cereal.info()) # Let's check the structure of the dataset and see if we have any issues with variable classes
cereal.info() # Let's check the structure of the dataset and see if we have any issues with variable classes
#usually it's converting things to category


# %%
#Looks like columns 1,2,11 and 12 need to be converted to category
#Looks like columns 11 and 12 need to be converted to category

Column_index_list = [1,2,11,12]
cereal.iloc[:,Column_index_list]= cereal.iloc[:,Column_index_list].astype('category')
cols = ["type","mfr","vitamins","shelf"]
cereal[cols]= cereal[cols].astype('category')
#iloc accesses the index of a dataframe, bypassing having to manually type in the names of each column

print(cereal.dtypes) #another way of checking the structure of the dataset. Simpler, but does not give an index
#convert type variable in category variable
#cereal.type = cereal.type.astype('category') #this is the same as the above code, but for a single column


cereal.dtypes #another way of checking the structure of the dataset. Simpler, but does not give an index

# %%
#Let's take a closer look at mfr
Expand All @@ -63,6 +67,7 @@

print(cereal.mfr.value_counts()) #This is a lot better


# %%
print(cereal.type.value_counts()) #looks good

Expand Down Expand Up @@ -110,11 +115,11 @@
# %%
# Next let's one-hot encode those categorical variables

category_list = list(cereal.select_dtypes('category')) #select function to find the categorical variables and create a list
#category_list = list(cereal.select_dtypes('category')) #select function to find the categorical variables and create a list

cereal_1h = pd.get_dummies(cereal, columns = category_list)
#cereal_1h = pd.get_dummies(cereal, columns = category_list)
#get_dummies encodes categorical variables into binary by adding in indicator column for each group of a category and assigning it 0 if false or 1 if true
print(cereal_1h) #see the difference? This is one-hot encoding!
cereal_1h.info() #see the difference? This is one-hot encoding!

# %% [markdown]
# ### Baseline/Prevalance
Expand All @@ -131,7 +136,7 @@
cereal_1h['rating_f'] = pd.cut(cereal_1h.rating, bins = [-1,0.43,1], labels =[0,1])
#If we want two segments we input three numbers, start, cut and stop values

print(cereal_1h) #notice the new column rating_f, it is now binary based on if the continuous value is above 0.43 or not
cereal.info() #notice the new column rating_f, it is now binary based on if the continuous value is above 0.43 or not

# %%
#So now let's check the prevalence
Expand Down

0 comments on commit 853605b

Please sign in to comment.