forked from ACM-Research/Coding-Challenge-S22
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mushroom_classification.py
114 lines (86 loc) · 3.02 KB
/
mushroom_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# Setup plotting
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
# Set Matplotlib defaults
plt.rc('figure', autolayout=True)
plt.rc('axes', labelweight='bold', labelsize='large',
titleweight='bold', titlesize=18, titlepad=10)
plt.rc('animation', html='html5')
# Load the dataset
import pandas as pd
shrooms = pd.read_csv('./mushrooms.csv')
# Binary encoding on class column
shrooms['class'] = shrooms['class'].map({'p': 0, 'e': 1})
from IPython.display import display
display(shrooms.head())
# Create input and output sets
X = shrooms.copy()
y = X.pop('class')
# Set up preprocessing pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
features_cat = [
"cap-shape", "cap-surface", "cap-color","bruises",
"odor","gill-attachment","gill-spacing","gill-size",
"gill-color","stalk-shape","stalk-root",
"stalk-surface-above-ring","stalk-surface-below-ring",
"stalk-color-above-ring","stalk-color-below-ring",
"veil-type","veil-color","ring-number","ring-type",
"spore-print-color","population","habitat"
]
transformer_cat = make_pipeline(
SimpleImputer(strategy="constant", fill_value="NA"),
OneHotEncoder(handle_unknown='ignore'),
)
preprocessor = make_column_transformer(
(transformer_cat, features_cat)
)
# Stratify - make sure classes are evenly represented across splits
X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, train_size=0.75)
# Perform preprocessing on the training and validation data
X_train = preprocessor.fit_transform(X_train)
X_valid = preprocessor.transform(X_valid)
X_train = X_train.toarray()
X_valid = X_valid.toarray()
# Setup ML model
from tensorflow import keras
from tensorflow.keras import layers, Sequential, callbacks
input_shape = [X_train.shape[1]]
model = keras.Sequential([
layers.Dense(256, activation='relu', input_shape=input_shape),
layers.Dense(256, activation='relu'),
layers.Dense(1, activation='sigmoid'),
])
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['binary_accuracy'],
)
early_stopping = keras.callbacks.EarlyStopping(
patience=10,
min_delta=0.001,
restore_best_weights=True,
)
# Train for supplied amount of epochs
history = model.fit(
X_train, y_train,
validation_data=(X_valid, y_valid),
batch_size=512,
epochs=200,
callbacks=[early_stopping],
# verbose=0, # hide the output because we have so many epochs
)
# Plot results of model training
history_df = pd.DataFrame(history.history)
history_df.loc[0:, ['loss', 'val_loss']].plot()
history_df.loc[0:, ['binary_accuracy', 'val_binary_accuracy']].plot()
print(("Best Validation Loss: {:0.5f}" +\
"\nBest Validation Accuracy: {:0.5f}")\
.format(history_df['val_loss'].min(),
history_df['val_binary_accuracy'].max())
)
plt.show()
# model.save('saved_model/my_model')