This repository has been archived by the owner on Aug 25, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 8
/
train.py
251 lines (219 loc) · 8.66 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
import joblib
from typing import Tuple
import bdrk
import numpy as np
import pandas as pd
from bdrk.model_analyzer import ModelAnalyzer, ModelTypes
from boxkite.monitoring.collector import (
BaselineMetricCollector,
FeatureHistogramCollector,
InferenceHistogramCollector
)
from boxkite.monitoring.encoder import MetricEncoder
from environs import Env
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
env = Env()
OUTPUT_MODEL_PATH = env("OUTPUT_MODEL_PATH")
TRAIN_DATA_PATH = env("TRAIN_DATA_PATH")
TEST_DATA_PATH = env("TEST_DATA_PATH")
C = env.float("C")
CONFIG_FAI = {
"large_rings": {
"privileged_attribute_values": [1],
# privileged group name corresponding to values=[1]
"privileged_group_name": "Large",
"unprivileged_attribute_values": [0],
# unprivileged group name corresponding to values=[0]
"unprivileged_group_name": "Small",
}
}
def load_dataset(filepath: str,
target: str) -> Tuple[pd.core.frame.DataFrame,
np.ndarray]:
"""
Loads the dataset and returns the features as a pandas dataframe and
the target variable as a numpy array.
:param filepath: Path to load the data
:type filepath: str
:param target: Target variable
:type target: str
:return: The features pandas dataframe and the target numpy array
:rtype: tuple[pandas.core.frame.DataFrame, numpy.ndarray]
"""
df = pd.read_csv(filepath)
df['large_rings'] = (df['Rings'] > 10).astype(int)
# Ensure nothing missing
original_len = len(df)
df.dropna(how="any", axis=0, inplace=True)
num_rows_dropped = original_len - len(df)
if num_rows_dropped > 0:
print(f"Warning - dropped {num_rows_dropped} rows with NA data.")
y = df[target].values
df.drop(target, axis=1, inplace=True)
return df, y
def train_log_reg_model(X: pd.core.frame.DataFrame,
y: np.ndarray,
seed: float = 0,
C: float = 1,
verbose: bool = False) -> Pipeline:
"""
Scales the features and trains a logistic regression model.
:param X: Features for training
:type X: pandas.core.frame.DataFrame
:param y: Target variable
:type y: numpy.ndarray
:param seed: `random_state` for logistic regression model
:type seed: float
:param C: Inverse of regularization strength
:type C: float
:param verbose: Whether to print additional info
:type verbose: bool
:return: Pipeline of transforms with a trained final estimator
:rtype: sklearn.pipeline.Pipeline
"""
verbose and print('\nTRAIN\nScaling...')
scaling = StandardScaler()
X = scaling.fit_transform(X)
verbose and print('Fitting...')
verbose and print('C:', C)
model = LogisticRegression(random_state=seed, C=C, max_iter=4000)
model.fit(X, y)
verbose and print('Chaining pipeline...')
pipe = Pipeline([('scaling', scaling), ('model', model)])
verbose and print('Done training.')
return pipe
def compute_log_metrics(pipe: Pipeline,
x_test: pd.core.frame.DataFrame,
y_test: np.ndarray,
y_test_onehot: np.ndarray):
"""
Computes, prints and log metrics.
:param pipe: Pipeline of transforms with a trained final estimator
:type pipe: sklearn.pipeline.Pipeline
:param x_test: Features for testing
:type x_test: pandas.core.frame.DataFrame
:param y_test: Target variable data for testing
:type y_test: numpy.ndarray
:param y_test_onehot: One hot encoded target variable data
:type y_test_onehot: numpy.ndarray
:return: Test predicted probability and predictions
:rtype: tuple[numpy.ndarray, numpy.ndarray]
"""
test_prob = pipe.predict_proba(x_test)
test_pred = pipe.predict(x_test)
acc = metrics.accuracy_score(y_test, test_pred)
precision = metrics.precision_score(y_test, test_pred, average='macro')
recall = metrics.recall_score(y_test, test_pred, average='macro')
f1_score = metrics.f1_score(y_test, test_pred, average='macro')
roc_auc = metrics.roc_auc_score(y_test_onehot,
test_prob,
average='macro',
multi_class='ovr')
avg_prc = metrics.average_precision_score(y_test_onehot,
test_prob,
average='macro')
print("\nEVALUATION\n"
f"\tAccuracy = {acc:.4f}\n"
f"\tPrecision (macro) = {precision:.4f}\n"
f"\tRecall (macro) = {recall:.4f}\n"
f"\tF1 score (macro) = {f1_score:.4f}\n"
f"\tROC AUC (macro) = {roc_auc:.4f}\n"
f"\tAverage precision (macro) = {avg_prc:.4f}")
# Bedrock Logger: captures model metrics
bdrk.log_metrics(
{
"Accuracy": acc,
"Precision (macro)": precision,
"Recall (macro)": recall,
"F1 Score (macro)": f1_score,
"ROC AUC (macro)": roc_auc,
"Avg precision (macro)": avg_prc,
}
)
# `log_chart_data` assumes binary classification
# For multiclass labels, we can use a "micro-average" by
# quantifying score on all classes jointly
# See https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html # noqa: E501
# This will allow us to use the same `log_chart_data` method
bdrk.log_binary_classifier_metrics(
y_test_onehot.ravel().astype(int).tolist(), # list of int
test_prob.ravel().astype(float).tolist() # list of float
)
return test_prob, test_pred
def main():
x_train, y_train = load_dataset(
filepath=TRAIN_DATA_PATH,
target='Type'
)
x_test, y_test = load_dataset(
filepath=TEST_DATA_PATH,
target='Type'
)
print('X (train)')
print(x_train)
# sklearn `roc_auc_score` and `average_precision_score` expects
# binary label indicators with shape (n_samples, n_classes)
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
y_train_onehot = enc.fit_transform(y_train.reshape(-1, 1))
y_test_onehot = enc.fit_transform(y_test.reshape(-1, 1))
print('\nCATEGORIES')
for value, category in enumerate(enc.categories_[0]):
print(f'{category} : {value}')
# Convert target variable to numeric values
# ModelMonitoringService.export_text expect both features
# and inference to be numeric values
y_train = np.argmax(y_train_onehot, axis=1)
y_test = np.argmax(y_test_onehot, axis=1)
pipe = train_log_reg_model(x_train,
y_train,
seed=0,
C=C,
verbose=True)
# Save trained model
feature_names = x_train.columns.tolist()
print("\nSAMPLE FEATURES")
print({
feature_name: str(x_train[feature_name][0])
for feature_name in feature_names
})
joblib.dump([feature_names, enc, pipe], OUTPUT_MODEL_PATH)
print('\nSaved trained one hot encoder and logistic regression model.')
test_prob, test_pred = compute_log_metrics(pipe,
x_test,
y_test,
y_test_onehot)
# Save feature and inferance distribution
train_predicted = pipe.predict(x_train).flatten().tolist()
collectors = [
FeatureHistogramCollector(
data=x_train.iteritems(),
discrete={7, 8}, # Specify which column indices are discrete
),
InferenceHistogramCollector(data=train_predicted,
is_discrete=True)
# Specify inference as discrete
]
encoder = MetricEncoder(collectors=collectors)
with open(BaselineMetricCollector.DEFAULT_HISTOGRAM_PATH, "wb") as f:
f.write(encoder.as_text())
print('Saved feature and inference distribution.')
# Train Shap model and calculate xafai metrics
analyzer = (
ModelAnalyzer(pipe[1],
model_name='logistic',
model_type=ModelTypes.LINEAR)
.train_features(x_train)
.test_features(x_test)
.fairness_config(CONFIG_FAI)
.test_labels(y_test)
.test_inference(test_pred)
)
analyzer.analyze()
print('Saved Shap model and fairness results.')
if __name__ == '__main__':
bdrk.init()
with bdrk.start_run():
main()