-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.py
287 lines (240 loc) · 10 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
import argparse
import pandas as pd
from argparse import RawTextHelpFormatter
from enum import Enum
from classes.abstract_model import AbstractModel
from classes.baseline import Baseline
from classes.bert import Bert
from classes.ensemble import Ensemble
from classes.gru import Gru
from classes.preprocessing import Preprocessing
from constants import *
from time import strftime
class Models(Enum):
"""
This an enumeration to define user-executable methods
"""
bert = 'bert'
gru = 'gru'
ensemble = 'ensemble'
mlp = 'mlp'
knn = 'knn'
nbc = 'nbc'
rf = 'rf'
lr = 'lr'
svm = 'svm'
def __str__(self):
"""
Returns the value of the Enumeration
:return: value of Enumeration
:rtype: str
"""
return self.value
def get_model_name(self):
"""
Performs a mapping between Models value and class/string to run the method
:return: class/string with respect to the value of the Enumeration
:rtype: object
"""
list_model = {
Models.bert: Bert,
Models.gru: Gru,
Models.mlp: 'Neural-Network',
Models.knn: 'KNN',
Models.nbc: 'Naive-Bayes',
Models.rf: 'Random-Forest',
Models.lr: 'Logistic-Regression',
Models.svm: 'SVM',
Models.ensemble: None
}
return list_model[self]
def run_preprocessing(csr: AbstractModel, train_preprocessed_path,
test_preprocessed_path, full_data=True):
"""
Runs the preprocessing methods according to the chosen classifier
on the train and test data
:param csr: chosen classifier (child of AbstractModel)
:type csr: AbstractModel
:param train_preprocessed_path: path to load train data
:type train_preprocessed_path: str
:param test_preprocessed_path: path to load test data
:type test_preprocessed_path: str
:param full_data: if False, the small dataset (200K rows) is used
:type full_data: bool, optional
"""
# Read data
if full_data:
dataset_files = [TRAIN_DATA_NEGATIVE_FULL, TRAIN_DATA_POSITIVE_FULL]
else:
dataset_files = [TRAIN_DATA_NEGATIVE, TRAIN_DATA_POSITIVE]
train_preprocessing = Preprocessing(dataset_files, submission=False)
test_preprocessing = Preprocessing([TEST_DATA], submission=True)
# Preprocess it
for method in csr.get_preprocessing_methods(istest=False):
getattr(train_preprocessing, method)()
for method in csr.get_preprocessing_methods(istest=True):
getattr(test_preprocessing, method)()
# Save it
train_df = train_preprocessing.get()
train_df = train_df.sample(frac=1)
train_df.to_csv(train_preprocessed_path, index=False)
test_preprocessing.get().to_csv(test_preprocessed_path, index=False)
def execute(args, weights_path, train_preprocessed_path, test_preprocessed_path,
submission_path, full_data=True, **kwargs):
"""
Creates a submission file using a method specified by user.
If specified, loads preprocessed data and/or the weights, otherwise
preprocesses data, fits the model and makes predictions from scratch
:param args: arguments chosen by the user
:type args: argparse.Namespace
:param weights_path: path to load/store the weights
:type weights_path: str
:param train_preprocessed_path: path to load/store the train preprocessed data
:type train_preprocessed_path: str
:param test_preprocessed_path: path to load/store the test preprocessed data
:type test_preprocessed_path: str
:param submission_path: path to save the submission file
:type submission_path: str
:param full_data: if False, the small dataset (200K rows) is used
:type full_data: bool, optional
:param kwargs: additional arguments for classical methods (otherwise empty)
:type kwargs: dict
"""
# Is a classical method if more parameters are specified
is_classical = len(kwargs) > 0
if is_classical:
classifier = Baseline(weights_path)
else:
classifier = args.model.get_model_name()(weights_path)
# Doing preprocessing if the user doesn't specify to load preprocessing data
if not args.lp:
run_preprocessing(classifier,
train_preprocessed_path,
test_preprocessed_path, full_data)
# Specifying the columns of the DataFrame
usecols_train = ['text', 'label']
usecols_test = ['ids', 'text']
# If classical, one more column
if is_classical:
usecols_train.append('raw')
usecols_test.append('raw')
# Loading preprocessed data
train_preprocessed = pd.read_csv(train_preprocessed_path,
usecols=usecols_train)
test_preprocessed = pd.read_csv(test_preprocessed_path,
usecols=usecols_test)
# Dropping null rows from training data
train_preprocessed.dropna(inplace=True)
# If classical method is used, performing feature extraction
if is_classical:
X, Y = classifier.feature_extraction(train_preprocessed)
X_test, test_ids = classifier.feature_extraction(test_preprocessed, istest=True)
# Otherwise, just read the DataFrame content
else:
X, Y = train_preprocessed['text'].values, train_preprocessed['label'].values
X_test, test_ids = test_preprocessed['text'].values, test_preprocessed['ids'].values
# Updating the vocabulary of the GRU classifier according to the training data
if args.model == Models.gru:
classifier.update_vocabulary(X)
# Only making predictions, if the user specifies to load the weights
if args.lt:
classifier.predict(
test_ids, X_test,
f'{submission_path}submission-{strftime("%Y-%m-%d_%H:%M:%S")}.csv',
**kwargs)
# Otherwise, fitting and then making predictions
else:
classifier.fit_predict(
X, Y, test_ids, X_test,
f'{submission_path}submission-{strftime("%Y-%m-%d_%H:%M:%S")}.csv',
**kwargs)
if __name__ == '__main__':
# For the classical ML methods, when the '-lt' (load trained) option
# is not specificied, we automatically perform a cross validation to find
# the best hyperparameters for the model
parser = argparse.ArgumentParser(
description='This script performs a classification task to predict if ' \
'a tweet message used to contain a positive :) or negative ' \
':( smiley,by considering only the remaining text.',
formatter_class=RawTextHelpFormatter)
# Required argument
parser.add_argument(
'model',
type=Models,
choices=list(Models),
help='Specify the model you want to run.\nNote: for classical ML models ' \
'(every model excluded Bert and GRU), if -lt is not specified, ' \
'before the training phase we perform the hyperparameters tuning\n' \
' bert: performs the classification with a Bert model (we suggest ' \
'you to train this model on a cloud platform)\n gru: performs the ' \
'classification with a GRU bidirectional model\n ensemble: ' \
'performs the classification with an Ensemble out of all other ' \
'models based on a weighted voting scheme proportional with cross ' \
'validation score\n mlp: performs the classification with a ' \
'multi-layer perceptron neural network \n knn: performs the ' \
'classification with a K-nearest neighbors classifier\n nbc: ' \
'performs the classification with a Naive Bayes classifier\n ' \
'rf: performs the classification with a Random Forest classifier\n' \
' lr: performs the classification with Logistic Regression \n' \
' svm: performs the classification with linear SVM classifier')
# Optional arguments
parser.add_argument(
'-lp',
action='store_true',
help='Load already preprocessed data for a specified model')
parser.add_argument(
'-lt',
action='store_true',
help='Load an already trained model')
# Getting args namespace
args = parser.parse_args()
if args.model == Models.bert:
execute(args,
BERT_WEIGHTS_PATH,
f'{PREPROCESSED_DATA_PATH_BERT}{PREPROCESSED_TRAIN_DATA_BERT}',
f'{PREPROCESSED_DATA_PATH_BERT}{PREPROCESSED_TEST_DATA_BERT}',
SUBMISSION_PATH_BERT)
elif args.model == Models.gru:
execute(args,
GRU_WEIGHTS_PATH,
f'{PREPROCESSED_DATA_PATH_GRU}{PREPROCESSED_TRAIN_DATA_GRU}',
f'{PREPROCESSED_DATA_PATH_GRU}{PREPROCESSED_TEST_DATA_GRU}',
SUBMISSION_PATH_GRU)
elif args.model == Models.ensemble:
# Names of the models you want to use for ensembling
model_names = [
'Gru',
'Bert_no_prep',
'Bert_with_prep',
'Bert_large',
'KNN',
'Logistic_Regression',
'Naive_Bayes',
'Random_Forest',
'Multilayer_Perceptron',
'SVM']
# Dictionary with the submissions of those models
# and their respective validation accuracy
model_accuracies = {
f'{SUBMISSION_PATH_GRU}submission-2020-12-10_12:06:04.csv': 0.853,
f'{SUBMISSION_PATH_BERT}submission-2020-12-06_16:48:30.csv': 0.894,
f'{SUBMISSION_PATH_BERT}submission-2020-12-03_20:24:31.csv': 0.888,
f'{SUBMISSION_PATH_BERT}submission-2020-12-13_11:13:01.csv': 0.897,
f'{SUBMISSION_PATH_CLASSICAL}submission-KNN-2020-12-08_23:37:01.csv': 0.674,
f'{SUBMISSION_PATH_CLASSICAL}submission-Logistic Regression-2020-12-09_07:56:20.csv': 0.765,
f'{SUBMISSION_PATH_CLASSICAL}submission-Naive Bayes-2020-12-08_20:28:39.csv':0.642,
f'{SUBMISSION_PATH_CLASSICAL}submission-Random Forest-2020-12-09_09:30:11.csv': 0.766,
f'{SUBMISSION_PATH_CLASSICAL}submission-Neural Network-2020-12-09_04:42:17.csv': 0.776,
f'{SUBMISSION_PATH_CLASSICAL}submission-SVM-2020-12-08_20:03:39.csv': 0.765
}
# Instantiating the model
ensemble_model = Ensemble(model_accuracies, model_names)
# Predicting
ensemble_model.predict(f'{SUBMISSION_PATH_ENSEMBLE}submission-{strftime("%Y-%m-%d_%H:%M:%S")}.csv')
else:
execute(args,
CLASSICAL_WEIGHTS_PATH,
f'{PREPROCESSED_DATA_PATH_CLASSICAL}{PREPROCESSED_TRAIN_DATA_CLASSICAL}',
f'{PREPROCESSED_DATA_PATH_CLASSICAL}{PREPROCESSED_TEST_DATA_CLASSICAL}',
SUBMISSION_PATH_CLASSICAL, full_data=False,
model_name=args.model.get_model_name())