-
Notifications
You must be signed in to change notification settings - Fork 0
/
trainingModel.py
118 lines (78 loc) · 4.76 KB
/
trainingModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""
This is the Entry point for Training the Machine Learning Model.
"""
# Doing the necessary imports
from sklearn.model_selection import train_test_split
from data_ingestion import data_loader
from data_preprocessing import preprocessing
from data_preprocessing import clustering
from best_model_finder import tuner
from file_operations import file_methods
from application_logging import logger
#Creating the common Logging object
class trainModel:
def __init__(self):
self.log_writer = logger.App_Logger()
self.file_object = open("Training_Logs/ModelTrainingLog.txt", 'a+')
def trainingModel(self):
# Logging the start of Training
self.log_writer.log(self.file_object, 'Start of Training')
try:
# Getting the data from the source
data_getter=data_loader.Data_Getter(self.file_object,self.log_writer)
data=data_getter.get_data()
"""doing the data preprocessing"""
preprocessor=preprocessing.Preprocessor(self.file_object,self.log_writer)
#data=preprocessor.remove_columns(data,['Wafer']) # remove the unnamed column as it doesn't contribute to prediction.
#removing unwanted columns as discussed in the EDA part in ipynb file
data = preprocessor.dropUnnecessaryColumns(data,['TSH_measured','T3_measured','TT4_measured','T4U_measured','FTI_measured','TBG_measured','TBG','TSH'])
#repalcing '?' values with np.nan as discussed in the EDA part
data = preprocessor.replaceInvalidValuesWithNull(data)
# get encoded values for categorical data
data = preprocessor.encodeCategoricalValues(data)
# create separate features and labels
X,Y=preprocessor.separate_label_feature(data,label_column_name='Class')
# check if missing values are present in the dataset
is_null_present=preprocessor.is_null_present(X)
# if missing values are there, replace them appropriately.
if(is_null_present):
X=preprocessor.impute_missing_values(X) # missing value imputation
X,Y = preprocessor.handleImbalanceDataset(X,Y)
# check further which columns do not contribute to predictions
# if the standard deviation for a column is zero, it means that the column has constant values
# and they are giving the same output both for good and bad sensors
# prepare the list of such columns to drop
#cols_to_drop=preprocessor.get_columns_with_zero_std_deviation(X)
# drop the columns obtained above
#X=preprocessor.remove_columns(X,cols_to_drop)
""" Applying the clustering approach"""
kmeans=clustering.KMeansClustering(self.file_object,self.log_writer) # object initialization.
number_of_clusters=kmeans.elbow_plot(X) # using the elbow plot to find the number of optimum clusters
# Divide the data into clusters
X=kmeans.create_clusters(X,number_of_clusters)
#create a new column in the dataset consisting of the corresponding cluster assignments.
X['Labels']=Y
# getting the unique clusters from our dataset
list_of_clusters=X['Cluster'].unique()
"""parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""
for i in list_of_clusters:
cluster_data=X[X['Cluster']==i] # filter the data for one cluster
# Prepare the feature and Label columns
cluster_features=cluster_data.drop(['Labels','Cluster'],axis=1)
cluster_label= cluster_data['Labels']
# splitting the data into training and test set for each cluster one by one
x_train, x_test, y_train, y_test = train_test_split(cluster_features, cluster_label, test_size=1 / 3, random_state=355)
model_finder=tuner.Model_Finder(self.file_object,self.log_writer) # object initialization
#getting the best model for each of the clusters
best_model_name,best_model=model_finder.get_best_model(x_train,y_train,x_test,y_test)
#saving the best model to the directory.
file_op = file_methods.File_Operation(self.file_object,self.log_writer)
save_model=file_op.save_model(best_model,best_model_name+str(i))
# logging the successful Training
self.log_writer.log(self.file_object, 'Successful End of Training')
self.file_object.close()
except Exception:
# logging the unsuccessful Training
self.log_writer.log(self.file_object, 'Unsuccessful End of Training')
self.file_object.close()
raise Exception