-
Notifications
You must be signed in to change notification settings - Fork 0
/
FirstModelRunAutomate.py
92 lines (71 loc) · 3.73 KB
/
FirstModelRunAutomate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import mastml
from mastml.mastml import Mastml
from mastml.datasets import LocalDatasets
from mastml.preprocessing import SklearnPreprocessor
from mastml.models import SklearnModel, EnsembleModel
from mastml.data_splitters import SklearnDataSplitter,NoSplit
import os
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
model_rf = SklearnModel(model='RandomForestRegressor')
model_neighbor = EnsembleModel(model="KNeighborsRegressor", n_estimators=20, metric = "euclidean", n_neighbors = 5, weights = "distance")
def keras_model():
model = Sequential()
model.add(Dense(2048, input_dim=len(X.keys()), kernel_initializer='normal', activation='relu'))
model.add(Dense(2048, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(loss='mean_squared_error', optimizer='adam')
return model
keras_regressor = KerasRegressor(build_fn=keras_model, epochs=100, batch_size=100, verbose=0)
model_keras = EnsembleModel(model=keras_regressor, n_estimators = 20)
Diffusion = "datasets/diffusion_data_selectfeatures.xlsx"
Perovskite = "datasets/Perovskite_70_Selected_Features.xlsx"
Supercond = "datasets/Supercon_data_features_selected.xlsx"
save_folder = "/home/mse10/vidit-work/FirstModelRuns"
model_dict = {model_rf: 'RandomForestRegressor', model_neighbor: 'NearestNeighbor', model_keras: 'KerasNetwork'}
dataset_dict = {Diffusion: 'Diffusion', Perovskite: 'Perovskite', Supercond: 'Superconductivity'}
for datapath in [Diffusion, Perovskite, Supercond]:
if(datapath == Diffusion):
target = 'E_regression'
extra_columns = ['Material compositions 1', 'Material compositions 2']
elif(datapath == Perovskite):
target = 'EnergyAboveHull'
extra_columns = ['Unnamed: 0']
elif(datapath == Supercond):
target = 'Tc'
extra_columns = ['name', 'group', 'ln(Tc)']
d = LocalDatasets(file_path=datapath,
target=target,
extra_columns=extra_columns,
testdata_columns=None,
as_frame=True)
# Load the data with the load_data() method
data_dict = d.load_data()
# Let's assign each data object to its respective name
X = data_dict['X']
y = data_dict['y']
X_extra = data_dict['X_extra']
X_testdata = data_dict['X_testdata']
preprocessor = SklearnPreprocessor(preprocessor='MinMaxScaler', as_frame=True)
metrics = ['r2_score', 'mean_absolute_error', 'root_mean_squared_error', 'rmse_over_stdev']
for modelType in [model_rf, model_neighbor, model_keras]:
for splitter_type in ['CV', 'NoSplit']:
SAVEPATH = save_folder + "/data_{}_model_{}_{}".format(dataset_dict[datapath], model_dict[modelType],splitter_type)
mastml = Mastml(savepath=SAVEPATH)
savepath = mastml.get_savepath
if splitter_type == 'CV':
splitter = SklearnDataSplitter(splitter='RepeatedKFold', n_repeats=1, n_splits=5)
elif splitter_type == 'NoSplit':
splitter = NoSplit()
#Running this model according to Palmer specifications
splitter.evaluate(X=X, y=y, models=[modelType], mastml=mastml, preprocessor=preprocessor, metrics=metrics,
plots=['Error', 'Scatter', 'Histogram'],
parallel_run = True,
savepath=savepath,
X_extra=X_extra,
Nested_CV = True,
error_method='stdev_weak_learners',
recalibrate_errors=True,
remove_outlier_learners=True,
verbosity=3)