-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbackward_search_analysis.py
138 lines (107 loc) · 6.33 KB
/
backward_search_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import numpy as np
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
from datetime import datetime
# Record start time
start_time = datetime.now()
# Load training data, extract the machine failed labels (last column), and delete the column from the original data
train_data = np.loadtxt("dataset\\ai4i2020_train_data.csv", delimiter=",")
train_data = shuffle(train_data, random_state=42) # Randomize the data
train_data_labels = train_data[:, 6]
train_data = np.delete(train_data, 6, axis=1)
# Load testing data, extract the machine failed labels (last column), and delete the column from the original data
test_data = np.loadtxt("dataset\\ai4i2020_test_data.csv", delimiter=",")
test_data_labels = test_data[:, 6]
test_data = np.delete(test_data, 6, axis=1)
# Calculate number of features
attributes = np.shape(train_data)[1] # 6
# Variables for storing errors
dtc_classification_error = []
rfc_classification_error = []
# Create instances of classifiers
clf_dtc = DecisionTreeClassifier()
clf_rfc = RandomForestClassifier()
# --- Decision Tree Classification and Random Forest Classification using backward search begins ---
# Running backward search for Decision Tree Classification
# Begin with all dimensions
train_data_reduced = train_data
test_data_reduced = test_data
print("\nDecision Tree Classification errors after backward search dimensionality reduction:\n")
# Iterate though all dimensions
for iteration in range(attributes):
classification_error = []
# Calculating error when all dimensions are retained
if iteration == 0 or iteration == 5:
# Decision Tree Classification and Random Forest Classification on the reduced data
clf_dtc.fit(train_data_reduced, train_data_labels) # Training of the model
dtc_prediction = clf_dtc.predict(test_data_reduced) # Testing of the model
classification_error.append(sum(dtc_prediction != test_data_labels)) # Error Calculation
# Calculating for rest of the dimensions
else:
for dimension in range(attributes - iteration - 1, -1, -1):
train_data_dimension_reduced = np.delete(train_data_reduced, obj=dimension, axis=1)
test_data_dimension_reduced = np.delete(test_data_reduced, obj=dimension, axis=1)
# Decision Tree Classification and Random Forest Classification on the reduced data
clf_dtc.fit(train_data_dimension_reduced, train_data_labels) # Training of the model
dtc_prediction = clf_dtc.predict(test_data_dimension_reduced) # Testing of the model
classification_error.append(sum(dtc_prediction != test_data_labels)) # Error Calculation
# Find the minimum classification error
dtc_classification_error.append(min(classification_error))
print(f"{attributes - iteration} retained dimensions: {min(classification_error)}")
# Find the index of the dimension of the least classification error
dimension_to_remove = classification_error.index(min(classification_error))
print(f"Index of removed dimension from the retained dimensions: {dimension_to_remove}")
# Remove the dimension from the data
train_data_reduced = np.delete(train_data_reduced, obj=dimension_to_remove, axis=1)
test_data_reduced = np.delete(test_data_reduced, obj=dimension_to_remove, axis=1)
# Running backward search for Random Forest Classification
# Begin with all dimensions
train_data_reduced = train_data
test_data_reduced = test_data
print("\nRandom Forest Classification errors after backward search dimensionality reduction:\n")
# Iterate though all dimensions
for iteration in range(attributes):
classification_error = []
# Calculating error when all dimensions are retained
if iteration == 0 or iteration == 5:
# Decision Tree Classification and Random Forest Classification on the reduced data
clf_rfc.fit(train_data_reduced, train_data_labels) # Training of the model
rfc_prediction = clf_rfc.predict(test_data_reduced) # Testing of the model
classification_error.append(sum(rfc_prediction != test_data_labels)) # Error Calculation
# Calculating for rest of the dimensions
else:
for dimension in range(attributes - iteration - 1, -1, -1):
train_data_dimension_reduced = np.delete(train_data_reduced, obj=dimension, axis=1)
test_data_dimension_reduced = np.delete(test_data_reduced, obj=dimension, axis=1)
# Decision Tree Classification and Random Forest Classification on the reduced data
clf_rfc.fit(train_data_dimension_reduced, train_data_labels) # Training of the model
rfc_prediction = clf_rfc.predict(test_data_dimension_reduced) # Testing of the model
classification_error.append(sum(rfc_prediction != test_data_labels)) # Error Calculation
# Find the minimum classification error
rfc_classification_error.append(min(classification_error))
print(f"{attributes - iteration} retained dimensions: {min(classification_error)}")
# Find the index of the dimension of the least classification error
dimension_to_remove = classification_error.index(min(classification_error))
print(f"Index of removed dimension from the retained dimensions: {dimension_to_remove}")
# Remove the dimension from the data
train_data_reduced = np.delete(train_data_reduced, obj=dimension_to_remove, axis=1)
test_data_reduced = np.delete(test_data_reduced, obj=dimension_to_remove, axis=1)
# Plot the errors
x_axis = [i for i in range(6)]
backward_search_error_plot = plt.figure(1)
plt.plot(x_axis, dtc_classification_error)
plt.plot(x_axis, rfc_classification_error)
plt.xlabel("Retained dimensions out of 6 total dimensions in the dataset")
plt.ylabel("Number of errors")
plt.legend(["Decision Tree Classification Errors", "Random Forest Classification Errors"])
plt.xticks(ticks=[i for i in range(6)], labels=[i for i in range(6, 0, -1)])
plt.grid()
plt.title("Plot of Classification errors after dimensionality reduction using Backward Search")
backward_search_error_plot.show()
# Record end time and show total run time
print(f"\nProgram completed in {(datetime.now() - start_time).seconds}."
f"{(datetime.now() - start_time).microseconds} seconds.")
plt.show()
# --- Decision Tree Classification and Random Forest Classification using backward search ends ---