-
Notifications
You must be signed in to change notification settings - Fork 67
/
05_2_machine_learning_implementation_with_18_feature.py
168 lines (122 loc) · 7.6 KB
/
05_2_machine_learning_implementation_with_18_feature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
## "all_data.csv" file is required for the operation of the program.
## "all_data.csv" file must be located in the same directory as the program.
## the purpose of this program is to apply machine learning algorithms to the dataset and observe the performance of algorithms.
## the algorithms used are:Naive Bayes, QDA, Random Forest, ID3, AdaBoost, MLP, Nearest Neighbors
## As the program display output data include: file name, machine learning algorithm name, accuracy,Precision, Recall, F1-score,Time
## the program will create a CSV file that prints the results and a folder containing graphics.
## the some codes parts used for calculation and graphing are taken from the following site.
## http://scikit-learn.org
from sklearn import metrics
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
import matplotlib.pyplot as plt
import numpy as np
#%matplotlib inline
import os
import pandas as pd
import csv
import time
import warnings
import math
warnings.filterwarnings("ignore")
result="./results/results_2.csv" #a CSV file is named in which the results are saved.
csv_files=["all_data.csv"]# CSV files names: #The names of the dataset files (csv_files).
path=""
repetition=10
def folder(f_name): #this function creates a folder named "results" and "result_graph_1" in the program directory.
try:
if not os.path.exists(f_name):
os.makedirs(f_name)
except OSError:
print ("The folder could not be created!")
folder_name="./results/"
folder(folder_name)
folder_name="./results/result_graph_2/"
folder(folder_name)
#The machine learning algorithms to be used are defined in a dictionary (ml_list).
ml_list={
"Naive Bayes":GaussianNB(),
"QDA":QDA(),
"Random Forest":RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
"ID3" :DecisionTreeClassifier(max_depth=5,criterion="entropy"),
"AdaBoost":AdaBoostClassifier(),
"MLP":MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500),
"Nearest Neighbors":KNeighborsClassifier(3)}
# the features to be used for each attack type is defined in a dictionary(features).
# the first 4 of the features created by the file "04_1_feature_selection_for_attack_files.py" are used here.
### The set of features to be used consists of combining the 4 features with the highest importance-weight achieved for each attack in approach 1 under a single roof.
### Thus, 4 features are obtained from each of the 12 attack types, resulting in a pool of features consisting of 48 attributes.
### After the repetitions are removed, the number of features is 18. The list of these features can be seen in below : ()
features={"all_data":["Bwd Packet Length Max","Bwd Packet Length Mean","Bwd Packet Length Std","Flow Bytes/s",
"Flow Duration","Flow IAT Max","Flow IAT Mean","Flow IAT Min","Flow IAT Std","Fwd IAT Total","Fwd Packet Length Max",
"Fwd Packet Length Mean","Fwd Packet Length Min","Fwd Packet Length Std","Total Backward Packets","Total Fwd Packets",
"Total Length of Bwd Packets","Total Length of Fwd Packets","Label"]}
seconds=time.time()#time stamp for all processing time
with open(result, "w", newline="",encoding="utf-8") as f:#a CSV file is created to save the results obtained.
wrt = csv.writer(f)
wrt.writerow(["File","ML algorithm","accuracy","Precision", "Recall" , "F1-score","Time"])
for j in csv_files: #this loop runs on the list containing the filenames.Operations are repeated for all attack files
print ('%-17s %-17s %-15s %-15s %-15s %-15s %-15s' % ("File","ML algorithm","accuracy","Precision", "Recall" , "F1-score","Time"))# print output header
feature_list=list(features[j[0:-4]])
df=pd.read_csv(path+j,usecols=feature_list)#read an attack file.
df=df.fillna(0)
attack_or_not=[]
for i in df["Label"]: #it changes the normal label to "1" and the attack tag to "0" for use in the machine learning algorithm
if i =="BENIGN":
attack_or_not.append(1)
else:
attack_or_not.append(0)
df["Label"]=attack_or_not
y = df["Label"] #this section separates the label and the data into two separate pieces, as Label=y Data=X
del df["Label"]
feature_list.remove('Label')
X = df[feature_list]
for ii in ml_list: #this loop runs on the list containing the machine learning algorithm names. Operations are repeated for all the 7 algorithm
precision=[]
recall=[]
f1=[]
accuracy=[]
t_time=[]
for i in range(repetition): # This loop allows cross-validation and machine learning algorithm to be repeated 10 times
second=time.time()#time stamp for processing time
# cross-validation
X_train, X_test, y_train, y_test = train_test_split(X, y,# data (X) and labels (y) are divided into 2 parts to be sent to the machine learning algorithm (80% train,%20 test).
test_size = 0.20, random_state = repetition)# So, in total there are 4 tracks: training data(X_train), training tag (y_train), test data(X_test) and test tag(y_test).
#machine learning algorithm is applied in this section
clf = ml_list[ii]#choose algorithm from ml_list dictionary
clf.fit(X_train, y_train)
predict =clf.predict(X_test)
#makes "classification report" and assigns the precision, f-measure, and recall values.s.
f_1=f1_score(y_test, predict, average='macro')
pr=precision_score(y_test, predict, average='macro')
rc=recall_score(y_test, predict, average='macro')
precision.append(float(pr))
recall.append(float(rc))
f1.append(float(f_1))
accuracy.append(clf.score(X_test, y_test))
t_time.append(float((time.time()-second)) )
print ('%-17s %-17s %-15s %-15s %-15s %-15s %-15s' % (j[0:-4],ii,str(round(np.mean(accuracy),2)),str(round(np.mean(precision),2)),
str(round(np.mean(recall),2)),str(round(np.mean(f1),2)),str(round(np.mean(t_time),4))))#the result of the ten repetitions is printed on the screen.
with open(result, "a", newline="",encoding="utf-8") as f: # all the values found are saved in the opened file.
wrt = csv.writer(f)
for i in range(0,len(t_time)):
wrt.writerow([j[0:-4],ii,accuracy[i],precision[i],recall[i],f1[i],t_time[i]])#file name, algorithm name, precision, recall and f-measure are writed in CSV file
# In this section, Box graphics are created for the results of machine learning algorithms and saved in the feaure_graph folder.
plt.boxplot(f1)
plt.title("All Dataset - " +str(ii))
plt.ylabel('F-measure')
plt.savefig(folder_name+j[0:-4]+str(ii)+".pdf",bbox_inches='tight', papertype = 'a4', orientation = 'portrait', format = 'pdf')
plt.show()# you can remove the # sign if you want to see the graphics simultaneously
print("mission accomplished!")
print("Total operation time: = ",time.time()- seconds ,"seconds")