-
Notifications
You must be signed in to change notification settings - Fork 0
/
decision_tree.py
138 lines (92 loc) · 3.6 KB
/
decision_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 14 10:23:35 2022
@author: andreas
"""
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pandas as pd
# load phishing dataset into dataframe
phishing = pd.read_csv("phishingDataset.csv")
# split the dataset into features (X) and targets (y)
X = phishing.drop(["id","Result"], axis=1)
y = phishing.Result
# split the dataset into a training set and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# WITH ENTROPY CRITERION
# create an instance of a decision tree classifer (split on entropy)
decisionTree = DecisionTreeClassifier(criterion='entropy')
# train the model
decisionTree.fit(X_train,y_train)
# make predictions using the testing data
y_pred = decisionTree.predict(X_test)
# function that uses metrics to check accuracy and plot confusion matrix
def evaluateAccuracy(predictions):
# calculate accuracy of model
accuracy = accuracy_score(y_test, predictions)
# rounded to 2 significant figures
print('Accuracy: %.3f' % accuracy)
# produce confusion matrix
cm = confusion_matrix(y_test, predictions)
display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=(["Phishing","Non-Phishing"]))
display.plot()
plt.show()
evaluateAccuracy(y_pred)
# WITH GINI CRITERION AND BEST SPLITTER
# create instance of decision tree
decisionTree2 = DecisionTreeClassifier(criterion='gini')
# train the model
decisionTree2.fit(X_train,y_train)
# make predictions using the testing data
y_pred = decisionTree2.predict(X_test)
evaluateAccuracy(y_pred)
# WITH GINI CRITERION AND RANDOM SPLITTER
# create instance of decision tree
decisionTree3 = DecisionTreeClassifier(criterion='gini', splitter="random")
# train the model
decisionTree3.fit(X_train,y_train)
# make predictions using the testing data
y_pred = decisionTree3.predict(X_test)
evaluateAccuracy(y_pred)
# WITH ENTROPY CRITERION AND RANDOM SPLITTER
# create instance of decision tree
decisionTree4 = DecisionTreeClassifier(criterion='entropy', splitter="random")
# train the model
decisionTree4.fit(X_train,y_train)
# make predictions using the testing data
y_pred = decisionTree4.predict(X_test)
evaluateAccuracy(y_pred)
# WITH STANDARD SCALER
# create instance of decision tree
decisionTree5 = DecisionTreeClassifier(criterion='entropy')
# apply standard scaler to the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)
# train the model
decisionTree5.fit(X_train_std,y_train)
# make predictions using the testing data
y_pred = decisionTree5.predict(X_test_std)
evaluateAccuracy(y_pred)
# WITH EDITED DATASET
#loading dataset without 0's
phishingEdited = phishing.replace([0], -1)
# split the dataset into features (X) and targets (y)
X = phishingEdited.drop(["id","having_Sub_Domain","double_slash_redirecting","Result"], axis=1)
y = phishingEdited.Result
# split the dataset into a training set and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
decisionTree6 = DecisionTreeClassifier(criterion='entropy')
# train the model
decisionTree4.fit(X_train,y_train)
# make predictions using the testing data
y_pred = decisionTree4.predict(X_test)
evaluateAccuracy(y_pred)