-
Notifications
You must be signed in to change notification settings - Fork 0
/
titanic.py
194 lines (161 loc) · 8.6 KB
/
titanic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import pandas as pd
import matplotlib
# Weird fix for getting graphs to show
matplotlib.use("TkAgg")
import matplotlib.pyplot as plt
import math
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
import sklearn.metrics as metrics
class Titanic():
def __init__(self):
# There was no skewed data found in the data files
self.file = pd.read_csv("./data_files/Titanic/traintest.csv")
self.raw_file = pd.read_csv("./data_files/Titanic/predict.csv")
self.tested = pd.read_csv("./data_files/Titanic/tested.csv")
self.file = self.prep_data(self.file)
self.raw_file = self.prep_data(self.raw_file)
self.tested = self.prep_data(self.tested)
self.display_info(self.file)
self.display_barchart_sex()
self.display_barchart_class()
plt.show()
print("\n---------------Training Models---------------\n")
self.models = self.train_models()
print("---------------Testing accuracy---------------\n")
for model in self.models:
self.test_accuracy(model)
# In conclusion, the SVM model with a linear kernal was the best model, since it predicted 99% correct againts the tested data. The biggest factor in survival was your Sex, followed by Class.
def encode(self, data, to_convert):
self.le = preprocessing.LabelEncoder()
self.le.fit(data[to_convert])
encoded_values = self.le.transform(data[to_convert])
data.drop([to_convert], axis=1, inplace=True)
data[to_convert] = encoded_values
return data
def display_info(self, data_file):
print("---------------Head---------------\n")
print(data_file.head())
print("\n---------------Info---------------\n")
print(data_file.info())
print("\n---------------Describe---------------\n")
print(data_file.describe())
def test_accuracy(self, model):
print(f'Model: {model}')
predict = model.predict(self.raw_file)
print(
f'Tested Accuracy: {metrics.accuracy_score(self.tested["Survived"], predict)}')
matrix = metrics.confusion_matrix(self.tested["Survived"], predict)
self.explain_confusion_matrix(matrix)
def train_data(self, resp, test_size=0.75):
y = self.file[resp]
predictors = list(self.file.columns)
predictors.remove(resp)
x = self.file[predictors]
return train_test_split(x, y, random_state=1111, test_size=test_size)
def train_models(self):
# For the logictic regression model, using 0.4 yielded the best results
x_train, x_test, y_train, y_test = self.train_data("Survived", 0.4)
lg_model = self.train_lg_model(x_train, x_test, y_train, y_test)
x_train, x_test, y_train, y_test = self.train_data("Survived", 0.2)
rf_model = self.train_random_forest_model(
x_train, x_test, y_train, y_test)
x_train, x_test, y_train, y_test = self.train_data("Survived", 0.4)
svm_model = self.train_support_vector_machines_model(
x_train, x_test, y_train, y_test)
return lg_model, rf_model, svm_model
def train_lg_model(self, x_train, x_test, y_train, y_test):
# For the Logistic regression model, using the default values yielded the most accuracy
lg = LogisticRegression()
model = lg.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(
f'Logistic Regression accuracy: {metrics.accuracy_score(y_test, y_pred)}')
matrix = metrics.confusion_matrix(y_test, y_pred)
self.explain_confusion_matrix(matrix)
return model
def train_random_forest_model(self, x_train, x_test, y_train, y_test):
# For the Random classifier model, using the n_estimators of 1000 yielded the most accuracy
clf = RandomForestClassifier(n_estimators=1000)
model = clf.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(
f'Random forest accuracy: {metrics.accuracy_score(y_test, y_pred)}')
matrix = metrics.confusion_matrix(y_test, y_pred)
self.explain_confusion_matrix(matrix)
return model
def train_support_vector_machines_model(self, x_train, x_test, y_train, y_test):
# For the SVM model, using the linear kernal yielded the most accuracy. I chose this model because i could spell it out in google, and it was on the example slides for other models.
clf=svm.SVC(kernel="linear")
model=clf.fit(x_train, y_train)
y_pred=model.predict(x_test)
print(f'SVM accuracy: {metrics.accuracy_score(y_test, y_pred)}')
matrix = metrics.confusion_matrix(y_test, y_pred)
self.explain_confusion_matrix(matrix)
return model
def get_survived_and_dead(self):
return self.file[self.file['Survived'].isin([1])], self.file[self.file['Survived'].isin([0])]
def explain_confusion_matrix(self, matrix):
print(
f'Confusion matrix:\n {matrix}')
print(f'''
Confusion matrix explained:
Survived:
- model got right: {matrix[0][0]}
- model got wrong: {matrix[0][1]}
Dead:
- model got wrong: {matrix[1][0]}
- model got right: {matrix[1][1]}
''')
def prep_data(self, data):
# The ticket column had to be dropped since it was not relevant what ticket number the person had
data.drop(['Ticket'], axis=1, inplace=True)
# The name column had to be dropped since it's irrelevant and not a numerical value
data.drop(['Name'], axis=1, inplace=True)
# The Cabin column had to be dropped because it's not a strict numerical value, and couldn't be computed
data.drop(['Cabin'], axis=1, inplace=True)
# The Embarked column had to be dropped because it's not relevant where the person embarked on
data.drop(['Embarked'], axis=1, inplace=True)
# The Age value in certain rows was filled where it wasn't provided, because you can't train the model with missing data.
data.fillna({"Age": data["Age"].median()}, inplace=True)
# The Fare value had to be filled where it wasn't provided, because you can't train the model with missing data.
data.fillna({"Fare": data["Fare"].median()}, inplace=True)
# The following function encodes Sex to 0/1, because you need numeric data to train your model.
return self.encode(data, "Sex")
def display_barchart_sex(self):
# This chart shows us that though there was significantly more men than women on the Titanic, a much higher percentage of men died compared to women. For the men, close to 75% died, on the womens side, around 20% died. This is not suprising since back in the day, it was common to evacuate women and children first.
survived, dead=self.get_survived_and_dead()
survived_men=survived[survived["Sex"].isin([1])]
survived_women=survived[survived["Sex"].isin([0])]
dead_men=dead[dead["Sex"].isin([1])]
dead_women=dead[dead["Sex"].isin([0])]
fix, ax=plt.subplots()
x=["Women", "Men"]
y1=[len(survived_women), len(survived_men)]
y2=[len(dead_women), len(dead_men)]
ax.bar(x, y1)
ax.bar(x, y2, bottom=y1, color='r')
plt.legend(["Alive", "Dead"])
plt.ylabel("Number of people")
def display_barchart_class(self):
# This chart shows that though the majority of passengers were 3rd class, they have the largest amount of people who died. Compared to the first and second class, where in 2nd class close to 50% died, and in first class less than 50% died. This shows us that the class you were in had a direct relationship with your likelyhood of survival.
survived, dead=self.get_survived_and_dead()
survived_first=survived[survived["Pclass"].isin([1])]
survived_second=survived[survived["Pclass"].isin([2])]
survived_third=survived[survived["Pclass"].isin([3])]
dead_first=dead[dead["Pclass"].isin([1])]
dead_second=dead[dead["Pclass"].isin([2])]
dead_third=dead[dead["Pclass"].isin([3])]
fix, ax=plt.subplots()
x=["1st Class", "2nd Class", "3rd Class"]
y1=[len(survived_first), len(survived_second), len(survived_third)]
y2=[len(dead_first), len(dead_second), len(dead_third)]
ax.bar(x, y1)
ax.bar(x, y2, bottom=y1, color='r')
plt.legend(["Alive", "Dead"])
plt.ylabel("Number of people")
if __name__ == "__main__":
T=Titanic()