-
Notifications
You must be signed in to change notification settings - Fork 0
/
Predictions_For_Kaggle.m
162 lines (142 loc) · 7.36 KB
/
Predictions_For_Kaggle.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
%Titanic Machine Learning from Disaster Compteition from Kaggle
%Prepared by Özlem Körpe
%Github: https://github.com/ozlemkorpe/Titanic-Machine-Learning-from-Disaster-MATLAB
%Note1: Please check out the data path before run, fix if necessary.
%Note2: Count of iterations is set as 0 initially.Change the number of iterations for calculating average/general accuracy of prediction.
%Note3: Set local file path for exporting Result.csv file
clear %Clear the workspace
%-------------------------------- IMPORT DATA
titanic_train = readtable('titanic_train.csv');
titanic_test = readtable('titanic_test.csv');
%-------------------------------- HANDLE MISSING DATA
%Number of missing data in each column
titanic_train_missing = sum(ismissing(titanic_train));
titanic_test_missing = sum(ismissing(titanic_test));
%New tables for storing filled data
filled_data = titanic_train ;
test_filled_data = titanic_test ;
%---------------- Training set
%Calculate mean values and cast double value to integer
%------- Age
mean_age = cast(mean(titanic_train.Age, 'omitnan'),'uint8') ;
filled_age = fillmissing(titanic_train.Age, 'constant', mean_age);
filled_data.Age = filled_age;
%------- Fare
mean_fare = cast(mean(titanic_train.Fare, 'omitnan'),'uint8') ;
filled_fare = fillmissing(titanic_train.Fare, 'constant', mean_fare);
filled_data.Fare = filled_fare;
%---------------- Test set
%Set test mean age as training mean age
test_mean_age = mean_age ;
test_mean_fare = mean_fare ;
%------- Age
test_filled_age = fillmissing(titanic_test.Age, 'constant', test_mean_age);
test_filled_data.Age = test_filled_age;
%------- Fare
test_filled_fare = fillmissing(titanic_test.Fare, 'constant', test_mean_fare);
test_filled_data.Fare = test_filled_fare;
%-------------------------------- HANDLE CATEGORICAL DATA
%---------------- Training set
%Seperate genders into different columns
filled_data = categorical_data_to_dummy_variables(filled_data, filled_data.Sex);
filled_data.Sex = [];
%---------------- Test set
test_filled_data = categorical_data_to_dummy_variables(test_filled_data, test_filled_data.Sex);
test_filled_data.Sex = [];
%-------------------------------- HANDLE OUTLIERS
%plot(filled_data.Age) %Age varies between 0-80 which can be accepted as normal
%Remove rows which has age less than 1
toDelete = filled_data.Age < 1;
filled_data(toDelete,:) = [];
%Remove rows which has age not integer
toDelete2 = mod(filled_data.Age,1) ~= 0;
filled_data(toDelete2,:) = [];
%-------------------------------- FEATURE SCALING (Normalization)
%---------------- Training set
%New table for normalized data
normalized_data = filled_data ;
%Feature scaling for the Age
normalized_age = (filled_data.Age - min(filled_data.Age)) / (max(filled_data.Age) - min(filled_data.Age));
normalized_data.Age = normalized_age;
%Feature scaling for the Fare
normalized_fare = (filled_data.Fare - min(filled_data.Fare)) / (max(filled_data.Fare) - min(filled_data.Fare));
normalized_data.Fare= normalized_fare;
%Feature scaling for the SibSp
normalized_sibsp = (filled_data.SibSp - min(filled_data.SibSp)) / (max(filled_data.SibSp) - min(filled_data.SibSp));
normalized_data.SibSp = normalized_sibsp;
%Feature scaling for the Parch
normalized_parch = (filled_data.Parch - min(filled_data.Parch)) / (max(filled_data.Parch) - min(filled_data.Parch));
normalized_data.Parch = normalized_parch;
%Feature scaling fot the Pclass
normalized_pclass = (filled_data.Pclass - min(filled_data.Pclass)) / (max(filled_data.Pclass) - min(filled_data.Pclass));
normalized_data.Pclass = normalized_pclass;
%---------------- Test set
%New table for normalized data
test_normalized_data = test_filled_data ;
%Feature scaling for the Age
test_normalized_age = (test_filled_data.Age - min(test_filled_data.Age)) / (max(test_filled_data.Age) - min(test_filled_data.Age));
test_normalized_data.Age = test_normalized_age;
%Feature scaling for the Fare
test_normalized_fare = (test_filled_data.Fare - min(test_filled_data.Fare)) / (max(test_filled_data.Fare) - min(test_filled_data.Fare));
test_normalized_data.Fare= test_normalized_fare;
%Feature scaling for the SibSp
test_normalized_sibsp = (test_filled_data.SibSp - min(test_filled_data.SibSp)) / (max(test_filled_data.SibSp) - min(test_filled_data.SibSp));
test_normalized_data.SibSp = test_normalized_sibsp;
%Feature scaling for the Parch
test_normalized_parch = (test_filled_data.Parch - min(test_filled_data.Parch)) / (max(test_filled_data.Parch) - min(test_filled_data.Parch));
test_normalized_data.Parch = test_normalized_parch;
%Feature scaling fot the Pclass
test_normalized_pclass = (test_filled_data.Pclass - min(test_filled_data.Pclass)) / (max(test_filled_data.Pclass) - min(test_filled_data.Pclass));
test_normalized_data.Pclass = test_normalized_pclass;
%-------------------------------- CLASSIFICATION (Decision Tree)
classification_model = fitctree(normalized_data, 'Survived~Age+Fare+Parch+SibSp+female+male+Pclass'); %Classification Model
%-------------------------------- LOOP FOR GENERAL/AVERAGE ACCURACY
general_accuracy = 0;
for a = 1:1
%--------------------------------PARTITIONING TRAINING DATA
cv = cvpartition(classification_model.NumObservations,'HoldOut', 0.03); %Built-in function for partitioning
cross_validated_model = crossval(classification_model, 'cvpartition', cv); %Use training part of training set only to built model
%--------------------------------PREDICTION
Predictions = predict(cross_validated_model.Trained{1}, normalized_data(test(cv),1:end-1));
%--------------------------------ANALYZING THE RESULT
%Confusion Matrix: / diagonal will give the false predictions, \ will be the rigth predictions.
Results = confusionmat(cross_validated_model.Y(test(cv)),Predictions);
% coloredResultsMatrix = confusionchart(Results,'DiagonalColor','green');
right_results = Results(1,1) + Results(2,2);
wrong_results = Results(1,2) + Results(2,1);
truth_score = right_results /(right_results + wrong_results);
%Sum each accuracy to calculate general_accuracy outside the loop
general_accuracy = general_accuracy + truth_score;
end
%Calculate the general accuracy for a number of iterations
general_accuracy = general_accuracy / a;
%Print general accuracy
disp('General accuracy is:');
disp(general_accuracy);
%-------------------------------- PERFORM PREDICTIONS ON TEST SET FOR KAGGLE
%Use created classification_model's training set and perform predictions on test_normalized data.
test_predictions = predict(cross_validated_model.Trained{1}, test_normalized_data(1:end,1:end-1));
%---------------- Prepare Table for Kaggle
Resulttable = table(test_normalized_data.PassengerId ,int16(test_predictions));
%Set headers for columns
Resulttable.Properties.VariableNames{1} = 'PassengerId';
Resulttable.Properties.VariableNames{2} = 'Survived';
%Write table into file system
writetable(Resulttable, 'resultedtable.csv');
%-------------------------------- VISUALIZE THE RESULTS FOR TRAINING SET
view(cross_validated_model.Trained{1}, 'Mode', 'Graph');
%-------------------------------- FUNCTION TO HANDLE UNORDERED CATEGORICAL DATA
function data = categorical_data_to_dummy_variables(data,variable)
unique_values = unique(variable);
for i=1:length(unique_values)
dummy_variable(:,i) = double(ismember(variable,unique_values{i})) ;
end
T = table;
[rows, col] = size(dummy_variable);
for i=1:col
T1 = table(dummy_variable(:,i));
T1.Properties.VariableNames = unique_values(i);
T = [T T1];
end
data = [T data];
end