-
Notifications
You must be signed in to change notification settings - Fork 12
/
svmcv.m
261 lines (248 loc) · 8.46 KB
/
svmcv.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
function [net, CVErr, paramSeq] = svmcv(net, X, Y, range, step, nfold, Xv, Yv, dodisplay)
% SVMCV - Kernel parameter selection for SVM via cross validation
%
% NET = SVMCV(NET, X, Y, RANGE)
% Given an initialised Support Vector Machine structure NET, the best
% setting of the kernel parameters is computed via 10fold cross
% validation. CV is done on the data points X (one point per row) with
% target values Y (+1 or -1). The kernel parameters that are tested lie
% between MIN(RANGE) and MAX(RANGE), starting with MIN(RANGE) for 'rbf'
% kernels and MAX(RANGE) for all other kernel functions.
% RANGE may also be a vector of length >2, in this case RANGE is taken
% as the explicit sequence of kernel parameters that are tested.
% SVMCV only works for kernel functions that require one parameter.
%
% [NET, CVERR, PARAMSEQ] = SVMCV(NET, X, Y, RANGE, STEP, NFOLD)
% The sequence of test parameters is generated by Param(t+1) =
% Param(t)*STEP for 'rbf' kernels, and Param(t+1) = Param(t)+STEP for
% all other kernels. Default value: SQRT(2) for 'rbf', -1 otherwise.
% Determine the parameters based on NFOLD cross validation.
% If STEP==[], RANGE is again interpreted as the explict sequence of
% kernel parameters.
%
% The tested parameter sequence is returned in PARAMSEQ. For each entry
% PARAMSEQ(i), there is one line CVERR(i,:) that describes the
% estimated test set error. CVERR(i,1) is the mean, CVERR(i,2) is the
% variance of the test set error over all NFOLD runs.
%
% [NET, CVERR, PARAMSEQ] = SVMCV(NET, X, Y, RANGE, STEP, 1, XV, YV)
% does parameter selection based on one fixed validation set XV and
% YV. CVERR(i,2)==0 for all tested parameter settings.
% NET = SVMCV(NET, X, Y, RANGE, STEP, 1, XV, YV, DODISPLAY) displays
% error information for all tested parameters. DODISPLAY==0 shows
% nothing, DODISPLAY==1 shows a final CV summary (default),
% DODISPLAY==2 also shows the test set error for each trained SVM,
% DODISPLAY==3 includes the output produced by SVMTRAIN.
%
% See also
% SVM, SVMTRAIN
%
%
% Copyright (c) by Anton Schwaighofer (2001)
% $Revision: 1.6 $ $Date: 2001/06/05 19:20:00 $
% mailto:anton.schwaighofer@gmx.net
%
% This program is released unter the GNU General Public License.
%
% Check arguments for consistency
errstring = consist(net, 'svm', X, Y);
if ~isempty(errstring);
error(errstring);
end
if nargin<9,
dodisplay = 1;
end
if nargin<8,
Xv = [];
end
if nargin<7,
Yv = [];
end
if nargin<6,
nfold = 10;
end
if (~isempty(Xv)) & (~isempty(Yv)),
errstring = consist(net, 'svm', Xv, Yv);
if ~isempty(errstring);
error(errstring);
end
if (nfold~=1),
error('Input parameters XV and YV may only be used with NFOLD==1');
end
end
if nargin<5,
step = 0;
end
range = range(:)';
N = size(X, 1);
if N<nfold,
error('At least NFOLD (default 10) training examples must be given');
end
if (length(range)>2) | isempty(step),
% If range parameter has more than only min/max entries: Use this as
% the sequence of parameters to test
paramSeq = range;
else
paramSeq = [];
switch net.kernel
case 'rbf'
if step==0,
step = sqrt(2);
end
% Multiplicative update, step size < 1 : start with max value
if abs(step)<1,
param = max(range);
while (param>=min(range)),
paramSeq = [paramSeq param];
param = param*abs(step);
end
else
% Multiplicative update, step size > 1 : start with min value
param = min(range);
while (param<=max(range)),
paramSeq = [paramSeq param];
param = param*abs(step);
end
end
otherwise
% Additive update for kernels other than 'rbf'
if step==0,
step = -1;
end
if step<0,
paramSeq = max(range):step:min(range);
else
paramSeq = min(range):step:max(range);
end
end
end
% Storing all validation set errors for each parameter choice
allErr = zeros(nfold, length(paramSeq));
% Storing the confusion matrices for each parameter choice
cm = cell(1, length(paramSeq));
for j = 1:length(paramSeq),
cm{j} = zeros(2);
end
% shuffle X and Y in the same way
perm = randperm(N);
X = X(perm,:);
Y = Y(perm,:);
% size of one test set
chsize = floor (N/nfold);
% the training set is not exactly the whole data minus the one test set,
% but it is the union of the other test sets. So only effsize examples
% of the data set will ever be used
effsize = nfold*chsize;
% check if leave-one-out CV (or almost such) is required
usePrev = (nfold>=(N/2));
prevInd = [];
for i = 1:nfold,
% currentX/Y is the current training set
if (nfold == 1),
currentX = X;
currentY = Y;
testX = Xv;
testY = Yv;
else
% start and end index of current test set
ind1 = 1+(i-1)*chsize;
ind2 = i*chsize;
currentInd = [1:(ind1-1), (ind2+1):effsize];
currentX = X(currentInd, :);
currentY = Y(currentInd, :);
testX = X(ind1:ind2,:);
testY = Y(ind1:ind2,:);
end;
% We start out with the most powerful kernel (smallest sigma for RBF
% kernel, highest degree for polynomial). We assume that all training
% examples will be support vectors due to overfitting, thus we start
% the optimization with a value of C/2 for each example.
if length(net.c(:))==1,
alpha0 = repmat(net.c, [length(currentY) 1]);
% The same upper bound for all examples
elseif length(net.c(:))==2,
alpha0 = zeros([length(currentY) 1]);
alpha0(currentY>0) = net.c(1);
alpha0(currentY<=0) = net.c(2);
% Different upper bounds C for the positive and negative examples
else
net2.c = net.c(perm);
alpha0 = net2.c(currentInd);
% Use different C for each example: permute the original C's
end
alpha0 = alpha0/2;
% Start out with alpha = C/2 for the optimization routine.
% another little trick for leave-one-out CV: training sets will only
% slightly differ, thus use the alphas from the previous iteration,
% even if it resulted from a different parameter setting, as initial
% values alpha0
if usePrev & ~isempty(prevInd),
a = zeros(N, 1);
a(currentInd) = alpha0;
a(prevInd) = prevAlpha;
alpha0 = a(currentInd);
end
% Now loop over all parameter settings and train the SVM on currentX/Y
net2 = net;
if (dodisplay>0),
fprintf('Split %i of the training data:\n', i);
end
for j = 1:length(paramSeq),
param = paramSeq(j);
net2.kernelpar = param;
% Plug the current parameter settings into the SVM and train on the
% current training set
net2 = svmtrain(net2, currentX, currentY, alpha0, max(0,dodisplay-2));
% Evaluate on the non-training data
testPred = svmfwd(net2, testX);
allErr(i, j) = mean(testY ~= testPred);
% Compute the confusion matrix
for k = [1 2],
for l = [1 2],
c(k,l) = sum(((testPred>=0)==(l-1)).*((testY>=0)==(k-1)));
end
end
cm{j} = cm{j}+c;
% take out the computed coefficients alpha and use them as starting
% values for the next iteration (next parameter choice)
alpha0 = net2.alpha;
if (dodisplay>1),
fprintf('Split %i with parameter %g:\n', i, param);
fprintf(' Test set error = %2.3f%%\n', allErr(i, j)*100);
[fracSV, normW] = svmstat(net2, (dodisplay>2));
fprintf(' Norm of the separating hyperplane: %g\n', normW);
fprintf(' Fraction of support vectors: %2.3f%%\n', fracSV*100);
end
end
if usePrev,
prevAlpha = net2.alpha;
prevInd = currentInd;
end
end
% Compute mean and standard deviation over all nfold runs
meanErr = mean(allErr, 1);
stdErr = std(allErr, 0, 1);
CVErr = [meanErr; stdErr]';
% Find the point of minimum mean error and plug that parameter into the
% output SVM structure. If there should be several points of minimal
% error, select the one with minimal standard deviation
[sortedMean, sortedInd] = sort(meanErr);
minima = find(sortedMean(1)==meanErr);
[dummy, sortedInd2] = sort(stdErr(minima));
net.kernelpar = paramSeq(minima(sortedInd2(1)));
if (dodisplay>0),
for j = 1:length(paramSeq),
fprintf('kernelpar=%g: Avg CV error %2.3f%% with stddev %1.4f\n', ...
paramSeq(j), meanErr(j)*100, stdErr(j)*100);
if any(cm{j}~=0),
fprintf(' Confusion matrix, averaged over all runs:\n');
fprintf(' Predicted class:\n');
fprintf(' %5i %5i\n', -1, +1);
c1 = cm{j}';
c2 = 100*c1./repmat(sum(c1), [2 1]);
c3 = [c1(:) c2(:)]';
fprintf([' True -1: %5i (%3.2f%%) %5i (%3.2f%%)\n True +1: %5i' ...
' (%3.2f%%) %5i (%3.2f%%)\n'], c3(:));
end
end
end