forked from claudioangione/PGA_and_C-EDGE
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdist_corr_1.m
407 lines (378 loc) · 15.9 KB
/
pdist_corr_1.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
function Y = pdist(X,dist,varargin)
%PDIST Pairwise distance between observations.
% D = PDIST(X) returns a vector D containing the Euclidean distances
% between each pair of observations in the M-by-N data matrix X. Rows of
% X correspond to observations, columns correspond to variables. D is a
% 1-by-(M*(M-1)/2) row vector, corresponding to the M*(M-1)/2 pairs of
% observations in X.
%
% D = PDIST(X, DISTANCE) computes D using DISTANCE. Choices are:
%
% 'euclidean' - Euclidean distance (default)
% 'seuclidean' - Standardized Euclidean distance. Each coordinate
% difference between rows in X is scaled by dividing
% by the corresponding element of the standard
% deviation S=NANSTD(X). To specify another value for
% S, use D=PDIST(X,'seuclidean',S).
% 'cityblock' - City Block distance
% 'minkowski' - Minkowski distance. The default exponent is 2. To
% specify a different exponent, use
% D = PDIST(X,'minkowski',P), where the exponent P is
% a scalar positive value.
% 'chebychev' - Chebychev distance (maximum coordinate difference)
% 'mahalanobis' - Mahalanobis distance, using the sample covariance
% of X as computed by NANCOV. To compute the distance
% with a different covariance, use
% D = PDIST(X,'mahalanobis',C), where the matrix C
% is symmetric and positive definite.
% 'cosine' - One minus the cosine of the included angle
% between observations (treated as vectors)
% 'correlation' - One minus the sample linear correlation between
% observations (treated as sequences of values).
% 'spearman' - One minus the sample Spearman's rank correlation
% between observations (treated as sequences of values).
% 'hamming' - Hamming distance, percentage of coordinates
% that differ
% 'jaccard' - One minus the Jaccard coefficient, the
% percentage of nonzero coordinates that differ
% function - A distance function specified using @, for
% example @DISTFUN.
%
% A distance function must be of the form
%
% function D2 = DISTFUN(XI, XJ),
%
% taking as arguments a 1-by-N vector XI containing a single row of X, an
% M2-by-N matrix XJ containing multiple rows of X, and returning an
% M2-by-1 vector of distances D2, whose Jth element is the distance
% between the observations XI and XJ(J,:).
%
% The output D is arranged in the order of ((2,1),(3,1),..., (M,1),
% (3,2),...(M,2),.....(M,M-1)), i.e. the lower left triangle of the full
% M-by-M distance matrix in column order. To get the distance between
% the Ith and Jth observations (I < J), either use the formula
% D((I-1)*(M-I/2)+J-I), or use the helper function Z = SQUAREFORM(D),
% which returns an M-by-M square symmetric matrix, with the (I,J) entry
% equal to distance between observation I and observation J.
%
% Example:
% % Compute the ordinary Euclidean distance
% X = randn(100, 5); % some random points
% D = pdist(X, 'euclidean'); % euclidean distance
%
% % Compute the Euclidean distance with each coordinate difference
% % scaled by the standard deviation
% Dstd = pdist(X,'seuclidean');
%
% % Use a function handle to compute a distance that weights each
% % coordinate contribution differently
% Wgts = [.1 .3 .3 .2 .1]; % coordinate weights
% weuc = @(XI,XJ,W)(sqrt(bsxfun(@minus,XI,XJ).^2 * W'));
% Dwgt = pdist(X, @(Xi,Xj) weuc(Xi,Xj,Wgts));
%
% See also SQUAREFORM, LINKAGE, SILHOUETTE, PDIST2.
% An example of distance for data with missing elements:
%
% X = randn(100, 5); % some random points
% X(unidrnd(prod(size(X)),1,20)) = NaN; % scatter in some NaNs
% D = pdist(X, @naneucdist);
%
% function d = naneucdist(XI, XJ) % euclidean distance, ignoring NaNs
% [m,p] = size(XJ);
% sqdx = bsxfun(@minus,XI,XJ).^2;
% pstar = sum(~isnan(sqdx),2); % correction for missing coords
% pstar(pstar == 0) = NaN;
% d = sqrt(nansum(sqdx,2) .* p ./ pstar);
%
%
% For a large number of observations, it is sometimes faster to compute
% the distances by looping over coordinates of the data (though the code
% is more complicated):
%
% function d = nanhamdist(XI, XJ) % hamming distance, ignoring NaNs
% [m,p] = size(XJ);
% nesum = zeros(m,1);
% pstar = zeros(m,1);
% for q = 1:p
% notnan = ~(isnan((XI(q)) | isnan(XJ(:,q)));
% nesum = nesum + (XI(q) ~= XJ(:,q)) & notnan;
% pstar = pstar + notnan;
% end
% nesum(any() | nans((i+1):n)) = NaN;
% d(k:(k+n-i-1)) = nesum ./ pstar;
% Copyright 1993-2011 The MathWorks, Inc.
if nargin < 2
dist = 'euc';
else
if ischar(dist)
methods = {'euclidean'; 'seuclidean'; 'cityblock'; 'chebychev'; ...
'mahalanobis'; 'minkowski'; 'cosine'; 'correlation'; ...
'spearman'; 'hamming'; 'jaccard'};
i = find(strncmpi(dist,methods,length(dist)));
if length(i) > 1
error(message('stats:pdist:AmbiguousDistance', dist));
elseif isempty(i)
% Assume an unrecognized string is a user-supplied distance
% function name, change it to a handle.
distfun = str2func(dist);
distargs = varargin;
dist = 'usr';
else
dist = lower(methods{i}(1:3));
end
elseif isa(dist, 'function_handle') || isa(dist, 'inline')
distfun = dist;
distargs = varargin;
dist = 'usr';
else
error(message('stats:pdist:BadDistance'));
end
end
% Integer/logical/char/anything data may be handled by a caller-defined
% distance function, otherwise it is converted to double. Complex floating
% point data must also be handled by a caller-defined distance function.
if ~strcmp(dist,'usr')
if ~isfloat(X)
warning(message('stats:pdist:DataConversion', class( X )));
X = double(X);
elseif any(imag(X(:)))
error(message('stats:pdist:InvalidData'));
end
end
[n,p] = size(X);
% Degenerate case, just return an empty of the proper size.
if n < 2
if ~strcmp(dist,'usr')
Y = zeros(1,0,class(X)); % X was single/double, or cast to double
elseif isa(X,'single')
Y = zeros(1,0,'single');
else
Y = zeros(1,0);
end
return;
end
additionalArg = [];
switch dist
case 'seu' % Standardized Euclidean weights by coordinate variance
if nargin < 3
additionalArg = nanvar(X);
if any(additionalArg == 0)
warning(message('stats:pdist:ConstantColumns'));
end
additionalArg = 1./ additionalArg;
else
additionalArg = varargin{1};
if ~(isvector(additionalArg) && length(additionalArg) == p...
&& all(additionalArg >= 0))
error(message('stats:pdist:InvalidWeights'));
end
if any(additionalArg == 0)
warning(message('stats:pdist:ZeroInverseWeights'));
end
%We will apply the inverse weight on each coordinate in the sum
%of squares.
additionalArg = 1./ (additionalArg .^2);
end
case 'mah' % Mahalanobis
if nargin < 3
additionalArg = nancov(X);
[T,flag] = chol(additionalArg);
else
additionalArg = varargin{1};
if ~isequal(size(additionalArg),[p,p])
error(message('stats:pdist:InvalidCov'));
end
%use cholcov because we also need to check whether the matrix is symmetric
[T,flag] = cholcov(additionalArg,0);
end
if flag ~= 0
error(message('stats:pdist:SingularCov'));
end
if ~issparse(X)
additionalArg = T \ eye(p); %inv(T)
end
case 'min' % Minkowski distance needs a third argument
if nargin < 3 % use default value for exponent
additionalArg = 2;
elseif ( isscalar(varargin{1}) && varargin{1} > 0)
additionalArg = varargin{1}; % get exponent from input args
else
error(message('stats:pdist:InvalidExponent'));
end
case 'cos' % Cosine
[X,flag] = normalizeX(X);
if flag
warning(message('stats:pdist:ZeroPoints'));
end
case 'cor' % Correlation
% X = bsxfun(@minus,X,mean(X,2));
X = bsxfun(@minus,X,ones(size(X,1),1));
[X, flag] = normalizeX(X);
if flag
warning(message('stats:pdist:ConstantPoints'));
end
case 'spe' %Spearman
X = tiedrank(X')'; % treat rows as a series
X = X - (p+1)/2; % subtract off the (constant) mean
[X,flag] = normalizeX(X);
if flag
warning(message('stats:pdist:TiedPoints'));
end
end
% Note that if there is any code for case 'che','euc' or 'cit' in the
% above switch dist block, the some code need to be repeated in the
% corresponding block below.
if strcmp(dist,'min') % Minkowski distance
if isinf(additionalArg) %the exponent is inf
dist = 'che';
additionalArg = [];
elseif additionalArg == 2 %the exponent is 2
dist = 'euc';
additionalArg = [];
elseif additionalArg == 1 %the exponent is 1
dist = 'cit';
additionalArg = [];
end
end
% Call a mex file to compute distances for the standard distance measures
% and full real double or single data.
% % % if ~strcmp(dist,'usr') && (isfloat(X) && ~issparse(X)) % ~usr => ~complex
% % % additionalArg = cast(additionalArg,class(X));
% % % Y = pdistmex(X',dist,additionalArg);
% % %
% % % % This M equivalent assumes real single or double. It is currently only
% % % % called for sparse inputs, but it may also be useful as a template for
% % % % customization.
% % % elseif ~strcmp(dist,'usr') && isfloat(X) % ~usr => ~complex
if any(strcmpi(dist, {'ham' 'jac' 'che'}))
nans = any(isnan(X),2);
end
outClass = class(X);
Y = zeros(1,n*(n-1)./2, outClass);
k = 1;
for i = 1:n-1
switch dist
case 'euc' % Euclidean
dsq = zeros(n-i,1,outClass);
for q = 1:p
dsq = dsq + (X(i,q) - X((i+1):n,q)).^2;
end
Y(k:(k+n-i-1)) = sqrt(dsq);
case 'seu' % Standardized Euclidean
wgts = additionalArg;
dsq = zeros(n-i,1,outClass);
for q = 1:p
dsq = dsq + wgts(q) .* (X(i,q) - X((i+1):n,q)).^2;
end
Y(k:(k+n-i-1)) = sqrt(dsq);
case 'cit' % City Block
d = zeros(n-i,1,outClass);
for q = 1:p
d = d + abs(X(i,q) - X((i+1):n,q));
end
Y(k:(k+n-i-1)) = d;
case 'mah' % Mahalanobis
del = bsxfun(@minus, X(i,:), X((i+1):n,:));
dsq = sum((del/T) .^ 2, 2);
Y(k:(k+n-i-1)) = sqrt(dsq);
case 'min' % Minkowski
expon = additionalArg;
dpow = zeros(n-i,1,outClass);
for q = 1:p
dpow = dpow + abs(X(i,q) - X((i+1):n,q)).^expon;
end
Y(k:(k+n-i-1)) = dpow .^ (1./expon);
case {'cos' 'cor' 'spe'} % Cosine, Correlation, Rank Correlation
% This assumes that data have been appropriately preprocessed
d = zeros(n-i,1,outClass);
for q = 1:p
d = d + (X(i,q).*X((i+1):n,q));
end
d(d>1) = 1; % protect against round-off, don't overwrite NaNs
Y(k:(k+n-i-1)) = 1 - d;
case 'ham' % Hamming
nesum = zeros(n-i,1,outClass);
for q = 1:p
nesum = nesum + (X(i,q) ~= X((i+1):n,q));
end
nesum(nans(i) | nans((i+1):n)) = NaN;
Y(k:(k+n-i-1)) = nesum ./ p;
case 'jac' % Jaccard
nzsum = zeros(n-i,1,outClass);
nesum = zeros(n-i,1,outClass);
for q = 1:p
nz = (X(i,q) ~= 0 | X((i+1):n,q) ~= 0);
ne = (X(i,q) ~= X((i+1):n,q));
nzsum = nzsum + nz;
nesum = nesum + (nz & ne);
end
nesum(nans(i) | nans((i+1):n)) = NaN;
Y(k:(k+n-i-1)) = nesum ./ nzsum;
case 'che' % Chebychev
dmax = zeros(n-i,1,outClass);
for q = 1:p
dmax = max(dmax, abs(X(i,q) - X((i+1):n,q)));
end
dmax(nans(i) | nans((i+1):n)) = NaN;
Y(k:(k+n-i-1)) = dmax;
end
k = k + (n-i);
end
% % % % Compute distances for a caller-defined distance function.
% % % else % if strcmp(dist,'usr')
% % % try
% % % Y = feval(distfun,X(1,:),X(2,:),distargs{:})';
% % % catch ME
% % % if strcmp('MATLAB:UndefinedFunction', ME.identifier) ...
% % % && ~isempty(strfind(ME.message, func2str(distfun)))
% % % error(message('stats:pdist:DistanceFunctionNotFound', func2str( distfun )));
% % % end
% % % % Otherwise, let the catch block below generate the error message
% % % Y = [];
% % % end
% % %
% % % % Make the return have whichever numeric type the distance function
% % % % returns, or logical.
% % % if islogical(Y)
% % % Y = false(1,n*(n-1)./2);
% % % else % isnumeric
% % % Y = zeros(1,n*(n-1)./2, class(Y));
% % % end
% % %
% % % k = 1;
% % % for i = 1:n-1
% % % try
% % % Y(k:(k+n-i-1)) = feval(distfun,X(i,:),X((i+1):n,:),distargs{:})';
% % % catch ME
% % % if isa(distfun, 'inline')
% % % m = message('stats:pdist:InlineFunctionError');
% % % throw(addCause(MException(m.Identifier,'%s',getString(m)),ME));
% % % else
% % % m = message('stats:pdist:DistanceFunctionError',func2str(distfun));
% % % throw(addCause(MException(m.Identifier,'%s',getString(m)),ME));
% % % end
% % % end
% % % k = k + (n-i);
% % % end
% % % end
%---------------------------------------------
% Normalize the rows in X to have unit norm.
function [X,flag] = normalizeX(X)
% Rescale each row by largest element to prevent over/underflow, and
% compute the 2-norm.
Xmax = max(abs(X),[],2);
X2 = bsxfun(@rdivide,X,Xmax);
Xnorm = sqrt(sum(X2.^2, 2));
% The norm will be NaN for rows that are all zeros, fix that for the test
% below.
Xnorm(Xmax==0) = 0;
% The norm will be NaN for rows of X that have any +/-Inf. Those should be
% Inf, but leave them as is so those rows will not affect the test below.
% The points can't be normalized, so any distances from them will be NaN
% anyway.
% Find points that are effectively zero relative to the point with largest norm.
flag = any(Xnorm <= eps(max(Xnorm)));
% Back out the rescaling, and normalize rows of X to have unit 2-norm.
% Rows can't be normalized become all NaN.
Xnorm = Xnorm .* Xmax;
X = bsxfun(@rdivide,X,Xnorm);