-
Notifications
You must be signed in to change notification settings - Fork 97
/
count_unique.m
155 lines (132 loc) · 4.49 KB
/
count_unique.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
function [uniques,numUnique] = count_unique(x,option)
%COUNT_UNIQUE Determines unique values, and counts occurrences
% [uniques,numUnique] = count_unique(x)
%
% This function determines unique values of an array, and also counts the
% number of instances of those values.
%
% This uses the MATLAB builtin function accumarray, and is faster than
% MATLAB's unique function for intermediate to large sizes of arrays for integer values.
% Unlike 'unique' it cannot be used to determine if rows are unique or
% operate on cell arrays.
%
% If float values are passed, it uses MATLAB's logic builtin unique function to
% determine unique values, and then to count instances.
%
% Descriptions of Input Variables:
% x: Input vector or matrix, N-D. Must be a type acceptable to
% accumarray, numeric, logical, char, scalar, or cell array of
% strings.
% option: Acceptable values currently only 'float'. If 'float' is
% specified, the input x vector will be treated as containing
% decimal values, regardless of whether it is a float array type.
%
% Descriptions of Output Variables:
% uniques: sorted unique values
% numUnique: number of instances of each unique value
%
% Example(s):
% >> [uniques] = count_unique(largeArray);
% >> [uniques,numUnique] = count_unique(largeArray);
%
% See also: unique, accumarray
% Author: Anthony Kendall
% Contact: anthony [dot] kendall [at] gmail [dot] com
% Created: 2009-03-17
testFloat = false;
if nargin == 2 && strcmpi(option,'float')
testFloat = true;
end
nOut = nargout;
if testFloat
if nOut < 2
[uniques] = float_cell_unique(x,nOut);
else
[uniques,numUnique] = float_cell_unique(x,nOut);
end
else
try %this will fail if the array is float or cell
if nOut < 2
[uniques] = int_log_unique(x,nOut);
else
[uniques,numUnique] = int_log_unique(x,nOut);
end
catch %default to standard approach
if nOut < 2
[uniques] = float_cell_unique(x,nOut);
else
[uniques,numUnique] = float_cell_unique(x,nOut);
end
end
end
end
function [uniques,numUnique] = int_log_unique(x,nOut)
%Check to see if accumarray is appropriate for this function
maxVal = max(x(:));
if maxVal / numel(x) > 1000
error('Accumarray is inefficient for arrays when ind values are >> than the number of elements')
end
%First, determine the offset for negative values
minVal = min(x(:));
if minVal < 1
%Now, offset to get the index
index = x(:) - minVal + 1;
%Get the number of duplicates with accumarray
numUnique = accumarray(index,1);
%Get the sum of those duplicate values
sumDups = accumarray(index,x(:));
else
%Get the number of duplicates with accumarray
numUnique = accumarray(x(:),1);
%Get the sum of those duplicate values
sumDups = accumarray(x(:),x(:));
end
%Find numUnique > 0
test = (numUnique > 0);
%Determine the unique values
uniques = sumDups(test) ./ (numUnique(test));
if nOut == 2
%Trim the numUnique array
numUnique = numUnique(test);
end
end
function [uniques,numUnique] = float_cell_unique(x,nOut)
if ~iscell(x)
%First, sort the input vector
x = sort(x(:));
numelX = numel(x);
%Check to see if the array type needs to be converted to double
currClass = class(x);
isdouble = strcmp(currClass,'double');
if ~isdouble
x = double(x);
end
%Check to see if there are any NaNs or Infs, sort returns these either at
%the beginning or end of an array
if isnan(x(1)) || isinf(x(1)) || isnan(x(numelX)) || isinf(x(numelX))
%Check to see if the array contains nans or infs
xnan = isnan(x);
xinf = isinf(x);
testRep = xnan | xinf;
%Remove all of these from the array
x = x(~testRep);
end
%Determine break locations of unique values
uniqueLocs = [true;diff(x) ~= 0];
else
isdouble = true; %just to avoid conversion on finish
%Sort the rows of the cell array
x = sort(x(:));
%Determine unique location values
uniqueLocs = [true;~strcmp(x(1:end-1),x(2:end)) ~= 0] ;
end
%Determine the unique values
uniques = x(uniqueLocs);
if ~isdouble
x = feval(currClass,x);
end
%Count the number of duplicate values
if nOut == 2
numUnique = diff([find(uniqueLocs);length(x)+1]);
end
end