1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457
| function [label, center, bCon, sumD, D] = litekmeans(X, k, varargin)
%LITEKMEANS K-means clustering, accelerated by matlab matrix operations.
%
% label = LITEKMEANS(X, K) partitions the points in the N-by-P data matrix
% X into K clusters. This partition minimizes the sum, over all
% clusters, of the within-cluster sums of point-to-cluster-centroid
% distances. Rows of X correspond to points, columns correspond to
% variables. KMEANS returns an N-by-1 vector label containing the
% cluster indices of each point.
%
% [label, center] = LITEKMEANS(X, K) returns the K cluster centroid
% locations in the K-by-P matrix center.
%
% [label, center, bCon] = LITEKMEANS(X, K) returns the bool value bCon to
% indicate whether the iteration is converged.
%
% [label, center, bCon, SUMD] = LITEKMEANS(X, K) returns the
% within-cluster sums of point-to-centroid distances in the 1-by-K vector
% sumD.
%
% [label, center, bCon, SUMD, D] = LITEKMEANS(X, K) returns
% distances from each point to every centroid in the N-by-K matrix D.
%
% [ ... ] = LITEKMEANS(..., 'PARAM1',val1, 'PARAM2',val2, ...) specifies
% optional parameter name/value pairs to control the iterative algorithm
% used by KMEANS. Parameters are:
%
% 'Distance' - Distance measure, in P-dimensional space, that KMEANS
% should minimize with respect to. Choices are:
% {'sqEuclidean'} - Squared Euclidean distance (the default)
% 'cosine' - One minus the cosine of the included angle
% between points (treated as vectors). Each
% row of X SHOULD be normalized to unit. If
% the intial center matrix is provided, it
% SHOULD also be normalized.
%
% 'Start' - Method used to choose initial cluster centroid positions,
% sometimes known as "seeds". Choices are:
% {'sample'} - Select K observations from X at random (the default)
% 'cluster' - Perform preliminary clustering phase on random 10%
% subsample of X. This preliminary phase is itself
% initialized using 'sample'. An additional parameter
% clusterMaxIter can be used to control the maximum
% number of iterations in each preliminary clustering
% problem.
% matrix - A K-by-P matrix of starting locations; or a K-by-1
% indicate vector indicating which K points in X
% should be used as the initial center. In this case,
% you can pass in [] for K, and KMEANS infers K from
% the first dimension of the matrix.
%
% 'MaxIter' - Maximum number of iterations allowed. Default is 100.
%
% 'Replicates' - Number of times to repeat the clustering, each with a
% new set of initial centroids. Default is 1. If the
% initial centroids are provided, the replicate will be
% automatically set to be 1.
%
% 'clusterMaxIter' - Only useful when 'Start' is 'cluster'. Maximum number
% of iterations of the preliminary clustering phase.
% Default is 10.
%
%
% Examples:
%
% fea = rand(500,10);
% [label, center] = litekmeans(fea, 5, 'MaxIter', 50);
%
% fea = rand(500,10);
% [label, center] = litekmeans(fea, 5, 'MaxIter', 50, 'Replicates', 10);
%
% fea = rand(500,10);
% [label, center, bCon, sumD, D] = litekmeans(fea, 5, 'MaxIter', 50);
% TSD = sum(sumD);
%
% fea = rand(500,10);
% initcenter = rand(5,10);
% [label, center] = litekmeans(fea, 5, 'MaxIter', 50, 'Start', initcenter);
%
% fea = rand(500,10);
% idx=randperm(500);
% [label, center] = litekmeans(fea, 5, 'MaxIter', 50, 'Start', idx(1:5));
%
%
% See also KMEANS
%
% [Cite] Deng Cai, "Litekmeans: the fastest matlab implementation of
% kmeans," Available at:
% http://www.zjucadcg.cn/dengcai/Data/Clustering.html, 2011.
%
% version 2.0 --December/2011
% version 1.0 --November/2011
%
% Written by Deng Cai (dengcai AT gmail.com)
if nargin < 2
error('litekmeans:TooFewInputs','At least two input arguments required.');
end
[n, p] = size(X);
pnames = { 'distance' 'start' 'maxiter' 'replicates' 'onlinephase' 'clustermaxiter'};
dflts = {'sqeuclidean' 'sample' [] [] 'off' [] };
[eid,errmsg,distance,start,maxit,reps,online,clustermaxit] = getargs(pnames, dflts, varargin{:});
if ~isempty(eid)
error(sprintf('litekmeans:%s',eid),errmsg);
end
if ischar(distance)
distNames = {'sqeuclidean','cosine'};
j = strcmpi(distance, distNames);
j = find(j);
if length(j) > 1
error('litekmeans:AmbiguousDistance', ...
'Ambiguous ''Distance'' parameter value: %s.', distance);
elseif isempty(j)
error('litekmeans:UnknownDistance', ...
'Unknown ''Distance'' parameter value: %s.', distance);
end
distance = distNames{j};
else
error('litekmeans:InvalidDistance', ...
'The ''Distance'' parameter value must be a string.');
end
center = [];
if ischar(start)
startNames = {'sample','cluster'};
j = find(strncmpi(start,startNames,length(start)));
if length(j) > 1
error(message('litekmeans:AmbiguousStart', start));
elseif isempty(j)
error(message('litekmeans:UnknownStart', start));
elseif isempty(k)
error('litekmeans:MissingK', ...
'You must specify the number of clusters, K.');
end
if j == 2
if floor(.1*n) < 5*k
j = 1;
end
end
start = startNames{j};
elseif isnumeric(start)
if size(start,2) == p
center = start;
elseif (size(start,2) == 1 || size(start,1) == 1)
center = X(start,:);
else
error('litekmeans:MisshapedStart', ...
'The ''Start'' matrix must have the same number of columns as X.');
end
if isempty(k)
k = size(center,1);
elseif (k ~= size(center,1))
error('litekmeans:MisshapedStart', ...
'The ''Start'' matrix must have K rows.');
end
start = 'numeric';
else
error('litekmeans:InvalidStart', ...
'The ''Start'' parameter value must be a string or a numeric matrix or array.');
end
% The maximum iteration number is default 100
if isempty(maxit)
maxit = 100;
end
% The maximum iteration number for preliminary clustering phase on random
% 10% subsamples is default 10
if isempty(clustermaxit)
clustermaxit = 10;
end
% Assume one replicate
if isempty(reps) || ~isempty(center)
reps = 1;
end
if ~(isscalar(k) && isnumeric(k) && isreal(k) && k > 0 && (round(k)==k))
error('litekmeans:InvalidK', ...
'X must be a positive integer value.');
elseif n < k
error('litekmeans:TooManyClusters', ...
'X must have more rows than the number of clusters.');
end
bestlabel = [];
sumD = zeros(1,k);
bCon = false;
for t=1:reps
switch start
case 'sample'
center = X(randsample(n,k),:);
case 'cluster'
Xsubset = X(randsample(n,floor(.1*n)),:);
[dump, center] = litekmeans(Xsubset, k, varargin{:}, 'start','sample', 'replicates',1 ,'MaxIter',clustermaxit);
case 'numeric'
end
last = 0;label=1;
it=0;
switch distance
case 'sqeuclidean'
while any(label ~= last) && it<maxit
last = label;
bb = full(sum(center.*center,2)');
ab = full(X*center');
D = bb(ones(1,n),:) - 2*ab;
[val,label] = min(D,[],2); % assign samples to the nearest centers
ll = unique(label);
if length(ll) < k
%disp([num2str(k-length(ll)),' clusters dropped at iter ',num2str(it)]);
missCluster = 1:k;
missCluster(ll) = [];
missNum = length(missCluster);
aa = sum(X.*X,2);
val = aa + val;
[dump,idx] = sort(val,1,'descend');
label(idx(1:missNum)) = missCluster;
end
E = sparse(1:n,label,1,n,k,n); % transform label into indicator matrix
center = full((E*spdiags(1./sum(E,1)',0,k,k))'*X); % compute center of each cluster
it=it+1;
end
if it<maxit
bCon = true;
end
if isempty(bestlabel)
bestlabel = label;
bestcenter = center;
if reps>1
if it>=maxit
aa = full(sum(X.*X,2));
bb = full(sum(center.*center,2));
ab = full(X*center');
D = bsxfun(@plus,aa,bb') - 2*ab;
D(D<0) = 0;
else
aa = full(sum(X.*X,2));
D = aa(:,ones(1,k)) + D;
D(D<0) = 0;
end
D = sqrt(D);
for j = 1:k
sumD(j) = sum(D(label==j,j));
end
bestsumD = sumD;
bestD = D;
end
else
if it>=maxit
aa = full(sum(X.*X,2));
bb = full(sum(center.*center,2));
ab = full(X*center');
D = bsxfun(@plus,aa,bb') - 2*ab;
D(D<0) = 0;
else
aa = full(sum(X.*X,2));
D = aa(:,ones(1,k)) + D;
D(D<0) = 0;
end
D = sqrt(D);
for j = 1:k
sumD(j) = sum(D(label==j,j));
end
if sum(sumD) < sum(bestsumD)
bestlabel = label;
bestcenter = center;
bestsumD = sumD;
bestD = D;
end
end
case 'cosine'
while any(label ~= last) && it<maxit
last = label;
W=full(X*center');
[val,label] = max(W,[],2); % assign samples to the nearest centers
ll = unique(label);
if length(ll) < k
missCluster = 1:k;
missCluster(ll) = [];
missNum = length(missCluster);
[dump,idx] = sort(val);
label(idx(1:missNum)) = missCluster;
end
E = sparse(1:n,label,1,n,k,n); % transform label into indicator matrix
center = full((E*spdiags(1./sum(E,1)',0,k,k))'*X); % compute center of each cluster
centernorm = sqrt(sum(center.^2, 2));
center = center ./ centernorm(:,ones(1,p));
it=it+1;
end
if it<maxit
bCon = true;
end
if isempty(bestlabel)
bestlabel = label;
bestcenter = center;
if reps>1
if any(label ~= last)
W=full(X*center');
end
D = 1-W;
for j = 1:k
sumD(j) = sum(D(label==j,j));
end
bestsumD = sumD;
bestD = D;
end
else
if any(label ~= last)
W=full(X*center');
end
D = 1-W;
for j = 1:k
sumD(j) = sum(D(label==j,j));
end
if sum(sumD) < sum(bestsumD)
bestlabel = label;
bestcenter = center;
bestsumD = sumD;
bestD = D;
end
end
end
end
label = bestlabel;
center = bestcenter;
if reps>1
sumD = bestsumD;
D = bestD;
elseif nargout > 3
switch distance
case 'sqeuclidean'
if it>=maxit
aa = full(sum(X.*X,2));
bb = full(sum(center.*center,2));
ab = full(X*center');
D = bsxfun(@plus,aa,bb') - 2*ab;
D(D<0) = 0;
else
aa = full(sum(X.*X,2));
D = aa(:,ones(1,k)) + D;
D(D<0) = 0;
end
D = sqrt(D);
case 'cosine'
if it>=maxit
W=full(X*center');
end
D = 1-W;
end
for j = 1:k
sumD(j) = sum(D(label==j,j));
end
end
function [eid,emsg,varargout]=getargs(pnames,dflts,varargin)
%GETARGS Process parameter name/value pairs
% [EID,EMSG,A,B,...]=GETARGS(PNAMES,DFLTS,'NAME1',VAL1,'NAME2',VAL2,...)
% accepts a cell array PNAMES of valid parameter names, a cell array
% DFLTS of default values for the parameters named in PNAMES, and
% additional parameter name/value pairs. Returns parameter values A,B,...
% in the same order as the names in PNAMES. Outputs corresponding to
% entries in PNAMES that are not specified in the name/value pairs are
% set to the corresponding value from DFLTS. If nargout is equal to
% length(PNAMES)+1, then unrecognized name/value pairs are an error. If
% nargout is equal to length(PNAMES)+2, then all unrecognized name/value
% pairs are returned in a single cell array following any other outputs.
%
% EID and EMSG are empty if the arguments are valid. If an error occurs,
% EMSG is the text of an error message and EID is the final component
% of an error message id. GETARGS does not actually throw any errors,
% but rather returns EID and EMSG so that the caller may throw the error.
% Outputs will be partially processed after an error occurs.
%
% This utility can be used for processing name/value pair arguments.
%
% Example:
% pnames = {'color' 'linestyle', 'linewidth'}
% dflts = { 'r' '_' '1'}
% varargin = {{'linew' 2 'nonesuch' [1 2 3] 'linestyle' ':'}
% [eid,emsg,c,ls,lw] = statgetargs(pnames,dflts,varargin{:}) % error
% [eid,emsg,c,ls,lw,ur] = statgetargs(pnames,dflts,varargin{:}) % ok
% We always create (nparams+2) outputs:
% one each for emsg and eid
% nparams varargs for values corresponding to names in pnames
% If they ask for one more (nargout == nparams+3), it's for unrecognized
% names/values
% Original Copyright 1993-2008 The MathWorks, Inc.
% Modified by Deng Cai (dengcai@gmail.com) 2011.11.27
% Initialize some variables
emsg = '';
eid = '';
nparams = length(pnames);
varargout = dflts;
unrecog = {};
nargs = length(varargin);
% Must have name/value pairs
if mod(nargs,2)~=0
eid = 'WrongNumberArgs';
emsg = 'Wrong number of arguments.';
else
% Process name/value pairs
for j=1:2:nargs
pname = varargin{j};
if ~ischar(pname)
eid = 'BadParamName';
emsg = 'Parameter name must be text.';
break;
end
i = strcmpi(pname,pnames);
i = find(i);
if isempty(i)
% if they've asked to get back unrecognized names/values, add this
% one to the list
if nargout > nparams+2
unrecog((end+1):(end+2)) = {varargin{j} varargin{j+1}};
% otherwise, it's an error
else
eid = 'BadParamName';
emsg = sprintf('Invalid parameter name: %s.',pname);
break;
end
elseif length(i)>1
eid = 'BadParamName';
emsg = sprintf('Ambiguous parameter name: %s.',pname);
break;
else
varargout{i} = varargin{j+1};
end
end
end
varargout{nparams+1} = unrecog; |
Partager