# v_kmeans

## PURPOSE

V_KMEANS Vector quantisation using K-means algorithm [X,ESQ,J]=(D,K,X0,L)

## SYNOPSIS

function [x,g,j,gg] = v_kmeans(d,k,x0,l)

## DESCRIPTION

```V_KMEANS Vector quantisation using K-means algorithm [X,ESQ,J]=(D,K,X0,L)

Inputs:

D(N,P)  contains N data vectors of dimension P
K       is number of centres required
X0(K,P) are the initial centres (optional)

or alternatively

X0      gives the initialization method
'f'   pick K random elements of D as the initial centres [default]
'p'   randomly divide D into K sets and choose the centroids
L       gives max number of iterations (use 0 if you just want to calculate G and J)

Outputs:

X(K,P)  is output row vectors (omitted if L=0)
G       is mean square error
J(N)    indicates which centre each data vector belongs to
GG(L)   gives the mean square error at the start of each iteration (omitted if L=0)

It is often a good idea to scale the input data so that it has equal variance in each
dimension before calling V_KMEANS.```

## CROSS-REFERENCE INFORMATION

This function calls:
This function is called by:
• v_gaussmix V_GAUSSMIX fits a gaussian mixture pdf to a set of data observations [m,v,w,g,f]=(x,c,l,m0,v0,w0,wx)
• v_kmeanlbg V_KMEANLBG Vector quantisation using the Linde-Buzo-Gray algorithm [X,ESQ,J]=(D,K)

## SOURCE CODE

```0001 function [x,g,j,gg] = v_kmeans(d,k,x0,l)
0002 %V_KMEANS Vector quantisation using K-means algorithm [X,ESQ,J]=(D,K,X0,L)
0003 %
0004 %  Inputs:
0005 %
0006 %    D(N,P)  contains N data vectors of dimension P
0007 %    K       is number of centres required
0008 %    X0(K,P) are the initial centres (optional)
0009 %
0010 %      or alternatively
0011 %
0012 %    X0      gives the initialization method
0013 %            'f'   pick K random elements of D as the initial centres [default]
0014 %            'p'   randomly divide D into K sets and choose the centroids
0015 %    L       gives max number of iterations (use 0 if you just want to calculate G and J)
0016 %
0017 %  Outputs:
0018 %
0019 %    X(K,P)  is output row vectors (omitted if L=0)
0020 %    G       is mean square error
0021 %    J(N)    indicates which centre each data vector belongs to
0022 %    GG(L)   gives the mean square error at the start of each iteration (omitted if L=0)
0023 %
0024 % It is often a good idea to scale the input data so that it has equal variance in each
0025 % dimension before calling V_KMEANS.
0026
0027 %  Originally based on a routine by Chuck Anderson, anderson@cs.colostate.edu, 1996
0028
0029
0030 %      Copyright (C) Mike Brookes 1998
0031 %      Version: \$Id: v_kmeans.m 4497 2014-04-23 10:28:55Z dmb \$
0032 %
0033 %   VOICEBOX is a MATLAB toolbox for speech processing.
0035 %
0036 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0037 %   This program is free software; you can redistribute it and/or modify
0039 %   the Free Software Foundation; either version 2 of the License, or
0040 %   (at your option) any later version.
0041 %
0042 %   This program is distributed in the hope that it will be useful,
0043 %   but WITHOUT ANY WARRANTY; without even the implied warranty of
0044 %   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0045 %   GNU General Public License for more details.
0046 %
0047 %   You can obtain a copy of the GNU General Public License from
0048 %   http://www.gnu.org/copyleft/gpl.html or by writing to
0049 %   Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
0050 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0051
0052 memsize=voicebox('memsize');
0053 [n,p] = size(d);
0054 nb=min(n,max(1,floor(memsize/(8*p*k))));    % block size for testing data points
0055 nl=ceil(n/nb);                  % number of blocks
0056 if nargin<4
0057     l=300;                  % very large max iteration count
0058     if nargin<3
0059         x0='f';             % use 'f' initialization mode
0060     end
0061 end
0062 if ischar(x0)
0063     if k<n
0064         if any(x0)=='p'                  % Initialize using a random partition
0065             ix=ceil(rand(1,n)*k);       % allocate to random clusters
0066             ix(rnsubset(k,n))=1:k;      % but force at least one point per cluster
0067             x=zeros(k,p);
0068             for i=1:k
0069                 x(i,:)=mean(d(ix==i,:),1);
0070             end
0071         else                                % Forgy initialization: choose k random points [default]
0072             x=d(rnsubset(k,n),:);         % sample k centres without replacement
0073         end
0074     else
0075         x=d(mod((1:k)-1,n)+1,:);    % just include all points several times
0076     end
0077 else
0078     x=x0;
0079 end
0080 m=zeros(n,1);           % minimum distance to a centre
0081 j=zeros(n,1);           % index of closest centre
0082 gg=zeros(l,1);
0083 wp=ones(1,p);
0084 kk=1:p;
0085 kk=kk(ones(n,1),:);
0086 kk=kk(:);
0087
0088 if l>0
0089     for ll=1:l                 % loop until x==y causes a break
0090
0091         % find closest centre to each data point [m(:),j(:)] = distance, index
0092
0093         ix=1;
0094         jx=n-nl*nb;
0095         for il=1:nl
0096             jx=jx+nb;        % increment upper limit
0097             ii=ix:jx;
0098             z = disteusq(d(ii,:),x,'x');
0099             [m(ii),j(ii)] = min(z,[],2);
0100             ix=jx+1;
0101         end
0102         y = x;              % save old centre list
0103
0104         % calculate new centres as the mean of their assigned data values (or zero for unused centres)
0105
0106         nd=full(sparse(j,1,1,k,1));         % number of points allocated to each centre
0107         md=max(nd,1);                       % remove zeros
0108         jj=j(:,wp);
0109         x=full(sparse(jj(:),kk,d(:),k,p))./md(:,wp);    % calculate the new means
0110         fx=find(nd==0);
0111
0112         % if any centres are unused, assign them to data values that are not exactly on centres
0113         % choose randomly if there are more such points than needed
0114
0115         if ~isempty(fx)
0116             q=find(m~=0);
0117             if length(q)<=length(fx)
0118                 x(fx(1:length(q)),:)=d(q,:);
0119             else
0120                 if length(fx)>1
0121                     [rr,ri]=sort(rand(length(q),1));
0122                     x(fx,:)=d(q(ri(1:length(fx))),:);
0123                 else
0124                     x(fx,:) = d(q(ceil(rand(1)*length(q))),:);
0125                 end
0126             end
0127         end
0128
0129         % quit if the centres are unchanged
0130
0131         gg(ll)=sum(m,1);
0132         if x==y
0133             break
0134         end
0135     end
0136     gg=gg(1:ll)/n;
0137 %     ll % *** DEBUG ***
0138 %     gg' % *** DEBUG ***
0139     g=gg(end);
0140 else            % if l==0 then just calculate G and J (but rename as X and G)
0141     ix=1;
0142     jx=n-nl*nb;
0143     for il=1:nl
0144         jx=jx+nb;        % increment upper limit
0145         ii=ix:jx;
0146         z = disteusq(d(ii,:),x,'x');
0147         [m(ii),j(ii)] = min(z,[],2);
0148         ix=jx+1;
0149     end
0150     x=sum(m,1)/n;
0151     g=j;
0152 end```