Home > voicebox > fxrapt.m

fxrapt

PURPOSE ^

FXRAPT RAPT pitch tracker [FX,VUV]=(S,FS,M,Q)

SYNOPSIS ^

function [fx,tt]=fxrapt(s,fs,mode,q)

DESCRIPTION ^

FXRAPT RAPT pitch tracker [FX,VUV]=(S,FS,M,Q)

 Input:   s(ns)      Speech signal
          fs         Sample frequency (Hz)
          mode       'g' will plot a graph [default if no output arguments]
                     'u' will include unvoiced fames (with fx=NaN)
          q          stucture with parameter values (e.g. q.f0min=40); see below for a list

 Outputs: fx(nframe)     Larynx frequency for each fram,e (or NaN for silent/unvoiced)
          tt(nframe,3)  Start and end samples of each frame. tt(*,3)=1 at the start of each talk spurt

 Plots a graph if no outputs are specified showing lag candidates and selected path

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SUBFUNCTIONS ^

SOURCE CODE ^

0001 function [fx,tt]=fxrapt(s,fs,mode,q)
0002 %FXRAPT RAPT pitch tracker [FX,VUV]=(S,FS,M,Q)
0003 %
0004 % Input:   s(ns)      Speech signal
0005 %          fs         Sample frequency (Hz)
0006 %          mode       'g' will plot a graph [default if no output arguments]
0007 %                     'u' will include unvoiced fames (with fx=NaN)
0008 %          q          stucture with parameter values (e.g. q.f0min=40); see below for a list
0009 %
0010 % Outputs: fx(nframe)     Larynx frequency for each fram,e (or NaN for silent/unvoiced)
0011 %          tt(nframe,3)  Start and end samples of each frame. tt(*,3)=1 at the start of each talk spurt
0012 %
0013 % Plots a graph if no outputs are specified showing lag candidates and selected path
0014 %
0015 
0016 % Bugs/Suggestions:
0017 %   (1) Include backward DP pass and output the true cost for each candidate.
0018 %   (2) Add an extra state to distinguish between voiceless and silent
0019 %   (3) N-best DP to allow longer term penalties (e.g. for frequent pitch doubling/halving)
0020 
0021 % The algorithm is taken from [1] with the following differences:
0022 %
0023 %      (a)  the factor AFACT which in the Talkin algorithm corresponds roughly
0024 %           to the absolute level of harmonic noise in the correlation window. This value
0025 %           is here calculated as the maximum of three figures:
0026 %                   (i) an absolute floor set by p.absnoise
0027 %                  (ii) a multiple of the peak signal set by p.signoise
0028 %                 (iii) a multiple of the noise floor set by p.relnoise
0029 %      (b) The LPC used in calculating the Itakura distance uses a Hamming window rather than
0030 %          a Hanning window.
0031 %
0032 % A C implementation of this algorithm by Derek Lin and David Talkin is included as  "get_f0.c"
0033 % in the esps.zip package available from http://www.speech.kth.se/esps/esps.zip under the BSD
0034 % license.
0035 %
0036 % Refs:
0037 %      [1]   D. Talkin, "A Robust Algorithm for Pitch Tracking (RAPT)"
0038 %            in "Speech Coding & Synthesis", W B Kleijn, K K Paliwal eds,
0039 %            Elsevier ISBN 0444821694, 1995
0040 
0041 %      Copyright (C) Mike Brookes 2006-2013
0042 %      Version: $Id: fxrapt.m 9312 2017-01-19 13:19:13Z dmb $
0043 %
0044 %   VOICEBOX is a MATLAB toolbox for speech processing.
0045 %   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
0046 %
0047 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0048 %   This program is free software; you can redistribute it and/or modify
0049 %   it under the terms of the GNU General Public License as published by
0050 %   the Free Software Foundation; either version 2 of the License, or
0051 %   (at your option) any later version.
0052 %
0053 %   This program is distributed in the hope that it will be useful,
0054 %   but WITHOUT ANY WARRANTY; without even the implied warranty of
0055 %   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0056 %   GNU General Public License for more details.
0057 %
0058 %   You can obtain a copy of the GNU General Public License from
0059 %   http://www.gnu.org/copyleft/gpl.html or by writing to
0060 %   Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
0061 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0062 
0063 s=s(:); % force s to be a column
0064 if nargin<4
0065     q=[];
0066     if nargin<3
0067         mode=' ';
0068     end
0069 end
0070 doback=0;   % don't do backwards DP for now
0071 
0072 % set default parameters
0073 
0074 p0.f0min=50;           % Min F0 (Hz)
0075 p0.f0max=500;          % Max F0 (Hz)
0076 p0.tframe=0.01;        % frame size (s)
0077 p0.tlpw=0.005;         % low pass filter window size (s)
0078 p0.tcorw=0.0075;       % correlation window size (s)
0079 p0.candtr=0.3;         % minimum peak in NCCF
0080 p0.lagwt=0.3;          % linear lag taper factor
0081 p0.freqwt=0.02;        % cost factor for F0 change
0082 p0.vtranc=0.005;       % fixed voice-state transition cost
0083 p0.vtrac=0.5;          % delta amplitude modulated transition cost
0084 p0.vtrsc=0.5;          % delta spectrum modulated transition cost
0085 p0.vobias=0.0;         % bias to encourage voiced hypotheses
0086 p0.doublec=0.35;       % cost of exact doubling or halving
0087 p0.absnoise=0;         % absolute rms noise level
0088 p0.relnoise=2;         % rms noise level relative to noise floor
0089 p0.signoise=0.001;     % ratio of peak signal rms to noise floor (0.001 = 60dB)
0090 p0.ncands=20;          % max hypotheses at each frame
0091 p0.trms=0.03;                      % window length for rms measurement
0092 p0.dtrms=0.02;                     % window spacing for rms measurement
0093 p0.preemph=-7000;                  % s-plane position of preemphasis zero
0094 p0.nfullag=7;                      % number of full lags to try (must be odd)
0095 p=paramsetch(p0,q);
0096 
0097 % redefine commonly used parameters
0098 
0099 candtr=p.candtr;          % minimum peak in NCCF                      [0.3]
0100 vtranc=p.vtranc;          % fixed voice-state transition cost         [0.005]
0101 vtrac=p.vtrac;            % delta amplitude modulated transition cost [0.5]
0102 vtrsc=p.vtrsc;            % delta spectrum modulated transition cost  [0.5]
0103 vobias=p.vobias;          % bias to encourage voiced hypotheses       [0.0]
0104 doublec=p.doublec;        % cost of exact doubling or halving         [0.35]
0105 ncands=p.ncands;          % max hypotheses at each frame              [20]
0106 nfullag=p.nfullag;        % number of full lags to try (must be odd)  [7]
0107 
0108 % derived parameters (mostly dependent on sample rate fs)
0109 
0110 krms=round(p.trms*fs);            % window length for rms measurement
0111 kdrms=round(p.dtrms*fs);          % window spacing for rms measurement
0112 rmswin=hanning(krms).^2;
0113 kdsmp=round(0.25*fs/p.f0max);
0114 hlpw=round(p.tlpw*fs/2);          % force window to be an odd length
0115 blp=sinc((-hlpw:hlpw)/kdsmp).*hamming(2*hlpw+1).';
0116 fsd=fs/kdsmp;
0117 kframed=round(fsd*p.tframe);      % downsampled frame length
0118 kframe=kframed*kdsmp;           % frame increment at full rate
0119 rmsix=(1:krms)+floor((kdrms-kframe)/2); % rms index according to Talkin; better=(1:krms)+floor((kdrms-krms+1)/2)
0120 minlag=ceil(fsd/p.f0max);
0121 maxlag=round(fsd/p.f0min);        % use round() only because that is what Talkin does
0122 kcorwd=round(fsd*p.tcorw);        % downsampled correlation window
0123 kcorw=kcorwd*kdsmp;             % full rate correlation window
0124 spoff=max(hlpw-floor(kdsmp/2),1+kdrms-rmsix(1)-kframe);  % offset for first speech frame at full rate
0125 sfoff=spoff-hlpw+floor(kdsmp/2); % offset for downsampling filter
0126 sfi=1:kcorwd;                   % initial decimated correlation window index array
0127 sfhi=1:kcorw;                   % initial correlation window index array
0128 sfj=1:kcorwd+maxlag;
0129 sfmi=repmat((minlag:maxlag)',1,kcorwd)+repmat(sfi,maxlag-minlag+1,1);
0130 lagoff=(minlag-1)*kdsmp;        % lag offset when converting to high sample rate
0131 beta=p.lagwt*p.f0min/fs;            % bias towards low lags
0132 log2=log(2);
0133 lpcord=2+round(fs/1000);        % lpc order for itakura distance
0134 hnfullag=floor(nfullag/2);
0135 jumprat=exp((doublec+log2)/2);  % lag ratio at which octave jump cost is lowest
0136 ssq=s.^2;
0137 csssq=cumsum(ssq);
0138 sqrt(min(csssq(kcorw+1:end)-csssq(1:end-kcorw))/kcorw);
0139 afact=max([p.absnoise^2,max(ssq)*p.signoise^2,min(csssq(kcorw+1:end)-csssq(1:end-kcorw))*(p.relnoise/kcorw)^2])^2*kcorw^2;
0140 
0141 % downsample signal to approx 2 kHz to speed up autocorrelation calculation
0142 % kdsmp is the downsample factor
0143 
0144 sf=filter(blp/sum(blp),1,s(sfoff+1:end));
0145 sp=filter([1 exp(p.preemph/fs)],1,s); % preemphasised speech for LPC calculation
0146 sf(1:length(blp)-1)=[];         % remove startup transient
0147 sf=sf(1:kdsmp:end);             % downsample to =~2kHz
0148 nsf=length(sf);                 % length of downsampled speech
0149 ns=length(s);                   % length of full rate speech
0150 
0151 % Calculate the frame limit to ensure we don't run off the end of the speech or decimated speech:
0152 %   (a) For decimated autocorrelation when calculating sff():  (nframe-1)*kframed+kcorwd+maxlag <= nsf
0153 %   (b) For full rate autocorrelation when calculating sfh():  max(fho)+kcorw+maxlag*kdsamp+hnfllag <= ns
0154 %   (c) For rms ratio window when calculating rr            :  max(fho)+rmsix(end) <= ns
0155 % where max(fho) = (nframe-1)*kframe + spoff
0156 
0157 nframe=floor(1+min((nsf-kcorwd-maxlag)/kframed,(ns-spoff-max(kcorw-maxlag*kdsmp-hnfullag,rmsix(end)))/kframe));
0158 
0159 % now search for autocorrelation peaks in the downsampled signal
0160 
0161 cost=zeros(nframe,ncands);      % cumulative cost
0162 prev=zeros(nframe,ncands);      % traceback pointer
0163 mcands=zeros(nframe,1);         % number of actual candidates excluding voiceless
0164 lagval=repmat(NaN,nframe,ncands-1);    % lag of each voiced candidate
0165 tv=zeros(nframe,3);             % diagnostics: 1=voiceless cost, 2=min voiced cost, 3:cumulative voiceless-min voiced
0166 if doback
0167     costms=cell(nframe,1);
0168 end
0169 
0170 % Main processing loop for each 10 ms frame
0171 
0172 for iframe=1:nframe       % loop for each frame (~10 ms)
0173     
0174     % Find peaks in the normalized autocorrelation of subsampled (2Khz) speech
0175     % only keep peaks that are > 30% of highest peak
0176     
0177     fho=(iframe-1)*kframe+spoff;
0178     sff=sf((iframe-1)*kframed+sfj);
0179     sffdc=mean(sff(sfi));       % mean of initial correlation window length
0180     sff=sff-sffdc;              % subtract off the mean
0181     nccfd=normxcor(sff(1:kcorwd),sff(minlag+1:end));
0182     [ipkd,vpkd]=v_findpeaks(nccfd,'q');
0183     
0184     % Debugging: execute the line below to plot the autocorrelation peaks.
0185     % v_findpeaks(nccfd,'q'); xlabel(sprintf('Lag = (x+%d)*%g ms',minlag-1,1000*kdsmp/fs)); ylabel('Normalized Cross Correlation'); title (sprintf('Frame %d/%d',iframe,nframe));
0186     
0187     vipkd=[vpkd ipkd];
0188     vipkd(vpkd<max(vpkd)*candtr,:)=[];          % eliminate peaks that are small
0189     if size(vipkd,1)
0190         if size(vipkd,1)>ncands-1
0191             vipkd=sortrows(vipkd);
0192             vipkd(1:size(vipkd,1)-ncands+1,:)=[];   % eliminate lowest to leave only ncands-1
0193         end
0194         lagcan=round(vipkd(:,2)*kdsmp+lagoff);        % convert the lag candidate values to the full sample rate
0195         nlcan=length(lagcan);
0196     else
0197         nlcan=0;
0198     end
0199     
0200     % If there are any candidate lag values (nlcan>0) then refine their accuracy at the full sample rate
0201     
0202     if nlcan
0203         laglist=reshape(repmat(lagcan(:)',nfullag,1)+repmat((-hnfullag:hnfullag)',1,nlcan),nfullag*nlcan,1);
0204         sfh=s(fho+(1:kcorw+max(lagcan)+hnfullag));
0205         sfhdc=mean(sfh(sfhi));
0206         sfh=sfh-sfhdc;
0207         e0=sum(sfh(sfhi).^2);                     % energy of initial correlation window (only needed to store in tv(:,6)
0208         lagl2=repmat(lagcan(:)',nfullag+kcorw-1,1)+repmat((1-hnfullag:hnfullag+kcorw)',1,nlcan);
0209         nccf=normxcor(sfh(1:kcorw),sfh(lagl2),afact);
0210         
0211         [maxcc,maxcci]=max(nccf,[],1);
0212         vipk=[maxcc(:) lagcan(:)+maxcci(:)-hnfullag-1];
0213         vipk=vipk(:,[1 2 2]);
0214         maxccj=maxcci(:)'+nfullag*(0:nlcan-1);    % vector index into nccf array
0215         msk=mod(maxcci,nfullag-1)~=1 & 2*nccf(maxccj)-nccf(mod(maxccj-2,nfullag*nlcan)+1)-nccf(mod(maxccj,nfullag*nlcan)+1)>0;  % don't do quadratic interpolation for the end ones
0216         if any(msk)
0217             maxccj=maxccj(msk);
0218             vipk(msk,3)=vipk(msk,3)+(nccf(maxccj+1)-nccf(maxccj-1))'./(2*(2*nccf(maxccj)-nccf(maxccj-1)-nccf(maxccj+1)))';
0219         end
0220         vipk(maxcc<max(maxcc)*candtr,:)=[];          % eliminate peaks that are small
0221         if size(vipk,1)>ncands-1
0222             vipk=sortrows(vipk);
0223             vipk(1:size(vipk,1)-ncands+1,:)=[];   % eliminate lowest to leave only ncands-1
0224         end
0225         
0226         % vipk(:,1) has NCCF value, vipk(:,2) has integer peak position, vipk(:,3) has refined peak position
0227         
0228         mc=size(vipk,1);
0229     else
0230         mc=0;
0231     end
0232     
0233     % We now have mc lag candidates at the full sample rate
0234     
0235     mc1=mc+1;               % total number of candidates including "unvoiced" possibility
0236     mcands(iframe)=mc;      % save number of lag candidates (needed for pitch consistency cost calculation)
0237     if mc
0238         lagval(iframe,1:mc)=vipk(:,3)';
0239         cost(iframe,1)=vobias+max(vipk(:,1));   % voiceless cost
0240         cost(iframe,2:mc1)=1-vipk(:,1)'.*(1-beta*vipk(:,3)');   % local voiced costs
0241         tv(iframe,2)=min(cost(iframe,2:mc1));
0242     else
0243         cost(iframe,1)=vobias;          % if no lag candidates (mc=0), then the voiceless case is the only possibility
0244     end
0245     tv(iframe,1)=cost(iframe,1);
0246     if iframe>1                         % if it is not the first frame, then calculate pitch consistency and v/uv transition costs
0247         mcp=mcands(iframe-1);
0248         costm=zeros(mcp+1,mc1);         % cost matrix: rows and cols correspond to candidates in previous and current frames (incl voiceless)
0249         
0250         % if both frames have at least one lag candidate, then calculate a pitch consistency cost
0251         
0252         if mc*mcp
0253             lrat=abs(log(repmat(lagval(iframe,1:mc),mcp,1)./repmat(lagval(iframe-1,1:mcp)',1,mc)));
0254             costm(2:end,2:end)=p.freqwt*min(lrat,doublec+abs(lrat-log2));  % allow pitch doubling/halving
0255         end
0256         
0257         % if either frame has a lag candidate, then calculate the cost of voiced/voiceless transition and vice versa
0258         
0259         if mc+mcp
0260             rr=sqrt((rmswin'*s(fho+rmsix).^2)/(rmswin'*s(fho+rmsix-kdrms).^2)); % amplitude "gradient"
0261             ss=0.2/(distitar(lpcauto(sp(fho+rmsix),lpcord),lpcauto(sp(fho+rmsix-kdrms),lpcord),'e')-0.8);   % Spectral stationarity: note: Talkin uses Hanning instead of Hamming windows for LPC
0262             costm(1,2:end)= vtranc+vtrsc*ss+vtrac/rr;   % voiceless -> voiced cost
0263             costm(2:end,1)= vtranc+vtrsc*ss+vtrac*rr;
0264             tv(iframe,4:5)=[costm(1,mc1) costm(mcp+1,1)];
0265         end
0266         costm=costm+repmat(cost(iframe-1,1:mcp+1)',1,mc1);  % add in cumulative costs
0267         [costi,previ]=min(costm,[],1);
0268         cost(iframe,1:mc1)=cost(iframe,1:mc1)+costi;
0269         prev(iframe,1:mc1)=previ;
0270     else                            % first ever frame
0271         costm=zeros(1,mc1); % create a cost matrix in case doing a backward recursion
0272     end
0273     if mc
0274         tv(iframe,3)=cost(iframe,1)-min(cost(iframe,2:mc1));
0275         tv(iframe,6)=5*log10(e0*e0/afact);
0276     end
0277     if doback
0278         costms{iframe}=costm; % need to add repmatted cost into this
0279     end
0280 end
0281 
0282 % now do traceback
0283 
0284 best=zeros(nframe,1);
0285 [cbest,best(nframe)]=min(cost(nframe,1:mcands(nframe)+1));
0286 for i=nframe:-1:2
0287     best(i-1)=prev(i,best(i));
0288 end
0289 vix=find(best>1);
0290 fx=repmat(NaN,nframe,1);                        % unvoiced frames will be NaN
0291 fx(vix)=fs*lagval(vix+nframe*(best(vix)-2)).^(-1); % leave as NaN if unvoiced
0292 tt=zeros(nframe,3);
0293 tt(:,1)=(1:nframe)'*kframe+spoff;       % find frame times
0294 tt(:,2)=tt(:,1)+kframe-1;
0295 jratm=(jumprat+1/jumprat)/2;
0296 tt(2:end,3)=abs(fx(2:end)./fx(1:end-1)-jratm)>jumprat-jratm;    % new spurt if frequency ratio is outside (1/jumprat,jumprat)
0297 tt(1,3)=1;           % first frame always starts a spurt
0298 tt(1+find(isnan(fx(1:end-1))),3)=1; % NaN always forces a new spurt
0299 
0300 % plot results if there are no output arguments of if the 'g' mode option is specified
0301 
0302 if ~nargout || any(mode=='g')
0303     tf=spoff+(0:nframe-1)'*kframe;      % one sample before start of each frame
0304     blag=repmat(NaN,nframe,1);                        % unvoiced frames will be NaN
0305     blag(vix)=lagval(vix+nframe*(best(vix)-2)); % leave as NaN if unvoiced
0306     ts=(1:ns)/fs;                       % time scale for speech samples
0307     tsa=[1:tf(1) tf(end)+kframe+1:ns];  % indexes for unprocessed speech [-1 term is an error methinks]
0308     sup=repmat(NaN,ns,1);               % unprocessed speech - plot in black
0309     sup(tsa)=s(tsa);
0310     sv=reshape(s(tf(1)+1:tf(end)+kframe),kframe,nframe);               % processed speech
0311     su=sv;
0312     su(:,best>1)=NaN;                   % delete all voiced samples
0313     sv(:,best==1)=NaN;                  % delete all unvoiced samples
0314     tsuv=(tf(1)+1:tf(end)+kframe)/fs;
0315     su=su(:);
0316     sv=sv(:);
0317     ax=zeros(2,1);
0318     ax(1)=subplot(211);
0319     plot(ts,sup,'-k',tsuv,su,'r-',tsuv,sv,'b-');
0320     title('Speech');
0321     ax(2)=subplot(212);
0322     plot((tf+(kframe+1)/2)/fs,lagval*1000/fs,'xr',(tf+(kframe+1)/2)/fs,blag*1000/fs,'-b')
0323     xlabel('Time (s)');
0324     ylabel('Period (ms)');
0325     title('Lag Candidates');
0326     linkaxes(ax,'x');
0327 end
0328 if ~any(mode=='u')
0329     tt(isnan(fx),:)=[];    % remove NaN spurts
0330     fx(isnan(fx),:)=[];
0331 end
0332 
0333 
0334 
0335 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0336 
0337 function v=normxcor(x,y,d)
0338 % Calculate the normalized cross correlation of column vectors x and y
0339 % we can calculate this in two ways but fft is much faster even for nx small
0340 % We must have nx<=ny and the output length is ny-nx+1
0341 % note that this routine does not do mean subtraction even though this is normally a good idea
0342 % if y is a matrix, we correlate with each column
0343 % d is a constant added onto the normalization factor
0344 % v(j)=x'*yj/sqrt(d + x'*x * yj'*yj) where yj=y(j:j+nx-1) for j=1:ny-nx+1
0345 
0346 if nargin<3
0347     d=0;
0348 end
0349 nx=length(x);
0350 [ny,my]=size(y);
0351 nv=1+ny-nx;
0352 if nx>ny
0353     error('second argument is shorter than the first');
0354 end
0355 
0356 nf=pow2(nextpow2(ny));
0357 w=irfft(repmat(conj(rfft(x,nf,1)),1,my).*rfft(y,nf,1));
0358 s=zeros(ny+1,my);
0359 s(2:end,:)=cumsum(y.^2,1);
0360 v=w(1:nv,:)./sqrt(d+(x'*x).*(s(nx+1:end,:)-s(1:end-nx,:)));

Generated on Tue 19-Sep-2017 12:07:31 by m2html © 2003