Home > voicebox > fxrapt.m

fxrapt

PURPOSE ^

FXRAPT RAPT pitch tracker [FX,VUV]=(S,FS,M,Q)

SYNOPSIS ^

function [fx,tt]=fxrapt(s,fs,mode,q)

DESCRIPTION ^

FXRAPT RAPT pitch tracker [FX,VUV]=(S,FS,M,Q)

 Input:   s(ns)      Speech signal
          fs         Sample frequency (Hz)
          mode       'g' will plot a graph [default if no output arguments]
                     'u' will include unvoiced fames (with fx=NaN)
          q          stucture with parameter values (e.g. q.f0min=40); see below for a list

 Outputs: fx(nframe)     Larynx frequency for each fram,e (or NaN for silent/unvoiced)
          tt(nframe,3)  Start and end samples of each frame. tt(*,3)=1 at the start of each talk spurt

 Plots a graph if no outputs are specified showing lag candidates and selected path

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SUBFUNCTIONS ^

SOURCE CODE ^

0001 function [fx,tt]=fxrapt(s,fs,mode,q)
0002 %FXRAPT RAPT pitch tracker [FX,VUV]=(S,FS,M,Q)
0003 %
0004 % Input:   s(ns)      Speech signal
0005 %          fs         Sample frequency (Hz)
0006 %          mode       'g' will plot a graph [default if no output arguments]
0007 %                     'u' will include unvoiced fames (with fx=NaN)
0008 %          q          stucture with parameter values (e.g. q.f0min=40); see below for a list
0009 %
0010 % Outputs: fx(nframe)     Larynx frequency for each fram,e (or NaN for silent/unvoiced)
0011 %          tt(nframe,3)  Start and end samples of each frame. tt(*,3)=1 at the start of each talk spurt
0012 %
0013 % Plots a graph if no outputs are specified showing lag candidates and selected path
0014 %
0015 
0016 % Bugs/Suggestions:
0017 %   (1) Include backward DP pass and output the true cost for each candidate.
0018 %   (2) Add an extra state to distinguish between voiceless and silent
0019 %   (3) N-best DP to allow longer term penalties (e.g. for frequent pitch doubling/halving)
0020 
0021 % The algorithm is taken from [1] with the following differences:
0022 %
0023 %      (a)  the factor AFACT which in the Talkin algorithm corresponds roughly
0024 %           to the absolute level of harmonic noise in the correlation window. This value
0025 %           is here calculated as the maximum of three figures:
0026 %                   (i) an absolute floor set by p.absnoise
0027 %                  (ii) a multiple of the peak signal set by p.signoise
0028 %                 (iii) a multiple of the noise floor set by p.relnoise
0029 %      (b) The LPC used in calculating the Itakura distance uses a Hamming window rather than
0030 %          a Hanning window.
0031 %
0032 % A C implementation of this algorithm by Derek Lin and David Talkin is included as  "get_f0.c"
0033 % in the esps.zip package available from http://www.speech.kth.se/esps/esps.zip under the BSD
0034 % license.
0035 %
0036 % Refs:
0037 %      [1]   D. Talkin, "A Robust Algorithm for Pitch Tracking (RAPT)"
0038 %            in "Speech Coding & Synthesis", W B Kleijn, K K Paliwal eds,
0039 %            Elsevier ISBN 0444821694, 1995
0040 
0041 %      Copyright (C) Mike Brookes 2006-2013
0042 %      Version: $Id: fxrapt.m 10185 2017-10-04 08:20:32Z dmb $
0043 %
0044 %   VOICEBOX is a MATLAB toolbox for speech processing.
0045 %   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
0046 %
0047 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0048 %   This program is free software; you can redistribute it and/or modify
0049 %   it under the terms of the GNU General Public License as published by
0050 %   the Free Software Foundation; either version 2 of the License, or
0051 %   (at your option) any later version.
0052 %
0053 %   This program is distributed in the hope that it will be useful,
0054 %   but WITHOUT ANY WARRANTY; without even the implied warranty of
0055 %   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0056 %   GNU General Public License for more details.
0057 %
0058 %   You can obtain a copy of the GNU General Public License from
0059 %   http://www.gnu.org/copyleft/gpl.html or by writing to
0060 %   Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
0061 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0062 
0063 s=s(:); % force s to be a column
0064 if nargin<4
0065     q=[];
0066     if nargin<3
0067         mode=' ';
0068     end
0069 end
0070 doback=0;   % don't do backwards DP for now
0071 
0072 % set default parameters
0073 
0074 p0.f0min=50;           % Min F0 (Hz)
0075 p0.f0max=500;          % Max F0 (Hz)
0076 p0.tframe=0.01;        % frame size (s)
0077 p0.tlpw=0.005;         % low pass filter window size (s)
0078 p0.tcorw=0.0075;       % correlation window size (s)
0079 p0.candtr=0.3;         % minimum peak in NCCF
0080 p0.lagwt=0.3;          % linear lag taper factor
0081 p0.freqwt=0.02;        % cost factor for F0 change
0082 p0.vtranc=0.005;       % fixed voice-state transition cost
0083 p0.vtrac=0.5;          % delta amplitude modulated transition cost
0084 p0.vtrsc=0.5;          % delta spectrum modulated transition cost
0085 p0.vobias=0.0;         % bias to encourage voiced hypotheses
0086 p0.doublec=0.35;       % cost of exact doubling or halving
0087 p0.absnoise=0;         % absolute rms noise level
0088 p0.relnoise=2;         % rms noise level relative to noise floor
0089 p0.signoise=0.001;     % ratio of peak signal rms to noise floor (0.001 = 60dB)
0090 p0.ncands=20;          % max hypotheses at each frame
0091 p0.trms=0.03;                      % window length for rms measurement
0092 p0.dtrms=0.02;                     % window spacing for rms measurement
0093 p0.preemph=-7000;                  % s-plane position of preemphasis zero
0094 p0.nfullag=7;                      % number of full lags to try (must be odd)
0095 p=paramsetch(p0,q);
0096 
0097 % redefine commonly used parameters
0098 
0099 candtr=p.candtr;          % minimum peak in NCCF                      [0.3]
0100 vtranc=p.vtranc;          % fixed voice-state transition cost         [0.005]
0101 vtrac=p.vtrac;            % delta amplitude modulated transition cost [0.5]
0102 vtrsc=p.vtrsc;            % delta spectrum modulated transition cost  [0.5]
0103 vobias=p.vobias;          % bias to encourage voiced hypotheses       [0.0]
0104 doublec=p.doublec;        % cost of exact doubling or halving         [0.35]
0105 ncands=p.ncands;          % max hypotheses at each frame              [20]
0106 nfullag=p.nfullag;        % number of full lags to try (must be odd)  [7]
0107 
0108 % derived parameters (mostly dependent on sample rate fs)
0109 
0110 krms=round(p.trms*fs);            % window length for rms measurement
0111 kdrms=round(p.dtrms*fs);          % window spacing for rms measurement
0112 % rmswin=hanning(krms).^2;
0113 rmswin=(0.5-0.5*cos(2*pi*(1:krms)'/(krms+1))).^2; % squared Hanning window
0114 kdsmp=round(0.25*fs/p.f0max);
0115 hlpw=round(p.tlpw*fs/2);          % force window to be an odd length
0116 blp=sinc((-hlpw:hlpw)/kdsmp).*hamming(2*hlpw+1).';
0117 fsd=fs/kdsmp;
0118 kframed=round(fsd*p.tframe);      % downsampled frame length
0119 kframe=kframed*kdsmp;           % frame increment at full rate
0120 rmsix=(1:krms)+floor((kdrms-kframe)/2); % rms index according to Talkin; better=(1:krms)+floor((kdrms-krms+1)/2)
0121 minlag=ceil(fsd/p.f0max);
0122 maxlag=round(fsd/p.f0min);        % use round() only because that is what Talkin does
0123 kcorwd=round(fsd*p.tcorw);        % downsampled correlation window
0124 kcorw=kcorwd*kdsmp;             % full rate correlation window
0125 spoff=max(hlpw-floor(kdsmp/2),1+kdrms-rmsix(1)-kframe);  % offset for first speech frame at full rate
0126 sfoff=spoff-hlpw+floor(kdsmp/2); % offset for downsampling filter
0127 sfi=1:kcorwd;                   % initial decimated correlation window index array
0128 sfhi=1:kcorw;                   % initial correlation window index array
0129 sfj=1:kcorwd+maxlag;
0130 sfmi=repmat((minlag:maxlag)',1,kcorwd)+repmat(sfi,maxlag-minlag+1,1);
0131 lagoff=(minlag-1)*kdsmp;        % lag offset when converting to high sample rate
0132 beta=p.lagwt*p.f0min/fs;            % bias towards low lags
0133 log2=log(2);
0134 lpcord=2+round(fs/1000);        % lpc order for itakura distance
0135 hnfullag=floor(nfullag/2);
0136 jumprat=exp((doublec+log2)/2);  % lag ratio at which octave jump cost is lowest
0137 ssq=s.^2;
0138 csssq=cumsum(ssq);
0139 sqrt(min(csssq(kcorw+1:end)-csssq(1:end-kcorw))/kcorw);
0140 afact=max([p.absnoise^2,max(ssq)*p.signoise^2,min(csssq(kcorw+1:end)-csssq(1:end-kcorw))*(p.relnoise/kcorw)^2])^2*kcorw^2;
0141 
0142 % downsample signal to approx 2 kHz to speed up autocorrelation calculation
0143 % kdsmp is the downsample factor
0144 
0145 sf=filter(blp/sum(blp),1,s(sfoff+1:end));
0146 sp=filter([1 exp(p.preemph/fs)],1,s); % preemphasised speech for LPC calculation
0147 sf(1:length(blp)-1)=[];         % remove startup transient
0148 sf=sf(1:kdsmp:end);             % downsample to =~2kHz
0149 nsf=length(sf);                 % length of downsampled speech
0150 ns=length(s);                   % length of full rate speech
0151 
0152 % Calculate the frame limit to ensure we don't run off the end of the speech or decimated speech:
0153 %   (a) For decimated autocorrelation when calculating sff():  (nframe-1)*kframed+kcorwd+maxlag <= nsf
0154 %   (b) For full rate autocorrelation when calculating sfh():  max(fho)+kcorw+maxlag*kdsamp+hnfllag <= ns
0155 %   (c) For rms ratio window when calculating rr            :  max(fho)+rmsix(end) <= ns
0156 % where max(fho) = (nframe-1)*kframe + spoff
0157 
0158 nframe=floor(1+min((nsf-kcorwd-maxlag)/kframed,(ns-spoff-max(kcorw-maxlag*kdsmp-hnfullag,rmsix(end)))/kframe));
0159 
0160 % now search for autocorrelation peaks in the downsampled signal
0161 
0162 cost=zeros(nframe,ncands);      % cumulative cost
0163 prev=zeros(nframe,ncands);      % traceback pointer
0164 mcands=zeros(nframe,1);         % number of actual candidates excluding voiceless
0165 lagval=repmat(NaN,nframe,ncands-1);    % lag of each voiced candidate
0166 tv=zeros(nframe,3);             % diagnostics: 1=voiceless cost, 2=min voiced cost, 3:cumulative voiceless-min voiced
0167 if doback
0168     costms=cell(nframe,1);
0169 end
0170 
0171 % Main processing loop for each 10 ms frame
0172 
0173 for iframe=1:nframe       % loop for each frame (~10 ms)
0174     
0175     % Find peaks in the normalized autocorrelation of subsampled (2Khz) speech
0176     % only keep peaks that are > 30% of highest peak
0177     
0178     fho=(iframe-1)*kframe+spoff;
0179     sff=sf((iframe-1)*kframed+sfj);
0180     sffdc=mean(sff(sfi));       % mean of initial correlation window length
0181     sff=sff-sffdc;              % subtract off the mean
0182     nccfd=normxcor(sff(1:kcorwd),sff(minlag+1:end));
0183     [ipkd,vpkd]=v_findpeaks(nccfd,'q');
0184     
0185     % Debugging: execute the line below to plot the autocorrelation peaks.
0186     % v_findpeaks(nccfd,'q'); xlabel(sprintf('Lag = (x+%d)*%g ms',minlag-1,1000*kdsmp/fs)); ylabel('Normalized Cross Correlation'); title (sprintf('Frame %d/%d',iframe,nframe));
0187     
0188     vipkd=[vpkd ipkd];
0189     vipkd(vpkd<max(vpkd)*candtr,:)=[];          % eliminate peaks that are small
0190     if size(vipkd,1)
0191         if size(vipkd,1)>ncands-1
0192             vipkd=sortrows(vipkd);
0193             vipkd(1:size(vipkd,1)-ncands+1,:)=[];   % eliminate lowest to leave only ncands-1
0194         end
0195         lagcan=round(vipkd(:,2)*kdsmp+lagoff);        % convert the lag candidate values to the full sample rate
0196         nlcan=length(lagcan);
0197     else
0198         nlcan=0;
0199     end
0200     
0201     % If there are any candidate lag values (nlcan>0) then refine their accuracy at the full sample rate
0202     
0203     if nlcan
0204         laglist=reshape(repmat(lagcan(:)',nfullag,1)+repmat((-hnfullag:hnfullag)',1,nlcan),nfullag*nlcan,1);
0205         sfh=s(fho+(1:kcorw+max(lagcan)+hnfullag));
0206         sfhdc=mean(sfh(sfhi));
0207         sfh=sfh-sfhdc;
0208         e0=sum(sfh(sfhi).^2);                     % energy of initial correlation window (only needed to store in tv(:,6)
0209         lagl2=repmat(lagcan(:)',nfullag+kcorw-1,1)+repmat((1-hnfullag:hnfullag+kcorw)',1,nlcan);
0210         nccf=normxcor(sfh(1:kcorw),sfh(lagl2),afact);
0211         
0212         [maxcc,maxcci]=max(nccf,[],1);
0213         vipk=[maxcc(:) lagcan(:)+maxcci(:)-hnfullag-1];
0214         vipk=vipk(:,[1 2 2]);
0215         maxccj=maxcci(:)'+nfullag*(0:nlcan-1);    % vector index into nccf array
0216         msk=mod(maxcci,nfullag-1)~=1 & 2*nccf(maxccj)-nccf(mod(maxccj-2,nfullag*nlcan)+1)-nccf(mod(maxccj,nfullag*nlcan)+1)>0;  % don't do quadratic interpolation for the end ones
0217         if any(msk)
0218             maxccj=maxccj(msk);
0219             vipk(msk,3)=vipk(msk,3)+(nccf(maxccj+1)-nccf(maxccj-1))'./(2*(2*nccf(maxccj)-nccf(maxccj-1)-nccf(maxccj+1)))';
0220         end
0221         vipk(maxcc<max(maxcc)*candtr,:)=[];          % eliminate peaks that are small
0222         if size(vipk,1)>ncands-1
0223             vipk=sortrows(vipk);
0224             vipk(1:size(vipk,1)-ncands+1,:)=[];   % eliminate lowest to leave only ncands-1
0225         end
0226         
0227         % vipk(:,1) has NCCF value, vipk(:,2) has integer peak position, vipk(:,3) has refined peak position
0228         
0229         mc=size(vipk,1);
0230     else
0231         mc=0;
0232     end
0233     
0234     % We now have mc lag candidates at the full sample rate
0235     
0236     mc1=mc+1;               % total number of candidates including "unvoiced" possibility
0237     mcands(iframe)=mc;      % save number of lag candidates (needed for pitch consistency cost calculation)
0238     if mc
0239         lagval(iframe,1:mc)=vipk(:,3)';
0240         cost(iframe,1)=vobias+max(vipk(:,1));   % voiceless cost
0241         cost(iframe,2:mc1)=1-vipk(:,1)'.*(1-beta*vipk(:,3)');   % local voiced costs
0242         tv(iframe,2)=min(cost(iframe,2:mc1));
0243     else
0244         cost(iframe,1)=vobias;          % if no lag candidates (mc=0), then the voiceless case is the only possibility
0245     end
0246     tv(iframe,1)=cost(iframe,1);
0247     if iframe>1                         % if it is not the first frame, then calculate pitch consistency and v/uv transition costs
0248         mcp=mcands(iframe-1);
0249         costm=zeros(mcp+1,mc1);         % cost matrix: rows and cols correspond to candidates in previous and current frames (incl voiceless)
0250         
0251         % if both frames have at least one lag candidate, then calculate a pitch consistency cost
0252         
0253         if mc*mcp
0254             lrat=abs(log(repmat(lagval(iframe,1:mc),mcp,1)./repmat(lagval(iframe-1,1:mcp)',1,mc)));
0255             costm(2:end,2:end)=p.freqwt*min(lrat,doublec+abs(lrat-log2));  % allow pitch doubling/halving
0256         end
0257         
0258         % if either frame has a lag candidate, then calculate the cost of voiced/voiceless transition and vice versa
0259         
0260         if mc+mcp
0261             rr=sqrt((rmswin'*s(fho+rmsix).^2)/(rmswin'*s(fho+rmsix-kdrms).^2)); % amplitude "gradient"
0262             ss=0.2/(distitar(lpcauto(sp(fho+rmsix),lpcord),lpcauto(sp(fho+rmsix-kdrms),lpcord),'e')-0.8);   % Spectral stationarity: note: Talkin uses Hanning instead of Hamming windows for LPC
0263             costm(1,2:end)= vtranc+vtrsc*ss+vtrac/rr;   % voiceless -> voiced cost
0264             costm(2:end,1)= vtranc+vtrsc*ss+vtrac*rr;
0265             tv(iframe,4:5)=[costm(1,mc1) costm(mcp+1,1)];
0266         end
0267         costm=costm+repmat(cost(iframe-1,1:mcp+1)',1,mc1);  % add in cumulative costs
0268         [costi,previ]=min(costm,[],1);
0269         cost(iframe,1:mc1)=cost(iframe,1:mc1)+costi;
0270         prev(iframe,1:mc1)=previ;
0271     else                            % first ever frame
0272         costm=zeros(1,mc1); % create a cost matrix in case doing a backward recursion
0273     end
0274     if mc
0275         tv(iframe,3)=cost(iframe,1)-min(cost(iframe,2:mc1));
0276         tv(iframe,6)=5*log10(e0*e0/afact);
0277     end
0278     if doback
0279         costms{iframe}=costm; % need to add repmatted cost into this
0280     end
0281 end
0282 
0283 % now do traceback
0284 
0285 best=zeros(nframe,1);
0286 [cbest,best(nframe)]=min(cost(nframe,1:mcands(nframe)+1));
0287 for i=nframe:-1:2
0288     best(i-1)=prev(i,best(i));
0289 end
0290 vix=find(best>1);
0291 fx=repmat(NaN,nframe,1);                        % unvoiced frames will be NaN
0292 fx(vix)=fs*lagval(vix+nframe*(best(vix)-2)).^(-1); % leave as NaN if unvoiced
0293 tt=zeros(nframe,3);
0294 tt(:,1)=(1:nframe)'*kframe+spoff;       % find frame times
0295 tt(:,2)=tt(:,1)+kframe-1;
0296 jratm=(jumprat+1/jumprat)/2;
0297 tt(2:end,3)=abs(fx(2:end)./fx(1:end-1)-jratm)>jumprat-jratm;    % new spurt if frequency ratio is outside (1/jumprat,jumprat)
0298 tt(1,3)=1;           % first frame always starts a spurt
0299 tt(1+find(isnan(fx(1:end-1))),3)=1; % NaN always forces a new spurt
0300 
0301 % plot results if there are no output arguments of if the 'g' mode option is specified
0302 
0303 if ~nargout || any(mode=='g')
0304     tf=spoff+(0:nframe-1)'*kframe;      % one sample before start of each frame
0305     blag=repmat(NaN,nframe,1);                        % unvoiced frames will be NaN
0306     blag(vix)=lagval(vix+nframe*(best(vix)-2)); % leave as NaN if unvoiced
0307     ts=(1:ns)/fs;                       % time scale for speech samples
0308     tsa=[1:tf(1) tf(end)+kframe+1:ns];  % indexes for unprocessed speech [-1 term is an error methinks]
0309     sup=repmat(NaN,ns,1);               % unprocessed speech - plot in black
0310     sup(tsa)=s(tsa);
0311     sv=reshape(s(tf(1)+1:tf(end)+kframe),kframe,nframe);               % processed speech
0312     su=sv;
0313     su(:,best>1)=NaN;                   % delete all voiced samples
0314     sv(:,best==1)=NaN;                  % delete all unvoiced samples
0315     tsuv=(tf(1)+1:tf(end)+kframe)/fs;
0316     su=su(:);
0317     sv=sv(:);
0318     ax=zeros(2,1);
0319     ax(1)=subplot(211);
0320     plot(ts,sup,'-k',tsuv,su,'r-',tsuv,sv,'b-');
0321     title('Speech');
0322     ax(2)=subplot(212);
0323     plot((tf+(kframe+1)/2)/fs,lagval*1000/fs,'xr',(tf+(kframe+1)/2)/fs,blag*1000/fs,'-b')
0324     xlabel('Time (s)');
0325     ylabel('Period (ms)');
0326     title('Lag Candidates');
0327     linkaxes(ax,'x');
0328 end
0329 if ~any(mode=='u')
0330     tt(isnan(fx),:)=[];    % remove NaN spurts
0331     fx(isnan(fx),:)=[];
0332 end
0333 
0334 
0335 
0336 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0337 
0338 function v=normxcor(x,y,d)
0339 % Calculate the normalized cross correlation of column vectors x and y
0340 % we can calculate this in two ways but fft is much faster even for nx small
0341 % We must have nx<=ny and the output length is ny-nx+1
0342 % note that this routine does not do mean subtraction even though this is normally a good idea
0343 % if y is a matrix, we correlate with each column
0344 % d is a constant added onto the normalization factor
0345 % v(j)=x'*yj/sqrt(d + x'*x * yj'*yj) where yj=y(j:j+nx-1) for j=1:ny-nx+1
0346 
0347 if nargin<3
0348     d=0;
0349 end
0350 nx=length(x);
0351 [ny,my]=size(y);
0352 nv=1+ny-nx;
0353 if nx>ny
0354     error('second argument is shorter than the first');
0355 end
0356 
0357 nf=pow2(nextpow2(ny));
0358 w=irfft(repmat(conj(rfft(x,nf,1)),1,my).*rfft(y,nf,1));
0359 s=zeros(ny+1,my);
0360 s(2:end,:)=cumsum(y.^2,1);
0361 v=w(1:nv,:)./sqrt(d+(x'*x).*(s(nx+1:end,:)-s(1:end-nx,:)));

Generated on Tue 10-Oct-2017 08:30:10 by m2html © 2003