Home > voicebox > fxpefac.m

fxpefac

PURPOSE ^

FXPEFAC PEFAC pitch tracker [FX,TT,PV,FV]=(S,FS,TINC,M,PP)

SYNOPSIS ^

function [fx,tx,pv,fv]=fxpefac(s,fs,tinc,m,pp)

DESCRIPTION ^

FXPEFAC PEFAC pitch tracker [FX,TT,PV,FV]=(S,FS,TINC,M,PP)

 Input:   s(ns)      Speech signal
          fs         Sample frequency (Hz)
          tinc       Time increment between frames (s) [0.01]
                     or [start increment end]
          m          mode
                     'g' plot graph showing waveform and pitch
                     'G' plot spectrogram with superimposed pitch
                     'x' use external files for algorithm parameter
                         initialization: fxpefac_g and fxpefac_w
          pp         structure containing algorithm parameters

 Outputs: fx(nframe)     Estimated pitch (Hz)
          tx(nframe)     Time at the centre of each frame (seconds).
          pv(nframe)     Probability of the frame of being voiced
          fv             structure containing feature vectors
                           fv.vuvfea(nframe,2) = voiced/unvoiced GMM features

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SUBFUNCTIONS ^

SOURCE CODE ^

0001 function [fx,tx,pv,fv]=fxpefac(s,fs,tinc,m,pp)
0002 %FXPEFAC PEFAC pitch tracker [FX,TT,PV,FV]=(S,FS,TINC,M,PP)
0003 %
0004 % Input:   s(ns)      Speech signal
0005 %          fs         Sample frequency (Hz)
0006 %          tinc       Time increment between frames (s) [0.01]
0007 %                     or [start increment end]
0008 %          m          mode
0009 %                     'g' plot graph showing waveform and pitch
0010 %                     'G' plot spectrogram with superimposed pitch
0011 %                     'x' use external files for algorithm parameter
0012 %                         initialization: fxpefac_g and fxpefac_w
0013 %          pp         structure containing algorithm parameters
0014 %
0015 % Outputs: fx(nframe)     Estimated pitch (Hz)
0016 %          tx(nframe)     Time at the centre of each frame (seconds).
0017 %          pv(nframe)     Probability of the frame of being voiced
0018 %          fv             structure containing feature vectors
0019 %                           fv.vuvfea(nframe,2) = voiced/unvoiced GMM features
0020 
0021 % References
0022 %  [1]  S.Gonzalez and M. Brookes,
0023 %       A pitch estimation filter robust to high levels of noise (PEFAC), Proc EUSIPCO,Aug 2011.
0024 
0025 % Bugs/Suggestions
0026 % (1) do long files in chunks
0027 % (2) option of n-best DP
0028 
0029 %       Copyright (C) Sira Gonzalez and Mike Brookes 2011
0030 %      Version: $Id: fxpefac.m 713 2011-10-16 14:45:43Z dmb $
0031 %
0032 %   VOICEBOX is a MATLAB toolbox for speech processing.
0033 %   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
0034 %
0035 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0036 %   This program is free software; you can redistribute it and/or modify
0037 %   it under the terms of the GNU General Public License as published by
0038 %   the Free Software Foundation; either version 2 of the License, or
0039 %   (at your option) any later version.
0040 %
0041 %   This program is distributed in the hope that it will be useful,
0042 %   but WITHOUT ANY WARRANTY; without even the implied warranty of
0043 %   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0044 %   GNU General Public License for more details.
0045 %
0046 %   You can obtain a copy of the GNU General Public License from
0047 %   http://www.gnu.org/copyleft/gpl.html or by writing to
0048 %   Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
0049 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0050 
0051 persistent w_u m_u v_u w_v m_v v_v dpwtdef
0052 % initialize persistent variables
0053 if ~numel(w_u)
0054 
0055     % voiced/unvoiced decision based on 2-element feature vector
0056     % (a) mean power of the frame's log-freq spectrum (normalized so its short-term average is LTASS)
0057     % (b) sum of the power in the first three peaks
0058     %===== VUV
0059     if nargin>3 && any(m=='x')
0060         fxpefac_g;    % read in GMM parameters
0061         fxpefac_w;     % read in Weights parameters
0062     else
0063         w_u=[0.2123723 0.207788 0.2701817 0.1293616 0.04741722 0.1328791 ]';
0064 
0065         m_u=[0.2220388 0.4067706 ;
0066             0.04567656 0.4016914 ;
0067             0.8415278 0.3192158 ;
0068             0.2194808 0.1910079 ;
0069             1.6347 0.5819833 ;
0070             1.181519 0.6996485 ];
0071 
0072         v_u=reshape([0.01413822 0.003357913 0.003357913 0.01786169 ;
0073             0.0009377269 0.0006220489 0.0006220489 0.03422057 ;
0074             0.1233703 0.004299293 0.004299293 0.007660504 ;
0075             0.01779449 0.002078821 0.002078821 0.001605052 ;
0076             1.110173 0.00718649 0.00718649 0.005734435 ;
0077             0.5477135 -0.00182316 -0.00182316 0.05659796 ]',[2 2 6]);
0078 
0079         w_v=[0.07758689 0.2109879 0.1856225 0.06853158 0.2701563 0.1871148 ]';
0080 
0081         m_v=[1.208656 0.3365564 ;
0082             1.216643 0.5971916 ;
0083             4.08585 1.240948 ;
0084             8.322102 1.349939 ;
0085             1.734108 1.168643 ;
0086             0.5107205 0.940308 ];
0087 
0088         v_v=reshape([0.06181574 0.002950501 0.002950501 0.004528442 ;
0089             0.2946077 0.01433284 0.01433284 0.02684239 ;
0090             2.508473 -0.03310555 -0.03310555 0.1098579 ;
0091             14.17252 -0.09009174 -0.09009174 0.07989255 ;
0092             0.5834894 -0.07854027 -0.07854027 0.1108958 ;
0093             0.05978017 0.005528601 0.005528601 0.1309329 ]',[2 2 6]);
0094     end
0095     %===== PDP
0096     %     dfm = -0.4238; % df mean
0097     %     dfv = 3.8968; % df variance (although treated as std dev here)
0098     %     delta = 0.15;
0099     %     dflpso=[dfm 0.5/(log(10)*dfv^2) -log(2*delta/(dfv*sqrt(2*pi)))/log(10)]; % scale factor & offset for df pdf
0100     %     dpwtdef=[1.0000, 0.8250, 1.3064, 1.9863]; % default DP weights
0101     dpwtdef=[1.0000, 0.8250, 0.01868, 0.006773, 98.9, -0.4238]; % default DP weights
0102     %===== END
0103 
0104 end
0105 
0106 
0107 % Algorithm parameter defaults
0108 
0109 p.fstep=5;              % frequency resolution of initial spectrogram (Hz)
0110 p.fmax=4000;            % maximum frequency of initial spectrogram (Hz)
0111 p.fres = 20;            % bandwidth of initial spectrogram (Hz)
0112 p.fbanklo = 40;         % low frequency limit of log filterbank (Hz)
0113 p.mpsmooth = 201;       % width of smoothing filter for mean power
0114 % p.maxtranf = 1000;      % maximum value of tranf cost term
0115 p.shortut = 7;          % max utterance length to average power of entire utterance
0116 p.pefact = 1.5;         % shape factor in PEFAC filter
0117 p.numopt = 3;           % number of possible frequencies per frame
0118 p.flim = [60 400];      % range of feasible fundamental frequencies (Hz)
0119 p.w = dpwtdef;          % DP weights
0120 % p.rampk = 1.1;          % constant for relative-amplitude cost term
0121 % p.rampcz = 100;         % relative amplitude cost for missing peak
0122 p.tmf = 2;              % median frequency smoothing interval (s)
0123 p.tinc = 0.01;          % default frame increment (s)
0124 
0125 % update parameters from pp argument
0126 
0127 if nargin>=5 && isstruct(pp)
0128     fnq=fieldnames(pp);
0129     for i=1:length(fnq)
0130         if isfield(p,fnq{i})
0131             p.(fnq{i})=pp.(fnq{i});
0132         end
0133     end
0134 end
0135 
0136 % Sort out input arguments
0137 if nargin>=3  && numel(tinc)>0
0138     p.tinc = tinc;   % 0.01 s between consecutive time frames
0139 end
0140 if nargin<4
0141     m='';
0142 end
0143 
0144 % Spectrogram of the mixture
0145 fmin = 0; fstep = p.fstep; fmax = p.fmax;
0146 fres = p.fres;  % Frequency resolution (Hz)
0147 [tx,f,MIX]=spgrambw(s,fs,fres,[fmin fstep fmax],[],p.tinc);
0148 nframes=length(tx);
0149 txinc=tx(2)-tx(1);  % actual frame increment
0150 %  ==== we could combine spgrambw and filtbankm into a single call to spgrambw or use fft directly ====
0151 % Log-frequency scale
0152 [trans,cf]=filtbankm(length(f),2*length(f)-1,2*f(end),p.fbanklo,f(end),'usl');
0153 O = MIX*trans'; % Original spectrum in Log-frequency scale
0154 
0155 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0156 % Amplitude Compression
0157 
0158 % Calculate alpha based on LTASS ratios
0159 ltass = stdspectrum(6,'p',cf);
0160 auxf = [cf(1),(cf(1:end-1)+cf(2:end))./2,cf(end)];
0161 ltass = ltass.*diff(auxf);                  % weight by bin width
0162 
0163 % estimated ltass
0164 O = O.*repmat(diff(auxf),nframes,1);     % weight spectrum by bin width
0165 
0166 if tx(end)<p.shortut                        % if it is a short utterance
0167     eltass = mean(O,1);                     % mean power per each frequency band
0168     eltass = smooth(eltass,p.mpsmooth);     % smooth in log frequency
0169     eltass= eltass(:).';                    % force a row vector
0170 
0171     % Same mean power per frame as ltass
0172     cte = mean(ltass)/mean(eltass);
0173     eltass = eltass.*cte;                   % normalize to have the same mean as LTASS
0174     O = O.*cte;
0175 
0176     % Linear AC
0177     alpha = (ltass)./(eltass);
0178     alpha = alpha(:).';
0179     alpha = repmat(alpha,nframes,1);
0180     O = O.*alpha;                           % force O to have an average LTASS spectrum
0181     % ==== should perhaps exclude the silent portions ***
0182 else                                        % long utterance
0183     tsmo = 2; % time smoothing over 1 sec
0184     stt = round(tsmo/txinc);
0185     filttime = [ones(stt,1); zeros(stt-1,1)];
0186     filtfreq = ones(1,p.mpsmooth);
0187     eltass = imfilter(O,filttime);
0188     eltass = imfilter(eltass,filtfreq);     % filter in time and log frequency
0189 
0190     % Same mean power per frame than ltass
0191     cte = repmat(mean(ltass),nframes,1)./mean(eltass,2);
0192     eltass = eltass.*repmat(cte,1,length(cf));
0193     O = O.*repmat(cte,1,length(cf));
0194 
0195     % Linear AC
0196     alpha = repmat(ltass,nframes,1)./(eltass);
0197     O = O.*alpha;
0198 
0199 end
0200 
0201 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0202 % Create the filter to detect the harmonics
0203 ini = find(cf>2*cf(1));
0204 sca = cf/cf(ini(1)); % normalize bin frequencies to start at approximately 0.5
0205 sca = sca(sca<10.5 & sca>0.5);  % restrict to 0.5 - 10.5 times fundamental
0206 filh = -log10(p.pefact-cos(2*pi*sca));
0207 filh = filh-mean(filh);  % force filter to be zero mean
0208 posit = find(sca>=1);  % ==== this should just equal ini(1) ====
0209 if ~mod(length(posit),2)
0210     filh = [filh 0];  % force to be an odd length after central tap
0211 end
0212 negat = find(sca<1);
0213 numz = length(posit)-1-length(negat);
0214 filh = filh./max(filh);
0215 filh = [zeros(1,numz) filh];
0216 
0217 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0218 % Filter the log-frequency scaled spectrogram
0219 B = imfilter(O,filh);  % ==== no good reason to use imfilter here ====
0220 
0221 % Feasible frequency range
0222 numopt = p.numopt; % Number of possible fundamental frequencies per frame
0223 flim = p.flim;
0224 pfreq = find(cf>flim(1) & cf<flim(2));
0225 ff = zeros(nframes,numopt);
0226 amp = zeros(nframes,numopt);
0227 for i=1:nframes
0228     [pos,peak]=findpeaks(B(i,pfreq),[],5/(cf(pfreq(2))-cf(pfreq(1)))); % ==== calculate some out of loop ====
0229     if numel(pos)
0230         [peak,ind]=sort(peak,'descend');
0231         pos = pos(ind);                     % indices of peaks in the B array
0232         posff = cf(pfreq(pos));             % frequencies of peaks
0233         fin = min(numopt,length(posff));
0234         ff(i,1:fin)=posff(1:fin);           % save both frequency and amplitudes
0235         amp(i,1:fin)=peak(1:fin);
0236         %     else
0237         %         ff(i,:)=0;          % ==== unnecessary since they start as zeros ====
0238         %         amp(i,:)=0;
0239     end
0240 end
0241 
0242 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0243 % Probabilitly of the frame of being voiced
0244 
0245 % voiced/unvoiced decision based on 2-element feature vector
0246 % (a) mean power of the frame's log-freq spectrum (normalized so its short-term average is LTASS)
0247 % (b) sum of the power in the first three peaks
0248 
0249 pow = mean(O,2)*1e-6;
0250 vuvfea = [pow sum(amp,2)./(pow*1e9)];
0251 
0252 pru=gaussmixp(vuvfea,m_u,v_u,w_u);  % Probability of being unvoiced
0253 prv=gaussmixp(vuvfea,m_v,v_v,w_v);  % Probability of being voiced
0254 
0255 % pru = exp(pru);
0256 % prv=exp(prv);    % Linear probability
0257 % pv = prv./(prv+pru); % ==== better to write pv=(1+exp(pru-prv)).^(-1) ====
0258 pv=(1+exp(pru-prv)).^(-1);
0259 
0260 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0261 % Dynamic programming
0262 
0263 % w(1): relative amp, voiced local cost
0264 % w(2): median pitch deviation cost
0265 % w(3): df cost weight
0266 % w(4): max df cost
0267 % w(5): relative amp cost for missing peaks (very high)
0268 % w(6): df mean
0269 
0270 w = p.w;
0271 
0272 % Relative amplitude
0273 camp = -amp./repmat(max(amp,[],2),1,numopt);  % relative amplitude used as cost
0274 camp(amp==0)=w(5); % If no frequency found
0275 
0276 % Time interval for the median frequency
0277 tmf = p.tmf; % in sec
0278 inmf = round(tmf/txinc);
0279 
0280 %--------------------------------------------------------------------------
0281 % FORWARDS
0282 % Initialize values
0283 cost = zeros(nframes,numopt);
0284 prev = zeros(nframes,numopt);
0285 medfx = zeros(nframes,1);
0286 dffact=2/txinc;
0287 
0288 % First time frame
0289 % cost(1,:) = w(1)*ramp(1,:);
0290 cost(1,:) = w(1)*camp(1,:);  % only one cost term for first frame
0291 fpos = ff(1:min(inmf,end),1);
0292 mf=median(fpos(pv(1:min(inmf,end))>0.6));   % calculate median frequency of first 2 seconds
0293 if isnan(mf)
0294     mf=median(fpos(pv(1:min(inmf,end))>0.5));
0295     if isnan(mf)
0296         mf=median(fpos(pv(1:min(inmf,end))>0.4));
0297         if isnan(mf)
0298             mf=median(fpos(pv(1:min(inmf,end))>0.3)); % ==== clumsy way of ensuring that we take the best frames ====
0299             if isnan(mf)
0300                 mf=0;
0301             end
0302         end
0303     end
0304 end
0305 medfx(1)=mf;
0306 
0307 for i=2:nframes              % main dynamic programming loop
0308     if i>inmf
0309         fpos = ff(i-inmf:i,1);  % fpos is the highest peak in each frame
0310         mf=median(fpos(pv(1:inmf)>0.6));  % find median frequency over past 2 seconds
0311         if isnan(mf)
0312             mf=median(fpos(pv(1:inmf)>0.5));
0313             if isnan(mf)
0314                 mf=median(fpos(pv(1:inmf)>0.4));
0315                 if isnan(mf)
0316                     mf=median(fpos(pv(1:inmf)>0.3));% ==== clumsy way of ensuring that we take the best frames ====
0317                     if isnan(mf)
0318                         mf=0;
0319                     end
0320                 end
0321             end
0322         end
0323     end
0324     medfx(i)=mf;
0325     % Frequency difference between candidates and cost
0326     df = dffact*(repmat(ff(i,:).',1,numopt) - repmat(ff(i-1,:),numopt,1))./(repmat(ff(i,:).',1,numopt) + repmat(ff(i-1,:),numopt,1));
0327     costdf=w(3)*min((df-w(6)).^2,w(4));
0328 
0329     % Cost related to the median pitch
0330     if mf==0                                   % this test was inverted in the original version
0331         costf = zeros(1,numopt);
0332     else
0333         costf = abs(ff(i,:) - mf)./mf;
0334     end
0335     [cost(i,:),prev(i,:)]=min(costdf + repmat(cost(i-1,:),numopt,1),[],2); % ==== should we allow the possibility of skipping frames ? ====
0336     cost(i,:)=cost(i,:)+w(2)*costf + w(1)*camp(i,:);  % add on costs that are independent of previous path
0337 
0338 end
0339 
0340 % Traceback
0341 
0342 fx=zeros(nframes,1);
0343 best = zeros(nframes,1);
0344 
0345 nose=find(cost(end,:)==min(cost(end,:))); % ==== bad method (dangerous) ===
0346 best(end)=nose(1);
0347 % ff = [ff zeros(nframes,1)];  % not clear why this was here
0348 fx(end)=ff(end,best(end));
0349 for i=nframes:-1:2
0350     best(i-1)=prev(i,best(i));
0351     fx(i-1)=ff(i-1,best(i-1));
0352 end
0353 
0354 if nargout>=4
0355     fv.vuvfea=vuvfea;  % voiced-unvoiced features
0356     fv.best=best;  % selected path
0357     fv.ff=ff;  % pitch candidates
0358     fv.amp=amp;  % pitch candidate amplitudes
0359     fv.medfx=medfx;  % median pitch
0360     fv.w=w;  % DP weights
0361     fv.dffact=dffact;  % df scale factor
0362 end
0363 
0364 if ~nargout || any(m=='g') || any(m=='G')
0365     nax=0;  % number of axes sets to link
0366     msk=pv>0.5; % find voiced frames as a mask
0367     fxg=fx;
0368     fxg(~msk)=NaN; % allow only good frames
0369     fxb=fx;
0370     fxb(msk)=NaN; % allow only bad frames
0371     if any(m=='G') || ~nargout && ~any(m=='g')
0372         clf;
0373         spgrambw(s,fs,'ilcwpf'); % draw spectrogram with log axes
0374         hold on
0375         plot(tx,log10(fxg),'-b',tx,log10(fxb),'-r'); % fx track
0376         yy=get(gca,'ylim');
0377         plot(tx,yy(1)+yy*[-1;1]*(0.02+0.05*pv),'-k'); % P(V) track
0378         hold off
0379         nax=nax+1;
0380         axh(nax)=gca;
0381         if any(m=='g')
0382             figure;   % need a new figure if plotting two graphs
0383         end
0384     end
0385     if any(m=='g')
0386         ns=length(s);
0387         [tsr,ix]=sort([(1:ns)/fs 0.5*(tx(1:end-1)+tx(2:end))']); % intermingle speech and frame boundaries
0388         jx(ix)=1:length(ix); % create inverse index
0389         sp2fr=jx(1:ns)-(0:ns-1);  % speech sample to frame number
0390         spmsk=msk(sp2fr);   % speech sample voiced mask
0391         sg=s;
0392         sg(~spmsk)=NaN;   % good speech samples only
0393         sb=s;
0394         sb(spmsk)=NaN;    % bad speech samples only
0395         clf;
0396         subplot(5,1,1);
0397         plot(tx,pv,'-b',(1:ns)/fs,0.5*mod(cumsum(fx(sp2fr)/fs),1)-0.6,'-b');
0398         nax=nax+1;
0399         axh(nax)=gca;
0400         ylabel('\phi(t), P(V)');
0401         set(gca,'ylim',[-0.65 1.05]);
0402         subplot(5,1,2:3);
0403         plot((1:ns)/fs,sg,'-b',(1:ns)/fs,sb,'-r');
0404         nax=nax+1;
0405         axh(nax)=gca;
0406         subplot(5,1,4:5);
0407         plot(tx,fxg,'-b',tx,fxb,'-r');
0408         ylabel('Pitch (Hz)');
0409         %         semilogy(tx,fxg,'-b',tx,fxb,'-r');
0410         %         ylabel(['Pitch (' yticksi 'Hz)']);
0411         set(gca,'ylim',[min(fxg)-30 max(fxg)+30]);
0412         nax=nax+1;
0413         axh(nax)=gca;
0414     end
0415     if nax>1
0416         linkaxes(axh,'x');
0417     end
0418 end
0419 
0420 function y=smooth(x,n)
0421 nx=length(x);
0422 c=cumsum(x);
0423 y=[c(1:2:n)./(1:2:n) (c(n+1:end)-c(1:end-n))/n (c(end)-c(end-n+2:2:end-1))./(n-2:-2:1)];

Generated on Thu 23-Feb-2012 09:25:48 by m2html © 2003