Description of v

0001 function [fx,tx,pv,fv]=v_fxpefac(s,fs,tinc,m,pp)
0002 %V_FXPEFAC PEFAC pitch tracker [FX,TT,PV,FV]=(S,FS,TINC,M,PP)
0003 %
0004 % Input:   s(ns)      Speech signal
0005 %          fs         Sample frequency (Hz)
0006 %          tinc       Time increment between frames (s) [0.01]
0007 %                     or [start increment end]
0008 %          m          mode
0009 %                     'g' plot graph showing waveform and pitch
0010 %                     'G' plot spectrogram with superimposed pitch using
0011 %                         options pp.sopt [default: 'ilcwpf']
0012 %                     'x' use external files for algorithm parameter
0013 %                         initialization: fxpefac_g and fxpefac_w
0014 %          pp         structure containing algorithm parameters
0015 %
0016 % Outputs: fx(nframe)     Estimated pitch (Hz)
0017 %          tx(nframe)     Time at the centre of each frame (seconds).
0018 %          pv(nframe)     Probability of the frame of being voiced
0019 %          fv             structure containing feature vectors
0020 %                           fv.vuvfea(nframe,2) = voiced/unvoiced GMM features
0021 
0022 % References
0023 %  [1]  S. Gonzalez and M. Brookes. PEFAC - a pitch estimation algorithm robust to high levels of noise.
0024 %       IEEE Trans. Audio, Speech, Language Processing, 22 (2): 518-530, Feb. 2014.
0025 %       doi: 10.1109/TASLP.2013.2295918.
0026 %  [2]  S.Gonzalez and M. Brookes,
0027 %       A pitch estimation filter robust to high levels of noise (PEFAC), Proc EUSIPCO,Aug 2011.
0028 
0029 % Bugs/Suggestions
0030 % (1) do long files in chunks
0031 % (2) option of n-best DP
0032 
0033 %       Copyright (C) Sira Gonzalez and Mike Brookes 2011
0034 %      Version: $Id: v_fxpefac.m 10865 2018-09-21 17:22:45Z dmb $
0035 %
0036 %   VOICEBOX is a MATLAB toolbox for speech processing.
0037 %   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
0038 %
0039 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0040 %   This program is free software; you can redistribute it and/or modify
0041 %   it under the terms of the GNU Lesser General Public License as published by
0042 %   the Free Software Foundation; either version 3 of the License, or
0043 %   (at your option) any later version.
0044 %
0045 %   This program is distributed in the hope that it will be useful,
0046 %   but WITHOUT ANY WARRANTY; without even the implied warranty of
0047 %   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0048 %   GNU Lesser General Public License for more details.
0049 %
0050 %   You can obtain a copy of the GNU Lesser General Public License from
0051 %   https://www.gnu.org/licenses/ .
0052 %    See files gpl-3.0.txt and lgpl-3.0.txt included in this distribution.
0053 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0054 
0055 persistent w_u m_u v_u w_v m_v v_v dpwtdef
0056 % initialize persistent variables
0057 if ~numel(w_u)
0058 
0059     % voiced/unvoiced decision based on 2-element feature vector
0060     % (a) mean power of the frame's log-freq spectrum (normalized so its short-term average is LTASS)
0061     % (b) sum of the power in the first three peaks
0062     %===== VUV
0063     if nargin>3 && any(m=='x')
0064         fxpefac_g;     % read in GMM parameters
0065         fxpefac_w;     % read in Weights parameters
0066     else
0067         w_u=[0.1461799 0.3269458 0.2632178 0.02331986 0.06360947 0.1767271 ]';
0068 
0069         m_u=[13.38533 0.4199435 ;
0070              12.23505 0.1496836 ;
0071              12.76646 0.2581733 ;
0072              13.69822 0.6893078 ;
0073              9.804372 0.02786567 ;
0074              11.03848 0.07711229 ];
0075 
0076         v_u=reshape([0.4575519 0.002619074 0.002619074 0.01262138 ;
0077              0.7547719 0.008568089 0.008568089 0.001933864 ;
0078              0.5770533 0.003561592 0.003561592 0.00527957 ;
0079              0.3576287 0.01388739 0.01388739 0.04742106 ;
0080              0.9049906 0.01033191 0.01033191 0.0001887114 ;
0081              0.637969 0.009936445 0.009936445 0.0007082946 ]',[2 2 6]);
0082 
0083         w_v=[0.1391365 0.221577 0.2214025 0.1375109 0.1995124 0.08086066 ]';
0084 
0085         m_v=[15.36667 0.8961554 ;
0086              13.52718 0.4809653 ;
0087              13.95531 0.8901121 ;
0088              14.56318 0.6767258 ;
0089              14.59449 1.190709 ;
0090              13.11096 0.2861982 ];
0091 
0092         v_v=reshape([0.196497 -0.002605404 -0.002605404 0.05495016 ;
0093              0.6054919 0.007776652 0.007776652 0.01899244 ;
0094              0.5944617 0.0485788 0.0485788 0.03511229 ;
0095              0.3871268 0.0292966 0.0292966 0.02046839 ;
0096              0.3377683 0.02839657 0.02839657 0.04756354 ;
0097              1.00439 0.03595795 0.03595795 0.006737475 ]',[2 2 6]);
0098     end
0099     %===== PDP
0100     %     dfm = -0.4238; % df mean
0101     %     dfv = 3.8968; % df variance (although treated as std dev here)
0102     %     delta = 0.15;
0103     %     dflpso=[dfm 0.5/(log(10)*dfv^2) -log(2*delta/(dfv*sqrt(2*pi)))/log(10)]; % scale factor & offset for df pdf
0104     %     dpwtdef=[1.0000, 0.8250, 1.3064, 1.9863]; % default DP weights
0105     dpwtdef=[1.0000, 0.8250, 0.01868, 0.006773, 98.9, -0.4238]; % default DP weights
0106     %===== END
0107 
0108 end
0109 % Algorithm parameter defaults
0110 
0111 p.fstep=5;              % frequency resolution of initial spectrogram (Hz)
0112 p.fmax=4000;            % maximum frequency of initial spectrogram (Hz)
0113 p.fres = 20;            % bandwidth of initial spectrogram (Hz)
0114 p.fbanklo = 10;         % low frequency limit of log v_filterbank (Hz)
0115 p.mpsmooth = 21;       % width of smoothing filter for mean power
0116 % p.maxtranf = 1000;      % maximum value of tranf cost term
0117 p.shortut = 7;          % max utterance length to average power of entire utterance
0118 p.pefact = 1.8;         % shape factor in PEFAC filter
0119 p.numopt = 3;           % number of possible frequencies per frame
0120 p.flim = [60 400];      % range of feasible fundamental frequencies (Hz)
0121 p.w = dpwtdef;          % DP weights
0122 % p.rampk = 1.1;          % constant for relative-amplitude cost term
0123 % p.rampcz = 100;         % relative amplitude cost for missing peak
0124 p.tmf = 2;              % median frequency smoothing interval (s)
0125 p.tinc = 0.01;          % default frame increment (s)
0126 p.sopt = 'ilcwpf';      % spectrogram options
0127 
0128 % update parameters from pp argument
0129 
0130 if nargin>=5 && isstruct(pp)
0131     fnq=fieldnames(pp);
0132     for i=1:length(fnq)
0133         if isfield(p,fnq{i})
0134             p.(fnq{i})=pp.(fnq{i});
0135         end
0136     end
0137 end
0138 
0139 % Sort out input arguments
0140 if nargin>=3  && numel(tinc)>0
0141     p.tinc = tinc;   % 0.01 s between consecutive time frames
0142 end
0143 if nargin<4
0144     m='';
0145 end
0146 
0147 % Spectrogram of the mixture
0148 fmin = 0; fstep = p.fstep; fmax = p.fmax;
0149 fres = p.fres;  % Frequency resolution (Hz)
0150 [tx,f,MIX]=v_spgrambw(s,fs,fres,[fmin fstep fmax],[],p.tinc);
0151 nframes=length(tx);
0152 txinc=tx(2)-tx(1);  % actual frame increment
0153 %  ==== we could combine v_spgrambw and v_filtbankm into a single call to v_spgrambw or use fft directly ====
0154 % Log-frequency scale
0155 [trans,cf]=v_filtbankm(length(f),2*length(f)-1,2*f(end),p.fbanklo,f(end),'usl');
0156 O = MIX*trans'; % Original spectrum in Log-frequency scale
0157 
0158 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0159 % Amplitude Compression
0160 
0161 % Calculate alpha based on LTASS ratios
0162 ltass = v_stdspectrum(6,'p',cf); % uses an old version of the LTASS spectrum but we will need to recalculate the GMM if we update it
0163 auxf = [cf(1),(cf(1:end-1)+cf(2:end))./2,cf(end)];
0164 ltass = ltass.*diff(auxf);                  % weight by bin width
0165 
0166 % estimated ltass
0167 O = O.*repmat(diff(auxf),nframes,1);     % weight spectrum by bin width
0168 O1 = O;
0169 
0170 if tx(end)<p.shortut                        % if it is a short utterance
0171     eltass = mean(O,1);                     % mean power per each frequency band
0172     eltass = smooth(eltass,p.mpsmooth);     % smooth in log frequency
0173     eltass= eltass(:).';                    % force a row vector
0174 
0175     % Linear AC
0176     alpha = (ltass)./(eltass);
0177     alpha = alpha(:).';
0178     alpha = repmat(alpha,nframes,1);
0179     O = O.*alpha;                           % force O to have an average LTASS spectrum
0180 
0181     % ==== should perhaps exclude the silent portions ***
0182 else                                        % long utterance
0183 
0184     tsmo = 3; % time smoothing over 3 sec
0185     stt = round(tsmo/txinc);
0186     eltass = timesm(O,stt);
0187     eltass = smooth(eltass,p.mpsmooth);     % filter in time and log frequency
0188 
0189     % Linear AC
0190     alpha = repmat(ltass,nframes,1)./(eltass);
0191     O = O.*alpha;
0192 
0193 end
0194 
0195 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0196 % Create the filter to detect the harmonics
0197 ini = find(cf>3*cf(1));
0198 sca = cf/cf(ini(1)); % bin frequencies start at approximately 0.33 with sca(ini(1))=1 exactly
0199 
0200 % Middle
0201 sca = sca(sca<10.5 & sca>0.5);  % restrict to 0.5 - 10.5 times fundamental
0202 
0203 sca1 = sca;
0204 filh = 1./(p.pefact-cos(2*pi*sca1));
0205 filh = filh - sum(filh(1:end).*diff([sca1(1),(sca1(1:end-1)+sca1(2:end))./2,sca1(end)]))/sum(diff([sca1(1),(sca1(1:end-1)+sca1(2:end))./2,sca1(end)]));
0206 
0207 posit = find(sca>=1);  % ==== this should just equal ini(1) ====
0208 negat = find(sca<1);
0209 numz = length(posit)-1-length(negat);
0210 filh = filh./max(filh);
0211 filh = [zeros(1,numz) filh]; % length is always odd with central value = 1
0212 
0213 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0214 % Filter the log-frequency scaled spectrogram
0215 B = imfilter(O,filh);  % does a convolution with zero lag at centre of filh
0216 
0217 % Feasible frequency range
0218 numopt = p.numopt; % Number of possible fundamental frequencies per frame
0219 flim = p.flim;
0220 pfreq = find(cf>flim(1) & cf<flim(2)); % flim = permitted fx range = [60 400]
0221 ff = zeros(nframes,numopt);
0222 amp = zeros(nframes,numopt);
0223 for i=1:nframes
0224     [pos,peak]=v_findpeaks(B(i,pfreq),[],5/(cf(pfreq(2))-cf(pfreq(1)))); % min separation = 5Hz @ fx=flim(1) (could pre-calculate) ====
0225     if numel(pos)
0226         [peak,ind]=sort(peak,'descend');
0227         pos = pos(ind);                     % indices of peaks in the B array
0228         posff = cf(pfreq(pos));             % frequencies of peaks
0229         fin = min(numopt,length(posff));
0230         ff(i,1:fin)=posff(1:fin);           % save both frequency and amplitudes
0231         amp(i,1:fin)=peak(1:fin);
0232     end
0233 end
0234 
0235 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0236 % Probabilitly of the frame of being voiced
0237 
0238 % voiced/unvoiced decision based on 2-element feature vector
0239 % (a) mean power of the frame's log-freq spectrum (normalized so its short-term average is LTASS)
0240 % (b) sum of the power in the first three peaks
0241 
0242 pow = mean(O,2);
0243 
0244 vuvfea = [log(pow) 1e-3*sum(amp,2)./(pow+1.75*1e5)];
0245 
0246 % %%%%%%%%%%%%%%%%%%%%%
0247 
0248 pru=v_gaussmixp(vuvfea,m_u,v_u,w_u);  % Log probability of being unvoiced
0249 prv=v_gaussmixp(vuvfea,m_v,v_v,w_v);  % Log probability of being voiced
0250 
0251 pv=(1+exp(pru-prv)).^(-1);
0252 
0253 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0254 % Dynamic programming
0255 
0256 % w(1): relative amp, voiced local cost
0257 % w(2): median pitch deviation cost
0258 % w(3): df cost weight
0259 % w(4): max df cost
0260 % w(5): relative amp cost for missing peaks (very high)
0261 % w(6): df mean
0262 
0263 w = p.w;
0264 
0265 % Relative amplitude
0266 camp = -amp./repmat(max(amp,[],2),1,numopt);  % relative amplitude used as cost
0267 camp(amp==0)=w(5); % If no frequency found
0268 
0269 % Time interval for the median frequency
0270 tmf = p.tmf; % in sec
0271 inmf = round(tmf/txinc);
0272 
0273 %--------------------------------------------------------------------------
0274 % FORWARDS
0275 % Initialize values
0276 cost = zeros(nframes,numopt);
0277 prev = zeros(nframes,numopt);
0278 medfx = zeros(nframes,1);
0279 dffact=2/txinc;
0280 
0281 % First time frame
0282 % cost(1,:) = w(1)*ramp(1,:);
0283 cost(1,:) = w(1)*camp(1,:);  % only one cost term for first frame
0284 fpos = ff(1:min(inmf,end),1);
0285 mf=median(fpos(pv(1:min(inmf,end))>0.6));   % calculate median frequency of first 2 seconds
0286 if isnan(mf)
0287     mf=median(fpos(pv(1:min(inmf,end))>0.5));
0288     if isnan(mf)
0289         mf=median(fpos(pv(1:min(inmf,end))>0.4));
0290         if isnan(mf)
0291             mf=median(fpos(pv(1:min(inmf,end))>0.3)); % ==== clumsy way of ensuring that we take the best frames ====
0292             if isnan(mf)
0293                 mf=0;
0294             end
0295         end
0296     end
0297 end
0298 medfx(1)=mf;
0299 
0300 for i=2:nframes              % main dynamic programming loop
0301     if i>inmf
0302         fpos = ff(i-inmf:i,1);  % fpos is the highest peak in each frame
0303         mf=median(fpos(pv(1:inmf)>0.6));  % find median frequency over past 2 seconds
0304         if isnan(mf)
0305             mf=median(fpos(pv(1:inmf)>0.5));
0306             if isnan(mf)
0307                 mf=median(fpos(pv(1:inmf)>0.4));
0308                 if isnan(mf)
0309                     mf=median(fpos(pv(1:inmf)>0.3));% ==== clumsy way of ensuring that we take the best frames ====
0310                     if isnan(mf)
0311                         mf=0;
0312                     end
0313                 end
0314             end
0315         end
0316     end
0317     medfx(i)=mf;
0318     % Frequency difference between candidates and cost
0319     df = dffact*(repmat(ff(i,:).',1,numopt) - repmat(ff(i-1,:),numopt,1))./(repmat(ff(i,:).',1,numopt) + repmat(ff(i-1,:),numopt,1));
0320     costdf=w(3)*min((df-w(6)).^2,w(4));
0321 
0322     % Cost related to the median pitch
0323     if mf==0                                   % this test was inverted in the original version
0324         costf = zeros(1,numopt);
0325     else
0326         costf = abs(ff(i,:) - mf)./mf;
0327     end
0328     [cost(i,:),prev(i,:)]=min(costdf + repmat(cost(i-1,:),numopt,1),[],2); % ==== should we allow the possibility of skipping frames ? ====
0329     cost(i,:)=cost(i,:)+w(2)*costf + w(1)*camp(i,:);  % add on costs that are independent of previous path
0330 
0331 end
0332 
0333 % Traceback
0334 
0335 fx=zeros(nframes,1);
0336 ax=zeros(nframes,1);
0337 best = zeros(nframes,1);
0338 
0339 nose=find(cost(end,:)==min(cost(end,:))); % ==== bad method (dangerous) ===
0340 best(end)=nose(1);
0341 fx(end)=ff(end,best(end));
0342 ax(end)=amp(end,best(end));
0343 for i=nframes:-1:2
0344     best(i-1)=prev(i,best(i));
0345     fx(i-1)=ff(i-1,best(i-1));
0346     ax(i-1)=amp(i-1,best(i-1));
0347 end
0348 
0349 if nargout>=4
0350     fv.vuvfea=vuvfea;  % voiced-unvoiced features
0351     fv.best=best;  % selected path
0352     fv.ff=ff;  % pitch candidates
0353     fv.amp=amp;  % pitch candidate amplitudes
0354     fv.medfx=medfx;  % median pitch
0355     fv.w=w;  % DP weights
0356     fv.dffact=dffact;  % df scale factor
0357     fv.hist = [log(mean(O,2)) sum(amp,2)./((mean(O,2)))];
0358 end
0359 
0360 if ~nargout || any(m=='g') || any(m=='G')
0361     nax=0;  % number of axes sets to link
0362     msk=pv>0.5; % find voiced frames as a mask
0363     fxg=fx;
0364     fxg(~msk)=NaN; % allow only good frames
0365     fxb=fx;
0366     fxb(msk)=NaN; % allow only bad frames
0367     if any(m=='G') || ~nargout && ~any(m=='g')
0368         clf;
0369         v_spgrambw(s,fs,p.sopt); % draw spectrogram with log axes
0370         hold on
0371         plot(tx,log10(fxg),'-b',tx,log10(fxb),'-r'); % fx track
0372         yy=get(gca,'ylim');
0373         plot(tx,yy(1)+yy*[-1;1]*(0.02+0.05*pv),'-k'); % P(V) track
0374         hold off
0375         nax=nax+1;
0376         axh(nax)=gca;
0377         if any(m=='g')
0378             figure;   % need a new figure if plotting two graphs
0379         end
0380     end
0381     if any(m=='g')
0382         ns=length(s);
0383         [tsr,ix]=sort([(1:ns)/fs 0.5*(tx(1:end-1)+tx(2:end))']); % intermingle speech and frame boundaries
0384         jx(ix)=1:length(ix); % create inverse index
0385         sp2fr=jx(1:ns)-(0:ns-1);  % speech sample to frame number
0386         spmsk=msk(sp2fr);   % speech sample voiced mask
0387         sg=s;
0388         sg(~spmsk)=NaN;   % good speech samples only
0389         sb=s;
0390         sb(spmsk)=NaN;    % bad speech samples only
0391         clf;
0392         subplot(5,1,1);
0393         plot(tx,pv,'-b',(1:ns)/fs,0.5*mod(cumsum(fx(sp2fr)/fs),1)-0.6,'-b');
0394         nax=nax+1;
0395         axh(nax)=gca;
0396         ylabel('\phi(t), P(V)');
0397         set(gca,'ylim',[-0.65 1.05]);
0398         subplot(5,1,2:3);
0399         plot((1:ns)/fs,sg,'-b',(1:ns)/fs,sb,'-r');
0400         nax=nax+1;
0401         axh(nax)=gca;
0402         subplot(5,1,4:5);
0403         plot(tx,fxg,'-b',tx,fxb,'-r');
0404         ylabel('Pitch (Hz)');
0405         %         semilogy(tx,fxg,'-b',tx,fxb,'-r');
0406         %         ylabel(['Pitch (' v_yticksi 'Hz)']);
0407         set(gca,'ylim',[min(fxg)-30 max(fxg)+30]);
0408         nax=nax+1;
0409         axh(nax)=gca;
0410     end
0411     if nax>1
0412         linkaxes(axh,'x');
0413     end
0414 end
0415 
0416 function y=smooth(x,n)
0417 nx=size(x,2);
0418 nf=size(x,1);
0419 c=cumsum(x,2);
0420 y=[c(:,1:2:n)./repmat(1:2:n,nf,1) (c(:,n+1:end)-c(:,1:end-n))/n (repmat(c(:,end),1,floor(n/2))-c(:,end-n+2:2:end-1))./repmat(n-2:-2:1,nf,1)];
0421 
0422 function y=timesm(x,n)
0423 if ~mod(n,2)
0424     n = n+1;
0425 end
0426 nx=size(x,2);
0427 nf=size(x,1);
0428 c=cumsum(x,1);
0429 mid = round(n/2);
0430 y=[c(mid:n,:)./repmat((mid:n).',1,nx); ...
0431     (c(n+1:end,:)-c(1:end-n,:))/n; ...
0432     (repmat(c(end,:),mid-1,1) - c(end-n+1:end-mid,:))./repmat((n-1:-1:mid).',1,nx)];
v_fxpefac

PURPOSE

SYNOPSIS

DESCRIPTION

CROSS-REFERENCE INFORMATION

SUBFUNCTIONS

SOURCE CODE