Description of v

0001 function [lev,af,fso,vad]=v_activlev(sp,fs,mode)
0002 %V_ACTIVLEV Measure active speech level as in ITU-T P.56 [LEV,AF,FSO]=(sp,FS,MODE)
0003 %
0004 %Usage: (1) lev=v_activlev(s,fs);           % speech level in units of power
0005 %       (2) db=v_activlev(s,fs,'d');        % speech level in dB
0006 %       (3) s=v_activlev(s,fs,'n');         % normalize active level to 0 dB
0007 %       (4) inc=10000;                      % you can process a long signal in chunks of length inc
0008 %           fso=fs;                         % initialize fso to the sample frequency
0009 %           for i=1:inc:ns                  % loop for each chunk
0010 %               [lev,af,fso]=v_activlev(s(i:min(i+inc-1,length(s))),fso,'dz'); % include the 'z' option
0011 %           end
0012 %           lev=v_activlev([],fso);         % finally omit the 'z' option to obtain the active level
0013 %                                           % all other options, e.g. 'd', are preserved from the very first call
0014 %
0015 % Inputs:
0016 %        SP     is the speech signal (with better than 20dB SNR)
0017 %        FS     is the sample frequency in Hz (see also FSO below)
0018 %        MODE   is a string containing a combination of the following:
0019 %               0 - omit high pass filter completely (i.e. include DC)
0020 %               3 - high pass filter at 30 Hz instead of 200 Hz (but allows mains hum to pass)
0021 %               4 - high pass filter at 40 Hz instead of 200 Hz (but allows mains hum to pass)
0022 %               1 - use cheybyshev 1 filter
0023 %               2 - use chebyshev 2 filter (default)
0024 %               e - use elliptic filter
0025 %               h - omit low pass filter at 5.5, 12 or 18 kHz
0026 %               w - use wideband filter frequencies: 70 Hz to 12 kHz
0027 %               W - use ultra wideband filter frequencies: 30 Hz to 18 kHz
0028 %               d - give outputs in dB rather than power
0029 %               n - output a normalized speech signal as the first argument
0030 %               N - output a normalized filtered speech signal as the first argument
0031 %               l - give both active and long-term power levels
0032 %               a - include A-weighting filter
0033 %               i - include ITU-R-BS.468/ITU-T-J.16 weighting filter
0034 %               z - do NOT zero-pad the signal by 0.35 s
0035 %
0036 % Outputs:
0037 %    If the "n" option is specified, a speech signal normalized to 0dB will be given as
0038 %    the first output followed by the other outputs.
0039 %        LEV    gives the speech level in units of power (or dB if mode='d')
0040 %               if mode='l' is specified, LEV is a row vector with the "long term
0041 %               level" as its second element (this is just the mean power)
0042 %        AF     is the activity factor (or duty cycle) in the range 0 to 1
0043 %        FSO    is a structure of intermediate information that allows
0044 %               you to process a speech signal in chunks (see usage example 4 above).
0045 %               Processing a signal in chunks is slower and may not give identical results
0046 %               because it will use slightly different thresholds. Note that you must use
0047 %               the 'z' option for all calls except the last. When the FS input is a structure, all
0048 %               options other than 'z' are taken from the FS structure rather than from the MODE input.
0049 %        VAD    is a boolean vector the same length as sp that acts as an approximate voice activity detector
0050 
0051 % For completeness we list here the contents of the FSO structure:
0052 %
0053 %   ffs : sample frequency
0054 %   fmd : mode string
0055 %    nh : hangover time in samples
0056 %    ae : smoothing filter coefs
0057 %    abl: HP filter numerator and denominator coefficient
0058 %    bh : LP filter numerator coefficient
0059 %    ah : LP filter denominator coefficients
0060 %    ze : smoothing filter state
0061 %    zl : HP filter state
0062 %    zh : LP filter state
0063 %    zx : hangover max filter state
0064 %  emax : maximum envelope exponent + 1
0065 %   ssq : signal sum of squares
0066 %    ns : number of signal samples
0067 %    ss : sum of speech samples (not actually used here)
0068 %    kc : cumulative occupancy counts
0069 %    aw : weighting filter denominator
0070 %    bw : weighting filter numerator
0071 %    zw : weighting filter state
0072 %
0073 % This routine implements "Method B" from [1],[2] to calculate the active
0074 % speech level which is defined to be the speech energy divided by the
0075 % duration of speech activity. Speech is designated as "active" based on an
0076 % adaptive threshold applied to the smoothed rectified speech signal. A
0077 % bandpass filter is first applied to the input speech whose -0.25 dB points
0078 % are at 200 Hz & 5.5 kHz by default but this can be changed to 70 Hz & 5.5 kHz
0079 % or to 30 Hz & 18 kHz by specifying the 'w' or 'W' options; these
0080 % correspond respectively to Annexes B and C in [2].
0081 %
0082 % References:
0083 % [1]    ITU-T. Objective measurement of active speech level. Recommendation P.56, Mar. 1993.
0084 % [2]    ITU-T. Objective measurement of active speech level. Recommendation P.56, Dec. 2011.
0085 %
0086 % Revision History
0087 %
0088 % 2011-10-16   713 Initial version
0089 % 2012-11-02  2471 Correctd behaviour when there are zero or multiple solutions to A(l)=C(l)+M
0090 % 2012-11-19  2516 Modified comments
0091 % 2014-03-17  4346 Modified comments
0092 % 2014-07-09  4795 Improved plotting when no output arguments are specified
0093 % 2016-01-06  7336 Include wideband options from 2011 standard
0094 % 2017-02-07  9407 Added 'z' option and corrected output for an all-zero input signal
0095 % 2018-03-22 10436 Modified comments
0096 % 2018-09-21 10863 Renamed to start with "v_"
0097 % 2018-11-07 10988 Changed EOL style to native so checkout works on all machines
0098 % 2019-11-19 11190 Fixed error in calculating the activity factor; it now excludes the
0099 %                  zero-padding samples from the calculation. [thanks to Joe Begin]
0100 % 2022-01-18       Using the 'n' option now gives a signal with exactly 0dB active level (thanks
0101 %                  to Alastair Moore and Becky Vos). Fixed anomalous behavior when FS is a
0102 %                  structure and MODE input is omitted.
0103 % 2022-02-21       Fixed error in plot of power histogram that meant that the threshold and active
0104 %                  level lines were in the wrong position
0105 % 2022-02-28       Tidied up the labelling on the power histogram plot
0106 
0107 %      Copyright (C) Mike Brookes 2008-2019
0108 %      Version: $Id: v_activlev.m 11190 2019-11-19 12:53:58Z dmb $
0109 %
0110 %   VOICEBOX is a MATLAB toolbox for speech processing.
0111 %   Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
0112 %
0113 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0114 %   This program is free software; you can redistribute it and/or modify
0115 %   it under the terms of the GNU Lesser General Public License as published by
0116 %   the Free Software Foundation; either version 3 of the License, or
0117 %   (at your option) any later version.
0118 %
0119 %   This program is distributed in the hope that it will be useful,
0120 %   but WITHOUT ANY WARRANTY; without even the implied warranty of
0121 %   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0122 %   GNU Lesser General Public License for more details.
0123 %
0124 %   You can obtain a copy of the GNU Lesser General Public License from
0125 %   https://www.gnu.org/licenses/ .
0126 %    See files gpl-3.0.txt and lgpl-3.0.txt included in this distribution.
0127 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0128 
0129 persistent nbin thresh c25zp c15zp e5zp
0130 if isempty(nbin)
0131     nbin=20;    % 60 dB range at 3dB per bin
0132     thresh=15.9;    % threshold in dB
0133     % High pass s-domain zeros and poles of filters with passband ripple<0.25dB, stopband<-50dB, w0=1
0134     %    w0=fzero(@ch2,0.5); [c2z,c2p,k]=cheby2(5,50,w0,'high','s');
0135     %    function v=ch2(w); [c2z,c2p,k]=cheby2(5,50,w,'high','s'); v= 20*log10(prod(abs(1i-c2z))/prod(abs(1i-c2p)))+0.25;
0136     c25zp=[0.37843443673309i 0.23388534441447i; -0.20640255179496+0.73942185906851i -0.54036889596392+0.45698784092898i];
0137     c25zp=[[0; -0.66793268833792] c25zp conj(c25zp)];
0138     %       [c1z,c1p,c1k] = cheby1(5,0.25,1,'high','s');
0139     c15zp=[-0.659002835294875+1.195798636925079i -0.123261821596263+0.947463030958881i];
0140     c15zp=[zeros(1,5); -2.288586431066945 c15zp conj(c15zp)];
0141     %      [ez,ep,ek] = ellip(5,0.25,50,1,'high','s')
0142     e5zp=[0.406667680649209i 0.613849362744881i; -0.538736390607201+1.130245082677107i -0.092723126159100+0.958193646330194i];
0143     e5zp=[[0; -1.964538608244084]  e5zp conj(e5zp)];
0144     %    w=linspace(0.2,2,100);
0145     %    figure(1); plot(w,20*log10(abs(freqs(real(poly(c15zp(1,:))),real(poly(c15zp(2,:))),w)))); title('Chebyshev 1');
0146     %    figure(2); plot(w,20*log10(abs(freqs(real(poly(c25zp(1,:))),real(poly(c25zp(2,:))),w)))); title('Chebyshev 2');
0147     %    figure(3); plot(w,20*log10(abs(freqs(real(poly(e5zp(1,:))),real(poly(e5zp(2,:))),w)))); title('Elliptic');
0148 end
0149 if nargin<3
0150     mode=' ';
0151 end
0152 if ~isstruct(fs)                        % no state vector given
0153     fso.ffs=fs;                           % sample frequency
0154     ti=1/fs;
0155     g=exp(-ti/0.03);                    % pole position for envelope filter
0156     fso.ae=[1 -2*g g^2]/(1-g)^2;        % envelope filter coefficients (DC gain = 1)
0157     fso.ze=zeros(2,1);
0158     fso.nh=ceil(0.2/ti)+1;              % hangover time in samples
0159     fso.zx=-Inf;                        % initial value for v_maxfilt()
0160     fso.emax=-Inf;                      % maximum exponent
0161     fso.ns=0;
0162     fso.ssq=0;
0163     fso.ss=0;
0164     fso.sf=1;                           % scale factor when forming energy histogram
0165     fso.sfdb=0;                         % scale factor in dB
0166     fso.kc=zeros(nbin,1);               % cumulative occupancy counts
0167     % s-plane zeros and poles of high pass 5'th order filter -0.25dB at w=1 and -50dB stopband
0168     if any(mode=='1')
0169         szp=c15zp;              % Chebyshev 1
0170     elseif any(mode=='e')
0171         szp=e5zp;               % Elliptic
0172     else
0173         szp=c25zp;              % Chebyshev 2
0174     end
0175     flh=[200 5500];             % default frequency range +- 0.25 dB
0176     if any(mode=='w')
0177         flh=[70 12000];         % super-wideband (Annex B of [2])
0178     elseif any(mode=='W')
0179         flh=[30 18000];         % full band (Annex C of [2])
0180     end
0181     if any(mode=='3')
0182         flh(1)=30;              % force a 30 Hz HPF cutoff
0183     end
0184     if any(mode=='4')
0185         flh(1)=40;              % force a 40 Hz HPF cutoff
0186     end
0187     if any(mode=='r')              % included for backward compatibility
0188         mode=['0h' mode];        % abolish both filters
0189     elseif fs<flh(2)*2.2
0190         mode=['h' mode];           % abolish lowpass filter at low sample rates
0191     end
0192     fso.fmd=mode;                % save mode flags
0193     if all(mode~='0')           % implement the HPF as biquads to avoid rounding errors
0194         zl=2./(1-szp*tan(flh(1)*pi/fs))-1;      % Transform s-domain poles/zeros with bilinear transform
0195         abl=[ones(2,1) -zl(:,1) -2*real(zl(:,2:3))  abs(zl(:,2:3)).^2];     % biquad coefficients
0196         hfg=(abl*[1 -1 0 0 0 0]').*(abl*[1 0 -1 0 1 0]').*(abl*[1 0 0 -1 0 1]');
0197         abl=abl(:,[1 2 1 3 5 1 4 6]);               % reorder into biquads
0198         abl(1,1:2)= abl(1,1:2)*hfg(2)/hfg(1);       % force Nyquist gain to equal 1
0199         fso.abl=abl;
0200         fso.zl=zeros(5,1);                          % space for HPF filter state
0201     end
0202     if all(mode~='h')
0203         zh=2./(szp/tan(flh(2)*pi/fs)-1)+1;     % Transform s-domain poles/zeros with bilinear transform
0204         ah=real(poly(zh(2,:)));
0205         bh=real(poly(zh(1,:)));
0206         fso.bh=bh*sum(ah)/sum(bh);
0207         fso.ah=ah;
0208         fso.zh=zeros(5,1);
0209     end
0210     if any(mode=='a')
0211         [fso.bw,fso.aw]=v_stdspectrum(2,'z',fs);
0212         fso.zw=zeros(max(length(fso.bw),length(fso.aw))-1,1);
0213     elseif any(mode=='i')
0214         [fso.bw,fso.aw]=v_stdspectrum(8,'z',fs);
0215         fso.zw=zeros(max(length(fso.bw),length(fso.aw))-1,1);
0216     end
0217 else
0218     fso=fs;             % use existing structure
0219 end
0220 md=fso.fmd;             % md is used to determine all options except 'z' which uses mode
0221 nsp=length(sp);         % original length of speech
0222 if all(mode~='z')
0223     nz=ceil(0.35*fso.ffs); % number of zeros to append
0224     sp=[sp(:);zeros(nz,1)];
0225 else
0226     nz=0;
0227 end
0228 ns=length(sp);
0229 if ns                       % process this speech chunk
0230     % apply the input filters to the speech
0231     if all(md~='0')         % implement the HPF as biquads to avoid rounding errors
0232         [sq,fso.zl(1)]=filter(fso.abl(1,1:2),fso.abl(2,1:2),sp(:),fso.zl(1));       % highpass filter: real pole/zero
0233         [sq,fso.zl(2:3)]=filter(fso.abl(1,3:5),fso.abl(2,3:5),sq(:),fso.zl(2:3));      % highpass filter: biquad 1
0234         [sq,fso.zl(4:5)]=filter(fso.abl(1,6:8),fso.abl(2,6:8),sq(:),fso.zl(4:5));      % highpass filter: biquad 2
0235     else
0236         sq=sp(:);
0237     end
0238     if all(md~='h')
0239         [sq,fso.zh]=filter(fso.bh,fso.ah,sq(:),fso.zh);     % lowpass filter
0240     end
0241     if any(md=='a') || any(md=='i')
0242         [sq,fso.zw]=filter(fso.bw,fso.aw,sq(:),fso.zw);     % weighting filter
0243     end
0244     fso.ns=fso.ns+ns;                               % count the number of speech samples
0245     fso.ss=fso.ss+sum(sq);                          % sum of speech samples (not used internally but available in fso output)
0246     ssq=sum(sq.*sq);                                % sum of squared new speech samples
0247     if ssq>0 && fso.ssq==0                          % if these are the first non-zero speech samples
0248         fso.sf=ns/ssq;                              % scale factor to normalize the mean power of this chunk to 1
0249         fso.sfdb=10*log10(fso.sf);                  % scale factor in dB
0250     end
0251     fso.ssq=fso.ssq+ssq;                            % sum of all squared speech samples
0252     [s,fso.ze]=filter(1,fso.ae,abs(sq(:)),fso.ze);     % envelope filter
0253     % we use the scale factor fso.sf in the following line to ensure that
0254     % the histogram binning is unchanged when sp is multiplied by a constant
0255     [qf,qe]=log2(fso.sf*s.^2);                      % take efficient log2 function, 2^qe is upper limit of bin
0256     qe(qf==0)=-Inf;                                 % fix zero values
0257     [qe,qk,fso.zx]=v_maxfilt(qe,1,fso.nh,1,fso.zx);    % apply the 0.2 second hangover
0258     oemax=fso.emax;                                 % previous max value of qe+1
0259     fso.emax=max(oemax,max(qe)+1);                  % update the max value of qe+1
0260     if fso.emax==-Inf                               % if all samples so far are zero
0261         fso.kc(1)=fso.kc(1)+ns;
0262     else                                            % we have had some non-zero samples
0263         qe=min(fso.emax-qe,nbin);                   % force in the range 1:nbin. Bin k has 2^(emax-k-1) <= s^2 < 2^(emax-k)
0264         wqe=ones(length(qe),1);
0265         % below: could use kc=cumsum(accumarray(qe,wqe,nbin)) but unsure about backwards compatibility
0266         kc=cumsum(full(sparse(qe,wqe,wqe,nbin,1)));     % cumulative occupancy counts
0267         esh=fso.emax-oemax;                             % amount to shift down previous bin counts
0268         if esh<nbin-1                                   % if any of the previous bins are worth keeping
0269             kc(esh+1:nbin-1)=kc(esh+1:nbin-1)+fso.kc(1:nbin-esh-1);
0270             kc(nbin)=kc(nbin)+sum(fso.kc(nbin-esh:nbin));
0271         else
0272             kc(nbin)=kc(nbin)+sum(fso.kc);              % otherwise just add all old counts into the last (lowest) bin
0273         end
0274         fso.kc=kc;
0275     end
0276 end
0277 if fso.ns>nz                       % now calculate the output values
0278     if fso.ssq>0
0279         aj=10*log10(fso.ssq*(fso.kc).^(-1)); % aj(jj) = active level if all power is in first jj bins
0280         % following line is equivalent to cj=20*log10(sqrt(2).^(fso.emax-(1:nbin)-1));
0281         cj=10*log10(2)*(fso.emax-(1:nbin)-1)-fso.sfdb;      % lower limit of bin j in dB correcting for scale factor
0282         mj=aj'-cj-thresh;
0283         jj=find(mj(1:end-1)<0 &  mj(2:end)>=0,1);           % find +ve transition through threshold
0284         if isempty(jj)                                      % if we never cross the threshold
0285             if mj(end)<=0                                   % if we end up below if
0286                 jj=length(mj)-1;            % take the threshold to be the bottom of the last (lowest) bin
0287                 jf=1;
0288             else                            % if we are always above it
0289                 jj=1;                       % take the threshold to be the bottom of the first (highest) bin
0290                 jf=0;
0291             end
0292         else
0293             jf=1/(1-mj(jj+1)/mj(jj));       % fractional part of j using linear interpolation
0294         end
0295         lev=aj(jj)+jf*(aj(jj+1)-aj(jj));    % active level in decibels
0296         lp=10.^(lev/10);                    % active level in power
0297         if any(md=='d')                     % 'd' option -> output in dB
0298             lev=[lev 10*log10(fso.ssq/fso.ns)];
0299         else                                % ~'d' option -> output in power
0300             lev=[lp fso.ssq/fso.ns];
0301         end
0302         af=fso.ssq/((fso.ns-nz)*lp);
0303     else                        % if all samples are equal to zero
0304         af=0;
0305         if any(md=='d')         % 'd' option -> output in dB
0306             lev=[-Inf -Inf];    % active level is 0 dB
0307         else                    % ~'d' option -> output in power
0308             lev=[0 0];          % active level is 0 power
0309         end
0310     end
0311     if all(md~='l')
0312         lev=lev(1);             % only output the first element of lev unless 'l' option
0313     end
0314 end
0315 if nargout>3
0316     vad=v_maxfilt(s(1:nsp),1,fso.nh,1);
0317     vad=vad>(sqrt(lp)/10^(thresh/20));
0318 end
0319 if ~nargout
0320     vad=v_maxfilt(s,1,fso.nh,1);
0321     vad=vad>(sqrt(lp)/10^(thresh/20));
0322     levdb=10*log10(lp);
0323     clf;
0324     subplot(2,2,[1 2]);
0325     tax=(1:ns)/fso.ffs;
0326     plot(tax,sp,'-y',tax,s,'-r',tax,(vad>0)*sqrt(lp),'-b');
0327     xlabel('Time (s)');
0328     title(sprintf('Active Level = %.1f dB, Activity = %.0f%% (ITU-T P.56)',levdb,100*af));
0329     v_axisenlarge([-1 -1 -1.4 -1.05]);
0330     if nz>0
0331         hold on
0332         ylim=get(gca,'ylim');
0333         plot(tax(end-nz)*[1 1],ylim,':k');
0334         hold off
0335         legend('Signal','Smoothed envelope','VAD * Active-Level','Zero-padding','Location','SouthEast');
0336     else
0337         legend('Signal','Smoothed envelope','VAD * Active-Level','Location','SouthEast');
0338     end
0339     ylabel('Amplitude');
0340     % lower right plot: diagram of threshold and active level calculation
0341     subplot(2,2,4);
0342     plot(cj,repmat(levdb,nbin,1),'k:',cj,aj(:),'-b',cj,cj,'-r',levdb-thresh*ones(1,2),[levdb-thresh levdb],'-r');
0343     xlabel('Threshold (dB)');
0344     ylabel('Active Level (dB)');
0345     legend('Active Level (AL)','Speech>Thresh','Threshold','Location','NorthWest');
0346     v_texthvc(levdb-thresh,levdb-0.5*thresh,sprintf('%.1f dB ',thresh),'rmr');
0347     v_axisenlarge([-1 -1.05]);
0348     ylim=get(gca,'ylim');
0349     set(gca,'ylim',[levdb-1.2*thresh max(ylim(2),levdb+1.9*thresh)]);
0350     kch=filter([1 -1],1,kc);
0351     % lower left plot: power histogram
0352     subplot(2,2,3);
0353     bar(5*log10(2)+cj(end:-1:1),kch(end:-1:1)*100/kc(end));
0354     set(gca,'xlim',[cj(end) cj(1)+10*log10(2)]);
0355     ylim=get(gca,'ylim');
0356     hold on
0357     plot(levdb([1 1]),ylim,'k:',levdb([1 1])-thresh,ylim,'r:');
0358     hold off
0359     v_texthvc(levdb(1),ylim(2),'AL','rtk');
0360     v_texthvc(levdb(1)-thresh,ylim(2),sprintf('Threshold'),'rtr');
0361     xlabel('Frame power (dB)')
0362     ylabel('% frames');
0363 elseif any(md=='n') || any(md=='N')             % output normalized speech waveform
0364     fsx=fso;                                    % shift along other outputs
0365     fso=af;
0366     af=lev;
0367     if any(md=='n')
0368         sq=sp;                                  % 'n' -> use unfiltered speech
0369     end
0370     if fsx.ns>0 && fsx.ssq>0                    % if there has been any non-zero speech
0371         lev=sq(1:nsp)/sqrt(lp);
0372     else
0373         lev=sq(1:nsp);
0374     end
0375 end
v_activlev

PURPOSE

SYNOPSIS

DESCRIPTION

CROSS-REFERENCE INFORMATION

SOURCE CODE