Description of v

0001 function [y,so,tax,fax,gd]=v_stftw(x,nw,m,ov,nt)
0002 %V_STFTW converts a time-domain signal into the time-frequency domain with the Short-time Fourier Transform [Y,SO,T,F]=(X,NW,M,OV)
0003 %  Usage:  (1) %%%%%% square-root-Hamming window with 50% overlap [Default options] %%%%%%
0004 %              [y,so]=v_stftw(x,nw);                % default parameters: m='rM', ov=2
0005 %              ...                                  % time-frequency domain processing goes here
0006 %              ...                                  % y only contains positive frequencies, so size(y,2)=floor(1+nw/2)
0007 %              z=v_istftw(y,so);                    % z is the same as x except for the first and last half-frames
0008 %
0009 %          (2) %%%%%% Hann window with 75% overlap, zero-pad start/end of signal %%%%%%
0010 %              [y,so]=v_stftw(x,nw,'en',4);         % Hanning window with ov=4
0011 %              ...                                  % time-frequency domain processing goes here
0012 %              z=v_istftw(y,so);                    % z is the same as x everywhere due to 'e' option
0013 %
0014 %          (3) %%%%%% Plot Spectrogram if called without outputs %%%%%%
0015 %              v_stftw(x,nw);                       % Plot spectrogram of first channel
0016 %
0017 %          (4) %%%%%% Real-time STFT-domain processing %%%%%%
0018 %              [y,so]=v_stftw([],nw,'pn',4);          % initialise v_stftw state variable, so
0019 %              io=[];                               % initialise v_istftw state variable, io
0020 %              while true                           % loop forever
0021 %                % ... [acquire input chunk x]         % acquire new chunk of input samples
0022 %                [y,so]=v_stftw(x,so,'p');           % 'p' option means more chunks to come
0023 %                % ... [process in T-F domain]      % process signal in time-frequency domain
0024 %                [z,io]=v_istftw(y,so,io);             % convert back to time domain
0025 %                % ... [output chunk z];            % output chunk of processed samples
0026 %              end
0027 
0028 %
0029 %          (5) %%%%%% Optionally process a long signal in chunks %%%%%%
0030 %              [y1,so]=v_stftw(x(1:1000),nw,'ep');      % convert chunk 1 of 3
0031 %              [y2,so]=v_stftw(x(1001:2000),so,'p'); % Options other than 'p' will be remembered
0032 %              [y3,so]=v_stftw(x(2001:end),so);      % Omit 'p' for final chunk (which can have x=[] if desired)
0033 %              y=[y1; y2; y3];                       % Concatenate to get complete T-F data
0034 %              [z1,io]=v_istftw(y(1:5,:),so);          % convert back to time domain
0035 %              [z2,io]=v_istftw(y(6:10,:),so,io);    % convert back to time domain
0036 %              z3=v_istftw(y(11:end,:),so,io);         % omit 'io' output on final chunk
0037 %              z=[z1; z2; z3];                       % z is the same as x everywhere due to 'e' option
0038 %
0039 %  Inputs: x(t,...)    input signal (one signal per column)
0040 %          nw       Window length (will be rounded up to a multiple of ov)
0041 %                       alternatively, the so output from the previous chunk's call to v_stftw
0042 %          m        mode string including window code
0043 %          ov       integer overlap factor; the frame hop is nw/ov. [2]
0044 %          nt       Optional DFT length (normally >=nw). [Default is nw as modified by 'i','I','u' options]
0045 %
0046 % Outputs: y(tf,k,...)  output STFT (frame=tf, freq=k). Number of frequencies is normally floor(1+nw/2) from DC to Nyquist
0047 %          so           structure (see below for details) used in the inverse transformation, v_istftw(), and also as the
0048 %                       nw argument in a subsequent call to v_stftw() when processing a long signal in multiple chunks.
0049 %          tax          sample numbers corresponding to the centre of each frame; divide by fs to get times
0050 %          fax          normalized centre frequencies of each bin (multiply by fs for actual frequencies)
0051 %          gd(tf,k,...) group delay in samples. A impulse at x(i,n+1) will give gd(i,:)=n. Will be infinite wherever y()=0.
0052 %
0053 % The mode string, m, is a character string containing a combination of the following options and a window-code (see below):
0054 %
0055 %             'p'   x is a partial signal which will be followed by another chunk (prevents paddding of the final frame)
0056 %
0057 %             'r'   pad final frame with reflected data if necessary [default]
0058 %             'z'   pad final frame with zeros if necessary
0059 %             't'   truncate data to an exact number of frames
0060 %             'e'   pad with samples at beginning and end of signal to include all frames with at least one signal sample
0061 %
0062 %             'i'   zero-pad by a factor of 2 to increase apparent frequency resolution
0063 %             'I'   zero-pad by a factor of 4 to increase apparent frequency resolution
0064 %             'u'   zero-pad to round transform length up to a power of 2 to speed computation
0065 %
0066 %             's'   plot spectrogram of first channel (default if no outputs)
0067 %             'S'   plot mean spectra and mean power waveforms of all channels
0068 %
0069 %            window-code   Window   mode-string  overlaps   Sidelobe  3dB-BW  6dB-BW  Falloff    Comment
0070 %
0071 %                'c'      cos(1)        s         2,3,5      -23dB     1.2     1.6  -12dB/oct   used in MP3
0072 %                'k'      kaiser(5)     boq         2        -23dB     1.2     1.7   -6dB/oct   used in AAC
0073 %                'm'      hamming       s         3,4,5      -43dB     1.3     1.8   -6dB/oct   low sidelobes [default if ov>2]
0074 %                'M'      hamming       sq        2,3,5      -24dB     1.1     1.5   -6dB/oct   sqrt Hamming  [default if ov=2]
0075 %                'n'      hann          s         3,4,5      -31dB     1.4     2.0  -18dB/oct   rapid falloff
0076 %                'R'      rectangle                 1        -13dB     0.9     1.2   -6dB/oct   narrow bandwidth [default if ov=1]
0077 %                'v'      vorbis        s           2        -21dB     1.3     1.8  -18dB/oct   used in Vorbis
0078 %                'V'      rsqvorbis     sq          2        -26dB     1.1     1.5   -6dB/oct   sqrt of raised and squared Vorbis
0079 %
0080 %                 where mode-string is the mode input to the v_windows call. For perfect reconstruction,
0081 %                 the ov input must be a multiple of one of the numbers listed in the overlaps column.
0082 %                 To see more details of a specific window, execute:
0083 %                      v_windows(window-code,55440,mode-string,param);   % Note: use window-code='w' for rsqvorbis
0084 %
0085 % The contents of the so output structure are:
0086 %
0087 %           so.mx           cumulative number of input samples
0088 %           so.mf           cumulative number of full frames output so far (excluding 'e' option prefix frames)
0089 %           so.nt           length of Fourier transform
0090 %           so.nh           frame hop in samples
0091 %           so.wa(1,nw)     analysis window (e.g. Rectangular or Hamming)
0092 %           so.po           index of final-frame padding option: 1='z', 2='r', 3='t'
0093 %           so.me           true iff 'e' option given i.e. signal is zero-padded at start and end
0094 %           so.ff           true iff no frames have yet been output
0095 %           so.mp           true iff 'p' option given i.e. there are additional chunks to follow
0096 %
0097 % Notes:
0098 %
0099 %  (1) The scaling preserves power in a 2-sided transform so that
0100 %
0101 %        p = mean(x.^2) ~=~ mean(sum(abs(yy).^2,2)) where yy=[y conj(y(:,nw-size(y,2)+1:-1:2))]
0102 %
0103 %      where yy is the double-sided spectrum. This equivalence is stochastic rather than exact.
0104 %  (2) For perfect reconstruction (see [3]), the overlap factor, ov, must be an integer multiple
0105 %      of one of the values listed above in the ov column for the selected window.
0106 %  (3) Using the default parameters, perfect reconstruction will not be obtained for the first and
0107 %      the last frames of the signal. Using the 'e' option will add padding frames to the start and
0108 %      end of the signal so that the entire signal will be reconstructed perfectly.
0109 %  (4) By default, padding at the start and end of the signal will avoid introducing a discontinuity
0110 %      by using a time-reversed reflection of the input signal. Alternatively, the 'z' option will
0111 %      pad with zeros and the 't' option will truncate the signal to fit into an exact number of frames.
0112 %  (5) To avoid temporal aliasing in the time-frequency domain , an overlap factor of ov>=4 is required for
0113 %      most windows. Using an overlap factor of ov=2 (the default) halves the number of frames to process
0114 %      but may result in aliasing for some signals although this is normally slight (see [1]).
0115 %
0116 % Refs: [1] J. B. Allen. Short term spectral analysis, synthesis, and modification by discrete Fourier transform.
0117 %           IEEE Trans. Acoustics, Speech and Signal Processing, 25 (3): 235–238, June 1977.
0118 %       [2]    R. Crochiere. A weighted overlap-add method of short-time fourier analysis/synthesis.
0119 %           IEEE Trans. Acoustics, Speech and Signal Processing, 28 (1): 99–102, 1980.
0120 %       [3]    J. Princen and A. Bradley. Analysis/synthesis filter bank design based on time domain aliasing cancellation.
0121 %           IEEE Trans. Acoustics, Speech and Signal Processing, 34 (5): 1153–1161, 1986. doi: 10.1109/TASSP.1986.1164954.
0122 %
0123 % Revision History
0124 %    2022-04-10 First version
0125 persistent nw0 nt0 wi0 w0 wt wm wp
0126 if isempty(nw0)
0127     nw0=0;
0128     wt={'rectangle','hamming','hamming','cos','kaiser','hann','rsqvorbis','vorbis'};
0129     wm={'','sq','s','s','boq','s','sq','s'};
0130     wp=[0 0 0 1 5 0 0 0];
0131 end
0132 sx=size(x);
0133 if sx(1)==1 && length(sx)==2 % transpose x if it is a row vector
0134     x=x.';
0135     sx=size(x);
0136 end
0137 nx=sx(1);                   % number of samples
0138 nc=prod(sx(2:end));         % total number of channels
0139 x=reshape(x,nx,nc);         % make input data 2-dimensional
0140 if nargin<3 || ~numel(m)    % no mode options specified
0141     m='';                   % use default modes
0142 end
0143 if isstruct(nw)             % nw is structure from previous call to v_stftw
0144     so=nw;                  % replicate into so since most fields will be identical
0145     wa=nw.wa;               % analysis window
0146     nh=nw.nh;               % frame hop (samples)
0147     po=nw.po;               % padding option
0148     nt=nw.nt;               % overwrite nt with length of transform
0149     me=nw.me;               % 'e' option given
0150     ff=nw.ff;               % first frame flag
0151     mf=nw.mf;
0152     if ~isempty(nw.xx)        % there is a data tail from a previous call
0153         x=cat(1,nw.xx,x);     % pre-append saved data
0154     end
0155     so.mx=so.mx+nx;          % cumulative signal length (using original value of nx)
0156     sx=size(x);             % recalculate sx (especially necessary if input argument x=[]
0157     nx=sx(1);               % recalculate the number of samples
0158     nw=length(wa);          % overwrite nw with the window length
0159 else
0160     if nargin<4 || ~numel(ov)
0161         ov=2;               % default overlap factor
0162     else
0163         ov=round(ov);         % force integer overlap factor
0164     end
0165     nh=ceil(nw/ov);         % force integer hop length
0166     nw=ov*nh;               % recalculate DFT length
0167     wi=find(sum(repmat(m(:),1,8)==repmat('RMmcknVv',length(m),1),1),1); % check if m specifies the window
0168     if isempty(wi)          % if no window specified
0169         wi=1+(ov>1)+(ov>2); % use 'R', 'M' or 'm'
0170     end
0171     % sort out zero-padding
0172     if nargin<5             % DFT length is not explicitly specified
0173         nt=nw;                  % default transform length
0174         if any(m=='i')          % 'i' = multiply nt by 2 for increased frequency resolution
0175             nt=2*nt;
0176         end
0177         if any(m=='I')          % 'I' = multiply nt by 4 for increased frequency resolution
0178             nt=4*nt;
0179         end
0180         if any(m=='u')          % 'u' = round up nt to a power of 2 for increased frequency resolution
0181             [fw,pw]=log2(nt);
0182             nt=pow2(pw-(fw==0.5));
0183         end
0184     end
0185     me=any(m=='e');
0186     if nw==nw0 && wi==wi0 && nt==nt0    % check if window already cached
0187         wa=w0;                          % use cached version if available
0188     else
0189         wa = v_windows(wt{wi},nw,wm{wi},wp(wi)); % only need the parameter for 'c' and 'k' windows
0190         wa=wa/sqrt(sum(wa.^2)*nt);      % scale to preserve power
0191         nw0=nw;                         % save parameters of cached window
0192         nt0=nt;
0193         wi0=wi;
0194         w0=wa;                          % save window in cache
0195     end
0196     % sort out enframing details
0197     ff=1;                               % flag to indicate the first frame
0198     po=find(sum(repmat(m(:),1,3)==repmat('zrt',length(m),1),1),1); % find padding option
0199     if isempty(po)
0200         po=2;                           % default to po=2 ('r')
0201     end
0202     % create so structure
0203     mf=0;
0204     so.wa=wa;                       % save analysis window
0205     so.po=po;                       % save padding option for final frame
0206     so.nh=nh;                       % save frame hop
0207     so.nt=nt;                       % length of transform
0208     so.me=me;                       % 'e' option given
0209     so.mx=nx;                       % cumulative signal length
0210     so.mf=0;                        % zero cumulative frames output so far (excluding prefix frames)
0211 end
0212 % now do the STFT
0213 mp=any(m=='p');                     % 'p' option given: this is not the last chunk
0214 f0=(1-nw/nh)*(me && ff);            % initial frame: either 0 or (1-ov)
0215 if nx>0                             % there are some samples
0216     if me && ~mp                    % include all frames that contain at least one sample
0217         f1=ceil(nx/nh)-1;           % last frame that include at least one sample
0218     elseif po<3 && ~mp              % ensure all samples are in at least one frame
0219         f1=max(ceil((nx-nw)/nh),0);
0220     else                            % only include full frames
0221         f1=floor((nx-nw)/nh);               % last full frame (might be negative and/or <f0)
0222     end
0223 else                                        % input signal is empty
0224     f1=-1;
0225 end
0226 nf=max((f1-f0+1)*(f1>=0),0);                % number of frames to output
0227 if nf>0                                     % if there are frames to output
0228     ff=0;                                   % turn off first frame flag
0229     indx=repmat(nh*(f0:f1).',nw,1)+reshape(repmat(0:nw-1,nf,1),nf*nw,1);    % index into x
0230     mk=indx<0 | indx>nx-1;                  % mask for values outside 0:nx-1
0231     indx=nx+0.5-abs(mod(indx,2*nx)+0.5-nx); % reflect values outside the range
0232     v=x(indx,:);                            % create output
0233     if po==1                                % padding option was 'z'
0234         v(mk,:)=0;                          % set padding values to zero
0235     end
0236     y=v_rfft(reshape(v,[nf,nw,sx(2:end)]).*repmat(wa,[nf,1,sx(2:end)]),nt,2);  % perform windowed DFT
0237 else
0238     y=zeros([0,nw,sx(2:end)]);
0239 end
0240 if nargout>1                                % need additional outputs
0241     if nf>0
0242         tax=(f0+mf:f1+mf)*nh+0.5*(1+nw);    % frame centre times (where input samples are 1,2,...)
0243     else
0244         tax=[];
0245     end
0246     fax=(0:size(y,2)-1)/nt;                 % bin centre frequencies (normalized by sample frequency)
0247     if nf>0
0248         so.xx=x((f1+1)*nh+1:end,:);            % save data tail for next chunk (starts at sample nh+1 of the last frame in y)
0249     else
0250         so.xx=x;                            % if nf==0 then data tail is entire signal (including any previous tail)
0251     end
0252     so.ff=ff;                               % first frame flag
0253     so.mp=mp;                               % flag indicating more chunks to follow
0254     so.mf=mf+nf+f0;                         % add number of additional non-prefix frames output
0255     if nargout>4                            % need group delay output
0256         if nf>0                             % if there are frames to output
0257             gd=real(v_rfft(reshape(v,[nf,nw,sx(2:end)]).*repmat(wa.*(0:nt-1),[nf,1,sx(2:end)]),nt,2)./y);  % calculate group delay
0258         else
0259             gd=zeros([0,nw,sx(2:end)]);
0260         end
0261     end
0262 end
0263 if nf>0 && (~nargout || any(lower(m)=='s')) % no outputs or 'sS' options so plot graphs
0264     ta=(f0+mf:f1+mf)*nh+0.5*(1+nw);
0265     nv=size(y,2);
0266     fa=(0:nv-1)/nt;
0267     clf;
0268     if any(m=='S')
0269         subplot(2,1,1);
0270         yp=abs(reshape(y,[nf,nv,nc])).^2;   % calculate power spectrum
0271         plot(ta,10*log10(reshape(sum(cat(2,yp,yp(:,2:floor((nt+1)/2),:)),2),nf,nc))); % add in power from negative frequencies
0272         v_axisenlarge([-1 -1.05]);
0273         xlabel('Sample Number');
0274         ylabel('Frame power (dB)');
0275         subplot(2,1,2);
0276         plot(fa,10*log10(reshape(mean(yp,1),nv,nc)));
0277         v_axisenlarge([-1 -1.05]);
0278         xlabel('Normalized Frequency');
0279         ylabel('Mean Power (dB)');
0280     else
0281         imagesc(ta,fa,20*log10(abs(y(:,:,1)))'); % plot only the first channel
0282         colorbar;
0283         axis 'xy';
0284         ylabel('Normalized Frequency');
0285         xlabel('Sample Number');
0286         v_cblabel('Energy (dB)');
0287         hold on;
0288         clim=get(gca,'clim');
0289         set(gca,'clim',[max(clim(1),clim(2)-40) clim(2)]);
0290         ylim=get(gca,'ylim');
0291         plot([0.5;0.5],ylim',':k');             % indicate start of signal
0292         if ~mp
0293             plot([1;1]*(so.mx+0.5),ylim',':k'); % indicate end of signal
0294         end
0295         hold off
0296     end
0297 end
0298
v_stftw

PURPOSE

SYNOPSIS

DESCRIPTION

CROSS-REFERENCE INFORMATION

SOURCE CODE