Home | History | Annotate | Download | only in vad
      1 /*
      2  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "webrtc/modules/audio_processing/vad/vad_audio_proc.h"
     12 
     13 #include <math.h>
     14 #include <stdio.h>
     15 
     16 #include "webrtc/common_audio/fft4g.h"
     17 #include "webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h"
     18 #include "webrtc/modules/audio_processing/vad/pitch_internal.h"
     19 #include "webrtc/modules/audio_processing/vad/pole_zero_filter.h"
     20 extern "C" {
     21 #include "webrtc/modules/audio_coding/codecs/isac/main/source/codec.h"
     22 #include "webrtc/modules/audio_coding/codecs/isac/main/source/lpc_analysis.h"
     23 #include "webrtc/modules/audio_coding/codecs/isac/main/source/pitch_estimator.h"
     24 #include "webrtc/modules/audio_coding/codecs/isac/main/source/structs.h"
     25 }
     26 #include "webrtc/modules/include/module_common_types.h"
     27 
     28 namespace webrtc {
     29 
     30 // The following structures are declared anonymous in iSAC's structs.h. To
     31 // forward declare them, we use this derived class trick.
     32 struct VadAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {};
     33 struct VadAudioProc::PreFiltBankstr : public ::PreFiltBankstr {};
     34 
     35 static const float kFrequencyResolution =
     36     kSampleRateHz / static_cast<float>(VadAudioProc::kDftSize);
     37 static const int kSilenceRms = 5;
     38 
     39 // TODO(turajs): Make a Create or Init for VadAudioProc.
     40 VadAudioProc::VadAudioProc()
     41     : audio_buffer_(),
     42       num_buffer_samples_(kNumPastSignalSamples),
     43       log_old_gain_(-2),
     44       old_lag_(50),  // Arbitrary but valid as pitch-lag (in samples).
     45       pitch_analysis_handle_(new PitchAnalysisStruct),
     46       pre_filter_handle_(new PreFiltBankstr),
     47       high_pass_filter_(PoleZeroFilter::Create(kCoeffNumerator,
     48                                                kFilterOrder,
     49                                                kCoeffDenominator,
     50                                                kFilterOrder)) {
     51   static_assert(kNumPastSignalSamples + kNumSubframeSamples ==
     52                     sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]),
     53                 "lpc analysis window incorrect size");
     54   static_assert(kLpcOrder + 1 == sizeof(kCorrWeight) / sizeof(kCorrWeight[0]),
     55                 "correlation weight incorrect size");
     56 
     57   // TODO(turajs): Are we doing too much in the constructor?
     58   float data[kDftSize];
     59   // Make FFT to initialize.
     60   ip_[0] = 0;
     61   WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
     62   // TODO(turajs): Need to initialize high-pass filter.
     63 
     64   // Initialize iSAC components.
     65   WebRtcIsac_InitPreFilterbank(pre_filter_handle_.get());
     66   WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get());
     67 }
     68 
     69 VadAudioProc::~VadAudioProc() {
     70 }
     71 
     72 void VadAudioProc::ResetBuffer() {
     73   memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess],
     74          sizeof(audio_buffer_[0]) * kNumPastSignalSamples);
     75   num_buffer_samples_ = kNumPastSignalSamples;
     76 }
     77 
     78 int VadAudioProc::ExtractFeatures(const int16_t* frame,
     79                                   size_t length,
     80                                   AudioFeatures* features) {
     81   features->num_frames = 0;
     82   if (length != kNumSubframeSamples) {
     83     return -1;
     84   }
     85 
     86   // High-pass filter to remove the DC component and very low frequency content.
     87   // We have experienced that this high-pass filtering improves voice/non-voiced
     88   // classification.
     89   if (high_pass_filter_->Filter(frame, kNumSubframeSamples,
     90                                 &audio_buffer_[num_buffer_samples_]) != 0) {
     91     return -1;
     92   }
     93 
     94   num_buffer_samples_ += kNumSubframeSamples;
     95   if (num_buffer_samples_ < kBufferLength) {
     96     return 0;
     97   }
     98   assert(num_buffer_samples_ == kBufferLength);
     99   features->num_frames = kNum10msSubframes;
    100   features->silence = false;
    101 
    102   Rms(features->rms, kMaxNumFrames);
    103   for (size_t i = 0; i < kNum10msSubframes; ++i) {
    104     if (features->rms[i] < kSilenceRms) {
    105       // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence.
    106       // Bail out here instead.
    107       features->silence = true;
    108       ResetBuffer();
    109       return 0;
    110     }
    111   }
    112 
    113   PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz,
    114                 kMaxNumFrames);
    115   FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames);
    116   ResetBuffer();
    117   return 0;
    118 }
    119 
    120 // Computes |kLpcOrder + 1| correlation coefficients.
    121 void VadAudioProc::SubframeCorrelation(double* corr,
    122                                        size_t length_corr,
    123                                        size_t subframe_index) {
    124   assert(length_corr >= kLpcOrder + 1);
    125   double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples];
    126   size_t buffer_index = subframe_index * kNumSubframeSamples;
    127 
    128   for (size_t n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++)
    129     windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n];
    130 
    131   WebRtcIsac_AutoCorr(corr, windowed_audio,
    132                       kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder);
    133 }
    134 
    135 // Compute |kNum10msSubframes| sets of LPC coefficients, one per 10 ms input.
    136 // The analysis window is 15 ms long and it is centered on the first half of
    137 // each 10ms sub-frame. This is equivalent to computing LPC coefficients for the
    138 // first half of each 10 ms subframe.
    139 void VadAudioProc::GetLpcPolynomials(double* lpc, size_t length_lpc) {
    140   assert(length_lpc >= kNum10msSubframes * (kLpcOrder + 1));
    141   double corr[kLpcOrder + 1];
    142   double reflec_coeff[kLpcOrder];
    143   for (size_t i = 0, offset_lpc = 0; i < kNum10msSubframes;
    144        i++, offset_lpc += kLpcOrder + 1) {
    145     SubframeCorrelation(corr, kLpcOrder + 1, i);
    146     corr[0] *= 1.0001;
    147     // This makes Lev-Durb a bit more stable.
    148     for (size_t k = 0; k < kLpcOrder + 1; k++) {
    149       corr[k] *= kCorrWeight[k];
    150     }
    151     WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder);
    152   }
    153 }
    154 
    155 // Fit a second order curve to these 3 points and find the location of the
    156 // extremum. The points are inverted before curve fitting.
    157 static float QuadraticInterpolation(float prev_val,
    158                                     float curr_val,
    159                                     float next_val) {
    160   // Doing the interpolation in |1 / A(z)|^2.
    161   float fractional_index = 0;
    162   next_val = 1.0f / next_val;
    163   prev_val = 1.0f / prev_val;
    164   curr_val = 1.0f / curr_val;
    165 
    166   fractional_index =
    167       -(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val);
    168   assert(fabs(fractional_index) < 1);
    169   return fractional_index;
    170 }
    171 
    172 // 1 / A(z), where A(z) is defined by |lpc| is a model of the spectral envelope
    173 // of the input signal. The local maximum of the spectral envelope corresponds
    174 // with the local minimum of A(z). It saves complexity, as we save one
    175 // inversion. Furthermore, we find the first local maximum of magnitude squared,
    176 // to save on one square root.
    177 void VadAudioProc::FindFirstSpectralPeaks(double* f_peak,
    178                                           size_t length_f_peak) {
    179   assert(length_f_peak >= kNum10msSubframes);
    180   double lpc[kNum10msSubframes * (kLpcOrder + 1)];
    181   // For all sub-frames.
    182   GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1));
    183 
    184   const size_t kNumDftCoefficients = kDftSize / 2 + 1;
    185   float data[kDftSize];
    186 
    187   for (size_t i = 0; i < kNum10msSubframes; i++) {
    188     // Convert to float with zero pad.
    189     memset(data, 0, sizeof(data));
    190     for (size_t n = 0; n < kLpcOrder + 1; n++) {
    191       data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]);
    192     }
    193     // Transform to frequency domain.
    194     WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
    195 
    196     size_t index_peak = 0;
    197     float prev_magn_sqr = data[0] * data[0];
    198     float curr_magn_sqr = data[2] * data[2] + data[3] * data[3];
    199     float next_magn_sqr;
    200     bool found_peak = false;
    201     for (size_t n = 2; n < kNumDftCoefficients - 1; n++) {
    202       next_magn_sqr =
    203           data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1];
    204       if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
    205         found_peak = true;
    206         index_peak = n - 1;
    207         break;
    208       }
    209       prev_magn_sqr = curr_magn_sqr;
    210       curr_magn_sqr = next_magn_sqr;
    211     }
    212     float fractional_index = 0;
    213     if (!found_peak) {
    214       // Checking if |kNumDftCoefficients - 1| is the local minimum.
    215       next_magn_sqr = data[1] * data[1];
    216       if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
    217         index_peak = kNumDftCoefficients - 1;
    218       }
    219     } else {
    220       // A peak is found, do a simple quadratic interpolation to get a more
    221       // accurate estimate of the peak location.
    222       fractional_index =
    223           QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr);
    224     }
    225     f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution;
    226   }
    227 }
    228 
    229 // Using iSAC functions to estimate pitch gains & lags.
    230 void VadAudioProc::PitchAnalysis(double* log_pitch_gains,
    231                                  double* pitch_lags_hz,
    232                                  size_t length) {
    233   // TODO(turajs): This can be "imported" from iSAC & and the next two
    234   // constants.
    235   assert(length >= kNum10msSubframes);
    236   const int kNumPitchSubframes = 4;
    237   double gains[kNumPitchSubframes];
    238   double lags[kNumPitchSubframes];
    239 
    240   const int kNumSubbandFrameSamples = 240;
    241   const int kNumLookaheadSamples = 24;
    242 
    243   float lower[kNumSubbandFrameSamples];
    244   float upper[kNumSubbandFrameSamples];
    245   double lower_lookahead[kNumSubbandFrameSamples];
    246   double upper_lookahead[kNumSubbandFrameSamples];
    247   double lower_lookahead_pre_filter[kNumSubbandFrameSamples +
    248                                     kNumLookaheadSamples];
    249 
    250   // Split signal to lower and upper bands
    251   WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], lower,
    252                                  upper, lower_lookahead, upper_lookahead,
    253                                  pre_filter_handle_.get());
    254   WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter,
    255                            pitch_analysis_handle_.get(), lags, gains);
    256 
    257   // Lags are computed on lower-band signal with sampling rate half of the
    258   // input signal.
    259   GetSubframesPitchParameters(
    260       kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes,
    261       &log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz);
    262 }
    263 
    264 void VadAudioProc::Rms(double* rms, size_t length_rms) {
    265   assert(length_rms >= kNum10msSubframes);
    266   size_t offset = kNumPastSignalSamples;
    267   for (size_t i = 0; i < kNum10msSubframes; i++) {
    268     rms[i] = 0;
    269     for (size_t n = 0; n < kNumSubframeSamples; n++, offset++)
    270       rms[i] += audio_buffer_[offset] * audio_buffer_[offset];
    271     rms[i] = sqrt(rms[i] / kNumSubframeSamples);
    272   }
    273 }
    274 
    275 }  // namespace webrtc
    276