Home | History | Annotate | Download | only in vad
      1 /*
      2  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "webrtc/modules/audio_processing/vad/pitch_based_vad.h"
     12 
     13 #include <assert.h>
     14 #include <math.h>
     15 #include <string.h>
     16 
     17 #include "webrtc/modules/audio_processing/vad/vad_circular_buffer.h"
     18 #include "webrtc/modules/audio_processing/vad/common.h"
     19 #include "webrtc/modules/audio_processing/vad/noise_gmm_tables.h"
     20 #include "webrtc/modules/audio_processing/vad/voice_gmm_tables.h"
     21 #include "webrtc/modules/include/module_common_types.h"
     22 
     23 namespace webrtc {
     24 
     25 static_assert(kNoiseGmmDim == kVoiceGmmDim,
     26               "noise and voice gmm dimension not equal");
     27 
     28 // These values should match MATLAB counterparts for unit-tests to pass.
     29 static const int kPosteriorHistorySize = 500;  // 5 sec of 10 ms frames.
     30 static const double kInitialPriorProbability = 0.3;
     31 static const int kTransientWidthThreshold = 7;
     32 static const double kLowProbabilityThreshold = 0.2;
     33 
     34 static double LimitProbability(double p) {
     35   const double kLimHigh = 0.99;
     36   const double kLimLow = 0.01;
     37 
     38   if (p > kLimHigh)
     39     p = kLimHigh;
     40   else if (p < kLimLow)
     41     p = kLimLow;
     42   return p;
     43 }
     44 
     45 PitchBasedVad::PitchBasedVad()
     46     : p_prior_(kInitialPriorProbability),
     47       circular_buffer_(VadCircularBuffer::Create(kPosteriorHistorySize)) {
     48   // Setup noise GMM.
     49   noise_gmm_.dimension = kNoiseGmmDim;
     50   noise_gmm_.num_mixtures = kNoiseGmmNumMixtures;
     51   noise_gmm_.weight = kNoiseGmmWeights;
     52   noise_gmm_.mean = &kNoiseGmmMean[0][0];
     53   noise_gmm_.covar_inverse = &kNoiseGmmCovarInverse[0][0][0];
     54 
     55   // Setup voice GMM.
     56   voice_gmm_.dimension = kVoiceGmmDim;
     57   voice_gmm_.num_mixtures = kVoiceGmmNumMixtures;
     58   voice_gmm_.weight = kVoiceGmmWeights;
     59   voice_gmm_.mean = &kVoiceGmmMean[0][0];
     60   voice_gmm_.covar_inverse = &kVoiceGmmCovarInverse[0][0][0];
     61 }
     62 
     63 PitchBasedVad::~PitchBasedVad() {
     64 }
     65 
     66 int PitchBasedVad::VoicingProbability(const AudioFeatures& features,
     67                                       double* p_combined) {
     68   double p;
     69   double gmm_features[3];
     70   double pdf_features_given_voice;
     71   double pdf_features_given_noise;
     72   // These limits are the same in matlab implementation 'VoicingProbGMM().'
     73   const double kLimLowLogPitchGain = -2.0;
     74   const double kLimHighLogPitchGain = -0.9;
     75   const double kLimLowSpectralPeak = 200;
     76   const double kLimHighSpectralPeak = 2000;
     77   const double kEps = 1e-12;
     78   for (size_t n = 0; n < features.num_frames; n++) {
     79     gmm_features[0] = features.log_pitch_gain[n];
     80     gmm_features[1] = features.spectral_peak[n];
     81     gmm_features[2] = features.pitch_lag_hz[n];
     82 
     83     pdf_features_given_voice = EvaluateGmm(gmm_features, voice_gmm_);
     84     pdf_features_given_noise = EvaluateGmm(gmm_features, noise_gmm_);
     85 
     86     if (features.spectral_peak[n] < kLimLowSpectralPeak ||
     87         features.spectral_peak[n] > kLimHighSpectralPeak ||
     88         features.log_pitch_gain[n] < kLimLowLogPitchGain) {
     89       pdf_features_given_voice = kEps * pdf_features_given_noise;
     90     } else if (features.log_pitch_gain[n] > kLimHighLogPitchGain) {
     91       pdf_features_given_noise = kEps * pdf_features_given_voice;
     92     }
     93 
     94     p = p_prior_ * pdf_features_given_voice /
     95         (pdf_features_given_voice * p_prior_ +
     96          pdf_features_given_noise * (1 - p_prior_));
     97 
     98     p = LimitProbability(p);
     99 
    100     // Combine pitch-based probability with standalone probability, before
    101     // updating prior probabilities.
    102     double prod_active = p * p_combined[n];
    103     double prod_inactive = (1 - p) * (1 - p_combined[n]);
    104     p_combined[n] = prod_active / (prod_active + prod_inactive);
    105 
    106     if (UpdatePrior(p_combined[n]) < 0)
    107       return -1;
    108     // Limit prior probability. With a zero prior probability the posterior
    109     // probability is always zero.
    110     p_prior_ = LimitProbability(p_prior_);
    111   }
    112   return 0;
    113 }
    114 
    115 int PitchBasedVad::UpdatePrior(double p) {
    116   circular_buffer_->Insert(p);
    117   if (circular_buffer_->RemoveTransient(kTransientWidthThreshold,
    118                                         kLowProbabilityThreshold) < 0)
    119     return -1;
    120   p_prior_ = circular_buffer_->Mean();
    121   return 0;
    122 }
    123 
    124 }  // namespace webrtc
    125