Home | History | Annotate | Download | only in vad
      1 /*
      2  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "webrtc/common_audio/vad/vad_core.h"
     12 
     13 #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
     14 #include "webrtc/common_audio/vad/vad_filterbank.h"
     15 #include "webrtc/common_audio/vad/vad_gmm.h"
     16 #include "webrtc/common_audio/vad/vad_sp.h"
     17 #include "webrtc/typedefs.h"
     18 
     19 // Spectrum Weighting
     20 static const int16_t kSpectrumWeight[kNumChannels] = { 6, 8, 10, 12, 14, 16 };
     21 static const int16_t kNoiseUpdateConst = 655; // Q15
     22 static const int16_t kSpeechUpdateConst = 6554; // Q15
     23 static const int16_t kBackEta = 154; // Q8
     24 // Minimum difference between the two models, Q5
     25 static const int16_t kMinimumDifference[kNumChannels] = {
     26     544, 544, 576, 576, 576, 576 };
     27 // Upper limit of mean value for speech model, Q7
     28 static const int16_t kMaximumSpeech[kNumChannels] = {
     29     11392, 11392, 11520, 11520, 11520, 11520 };
     30 // Minimum value for mean value
     31 static const int16_t kMinimumMean[kNumGaussians] = { 640, 768 };
     32 // Upper limit of mean value for noise model, Q7
     33 static const int16_t kMaximumNoise[kNumChannels] = {
     34     9216, 9088, 8960, 8832, 8704, 8576 };
     35 // Start values for the Gaussian models, Q7
     36 // Weights for the two Gaussians for the six channels (noise)
     37 static const int16_t kNoiseDataWeights[kTableSize] = {
     38     34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103 };
     39 // Weights for the two Gaussians for the six channels (speech)
     40 static const int16_t kSpeechDataWeights[kTableSize] = {
     41     48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81 };
     42 // Means for the two Gaussians for the six channels (noise)
     43 static const int16_t kNoiseDataMeans[kTableSize] = {
     44     6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362 };
     45 // Means for the two Gaussians for the six channels (speech)
     46 static const int16_t kSpeechDataMeans[kTableSize] = {
     47     8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483
     48 };
     49 // Stds for the two Gaussians for the six channels (noise)
     50 static const int16_t kNoiseDataStds[kTableSize] = {
     51     378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455 };
     52 // Stds for the two Gaussians for the six channels (speech)
     53 static const int16_t kSpeechDataStds[kTableSize] = {
     54     555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 };
     55 
     56 // Constants used in GmmProbability().
     57 //
     58 // Maximum number of counted speech (VAD = 1) frames in a row.
     59 static const int16_t kMaxSpeechFrames = 6;
     60 // Minimum standard deviation for both speech and noise.
     61 static const int16_t kMinStd = 384;
     62 
     63 // Constants in WebRtcVad_InitCore().
     64 // Default aggressiveness mode.
     65 static const short kDefaultMode = 0;
     66 static const int kInitCheck = 42;
     67 
     68 // Constants used in WebRtcVad_set_mode_core().
     69 //
     70 // Thresholds for different frame lengths (10 ms, 20 ms and 30 ms).
     71 //
     72 // Mode 0, Quality.
     73 static const int16_t kOverHangMax1Q[3] = { 8, 4, 3 };
     74 static const int16_t kOverHangMax2Q[3] = { 14, 7, 5 };
     75 static const int16_t kLocalThresholdQ[3] = { 24, 21, 24 };
     76 static const int16_t kGlobalThresholdQ[3] = { 57, 48, 57 };
     77 // Mode 1, Low bitrate.
     78 static const int16_t kOverHangMax1LBR[3] = { 8, 4, 3 };
     79 static const int16_t kOverHangMax2LBR[3] = { 14, 7, 5 };
     80 static const int16_t kLocalThresholdLBR[3] = { 37, 32, 37 };
     81 static const int16_t kGlobalThresholdLBR[3] = { 100, 80, 100 };
     82 // Mode 2, Aggressive.
     83 static const int16_t kOverHangMax1AGG[3] = { 6, 3, 2 };
     84 static const int16_t kOverHangMax2AGG[3] = { 9, 5, 3 };
     85 static const int16_t kLocalThresholdAGG[3] = { 82, 78, 82 };
     86 static const int16_t kGlobalThresholdAGG[3] = { 285, 260, 285 };
     87 // Mode 3, Very aggressive.
     88 static const int16_t kOverHangMax1VAG[3] = { 6, 3, 2 };
     89 static const int16_t kOverHangMax2VAG[3] = { 9, 5, 3 };
     90 static const int16_t kLocalThresholdVAG[3] = { 94, 94, 94 };
     91 static const int16_t kGlobalThresholdVAG[3] = { 1100, 1050, 1100 };
     92 
     93 // Calculates the weighted average w.r.t. number of Gaussians. The |data| are
     94 // updated with an |offset| before averaging.
     95 //
     96 // - data     [i/o] : Data to average.
     97 // - offset   [i]   : An offset added to |data|.
     98 // - weights  [i]   : Weights used for averaging.
     99 //
    100 // returns          : The weighted average.
    101 static int32_t WeightedAverage(int16_t* data, int16_t offset,
    102                                const int16_t* weights) {
    103   int k;
    104   int32_t weighted_average = 0;
    105 
    106   for (k = 0; k < kNumGaussians; k++) {
    107     data[k * kNumChannels] += offset;
    108     weighted_average += data[k * kNumChannels] * weights[k * kNumChannels];
    109   }
    110   return weighted_average;
    111 }
    112 
    113 // Calculates the probabilities for both speech and background noise using
    114 // Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which
    115 // type of signal is most probable.
    116 //
    117 // - self           [i/o] : Pointer to VAD instance
    118 // - features       [i]   : Feature vector of length |kNumChannels|
    119 //                          = log10(energy in frequency band)
    120 // - total_power    [i]   : Total power in audio frame.
    121 // - frame_length   [i]   : Number of input samples
    122 //
    123 // - returns              : the VAD decision (0 - noise, 1 - speech).
    124 static int16_t GmmProbability(VadInstT* self, int16_t* features,
    125                               int16_t total_power, size_t frame_length) {
    126   int channel, k;
    127   int16_t feature_minimum;
    128   int16_t h0, h1;
    129   int16_t log_likelihood_ratio;
    130   int16_t vadflag = 0;
    131   int16_t shifts_h0, shifts_h1;
    132   int16_t tmp_s16, tmp1_s16, tmp2_s16;
    133   int16_t diff;
    134   int gaussian;
    135   int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk;
    136   int16_t delt, ndelt;
    137   int16_t maxspe, maxmu;
    138   int16_t deltaN[kTableSize], deltaS[kTableSize];
    139   int16_t ngprvec[kTableSize] = { 0 };  // Conditional probability = 0.
    140   int16_t sgprvec[kTableSize] = { 0 };  // Conditional probability = 0.
    141   int32_t h0_test, h1_test;
    142   int32_t tmp1_s32, tmp2_s32;
    143   int32_t sum_log_likelihood_ratios = 0;
    144   int32_t noise_global_mean, speech_global_mean;
    145   int32_t noise_probability[kNumGaussians], speech_probability[kNumGaussians];
    146   int16_t overhead1, overhead2, individualTest, totalTest;
    147 
    148   // Set various thresholds based on frame lengths (80, 160 or 240 samples).
    149   if (frame_length == 80) {
    150     overhead1 = self->over_hang_max_1[0];
    151     overhead2 = self->over_hang_max_2[0];
    152     individualTest = self->individual[0];
    153     totalTest = self->total[0];
    154   } else if (frame_length == 160) {
    155     overhead1 = self->over_hang_max_1[1];
    156     overhead2 = self->over_hang_max_2[1];
    157     individualTest = self->individual[1];
    158     totalTest = self->total[1];
    159   } else {
    160     overhead1 = self->over_hang_max_1[2];
    161     overhead2 = self->over_hang_max_2[2];
    162     individualTest = self->individual[2];
    163     totalTest = self->total[2];
    164   }
    165 
    166   if (total_power > kMinEnergy) {
    167     // The signal power of current frame is large enough for processing. The
    168     // processing consists of two parts:
    169     // 1) Calculating the likelihood of speech and thereby a VAD decision.
    170     // 2) Updating the underlying model, w.r.t., the decision made.
    171 
    172     // The detection scheme is an LRT with hypothesis
    173     // H0: Noise
    174     // H1: Speech
    175     //
    176     // We combine a global LRT with local tests, for each frequency sub-band,
    177     // here defined as |channel|.
    178     for (channel = 0; channel < kNumChannels; channel++) {
    179       // For each channel we model the probability with a GMM consisting of
    180       // |kNumGaussians|, with different means and standard deviations depending
    181       // on H0 or H1.
    182       h0_test = 0;
    183       h1_test = 0;
    184       for (k = 0; k < kNumGaussians; k++) {
    185         gaussian = channel + k * kNumChannels;
    186         // Probability under H0, that is, probability of frame being noise.
    187         // Value given in Q27 = Q7 * Q20.
    188         tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
    189                                                  self->noise_means[gaussian],
    190                                                  self->noise_stds[gaussian],
    191                                                  &deltaN[gaussian]);
    192         noise_probability[k] = kNoiseDataWeights[gaussian] * tmp1_s32;
    193         h0_test += noise_probability[k];  // Q27
    194 
    195         // Probability under H1, that is, probability of frame being speech.
    196         // Value given in Q27 = Q7 * Q20.
    197         tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
    198                                                  self->speech_means[gaussian],
    199                                                  self->speech_stds[gaussian],
    200                                                  &deltaS[gaussian]);
    201         speech_probability[k] = kSpeechDataWeights[gaussian] * tmp1_s32;
    202         h1_test += speech_probability[k];  // Q27
    203       }
    204 
    205       // Calculate the log likelihood ratio: log2(Pr{X|H1} / Pr{X|H1}).
    206       // Approximation:
    207       // log2(Pr{X|H1} / Pr{X|H1}) = log2(Pr{X|H1}*2^Q) - log2(Pr{X|H1}*2^Q)
    208       //                           = log2(h1_test) - log2(h0_test)
    209       //                           = log2(2^(31-shifts_h1)*(1+b1))
    210       //                             - log2(2^(31-shifts_h0)*(1+b0))
    211       //                           = shifts_h0 - shifts_h1
    212       //                             + log2(1+b1) - log2(1+b0)
    213       //                          ~= shifts_h0 - shifts_h1
    214       //
    215       // Note that b0 and b1 are values less than 1, hence, 0 <= log2(1+b0) < 1.
    216       // Further, b0 and b1 are independent and on the average the two terms
    217       // cancel.
    218       shifts_h0 = WebRtcSpl_NormW32(h0_test);
    219       shifts_h1 = WebRtcSpl_NormW32(h1_test);
    220       if (h0_test == 0) {
    221         shifts_h0 = 31;
    222       }
    223       if (h1_test == 0) {
    224         shifts_h1 = 31;
    225       }
    226       log_likelihood_ratio = shifts_h0 - shifts_h1;
    227 
    228       // Update |sum_log_likelihood_ratios| with spectrum weighting. This is
    229       // used for the global VAD decision.
    230       sum_log_likelihood_ratios +=
    231           (int32_t) (log_likelihood_ratio * kSpectrumWeight[channel]);
    232 
    233       // Local VAD decision.
    234       if ((log_likelihood_ratio << 2) > individualTest) {
    235         vadflag = 1;
    236       }
    237 
    238       // TODO(bjornv): The conditional probabilities below are applied on the
    239       // hard coded number of Gaussians set to two. Find a way to generalize.
    240       // Calculate local noise probabilities used later when updating the GMM.
    241       h0 = (int16_t) (h0_test >> 12);  // Q15
    242       if (h0 > 0) {
    243         // High probability of noise. Assign conditional probabilities for each
    244         // Gaussian in the GMM.
    245         tmp1_s32 = (noise_probability[0] & 0xFFFFF000) << 2;  // Q29
    246         ngprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h0);  // Q14
    247         ngprvec[channel + kNumChannels] = 16384 - ngprvec[channel];
    248       } else {
    249         // Low noise probability. Assign conditional probability 1 to the first
    250         // Gaussian and 0 to the rest (which is already set at initialization).
    251         ngprvec[channel] = 16384;
    252       }
    253 
    254       // Calculate local speech probabilities used later when updating the GMM.
    255       h1 = (int16_t) (h1_test >> 12);  // Q15
    256       if (h1 > 0) {
    257         // High probability of speech. Assign conditional probabilities for each
    258         // Gaussian in the GMM. Otherwise use the initialized values, i.e., 0.
    259         tmp1_s32 = (speech_probability[0] & 0xFFFFF000) << 2;  // Q29
    260         sgprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h1);  // Q14
    261         sgprvec[channel + kNumChannels] = 16384 - sgprvec[channel];
    262       }
    263     }
    264 
    265     // Make a global VAD decision.
    266     vadflag |= (sum_log_likelihood_ratios >= totalTest);
    267 
    268     // Update the model parameters.
    269     maxspe = 12800;
    270     for (channel = 0; channel < kNumChannels; channel++) {
    271 
    272       // Get minimum value in past which is used for long term correction in Q4.
    273       feature_minimum = WebRtcVad_FindMinimum(self, features[channel], channel);
    274 
    275       // Compute the "global" mean, that is the sum of the two means weighted.
    276       noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
    277                                           &kNoiseDataWeights[channel]);
    278       tmp1_s16 = (int16_t) (noise_global_mean >> 6);  // Q8
    279 
    280       for (k = 0; k < kNumGaussians; k++) {
    281         gaussian = channel + k * kNumChannels;
    282 
    283         nmk = self->noise_means[gaussian];
    284         smk = self->speech_means[gaussian];
    285         nsk = self->noise_stds[gaussian];
    286         ssk = self->speech_stds[gaussian];
    287 
    288         // Update noise mean vector if the frame consists of noise only.
    289         nmk2 = nmk;
    290         if (!vadflag) {
    291           // deltaN = (x-mu)/sigma^2
    292           // ngprvec[k] = |noise_probability[k]| /
    293           //   (|noise_probability[0]| + |noise_probability[1]|)
    294 
    295           // (Q14 * Q11 >> 11) = Q14.
    296           delt = (int16_t)((ngprvec[gaussian] * deltaN[gaussian]) >> 11);
    297           // Q7 + (Q14 * Q15 >> 22) = Q7.
    298           nmk2 = nmk + (int16_t)((delt * kNoiseUpdateConst) >> 22);
    299         }
    300 
    301         // Long term correction of the noise mean.
    302         // Q8 - Q8 = Q8.
    303         ndelt = (feature_minimum << 4) - tmp1_s16;
    304         // Q7 + (Q8 * Q8) >> 9 = Q7.
    305         nmk3 = nmk2 + (int16_t)((ndelt * kBackEta) >> 9);
    306 
    307         // Control that the noise mean does not drift to much.
    308         tmp_s16 = (int16_t) ((k + 5) << 7);
    309         if (nmk3 < tmp_s16) {
    310           nmk3 = tmp_s16;
    311         }
    312         tmp_s16 = (int16_t) ((72 + k - channel) << 7);
    313         if (nmk3 > tmp_s16) {
    314           nmk3 = tmp_s16;
    315         }
    316         self->noise_means[gaussian] = nmk3;
    317 
    318         if (vadflag) {
    319           // Update speech mean vector:
    320           // |deltaS| = (x-mu)/sigma^2
    321           // sgprvec[k] = |speech_probability[k]| /
    322           //   (|speech_probability[0]| + |speech_probability[1]|)
    323 
    324           // (Q14 * Q11) >> 11 = Q14.
    325           delt = (int16_t)((sgprvec[gaussian] * deltaS[gaussian]) >> 11);
    326           // Q14 * Q15 >> 21 = Q8.
    327           tmp_s16 = (int16_t)((delt * kSpeechUpdateConst) >> 21);
    328           // Q7 + (Q8 >> 1) = Q7. With rounding.
    329           smk2 = smk + ((tmp_s16 + 1) >> 1);
    330 
    331           // Control that the speech mean does not drift to much.
    332           maxmu = maxspe + 640;
    333           if (smk2 < kMinimumMean[k]) {
    334             smk2 = kMinimumMean[k];
    335           }
    336           if (smk2 > maxmu) {
    337             smk2 = maxmu;
    338           }
    339           self->speech_means[gaussian] = smk2;  // Q7.
    340 
    341           // (Q7 >> 3) = Q4. With rounding.
    342           tmp_s16 = ((smk + 4) >> 3);
    343 
    344           tmp_s16 = features[channel] - tmp_s16;  // Q4
    345           // (Q11 * Q4 >> 3) = Q12.
    346           tmp1_s32 = (deltaS[gaussian] * tmp_s16) >> 3;
    347           tmp2_s32 = tmp1_s32 - 4096;
    348           tmp_s16 = sgprvec[gaussian] >> 2;
    349           // (Q14 >> 2) * Q12 = Q24.
    350           tmp1_s32 = tmp_s16 * tmp2_s32;
    351 
    352           tmp2_s32 = tmp1_s32 >> 4;  // Q20
    353 
    354           // 0.1 * Q20 / Q7 = Q13.
    355           if (tmp2_s32 > 0) {
    356             tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp2_s32, ssk * 10);
    357           } else {
    358             tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp2_s32, ssk * 10);
    359             tmp_s16 = -tmp_s16;
    360           }
    361           // Divide by 4 giving an update factor of 0.025 (= 0.1 / 4).
    362           // Note that division by 4 equals shift by 2, hence,
    363           // (Q13 >> 8) = (Q13 >> 6) / 4 = Q7.
    364           tmp_s16 += 128;  // Rounding.
    365           ssk += (tmp_s16 >> 8);
    366           if (ssk < kMinStd) {
    367             ssk = kMinStd;
    368           }
    369           self->speech_stds[gaussian] = ssk;
    370         } else {
    371           // Update GMM variance vectors.
    372           // deltaN * (features[channel] - nmk) - 1
    373           // Q4 - (Q7 >> 3) = Q4.
    374           tmp_s16 = features[channel] - (nmk >> 3);
    375           // (Q11 * Q4 >> 3) = Q12.
    376           tmp1_s32 = (deltaN[gaussian] * tmp_s16) >> 3;
    377           tmp1_s32 -= 4096;
    378 
    379           // (Q14 >> 2) * Q12 = Q24.
    380           tmp_s16 = (ngprvec[gaussian] + 2) >> 2;
    381           tmp2_s32 = tmp_s16 * tmp1_s32;
    382           // Q20  * approx 0.001 (2^-10=0.0009766), hence,
    383           // (Q24 >> 14) = (Q24 >> 4) / 2^10 = Q20.
    384           tmp1_s32 = tmp2_s32 >> 14;
    385 
    386           // Q20 / Q7 = Q13.
    387           if (tmp1_s32 > 0) {
    388             tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, nsk);
    389           } else {
    390             tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp1_s32, nsk);
    391             tmp_s16 = -tmp_s16;
    392           }
    393           tmp_s16 += 32;  // Rounding
    394           nsk += tmp_s16 >> 6;  // Q13 >> 6 = Q7.
    395           if (nsk < kMinStd) {
    396             nsk = kMinStd;
    397           }
    398           self->noise_stds[gaussian] = nsk;
    399         }
    400       }
    401 
    402       // Separate models if they are too close.
    403       // |noise_global_mean| in Q14 (= Q7 * Q7).
    404       noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
    405                                           &kNoiseDataWeights[channel]);
    406 
    407       // |speech_global_mean| in Q14 (= Q7 * Q7).
    408       speech_global_mean = WeightedAverage(&self->speech_means[channel], 0,
    409                                            &kSpeechDataWeights[channel]);
    410 
    411       // |diff| = "global" speech mean - "global" noise mean.
    412       // (Q14 >> 9) - (Q14 >> 9) = Q5.
    413       diff = (int16_t) (speech_global_mean >> 9) -
    414           (int16_t) (noise_global_mean >> 9);
    415       if (diff < kMinimumDifference[channel]) {
    416         tmp_s16 = kMinimumDifference[channel] - diff;
    417 
    418         // |tmp1_s16| = ~0.8 * (kMinimumDifference - diff) in Q7.
    419         // |tmp2_s16| = ~0.2 * (kMinimumDifference - diff) in Q7.
    420         tmp1_s16 = (int16_t)((13 * tmp_s16) >> 2);
    421         tmp2_s16 = (int16_t)((3 * tmp_s16) >> 2);
    422 
    423         // Move Gaussian means for speech model by |tmp1_s16| and update
    424         // |speech_global_mean|. Note that |self->speech_means[channel]| is
    425         // changed after the call.
    426         speech_global_mean = WeightedAverage(&self->speech_means[channel],
    427                                              tmp1_s16,
    428                                              &kSpeechDataWeights[channel]);
    429 
    430         // Move Gaussian means for noise model by -|tmp2_s16| and update
    431         // |noise_global_mean|. Note that |self->noise_means[channel]| is
    432         // changed after the call.
    433         noise_global_mean = WeightedAverage(&self->noise_means[channel],
    434                                             -tmp2_s16,
    435                                             &kNoiseDataWeights[channel]);
    436       }
    437 
    438       // Control that the speech & noise means do not drift to much.
    439       maxspe = kMaximumSpeech[channel];
    440       tmp2_s16 = (int16_t) (speech_global_mean >> 7);
    441       if (tmp2_s16 > maxspe) {
    442         // Upper limit of speech model.
    443         tmp2_s16 -= maxspe;
    444 
    445         for (k = 0; k < kNumGaussians; k++) {
    446           self->speech_means[channel + k * kNumChannels] -= tmp2_s16;
    447         }
    448       }
    449 
    450       tmp2_s16 = (int16_t) (noise_global_mean >> 7);
    451       if (tmp2_s16 > kMaximumNoise[channel]) {
    452         tmp2_s16 -= kMaximumNoise[channel];
    453 
    454         for (k = 0; k < kNumGaussians; k++) {
    455           self->noise_means[channel + k * kNumChannels] -= tmp2_s16;
    456         }
    457       }
    458     }
    459     self->frame_counter++;
    460   }
    461 
    462   // Smooth with respect to transition hysteresis.
    463   if (!vadflag) {
    464     if (self->over_hang > 0) {
    465       vadflag = 2 + self->over_hang;
    466       self->over_hang--;
    467     }
    468     self->num_of_speech = 0;
    469   } else {
    470     self->num_of_speech++;
    471     if (self->num_of_speech > kMaxSpeechFrames) {
    472       self->num_of_speech = kMaxSpeechFrames;
    473       self->over_hang = overhead2;
    474     } else {
    475       self->over_hang = overhead1;
    476     }
    477   }
    478   return vadflag;
    479 }
    480 
    481 // Initialize the VAD. Set aggressiveness mode to default value.
    482 int WebRtcVad_InitCore(VadInstT* self) {
    483   int i;
    484 
    485   if (self == NULL) {
    486     return -1;
    487   }
    488 
    489   // Initialization of general struct variables.
    490   self->vad = 1;  // Speech active (=1).
    491   self->frame_counter = 0;
    492   self->over_hang = 0;
    493   self->num_of_speech = 0;
    494 
    495   // Initialization of downsampling filter state.
    496   memset(self->downsampling_filter_states, 0,
    497          sizeof(self->downsampling_filter_states));
    498 
    499   // Initialization of 48 to 8 kHz downsampling.
    500   WebRtcSpl_ResetResample48khzTo8khz(&self->state_48_to_8);
    501 
    502   // Read initial PDF parameters.
    503   for (i = 0; i < kTableSize; i++) {
    504     self->noise_means[i] = kNoiseDataMeans[i];
    505     self->speech_means[i] = kSpeechDataMeans[i];
    506     self->noise_stds[i] = kNoiseDataStds[i];
    507     self->speech_stds[i] = kSpeechDataStds[i];
    508   }
    509 
    510   // Initialize Index and Minimum value vectors.
    511   for (i = 0; i < 16 * kNumChannels; i++) {
    512     self->low_value_vector[i] = 10000;
    513     self->index_vector[i] = 0;
    514   }
    515 
    516   // Initialize splitting filter states.
    517   memset(self->upper_state, 0, sizeof(self->upper_state));
    518   memset(self->lower_state, 0, sizeof(self->lower_state));
    519 
    520   // Initialize high pass filter states.
    521   memset(self->hp_filter_state, 0, sizeof(self->hp_filter_state));
    522 
    523   // Initialize mean value memory, for WebRtcVad_FindMinimum().
    524   for (i = 0; i < kNumChannels; i++) {
    525     self->mean_value[i] = 1600;
    526   }
    527 
    528   // Set aggressiveness mode to default (=|kDefaultMode|).
    529   if (WebRtcVad_set_mode_core(self, kDefaultMode) != 0) {
    530     return -1;
    531   }
    532 
    533   self->init_flag = kInitCheck;
    534 
    535   return 0;
    536 }
    537 
    538 // Set aggressiveness mode
    539 int WebRtcVad_set_mode_core(VadInstT* self, int mode) {
    540   int return_value = 0;
    541 
    542   switch (mode) {
    543     case 0:
    544       // Quality mode.
    545       memcpy(self->over_hang_max_1, kOverHangMax1Q,
    546              sizeof(self->over_hang_max_1));
    547       memcpy(self->over_hang_max_2, kOverHangMax2Q,
    548              sizeof(self->over_hang_max_2));
    549       memcpy(self->individual, kLocalThresholdQ,
    550              sizeof(self->individual));
    551       memcpy(self->total, kGlobalThresholdQ,
    552              sizeof(self->total));
    553       break;
    554     case 1:
    555       // Low bitrate mode.
    556       memcpy(self->over_hang_max_1, kOverHangMax1LBR,
    557              sizeof(self->over_hang_max_1));
    558       memcpy(self->over_hang_max_2, kOverHangMax2LBR,
    559              sizeof(self->over_hang_max_2));
    560       memcpy(self->individual, kLocalThresholdLBR,
    561              sizeof(self->individual));
    562       memcpy(self->total, kGlobalThresholdLBR,
    563              sizeof(self->total));
    564       break;
    565     case 2:
    566       // Aggressive mode.
    567       memcpy(self->over_hang_max_1, kOverHangMax1AGG,
    568              sizeof(self->over_hang_max_1));
    569       memcpy(self->over_hang_max_2, kOverHangMax2AGG,
    570              sizeof(self->over_hang_max_2));
    571       memcpy(self->individual, kLocalThresholdAGG,
    572              sizeof(self->individual));
    573       memcpy(self->total, kGlobalThresholdAGG,
    574              sizeof(self->total));
    575       break;
    576     case 3:
    577       // Very aggressive mode.
    578       memcpy(self->over_hang_max_1, kOverHangMax1VAG,
    579              sizeof(self->over_hang_max_1));
    580       memcpy(self->over_hang_max_2, kOverHangMax2VAG,
    581              sizeof(self->over_hang_max_2));
    582       memcpy(self->individual, kLocalThresholdVAG,
    583              sizeof(self->individual));
    584       memcpy(self->total, kGlobalThresholdVAG,
    585              sizeof(self->total));
    586       break;
    587     default:
    588       return_value = -1;
    589       break;
    590   }
    591 
    592   return return_value;
    593 }
    594 
    595 // Calculate VAD decision by first extracting feature values and then calculate
    596 // probability for both speech and background noise.
    597 
    598 int WebRtcVad_CalcVad48khz(VadInstT* inst, const int16_t* speech_frame,
    599                            size_t frame_length) {
    600   int vad;
    601   size_t i;
    602   int16_t speech_nb[240];  // 30 ms in 8 kHz.
    603   // |tmp_mem| is a temporary memory used by resample function, length is
    604   // frame length in 10 ms (480 samples) + 256 extra.
    605   int32_t tmp_mem[480 + 256] = { 0 };
    606   const size_t kFrameLen10ms48khz = 480;
    607   const size_t kFrameLen10ms8khz = 80;
    608   size_t num_10ms_frames = frame_length / kFrameLen10ms48khz;
    609 
    610   for (i = 0; i < num_10ms_frames; i++) {
    611     WebRtcSpl_Resample48khzTo8khz(speech_frame,
    612                                   &speech_nb[i * kFrameLen10ms8khz],
    613                                   &inst->state_48_to_8,
    614                                   tmp_mem);
    615   }
    616 
    617   // Do VAD on an 8 kHz signal
    618   vad = WebRtcVad_CalcVad8khz(inst, speech_nb, frame_length / 6);
    619 
    620   return vad;
    621 }
    622 
    623 int WebRtcVad_CalcVad32khz(VadInstT* inst, const int16_t* speech_frame,
    624                            size_t frame_length)
    625 {
    626     size_t len;
    627     int vad;
    628     int16_t speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB)
    629     int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
    630 
    631 
    632     // Downsample signal 32->16->8 before doing VAD
    633     WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]),
    634                            frame_length);
    635     len = frame_length / 2;
    636 
    637     WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len);
    638     len /= 2;
    639 
    640     // Do VAD on an 8 kHz signal
    641     vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
    642 
    643     return vad;
    644 }
    645 
    646 int WebRtcVad_CalcVad16khz(VadInstT* inst, const int16_t* speech_frame,
    647                            size_t frame_length)
    648 {
    649     size_t len;
    650     int vad;
    651     int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
    652 
    653     // Wideband: Downsample signal before doing VAD
    654     WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states,
    655                            frame_length);
    656 
    657     len = frame_length / 2;
    658     vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
    659 
    660     return vad;
    661 }
    662 
    663 int WebRtcVad_CalcVad8khz(VadInstT* inst, const int16_t* speech_frame,
    664                           size_t frame_length)
    665 {
    666     int16_t feature_vector[kNumChannels], total_power;
    667 
    668     // Get power in the bands
    669     total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length,
    670                                               feature_vector);
    671 
    672     // Make a VAD
    673     inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length);
    674 
    675     return inst->vad;
    676 }
    677