Home | History | Annotate | Download | only in vad
      1 /*
      2  *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 
     12 /*
     13  * This file includes the implementation of the core functionality in VAD.
     14  * For function description, see vad_core.h.
     15  */
     16 
     17 #include "vad_core.h"
     18 
     19 #include "signal_processing_library.h"
     20 #include "typedefs.h"
     21 #include "vad_defines.h"
     22 #include "vad_filterbank.h"
     23 #include "vad_gmm.h"
     24 #include "vad_sp.h"
     25 
     26 // Spectrum Weighting
     27 static const WebRtc_Word16 kSpectrumWeight[6] = { 6, 8, 10, 12, 14, 16 };
     28 static const WebRtc_Word16 kNoiseUpdateConst = 655; // Q15
     29 static const WebRtc_Word16 kSpeechUpdateConst = 6554; // Q15
     30 static const WebRtc_Word16 kBackEta = 154; // Q8
     31 // Minimum difference between the two models, Q5
     32 static const WebRtc_Word16 kMinimumDifference[6] = {
     33     544, 544, 576, 576, 576, 576 };
     34 // Upper limit of mean value for speech model, Q7
     35 static const WebRtc_Word16 kMaximumSpeech[6] = {
     36     11392, 11392, 11520, 11520, 11520, 11520 };
     37 // Minimum value for mean value
     38 static const WebRtc_Word16 kMinimumMean[2] = { 640, 768 };
     39 // Upper limit of mean value for noise model, Q7
     40 static const WebRtc_Word16 kMaximumNoise[6] = {
     41     9216, 9088, 8960, 8832, 8704, 8576 };
     42 // Start values for the Gaussian models, Q7
     43 // Weights for the two Gaussians for the six channels (noise)
     44 static const WebRtc_Word16 kNoiseDataWeights[12] = {
     45     34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103 };
     46 // Weights for the two Gaussians for the six channels (speech)
     47 static const WebRtc_Word16 kSpeechDataWeights[12] = {
     48     48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81 };
     49 // Means for the two Gaussians for the six channels (noise)
     50 static const WebRtc_Word16 kNoiseDataMeans[12] = {
     51     6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362 };
     52 // Means for the two Gaussians for the six channels (speech)
     53 static const WebRtc_Word16 kSpeechDataMeans[12] = {
     54     8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483
     55 };
     56 // Stds for the two Gaussians for the six channels (noise)
     57 static const WebRtc_Word16 kNoiseDataStds[12] = {
     58     378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455 };
     59 // Stds for the two Gaussians for the six channels (speech)
     60 static const WebRtc_Word16 kSpeechDataStds[12] = {
     61     555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 };
     62 
     63 static const int kInitCheck = 42;
     64 
     65 // Initialize VAD
     66 int WebRtcVad_InitCore(VadInstT *inst, short mode)
     67 {
     68     int i;
     69 
     70     // Initialization of struct
     71     inst->vad = 1;
     72     inst->frame_counter = 0;
     73     inst->over_hang = 0;
     74     inst->num_of_speech = 0;
     75 
     76     // Initialization of downsampling filter state
     77     inst->downsampling_filter_states[0] = 0;
     78     inst->downsampling_filter_states[1] = 0;
     79     inst->downsampling_filter_states[2] = 0;
     80     inst->downsampling_filter_states[3] = 0;
     81 
     82     // Read initial PDF parameters
     83     for (i = 0; i < NUM_TABLE_VALUES; i++)
     84     {
     85         inst->noise_means[i] = kNoiseDataMeans[i];
     86         inst->speech_means[i] = kSpeechDataMeans[i];
     87         inst->noise_stds[i] = kNoiseDataStds[i];
     88         inst->speech_stds[i] = kSpeechDataStds[i];
     89     }
     90 
     91     // Index and Minimum value vectors are initialized
     92     for (i = 0; i < 16 * NUM_CHANNELS; i++)
     93     {
     94         inst->low_value_vector[i] = 10000;
     95         inst->index_vector[i] = 0;
     96     }
     97 
     98     for (i = 0; i < 5; i++)
     99     {
    100         inst->upper_state[i] = 0;
    101         inst->lower_state[i] = 0;
    102     }
    103 
    104     for (i = 0; i < 4; i++)
    105     {
    106         inst->hp_filter_state[i] = 0;
    107     }
    108 
    109     // Init mean value memory, for FindMin function
    110     inst->mean_value[0] = 1600;
    111     inst->mean_value[1] = 1600;
    112     inst->mean_value[2] = 1600;
    113     inst->mean_value[3] = 1600;
    114     inst->mean_value[4] = 1600;
    115     inst->mean_value[5] = 1600;
    116 
    117     if (mode == 0)
    118     {
    119         // Quality mode
    120         inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst
    121         inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst
    122         inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst
    123         inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst
    124         inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst
    125         inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst
    126 
    127         inst->individual[0] = INDIVIDUAL_10MS_Q;
    128         inst->individual[1] = INDIVIDUAL_20MS_Q;
    129         inst->individual[2] = INDIVIDUAL_30MS_Q;
    130 
    131         inst->total[0] = TOTAL_10MS_Q;
    132         inst->total[1] = TOTAL_20MS_Q;
    133         inst->total[2] = TOTAL_30MS_Q;
    134     } else if (mode == 1)
    135     {
    136         // Low bitrate mode
    137         inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst
    138         inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst
    139         inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst
    140         inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst
    141         inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst
    142         inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst
    143 
    144         inst->individual[0] = INDIVIDUAL_10MS_LBR;
    145         inst->individual[1] = INDIVIDUAL_20MS_LBR;
    146         inst->individual[2] = INDIVIDUAL_30MS_LBR;
    147 
    148         inst->total[0] = TOTAL_10MS_LBR;
    149         inst->total[1] = TOTAL_20MS_LBR;
    150         inst->total[2] = TOTAL_30MS_LBR;
    151     } else if (mode == 2)
    152     {
    153         // Aggressive mode
    154         inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst
    155         inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst
    156         inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst
    157         inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst
    158         inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst
    159         inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst
    160 
    161         inst->individual[0] = INDIVIDUAL_10MS_AGG;
    162         inst->individual[1] = INDIVIDUAL_20MS_AGG;
    163         inst->individual[2] = INDIVIDUAL_30MS_AGG;
    164 
    165         inst->total[0] = TOTAL_10MS_AGG;
    166         inst->total[1] = TOTAL_20MS_AGG;
    167         inst->total[2] = TOTAL_30MS_AGG;
    168     } else
    169     {
    170         // Very aggressive mode
    171         inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst
    172         inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst
    173         inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst
    174         inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst
    175         inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst
    176         inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst
    177 
    178         inst->individual[0] = INDIVIDUAL_10MS_VAG;
    179         inst->individual[1] = INDIVIDUAL_20MS_VAG;
    180         inst->individual[2] = INDIVIDUAL_30MS_VAG;
    181 
    182         inst->total[0] = TOTAL_10MS_VAG;
    183         inst->total[1] = TOTAL_20MS_VAG;
    184         inst->total[2] = TOTAL_30MS_VAG;
    185     }
    186 
    187     inst->init_flag = kInitCheck;
    188 
    189     return 0;
    190 }
    191 
    192 // Set aggressiveness mode
    193 int WebRtcVad_set_mode_core(VadInstT *inst, short mode)
    194 {
    195 
    196     if (mode == 0)
    197     {
    198         // Quality mode
    199         inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst
    200         inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst
    201         inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst
    202         inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst
    203         inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst
    204         inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst
    205 
    206         inst->individual[0] = INDIVIDUAL_10MS_Q;
    207         inst->individual[1] = INDIVIDUAL_20MS_Q;
    208         inst->individual[2] = INDIVIDUAL_30MS_Q;
    209 
    210         inst->total[0] = TOTAL_10MS_Q;
    211         inst->total[1] = TOTAL_20MS_Q;
    212         inst->total[2] = TOTAL_30MS_Q;
    213     } else if (mode == 1)
    214     {
    215         // Low bitrate mode
    216         inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst
    217         inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst
    218         inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst
    219         inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst
    220         inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst
    221         inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst
    222 
    223         inst->individual[0] = INDIVIDUAL_10MS_LBR;
    224         inst->individual[1] = INDIVIDUAL_20MS_LBR;
    225         inst->individual[2] = INDIVIDUAL_30MS_LBR;
    226 
    227         inst->total[0] = TOTAL_10MS_LBR;
    228         inst->total[1] = TOTAL_20MS_LBR;
    229         inst->total[2] = TOTAL_30MS_LBR;
    230     } else if (mode == 2)
    231     {
    232         // Aggressive mode
    233         inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst
    234         inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst
    235         inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst
    236         inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst
    237         inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst
    238         inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst
    239 
    240         inst->individual[0] = INDIVIDUAL_10MS_AGG;
    241         inst->individual[1] = INDIVIDUAL_20MS_AGG;
    242         inst->individual[2] = INDIVIDUAL_30MS_AGG;
    243 
    244         inst->total[0] = TOTAL_10MS_AGG;
    245         inst->total[1] = TOTAL_20MS_AGG;
    246         inst->total[2] = TOTAL_30MS_AGG;
    247     } else if (mode == 3)
    248     {
    249         // Very aggressive mode
    250         inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst
    251         inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst
    252         inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst
    253         inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst
    254         inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst
    255         inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst
    256 
    257         inst->individual[0] = INDIVIDUAL_10MS_VAG;
    258         inst->individual[1] = INDIVIDUAL_20MS_VAG;
    259         inst->individual[2] = INDIVIDUAL_30MS_VAG;
    260 
    261         inst->total[0] = TOTAL_10MS_VAG;
    262         inst->total[1] = TOTAL_20MS_VAG;
    263         inst->total[2] = TOTAL_30MS_VAG;
    264     } else
    265     {
    266         return -1;
    267     }
    268 
    269     return 0;
    270 }
    271 
    272 // Calculate VAD decision by first extracting feature values and then calculate
    273 // probability for both speech and background noise.
    274 
    275 WebRtc_Word16 WebRtcVad_CalcVad32khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
    276                                      int frame_length)
    277 {
    278     WebRtc_Word16 len, vad;
    279     WebRtc_Word16 speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB)
    280     WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
    281 
    282 
    283     // Downsample signal 32->16->8 before doing VAD
    284     WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]),
    285                            frame_length);
    286     len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
    287 
    288     WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len);
    289     len = WEBRTC_SPL_RSHIFT_W16(len, 1);
    290 
    291     // Do VAD on an 8 kHz signal
    292     vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
    293 
    294     return vad;
    295 }
    296 
    297 WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
    298                                      int frame_length)
    299 {
    300     WebRtc_Word16 len, vad;
    301     WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
    302 
    303     // Wideband: Downsample signal before doing VAD
    304     WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states,
    305                            frame_length);
    306 
    307     len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
    308     vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
    309 
    310     return vad;
    311 }
    312 
    313 WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
    314                                     int frame_length)
    315 {
    316     WebRtc_Word16 feature_vector[NUM_CHANNELS], total_power;
    317 
    318     // Get power in the bands
    319     total_power = WebRtcVad_get_features(inst, speech_frame, frame_length, feature_vector);
    320 
    321     // Make a VAD
    322     inst->vad = WebRtcVad_GmmProbability(inst, feature_vector, total_power, frame_length);
    323 
    324     return inst->vad;
    325 }
    326 
    327 // Calculate probability for both speech and background noise, and perform a
    328 // hypothesis-test.
    329 WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
    330                                        WebRtc_Word16 total_power, int frame_length)
    331 {
    332     int n, k;
    333     WebRtc_Word16 backval;
    334     WebRtc_Word16 h0, h1;
    335     WebRtc_Word16 ratvec, xval;
    336     WebRtc_Word16 vadflag;
    337     WebRtc_Word16 shifts0, shifts1;
    338     WebRtc_Word16 tmp16, tmp16_1, tmp16_2;
    339     WebRtc_Word16 diff, nr, pos;
    340     WebRtc_Word16 nmk, nmk2, nmk3, smk, smk2, nsk, ssk;
    341     WebRtc_Word16 delt, ndelt;
    342     WebRtc_Word16 maxspe, maxmu;
    343     WebRtc_Word16 deltaN[NUM_TABLE_VALUES], deltaS[NUM_TABLE_VALUES];
    344     WebRtc_Word16 ngprvec[NUM_TABLE_VALUES], sgprvec[NUM_TABLE_VALUES];
    345     WebRtc_Word32 h0test, h1test;
    346     WebRtc_Word32 tmp32_1, tmp32_2;
    347     WebRtc_Word32 dotVal;
    348     WebRtc_Word32 nmid, smid;
    349     WebRtc_Word32 probn[NUM_MODELS], probs[NUM_MODELS];
    350     WebRtc_Word16 *nmean1ptr, *nmean2ptr, *smean1ptr, *smean2ptr, *nstd1ptr, *nstd2ptr,
    351             *sstd1ptr, *sstd2ptr;
    352     WebRtc_Word16 overhead1, overhead2, individualTest, totalTest;
    353 
    354     // Set the thresholds to different values based on frame length
    355     if (frame_length == 80)
    356     {
    357         // 80 input samples
    358         overhead1 = inst->over_hang_max_1[0];
    359         overhead2 = inst->over_hang_max_2[0];
    360         individualTest = inst->individual[0];
    361         totalTest = inst->total[0];
    362     } else if (frame_length == 160)
    363     {
    364         // 160 input samples
    365         overhead1 = inst->over_hang_max_1[1];
    366         overhead2 = inst->over_hang_max_2[1];
    367         individualTest = inst->individual[1];
    368         totalTest = inst->total[1];
    369     } else
    370     {
    371         // 240 input samples
    372         overhead1 = inst->over_hang_max_1[2];
    373         overhead2 = inst->over_hang_max_2[2];
    374         individualTest = inst->individual[2];
    375         totalTest = inst->total[2];
    376     }
    377 
    378     if (total_power > MIN_ENERGY)
    379     { // If signal present at all
    380 
    381         // Set pointers to the gaussian parameters
    382         nmean1ptr = &inst->noise_means[0];
    383         nmean2ptr = &inst->noise_means[NUM_CHANNELS];
    384         smean1ptr = &inst->speech_means[0];
    385         smean2ptr = &inst->speech_means[NUM_CHANNELS];
    386         nstd1ptr = &inst->noise_stds[0];
    387         nstd2ptr = &inst->noise_stds[NUM_CHANNELS];
    388         sstd1ptr = &inst->speech_stds[0];
    389         sstd2ptr = &inst->speech_stds[NUM_CHANNELS];
    390 
    391         vadflag = 0;
    392         dotVal = 0;
    393         for (n = 0; n < NUM_CHANNELS; n++)
    394         { // For all channels
    395 
    396             pos = WEBRTC_SPL_LSHIFT_W16(n, 1);
    397             xval = feature_vector[n];
    398 
    399             // Probability for Noise, Q7 * Q20 = Q27
    400             tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean1ptr++, *nstd1ptr++,
    401                                                     &deltaN[pos]);
    402             probn[0] = (WebRtc_Word32)(kNoiseDataWeights[n] * tmp32_1);
    403             tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean2ptr++, *nstd2ptr++,
    404                                                     &deltaN[pos + 1]);
    405             probn[1] = (WebRtc_Word32)(kNoiseDataWeights[n + NUM_CHANNELS] * tmp32_1);
    406             h0test = probn[0] + probn[1]; // Q27
    407             h0 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(h0test, 12); // Q15
    408 
    409             // Probability for Speech
    410             tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean1ptr++, *sstd1ptr++,
    411                                                     &deltaS[pos]);
    412             probs[0] = (WebRtc_Word32)(kSpeechDataWeights[n] * tmp32_1);
    413             tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean2ptr++, *sstd2ptr++,
    414                                                     &deltaS[pos + 1]);
    415             probs[1] = (WebRtc_Word32)(kSpeechDataWeights[n + NUM_CHANNELS] * tmp32_1);
    416             h1test = probs[0] + probs[1]; // Q27
    417             h1 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(h1test, 12); // Q15
    418 
    419             // Get likelihood ratio. Approximate log2(H1/H0) with shifts0 - shifts1
    420             shifts0 = WebRtcSpl_NormW32(h0test);
    421             shifts1 = WebRtcSpl_NormW32(h1test);
    422 
    423             if ((h0test > 0) && (h1test > 0))
    424             {
    425                 ratvec = shifts0 - shifts1;
    426             } else if (h1test > 0)
    427             {
    428                 ratvec = 31 - shifts1;
    429             } else if (h0test > 0)
    430             {
    431                 ratvec = shifts0 - 31;
    432             } else
    433             {
    434                 ratvec = 0;
    435             }
    436 
    437             // VAD decision with spectrum weighting
    438             dotVal += WEBRTC_SPL_MUL_16_16(ratvec, kSpectrumWeight[n]);
    439 
    440             // Individual channel test
    441             if ((ratvec << 2) > individualTest)
    442             {
    443                 vadflag = 1;
    444             }
    445 
    446             // Probabilities used when updating model
    447             if (h0 > 0)
    448             {
    449                 tmp32_1 = probn[0] & 0xFFFFF000; // Q27
    450                 tmp32_2 = WEBRTC_SPL_LSHIFT_W32(tmp32_1, 2); // Q29
    451                 ngprvec[pos] = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_2, h0);
    452                 ngprvec[pos + 1] = 16384 - ngprvec[pos];
    453             } else
    454             {
    455                 ngprvec[pos] = 16384;
    456                 ngprvec[pos + 1] = 0;
    457             }
    458 
    459             // Probabilities used when updating model
    460             if (h1 > 0)
    461             {
    462                 tmp32_1 = probs[0] & 0xFFFFF000;
    463                 tmp32_2 = WEBRTC_SPL_LSHIFT_W32(tmp32_1, 2);
    464                 sgprvec[pos] = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_2, h1);
    465                 sgprvec[pos + 1] = 16384 - sgprvec[pos];
    466             } else
    467             {
    468                 sgprvec[pos] = 0;
    469                 sgprvec[pos + 1] = 0;
    470             }
    471         }
    472 
    473         // Overall test
    474         if (dotVal >= totalTest)
    475         {
    476             vadflag |= 1;
    477         }
    478 
    479         // Set pointers to the means and standard deviations.
    480         nmean1ptr = &inst->noise_means[0];
    481         smean1ptr = &inst->speech_means[0];
    482         nstd1ptr = &inst->noise_stds[0];
    483         sstd1ptr = &inst->speech_stds[0];
    484 
    485         maxspe = 12800;
    486 
    487         // Update the model's parameters
    488         for (n = 0; n < NUM_CHANNELS; n++)
    489         {
    490 
    491             pos = WEBRTC_SPL_LSHIFT_W16(n, 1);
    492 
    493             // Get min value in past which is used for long term correction
    494             backval = WebRtcVad_FindMinimum(inst, feature_vector[n], n); // Q4
    495 
    496             // Compute the "global" mean, that is the sum of the two means weighted
    497             nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr); // Q7 * Q7
    498             nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+NUM_CHANNELS],
    499                     *(nmean1ptr+NUM_CHANNELS));
    500             tmp16_1 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(nmid, 6); // Q8
    501 
    502             for (k = 0; k < NUM_MODELS; k++)
    503             {
    504 
    505                 nr = pos + k;
    506 
    507                 nmean2ptr = nmean1ptr + k * NUM_CHANNELS;
    508                 smean2ptr = smean1ptr + k * NUM_CHANNELS;
    509                 nstd2ptr = nstd1ptr + k * NUM_CHANNELS;
    510                 sstd2ptr = sstd1ptr + k * NUM_CHANNELS;
    511                 nmk = *nmean2ptr;
    512                 smk = *smean2ptr;
    513                 nsk = *nstd2ptr;
    514                 ssk = *sstd2ptr;
    515 
    516                 // Update noise mean vector if the frame consists of noise only
    517                 nmk2 = nmk;
    518                 if (!vadflag)
    519                 {
    520                     // deltaN = (x-mu)/sigma^2
    521                     // ngprvec[k] = probn[k]/(probn[0] + probn[1])
    522 
    523                     delt = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(ngprvec[nr],
    524                             deltaN[nr], 11); // Q14*Q11
    525                     nmk2 = nmk + (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delt,
    526                             kNoiseUpdateConst,
    527                             22); // Q7+(Q14*Q15>>22)
    528                 }
    529 
    530                 // Long term correction of the noise mean
    531                 ndelt = WEBRTC_SPL_LSHIFT_W16(backval, 4);
    532                 ndelt -= tmp16_1; // Q8 - Q8
    533                 nmk3 = nmk2 + (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(ndelt,
    534                         kBackEta,
    535                         9); // Q7+(Q8*Q8)>>9
    536 
    537                 // Control that the noise mean does not drift to much
    538                 tmp16 = WEBRTC_SPL_LSHIFT_W16(k+5, 7);
    539                 if (nmk3 < tmp16)
    540                     nmk3 = tmp16;
    541                 tmp16 = WEBRTC_SPL_LSHIFT_W16(72+k-n, 7);
    542                 if (nmk3 > tmp16)
    543                     nmk3 = tmp16;
    544                 *nmean2ptr = nmk3;
    545 
    546                 if (vadflag)
    547                 {
    548                     // Update speech mean vector:
    549                     // deltaS = (x-mu)/sigma^2
    550                     // sgprvec[k] = probn[k]/(probn[0] + probn[1])
    551 
    552                     delt = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(sgprvec[nr],
    553                             deltaS[nr],
    554                             11); // (Q14*Q11)>>11=Q14
    555                     tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delt,
    556                             kSpeechUpdateConst,
    557                             21) + 1;
    558                     smk2 = smk + (tmp16 >> 1); // Q7 + (Q14 * Q15 >> 22)
    559 
    560                     // Control that the speech mean does not drift to much
    561                     maxmu = maxspe + 640;
    562                     if (smk2 < kMinimumMean[k])
    563                         smk2 = kMinimumMean[k];
    564                     if (smk2 > maxmu)
    565                         smk2 = maxmu;
    566 
    567                     *smean2ptr = smk2;
    568 
    569                     // (Q7>>3) = Q4
    570                     tmp16 = WEBRTC_SPL_RSHIFT_W16((smk + 4), 3);
    571 
    572                     tmp16 = feature_vector[n] - tmp16; // Q4
    573                     tmp32_1 = WEBRTC_SPL_MUL_16_16_RSFT(deltaS[nr], tmp16, 3);
    574                     tmp32_2 = tmp32_1 - (WebRtc_Word32)4096; // Q12
    575                     tmp16 = WEBRTC_SPL_RSHIFT_W16((sgprvec[nr]), 2);
    576                     tmp32_1 = (WebRtc_Word32)(tmp16 * tmp32_2);// (Q15>>3)*(Q14>>2)=Q12*Q12=Q24
    577 
    578                     tmp32_2 = WEBRTC_SPL_RSHIFT_W32(tmp32_1, 4); // Q20
    579 
    580                     // 0.1 * Q20 / Q7 = Q13
    581                     if (tmp32_2 > 0)
    582                         tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_2, ssk * 10);
    583                     else
    584                     {
    585                         tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(-tmp32_2, ssk * 10);
    586                         tmp16 = -tmp16;
    587                     }
    588                     // divide by 4 giving an update factor of 0.025
    589                     tmp16 += 128; // Rounding
    590                     ssk += WEBRTC_SPL_RSHIFT_W16(tmp16, 8);
    591                     // Division with 8 plus Q7
    592                     if (ssk < MIN_STD)
    593                         ssk = MIN_STD;
    594                     *sstd2ptr = ssk;
    595                 } else
    596                 {
    597                     // Update GMM variance vectors
    598                     // deltaN * (feature_vector[n] - nmk) - 1, Q11 * Q4
    599                     tmp16 = feature_vector[n] - WEBRTC_SPL_RSHIFT_W16(nmk, 3);
    600 
    601                     // (Q15>>3) * (Q14>>2) = Q12 * Q12 = Q24
    602                     tmp32_1 = WEBRTC_SPL_MUL_16_16_RSFT(deltaN[nr], tmp16, 3) - 4096;
    603                     tmp16 = WEBRTC_SPL_RSHIFT_W16((ngprvec[nr]+2), 2);
    604                     tmp32_2 = (WebRtc_Word32)(tmp16 * tmp32_1);
    605                     tmp32_1 = WEBRTC_SPL_RSHIFT_W32(tmp32_2, 14);
    606                     // Q20  * approx 0.001 (2^-10=0.0009766)
    607 
    608                     // Q20 / Q7 = Q13
    609                     tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_1, nsk);
    610                     if (tmp32_1 > 0)
    611                         tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_1, nsk);
    612                     else
    613                     {
    614                         tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(-tmp32_1, nsk);
    615                         tmp16 = -tmp16;
    616                     }
    617                     tmp16 += 32; // Rounding
    618                     nsk += WEBRTC_SPL_RSHIFT_W16(tmp16, 6);
    619 
    620                     if (nsk < MIN_STD)
    621                         nsk = MIN_STD;
    622 
    623                     *nstd2ptr = nsk;
    624                 }
    625             }
    626 
    627             // Separate models if they are too close - nmid in Q14
    628             nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr);
    629             nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+NUM_CHANNELS], *nmean2ptr);
    630 
    631             // smid in Q14
    632             smid = WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n], *smean1ptr);
    633             smid += WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n+NUM_CHANNELS], *smean2ptr);
    634 
    635             // diff = "global" speech mean - "global" noise mean
    636             diff = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(smid, 9);
    637             tmp16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(nmid, 9);
    638             diff -= tmp16;
    639 
    640             if (diff < kMinimumDifference[n])
    641             {
    642 
    643                 tmp16 = kMinimumDifference[n] - diff; // Q5
    644 
    645                 // tmp16_1 = ~0.8 * (kMinimumDifference - diff) in Q7
    646                 // tmp16_2 = ~0.2 * (kMinimumDifference - diff) in Q7
    647                 tmp16_1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(13, tmp16, 2);
    648                 tmp16_2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(3, tmp16, 2);
    649 
    650                 // First Gauss, speech model
    651                 tmp16 = tmp16_1 + *smean1ptr;
    652                 *smean1ptr = tmp16;
    653                 smid = WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n]);
    654 
    655                 // Second Gauss, speech model
    656                 tmp16 = tmp16_1 + *smean2ptr;
    657                 *smean2ptr = tmp16;
    658                 smid += WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n+NUM_CHANNELS]);
    659 
    660                 // First Gauss, noise model
    661                 tmp16 = *nmean1ptr - tmp16_2;
    662                 *nmean1ptr = tmp16;
    663 
    664                 nmid = WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n]);
    665 
    666                 // Second Gauss, noise model
    667                 tmp16 = *nmean2ptr - tmp16_2;
    668                 *nmean2ptr = tmp16;
    669                 nmid += WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n+NUM_CHANNELS]);
    670             }
    671 
    672             // Control that the speech & noise means do not drift to much
    673             maxspe = kMaximumSpeech[n];
    674             tmp16_2 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(smid, 7);
    675             if (tmp16_2 > maxspe)
    676             { // Upper limit of speech model
    677                 tmp16_2 -= maxspe;
    678 
    679                 *smean1ptr -= tmp16_2;
    680                 *smean2ptr -= tmp16_2;
    681             }
    682 
    683             tmp16_2 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(nmid, 7);
    684             if (tmp16_2 > kMaximumNoise[n])
    685             {
    686                 tmp16_2 -= kMaximumNoise[n];
    687 
    688                 *nmean1ptr -= tmp16_2;
    689                 *nmean2ptr -= tmp16_2;
    690             }
    691 
    692             nmean1ptr++;
    693             smean1ptr++;
    694             nstd1ptr++;
    695             sstd1ptr++;
    696         }
    697         inst->frame_counter++;
    698     } else
    699     {
    700         vadflag = 0;
    701     }
    702 
    703     // Hangover smoothing
    704     if (!vadflag)
    705     {
    706         if (inst->over_hang > 0)
    707         {
    708             vadflag = 2 + inst->over_hang;
    709             inst->over_hang = inst->over_hang - 1;
    710         }
    711         inst->num_of_speech = 0;
    712     } else
    713     {
    714         inst->num_of_speech = inst->num_of_speech + 1;
    715         if (inst->num_of_speech > NSP_MAX)
    716         {
    717             inst->num_of_speech = NSP_MAX;
    718             inst->over_hang = overhead2;
    719         } else
    720             inst->over_hang = overhead1;
    721     }
    722     return vadflag;
    723 }
    724