1 /* 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "webrtc/common_audio/vad/vad_core.h" 12 13 #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 14 #include "webrtc/common_audio/vad/vad_filterbank.h" 15 #include "webrtc/common_audio/vad/vad_gmm.h" 16 #include "webrtc/common_audio/vad/vad_sp.h" 17 #include "webrtc/typedefs.h" 18 19 // Spectrum Weighting 20 static const int16_t kSpectrumWeight[kNumChannels] = { 6, 8, 10, 12, 14, 16 }; 21 static const int16_t kNoiseUpdateConst = 655; // Q15 22 static const int16_t kSpeechUpdateConst = 6554; // Q15 23 static const int16_t kBackEta = 154; // Q8 24 // Minimum difference between the two models, Q5 25 static const int16_t kMinimumDifference[kNumChannels] = { 26 544, 544, 576, 576, 576, 576 }; 27 // Upper limit of mean value for speech model, Q7 28 static const int16_t kMaximumSpeech[kNumChannels] = { 29 11392, 11392, 11520, 11520, 11520, 11520 }; 30 // Minimum value for mean value 31 static const int16_t kMinimumMean[kNumGaussians] = { 640, 768 }; 32 // Upper limit of mean value for noise model, Q7 33 static const int16_t kMaximumNoise[kNumChannels] = { 34 9216, 9088, 8960, 8832, 8704, 8576 }; 35 // Start values for the Gaussian models, Q7 36 // Weights for the two Gaussians for the six channels (noise) 37 static const int16_t kNoiseDataWeights[kTableSize] = { 38 34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103 }; 39 // Weights for the two Gaussians for the six channels (speech) 40 static const int16_t kSpeechDataWeights[kTableSize] = { 41 48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81 }; 42 // Means for the two Gaussians for the six channels (noise) 43 static const int16_t kNoiseDataMeans[kTableSize] = { 44 6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362 }; 45 // Means for the two Gaussians for the six channels (speech) 46 static const int16_t kSpeechDataMeans[kTableSize] = { 47 8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483 48 }; 49 // Stds for the two Gaussians for the six channels (noise) 50 static const int16_t kNoiseDataStds[kTableSize] = { 51 378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455 }; 52 // Stds for the two Gaussians for the six channels (speech) 53 static const int16_t kSpeechDataStds[kTableSize] = { 54 555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 }; 55 56 // Constants used in GmmProbability(). 57 // 58 // Maximum number of counted speech (VAD = 1) frames in a row. 59 static const int16_t kMaxSpeechFrames = 6; 60 // Minimum standard deviation for both speech and noise. 61 static const int16_t kMinStd = 384; 62 63 // Constants in WebRtcVad_InitCore(). 64 // Default aggressiveness mode. 65 static const short kDefaultMode = 0; 66 static const int kInitCheck = 42; 67 68 // Constants used in WebRtcVad_set_mode_core(). 69 // 70 // Thresholds for different frame lengths (10 ms, 20 ms and 30 ms). 71 // 72 // Mode 0, Quality. 73 static const int16_t kOverHangMax1Q[3] = { 8, 4, 3 }; 74 static const int16_t kOverHangMax2Q[3] = { 14, 7, 5 }; 75 static const int16_t kLocalThresholdQ[3] = { 24, 21, 24 }; 76 static const int16_t kGlobalThresholdQ[3] = { 57, 48, 57 }; 77 // Mode 1, Low bitrate. 78 static const int16_t kOverHangMax1LBR[3] = { 8, 4, 3 }; 79 static const int16_t kOverHangMax2LBR[3] = { 14, 7, 5 }; 80 static const int16_t kLocalThresholdLBR[3] = { 37, 32, 37 }; 81 static const int16_t kGlobalThresholdLBR[3] = { 100, 80, 100 }; 82 // Mode 2, Aggressive. 83 static const int16_t kOverHangMax1AGG[3] = { 6, 3, 2 }; 84 static const int16_t kOverHangMax2AGG[3] = { 9, 5, 3 }; 85 static const int16_t kLocalThresholdAGG[3] = { 82, 78, 82 }; 86 static const int16_t kGlobalThresholdAGG[3] = { 285, 260, 285 }; 87 // Mode 3, Very aggressive. 88 static const int16_t kOverHangMax1VAG[3] = { 6, 3, 2 }; 89 static const int16_t kOverHangMax2VAG[3] = { 9, 5, 3 }; 90 static const int16_t kLocalThresholdVAG[3] = { 94, 94, 94 }; 91 static const int16_t kGlobalThresholdVAG[3] = { 1100, 1050, 1100 }; 92 93 // Calculates the weighted average w.r.t. number of Gaussians. The |data| are 94 // updated with an |offset| before averaging. 95 // 96 // - data [i/o] : Data to average. 97 // - offset [i] : An offset added to |data|. 98 // - weights [i] : Weights used for averaging. 99 // 100 // returns : The weighted average. 101 static int32_t WeightedAverage(int16_t* data, int16_t offset, 102 const int16_t* weights) { 103 int k; 104 int32_t weighted_average = 0; 105 106 for (k = 0; k < kNumGaussians; k++) { 107 data[k * kNumChannels] += offset; 108 weighted_average += data[k * kNumChannels] * weights[k * kNumChannels]; 109 } 110 return weighted_average; 111 } 112 113 // Calculates the probabilities for both speech and background noise using 114 // Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which 115 // type of signal is most probable. 116 // 117 // - self [i/o] : Pointer to VAD instance 118 // - features [i] : Feature vector of length |kNumChannels| 119 // = log10(energy in frequency band) 120 // - total_power [i] : Total power in audio frame. 121 // - frame_length [i] : Number of input samples 122 // 123 // - returns : the VAD decision (0 - noise, 1 - speech). 124 static int16_t GmmProbability(VadInstT* self, int16_t* features, 125 int16_t total_power, int frame_length) { 126 int channel, k; 127 int16_t feature_minimum; 128 int16_t h0, h1; 129 int16_t log_likelihood_ratio; 130 int16_t vadflag = 0; 131 int16_t shifts_h0, shifts_h1; 132 int16_t tmp_s16, tmp1_s16, tmp2_s16; 133 int16_t diff; 134 int gaussian; 135 int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk; 136 int16_t delt, ndelt; 137 int16_t maxspe, maxmu; 138 int16_t deltaN[kTableSize], deltaS[kTableSize]; 139 int16_t ngprvec[kTableSize] = { 0 }; // Conditional probability = 0. 140 int16_t sgprvec[kTableSize] = { 0 }; // Conditional probability = 0. 141 int32_t h0_test, h1_test; 142 int32_t tmp1_s32, tmp2_s32; 143 int32_t sum_log_likelihood_ratios = 0; 144 int32_t noise_global_mean, speech_global_mean; 145 int32_t noise_probability[kNumGaussians], speech_probability[kNumGaussians]; 146 int16_t overhead1, overhead2, individualTest, totalTest; 147 148 // Set various thresholds based on frame lengths (80, 160 or 240 samples). 149 if (frame_length == 80) { 150 overhead1 = self->over_hang_max_1[0]; 151 overhead2 = self->over_hang_max_2[0]; 152 individualTest = self->individual[0]; 153 totalTest = self->total[0]; 154 } else if (frame_length == 160) { 155 overhead1 = self->over_hang_max_1[1]; 156 overhead2 = self->over_hang_max_2[1]; 157 individualTest = self->individual[1]; 158 totalTest = self->total[1]; 159 } else { 160 overhead1 = self->over_hang_max_1[2]; 161 overhead2 = self->over_hang_max_2[2]; 162 individualTest = self->individual[2]; 163 totalTest = self->total[2]; 164 } 165 166 if (total_power > kMinEnergy) { 167 // The signal power of current frame is large enough for processing. The 168 // processing consists of two parts: 169 // 1) Calculating the likelihood of speech and thereby a VAD decision. 170 // 2) Updating the underlying model, w.r.t., the decision made. 171 172 // The detection scheme is an LRT with hypothesis 173 // H0: Noise 174 // H1: Speech 175 // 176 // We combine a global LRT with local tests, for each frequency sub-band, 177 // here defined as |channel|. 178 for (channel = 0; channel < kNumChannels; channel++) { 179 // For each channel we model the probability with a GMM consisting of 180 // |kNumGaussians|, with different means and standard deviations depending 181 // on H0 or H1. 182 h0_test = 0; 183 h1_test = 0; 184 for (k = 0; k < kNumGaussians; k++) { 185 gaussian = channel + k * kNumChannels; 186 // Probability under H0, that is, probability of frame being noise. 187 // Value given in Q27 = Q7 * Q20. 188 tmp1_s32 = WebRtcVad_GaussianProbability(features[channel], 189 self->noise_means[gaussian], 190 self->noise_stds[gaussian], 191 &deltaN[gaussian]); 192 noise_probability[k] = kNoiseDataWeights[gaussian] * tmp1_s32; 193 h0_test += noise_probability[k]; // Q27 194 195 // Probability under H1, that is, probability of frame being speech. 196 // Value given in Q27 = Q7 * Q20. 197 tmp1_s32 = WebRtcVad_GaussianProbability(features[channel], 198 self->speech_means[gaussian], 199 self->speech_stds[gaussian], 200 &deltaS[gaussian]); 201 speech_probability[k] = kSpeechDataWeights[gaussian] * tmp1_s32; 202 h1_test += speech_probability[k]; // Q27 203 } 204 205 // Calculate the log likelihood ratio: log2(Pr{X|H1} / Pr{X|H1}). 206 // Approximation: 207 // log2(Pr{X|H1} / Pr{X|H1}) = log2(Pr{X|H1}*2^Q) - log2(Pr{X|H1}*2^Q) 208 // = log2(h1_test) - log2(h0_test) 209 // = log2(2^(31-shifts_h1)*(1+b1)) 210 // - log2(2^(31-shifts_h0)*(1+b0)) 211 // = shifts_h0 - shifts_h1 212 // + log2(1+b1) - log2(1+b0) 213 // ~= shifts_h0 - shifts_h1 214 // 215 // Note that b0 and b1 are values less than 1, hence, 0 <= log2(1+b0) < 1. 216 // Further, b0 and b1 are independent and on the average the two terms 217 // cancel. 218 shifts_h0 = WebRtcSpl_NormW32(h0_test); 219 shifts_h1 = WebRtcSpl_NormW32(h1_test); 220 if (h0_test == 0) { 221 shifts_h0 = 31; 222 } 223 if (h1_test == 0) { 224 shifts_h1 = 31; 225 } 226 log_likelihood_ratio = shifts_h0 - shifts_h1; 227 228 // Update |sum_log_likelihood_ratios| with spectrum weighting. This is 229 // used for the global VAD decision. 230 sum_log_likelihood_ratios += 231 (int32_t) (log_likelihood_ratio * kSpectrumWeight[channel]); 232 233 // Local VAD decision. 234 if ((log_likelihood_ratio << 2) > individualTest) { 235 vadflag = 1; 236 } 237 238 // TODO(bjornv): The conditional probabilities below are applied on the 239 // hard coded number of Gaussians set to two. Find a way to generalize. 240 // Calculate local noise probabilities used later when updating the GMM. 241 h0 = (int16_t) (h0_test >> 12); // Q15 242 if (h0 > 0) { 243 // High probability of noise. Assign conditional probabilities for each 244 // Gaussian in the GMM. 245 tmp1_s32 = (noise_probability[0] & 0xFFFFF000) << 2; // Q29 246 ngprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h0); // Q14 247 ngprvec[channel + kNumChannels] = 16384 - ngprvec[channel]; 248 } else { 249 // Low noise probability. Assign conditional probability 1 to the first 250 // Gaussian and 0 to the rest (which is already set at initialization). 251 ngprvec[channel] = 16384; 252 } 253 254 // Calculate local speech probabilities used later when updating the GMM. 255 h1 = (int16_t) (h1_test >> 12); // Q15 256 if (h1 > 0) { 257 // High probability of speech. Assign conditional probabilities for each 258 // Gaussian in the GMM. Otherwise use the initialized values, i.e., 0. 259 tmp1_s32 = (speech_probability[0] & 0xFFFFF000) << 2; // Q29 260 sgprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h1); // Q14 261 sgprvec[channel + kNumChannels] = 16384 - sgprvec[channel]; 262 } 263 } 264 265 // Make a global VAD decision. 266 vadflag |= (sum_log_likelihood_ratios >= totalTest); 267 268 // Update the model parameters. 269 maxspe = 12800; 270 for (channel = 0; channel < kNumChannels; channel++) { 271 272 // Get minimum value in past which is used for long term correction in Q4. 273 feature_minimum = WebRtcVad_FindMinimum(self, features[channel], channel); 274 275 // Compute the "global" mean, that is the sum of the two means weighted. 276 noise_global_mean = WeightedAverage(&self->noise_means[channel], 0, 277 &kNoiseDataWeights[channel]); 278 tmp1_s16 = (int16_t) (noise_global_mean >> 6); // Q8 279 280 for (k = 0; k < kNumGaussians; k++) { 281 gaussian = channel + k * kNumChannels; 282 283 nmk = self->noise_means[gaussian]; 284 smk = self->speech_means[gaussian]; 285 nsk = self->noise_stds[gaussian]; 286 ssk = self->speech_stds[gaussian]; 287 288 // Update noise mean vector if the frame consists of noise only. 289 nmk2 = nmk; 290 if (!vadflag) { 291 // deltaN = (x-mu)/sigma^2 292 // ngprvec[k] = |noise_probability[k]| / 293 // (|noise_probability[0]| + |noise_probability[1]|) 294 295 // (Q14 * Q11 >> 11) = Q14. 296 delt = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(ngprvec[gaussian], 297 deltaN[gaussian], 298 11); 299 // Q7 + (Q14 * Q15 >> 22) = Q7. 300 nmk2 = nmk + (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(delt, 301 kNoiseUpdateConst, 302 22); 303 } 304 305 // Long term correction of the noise mean. 306 // Q8 - Q8 = Q8. 307 ndelt = (feature_minimum << 4) - tmp1_s16; 308 // Q7 + (Q8 * Q8) >> 9 = Q7. 309 nmk3 = nmk2 + (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(ndelt, kBackEta, 9); 310 311 // Control that the noise mean does not drift to much. 312 tmp_s16 = (int16_t) ((k + 5) << 7); 313 if (nmk3 < tmp_s16) { 314 nmk3 = tmp_s16; 315 } 316 tmp_s16 = (int16_t) ((72 + k - channel) << 7); 317 if (nmk3 > tmp_s16) { 318 nmk3 = tmp_s16; 319 } 320 self->noise_means[gaussian] = nmk3; 321 322 if (vadflag) { 323 // Update speech mean vector: 324 // |deltaS| = (x-mu)/sigma^2 325 // sgprvec[k] = |speech_probability[k]| / 326 // (|speech_probability[0]| + |speech_probability[1]|) 327 328 // (Q14 * Q11) >> 11 = Q14. 329 delt = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(sgprvec[gaussian], 330 deltaS[gaussian], 331 11); 332 // Q14 * Q15 >> 21 = Q8. 333 tmp_s16 = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(delt, 334 kSpeechUpdateConst, 335 21); 336 // Q7 + (Q8 >> 1) = Q7. With rounding. 337 smk2 = smk + ((tmp_s16 + 1) >> 1); 338 339 // Control that the speech mean does not drift to much. 340 maxmu = maxspe + 640; 341 if (smk2 < kMinimumMean[k]) { 342 smk2 = kMinimumMean[k]; 343 } 344 if (smk2 > maxmu) { 345 smk2 = maxmu; 346 } 347 self->speech_means[gaussian] = smk2; // Q7. 348 349 // (Q7 >> 3) = Q4. With rounding. 350 tmp_s16 = ((smk + 4) >> 3); 351 352 tmp_s16 = features[channel] - tmp_s16; // Q4 353 // (Q11 * Q4 >> 3) = Q12. 354 tmp1_s32 = WEBRTC_SPL_MUL_16_16_RSFT(deltaS[gaussian], tmp_s16, 3); 355 tmp2_s32 = tmp1_s32 - 4096; 356 tmp_s16 = sgprvec[gaussian] >> 2; 357 // (Q14 >> 2) * Q12 = Q24. 358 tmp1_s32 = tmp_s16 * tmp2_s32; 359 360 tmp2_s32 = tmp1_s32 >> 4; // Q20 361 362 // 0.1 * Q20 / Q7 = Q13. 363 if (tmp2_s32 > 0) { 364 tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp2_s32, ssk * 10); 365 } else { 366 tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp2_s32, ssk * 10); 367 tmp_s16 = -tmp_s16; 368 } 369 // Divide by 4 giving an update factor of 0.025 (= 0.1 / 4). 370 // Note that division by 4 equals shift by 2, hence, 371 // (Q13 >> 8) = (Q13 >> 6) / 4 = Q7. 372 tmp_s16 += 128; // Rounding. 373 ssk += (tmp_s16 >> 8); 374 if (ssk < kMinStd) { 375 ssk = kMinStd; 376 } 377 self->speech_stds[gaussian] = ssk; 378 } else { 379 // Update GMM variance vectors. 380 // deltaN * (features[channel] - nmk) - 1 381 // Q4 - (Q7 >> 3) = Q4. 382 tmp_s16 = features[channel] - (nmk >> 3); 383 // (Q11 * Q4 >> 3) = Q12. 384 tmp1_s32 = WEBRTC_SPL_MUL_16_16_RSFT(deltaN[gaussian], tmp_s16, 3); 385 tmp1_s32 -= 4096; 386 387 // (Q14 >> 2) * Q12 = Q24. 388 tmp_s16 = (ngprvec[gaussian] + 2) >> 2; 389 tmp2_s32 = tmp_s16 * tmp1_s32; 390 // Q20 * approx 0.001 (2^-10=0.0009766), hence, 391 // (Q24 >> 14) = (Q24 >> 4) / 2^10 = Q20. 392 tmp1_s32 = tmp2_s32 >> 14; 393 394 // Q20 / Q7 = Q13. 395 if (tmp1_s32 > 0) { 396 tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, nsk); 397 } else { 398 tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp1_s32, nsk); 399 tmp_s16 = -tmp_s16; 400 } 401 tmp_s16 += 32; // Rounding 402 nsk += tmp_s16 >> 6; // Q13 >> 6 = Q7. 403 if (nsk < kMinStd) { 404 nsk = kMinStd; 405 } 406 self->noise_stds[gaussian] = nsk; 407 } 408 } 409 410 // Separate models if they are too close. 411 // |noise_global_mean| in Q14 (= Q7 * Q7). 412 noise_global_mean = WeightedAverage(&self->noise_means[channel], 0, 413 &kNoiseDataWeights[channel]); 414 415 // |speech_global_mean| in Q14 (= Q7 * Q7). 416 speech_global_mean = WeightedAverage(&self->speech_means[channel], 0, 417 &kSpeechDataWeights[channel]); 418 419 // |diff| = "global" speech mean - "global" noise mean. 420 // (Q14 >> 9) - (Q14 >> 9) = Q5. 421 diff = (int16_t) (speech_global_mean >> 9) - 422 (int16_t) (noise_global_mean >> 9); 423 if (diff < kMinimumDifference[channel]) { 424 tmp_s16 = kMinimumDifference[channel] - diff; 425 426 // |tmp1_s16| = ~0.8 * (kMinimumDifference - diff) in Q7. 427 // |tmp2_s16| = ~0.2 * (kMinimumDifference - diff) in Q7. 428 tmp1_s16 = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(13, tmp_s16, 2); 429 tmp2_s16 = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(3, tmp_s16, 2); 430 431 // Move Gaussian means for speech model by |tmp1_s16| and update 432 // |speech_global_mean|. Note that |self->speech_means[channel]| is 433 // changed after the call. 434 speech_global_mean = WeightedAverage(&self->speech_means[channel], 435 tmp1_s16, 436 &kSpeechDataWeights[channel]); 437 438 // Move Gaussian means for noise model by -|tmp2_s16| and update 439 // |noise_global_mean|. Note that |self->noise_means[channel]| is 440 // changed after the call. 441 noise_global_mean = WeightedAverage(&self->noise_means[channel], 442 -tmp2_s16, 443 &kNoiseDataWeights[channel]); 444 } 445 446 // Control that the speech & noise means do not drift to much. 447 maxspe = kMaximumSpeech[channel]; 448 tmp2_s16 = (int16_t) (speech_global_mean >> 7); 449 if (tmp2_s16 > maxspe) { 450 // Upper limit of speech model. 451 tmp2_s16 -= maxspe; 452 453 for (k = 0; k < kNumGaussians; k++) { 454 self->speech_means[channel + k * kNumChannels] -= tmp2_s16; 455 } 456 } 457 458 tmp2_s16 = (int16_t) (noise_global_mean >> 7); 459 if (tmp2_s16 > kMaximumNoise[channel]) { 460 tmp2_s16 -= kMaximumNoise[channel]; 461 462 for (k = 0; k < kNumGaussians; k++) { 463 self->noise_means[channel + k * kNumChannels] -= tmp2_s16; 464 } 465 } 466 } 467 self->frame_counter++; 468 } 469 470 // Smooth with respect to transition hysteresis. 471 if (!vadflag) { 472 if (self->over_hang > 0) { 473 vadflag = 2 + self->over_hang; 474 self->over_hang--; 475 } 476 self->num_of_speech = 0; 477 } else { 478 self->num_of_speech++; 479 if (self->num_of_speech > kMaxSpeechFrames) { 480 self->num_of_speech = kMaxSpeechFrames; 481 self->over_hang = overhead2; 482 } else { 483 self->over_hang = overhead1; 484 } 485 } 486 return vadflag; 487 } 488 489 // Initialize the VAD. Set aggressiveness mode to default value. 490 int WebRtcVad_InitCore(VadInstT* self) { 491 int i; 492 493 if (self == NULL) { 494 return -1; 495 } 496 497 // Initialization of general struct variables. 498 self->vad = 1; // Speech active (=1). 499 self->frame_counter = 0; 500 self->over_hang = 0; 501 self->num_of_speech = 0; 502 503 // Initialization of downsampling filter state. 504 memset(self->downsampling_filter_states, 0, 505 sizeof(self->downsampling_filter_states)); 506 507 // Initialization of 48 to 8 kHz downsampling. 508 WebRtcSpl_ResetResample48khzTo8khz(&self->state_48_to_8); 509 510 // Read initial PDF parameters. 511 for (i = 0; i < kTableSize; i++) { 512 self->noise_means[i] = kNoiseDataMeans[i]; 513 self->speech_means[i] = kSpeechDataMeans[i]; 514 self->noise_stds[i] = kNoiseDataStds[i]; 515 self->speech_stds[i] = kSpeechDataStds[i]; 516 } 517 518 // Initialize Index and Minimum value vectors. 519 for (i = 0; i < 16 * kNumChannels; i++) { 520 self->low_value_vector[i] = 10000; 521 self->index_vector[i] = 0; 522 } 523 524 // Initialize splitting filter states. 525 memset(self->upper_state, 0, sizeof(self->upper_state)); 526 memset(self->lower_state, 0, sizeof(self->lower_state)); 527 528 // Initialize high pass filter states. 529 memset(self->hp_filter_state, 0, sizeof(self->hp_filter_state)); 530 531 // Initialize mean value memory, for WebRtcVad_FindMinimum(). 532 for (i = 0; i < kNumChannels; i++) { 533 self->mean_value[i] = 1600; 534 } 535 536 // Set aggressiveness mode to default (=|kDefaultMode|). 537 if (WebRtcVad_set_mode_core(self, kDefaultMode) != 0) { 538 return -1; 539 } 540 541 self->init_flag = kInitCheck; 542 543 return 0; 544 } 545 546 // Set aggressiveness mode 547 int WebRtcVad_set_mode_core(VadInstT* self, int mode) { 548 int return_value = 0; 549 550 switch (mode) { 551 case 0: 552 // Quality mode. 553 memcpy(self->over_hang_max_1, kOverHangMax1Q, 554 sizeof(self->over_hang_max_1)); 555 memcpy(self->over_hang_max_2, kOverHangMax2Q, 556 sizeof(self->over_hang_max_2)); 557 memcpy(self->individual, kLocalThresholdQ, 558 sizeof(self->individual)); 559 memcpy(self->total, kGlobalThresholdQ, 560 sizeof(self->total)); 561 break; 562 case 1: 563 // Low bitrate mode. 564 memcpy(self->over_hang_max_1, kOverHangMax1LBR, 565 sizeof(self->over_hang_max_1)); 566 memcpy(self->over_hang_max_2, kOverHangMax2LBR, 567 sizeof(self->over_hang_max_2)); 568 memcpy(self->individual, kLocalThresholdLBR, 569 sizeof(self->individual)); 570 memcpy(self->total, kGlobalThresholdLBR, 571 sizeof(self->total)); 572 break; 573 case 2: 574 // Aggressive mode. 575 memcpy(self->over_hang_max_1, kOverHangMax1AGG, 576 sizeof(self->over_hang_max_1)); 577 memcpy(self->over_hang_max_2, kOverHangMax2AGG, 578 sizeof(self->over_hang_max_2)); 579 memcpy(self->individual, kLocalThresholdAGG, 580 sizeof(self->individual)); 581 memcpy(self->total, kGlobalThresholdAGG, 582 sizeof(self->total)); 583 break; 584 case 3: 585 // Very aggressive mode. 586 memcpy(self->over_hang_max_1, kOverHangMax1VAG, 587 sizeof(self->over_hang_max_1)); 588 memcpy(self->over_hang_max_2, kOverHangMax2VAG, 589 sizeof(self->over_hang_max_2)); 590 memcpy(self->individual, kLocalThresholdVAG, 591 sizeof(self->individual)); 592 memcpy(self->total, kGlobalThresholdVAG, 593 sizeof(self->total)); 594 break; 595 default: 596 return_value = -1; 597 break; 598 } 599 600 return return_value; 601 } 602 603 // Calculate VAD decision by first extracting feature values and then calculate 604 // probability for both speech and background noise. 605 606 int WebRtcVad_CalcVad48khz(VadInstT* inst, const int16_t* speech_frame, 607 int frame_length) { 608 int vad; 609 int i; 610 int16_t speech_nb[240]; // 30 ms in 8 kHz. 611 // |tmp_mem| is a temporary memory used by resample function, length is 612 // frame length in 10 ms (480 samples) + 256 extra. 613 int32_t tmp_mem[480 + 256] = { 0 }; 614 const int kFrameLen10ms48khz = 480; 615 const int kFrameLen10ms8khz = 80; 616 int num_10ms_frames = frame_length / kFrameLen10ms48khz; 617 618 for (i = 0; i < num_10ms_frames; i++) { 619 WebRtcSpl_Resample48khzTo8khz(speech_frame, 620 &speech_nb[i * kFrameLen10ms8khz], 621 &inst->state_48_to_8, 622 tmp_mem); 623 } 624 625 // Do VAD on an 8 kHz signal 626 vad = WebRtcVad_CalcVad8khz(inst, speech_nb, frame_length / 6); 627 628 return vad; 629 } 630 631 int WebRtcVad_CalcVad32khz(VadInstT* inst, const int16_t* speech_frame, 632 int frame_length) 633 { 634 int len, vad; 635 int16_t speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB) 636 int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) 637 638 639 // Downsample signal 32->16->8 before doing VAD 640 WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]), 641 frame_length); 642 len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1); 643 644 WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len); 645 len = WEBRTC_SPL_RSHIFT_W16(len, 1); 646 647 // Do VAD on an 8 kHz signal 648 vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); 649 650 return vad; 651 } 652 653 int WebRtcVad_CalcVad16khz(VadInstT* inst, const int16_t* speech_frame, 654 int frame_length) 655 { 656 int len, vad; 657 int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) 658 659 // Wideband: Downsample signal before doing VAD 660 WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states, 661 frame_length); 662 663 len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1); 664 vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); 665 666 return vad; 667 } 668 669 int WebRtcVad_CalcVad8khz(VadInstT* inst, const int16_t* speech_frame, 670 int frame_length) 671 { 672 int16_t feature_vector[kNumChannels], total_power; 673 674 // Get power in the bands 675 total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length, 676 feature_vector); 677 678 // Make a VAD 679 inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length); 680 681 return inst->vad; 682 } 683