1 /* 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "webrtc/common_audio/vad/vad_core.h" 12 13 #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 14 #include "webrtc/common_audio/vad/vad_filterbank.h" 15 #include "webrtc/common_audio/vad/vad_gmm.h" 16 #include "webrtc/common_audio/vad/vad_sp.h" 17 #include "webrtc/typedefs.h" 18 19 // Spectrum Weighting 20 static const int16_t kSpectrumWeight[kNumChannels] = { 6, 8, 10, 12, 14, 16 }; 21 static const int16_t kNoiseUpdateConst = 655; // Q15 22 static const int16_t kSpeechUpdateConst = 6554; // Q15 23 static const int16_t kBackEta = 154; // Q8 24 // Minimum difference between the two models, Q5 25 static const int16_t kMinimumDifference[kNumChannels] = { 26 544, 544, 576, 576, 576, 576 }; 27 // Upper limit of mean value for speech model, Q7 28 static const int16_t kMaximumSpeech[kNumChannels] = { 29 11392, 11392, 11520, 11520, 11520, 11520 }; 30 // Minimum value for mean value 31 static const int16_t kMinimumMean[kNumGaussians] = { 640, 768 }; 32 // Upper limit of mean value for noise model, Q7 33 static const int16_t kMaximumNoise[kNumChannels] = { 34 9216, 9088, 8960, 8832, 8704, 8576 }; 35 // Start values for the Gaussian models, Q7 36 // Weights for the two Gaussians for the six channels (noise) 37 static const int16_t kNoiseDataWeights[kTableSize] = { 38 34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103 }; 39 // Weights for the two Gaussians for the six channels (speech) 40 static const int16_t kSpeechDataWeights[kTableSize] = { 41 48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81 }; 42 // Means for the two Gaussians for the six channels (noise) 43 static const int16_t kNoiseDataMeans[kTableSize] = { 44 6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362 }; 45 // Means for the two Gaussians for the six channels (speech) 46 static const int16_t kSpeechDataMeans[kTableSize] = { 47 8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483 48 }; 49 // Stds for the two Gaussians for the six channels (noise) 50 static const int16_t kNoiseDataStds[kTableSize] = { 51 378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455 }; 52 // Stds for the two Gaussians for the six channels (speech) 53 static const int16_t kSpeechDataStds[kTableSize] = { 54 555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 }; 55 56 // Constants used in GmmProbability(). 57 // 58 // Maximum number of counted speech (VAD = 1) frames in a row. 59 static const int16_t kMaxSpeechFrames = 6; 60 // Minimum standard deviation for both speech and noise. 61 static const int16_t kMinStd = 384; 62 63 // Constants in WebRtcVad_InitCore(). 64 // Default aggressiveness mode. 65 static const short kDefaultMode = 0; 66 static const int kInitCheck = 42; 67 68 // Constants used in WebRtcVad_set_mode_core(). 69 // 70 // Thresholds for different frame lengths (10 ms, 20 ms and 30 ms). 71 // 72 // Mode 0, Quality. 73 static const int16_t kOverHangMax1Q[3] = { 8, 4, 3 }; 74 static const int16_t kOverHangMax2Q[3] = { 14, 7, 5 }; 75 static const int16_t kLocalThresholdQ[3] = { 24, 21, 24 }; 76 static const int16_t kGlobalThresholdQ[3] = { 57, 48, 57 }; 77 // Mode 1, Low bitrate. 78 static const int16_t kOverHangMax1LBR[3] = { 8, 4, 3 }; 79 static const int16_t kOverHangMax2LBR[3] = { 14, 7, 5 }; 80 static const int16_t kLocalThresholdLBR[3] = { 37, 32, 37 }; 81 static const int16_t kGlobalThresholdLBR[3] = { 100, 80, 100 }; 82 // Mode 2, Aggressive. 83 static const int16_t kOverHangMax1AGG[3] = { 6, 3, 2 }; 84 static const int16_t kOverHangMax2AGG[3] = { 9, 5, 3 }; 85 static const int16_t kLocalThresholdAGG[3] = { 82, 78, 82 }; 86 static const int16_t kGlobalThresholdAGG[3] = { 285, 260, 285 }; 87 // Mode 3, Very aggressive. 88 static const int16_t kOverHangMax1VAG[3] = { 6, 3, 2 }; 89 static const int16_t kOverHangMax2VAG[3] = { 9, 5, 3 }; 90 static const int16_t kLocalThresholdVAG[3] = { 94, 94, 94 }; 91 static const int16_t kGlobalThresholdVAG[3] = { 1100, 1050, 1100 }; 92 93 // Calculates the weighted average w.r.t. number of Gaussians. The |data| are 94 // updated with an |offset| before averaging. 95 // 96 // - data [i/o] : Data to average. 97 // - offset [i] : An offset added to |data|. 98 // - weights [i] : Weights used for averaging. 99 // 100 // returns : The weighted average. 101 static int32_t WeightedAverage(int16_t* data, int16_t offset, 102 const int16_t* weights) { 103 int k; 104 int32_t weighted_average = 0; 105 106 for (k = 0; k < kNumGaussians; k++) { 107 data[k * kNumChannels] += offset; 108 weighted_average += data[k * kNumChannels] * weights[k * kNumChannels]; 109 } 110 return weighted_average; 111 } 112 113 // Calculates the probabilities for both speech and background noise using 114 // Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which 115 // type of signal is most probable. 116 // 117 // - self [i/o] : Pointer to VAD instance 118 // - features [i] : Feature vector of length |kNumChannels| 119 // = log10(energy in frequency band) 120 // - total_power [i] : Total power in audio frame. 121 // - frame_length [i] : Number of input samples 122 // 123 // - returns : the VAD decision (0 - noise, 1 - speech). 124 static int16_t GmmProbability(VadInstT* self, int16_t* features, 125 int16_t total_power, size_t frame_length) { 126 int channel, k; 127 int16_t feature_minimum; 128 int16_t h0, h1; 129 int16_t log_likelihood_ratio; 130 int16_t vadflag = 0; 131 int16_t shifts_h0, shifts_h1; 132 int16_t tmp_s16, tmp1_s16, tmp2_s16; 133 int16_t diff; 134 int gaussian; 135 int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk; 136 int16_t delt, ndelt; 137 int16_t maxspe, maxmu; 138 int16_t deltaN[kTableSize], deltaS[kTableSize]; 139 int16_t ngprvec[kTableSize] = { 0 }; // Conditional probability = 0. 140 int16_t sgprvec[kTableSize] = { 0 }; // Conditional probability = 0. 141 int32_t h0_test, h1_test; 142 int32_t tmp1_s32, tmp2_s32; 143 int32_t sum_log_likelihood_ratios = 0; 144 int32_t noise_global_mean, speech_global_mean; 145 int32_t noise_probability[kNumGaussians], speech_probability[kNumGaussians]; 146 int16_t overhead1, overhead2, individualTest, totalTest; 147 148 // Set various thresholds based on frame lengths (80, 160 or 240 samples). 149 if (frame_length == 80) { 150 overhead1 = self->over_hang_max_1[0]; 151 overhead2 = self->over_hang_max_2[0]; 152 individualTest = self->individual[0]; 153 totalTest = self->total[0]; 154 } else if (frame_length == 160) { 155 overhead1 = self->over_hang_max_1[1]; 156 overhead2 = self->over_hang_max_2[1]; 157 individualTest = self->individual[1]; 158 totalTest = self->total[1]; 159 } else { 160 overhead1 = self->over_hang_max_1[2]; 161 overhead2 = self->over_hang_max_2[2]; 162 individualTest = self->individual[2]; 163 totalTest = self->total[2]; 164 } 165 166 if (total_power > kMinEnergy) { 167 // The signal power of current frame is large enough for processing. The 168 // processing consists of two parts: 169 // 1) Calculating the likelihood of speech and thereby a VAD decision. 170 // 2) Updating the underlying model, w.r.t., the decision made. 171 172 // The detection scheme is an LRT with hypothesis 173 // H0: Noise 174 // H1: Speech 175 // 176 // We combine a global LRT with local tests, for each frequency sub-band, 177 // here defined as |channel|. 178 for (channel = 0; channel < kNumChannels; channel++) { 179 // For each channel we model the probability with a GMM consisting of 180 // |kNumGaussians|, with different means and standard deviations depending 181 // on H0 or H1. 182 h0_test = 0; 183 h1_test = 0; 184 for (k = 0; k < kNumGaussians; k++) { 185 gaussian = channel + k * kNumChannels; 186 // Probability under H0, that is, probability of frame being noise. 187 // Value given in Q27 = Q7 * Q20. 188 tmp1_s32 = WebRtcVad_GaussianProbability(features[channel], 189 self->noise_means[gaussian], 190 self->noise_stds[gaussian], 191 &deltaN[gaussian]); 192 noise_probability[k] = kNoiseDataWeights[gaussian] * tmp1_s32; 193 h0_test += noise_probability[k]; // Q27 194 195 // Probability under H1, that is, probability of frame being speech. 196 // Value given in Q27 = Q7 * Q20. 197 tmp1_s32 = WebRtcVad_GaussianProbability(features[channel], 198 self->speech_means[gaussian], 199 self->speech_stds[gaussian], 200 &deltaS[gaussian]); 201 speech_probability[k] = kSpeechDataWeights[gaussian] * tmp1_s32; 202 h1_test += speech_probability[k]; // Q27 203 } 204 205 // Calculate the log likelihood ratio: log2(Pr{X|H1} / Pr{X|H1}). 206 // Approximation: 207 // log2(Pr{X|H1} / Pr{X|H1}) = log2(Pr{X|H1}*2^Q) - log2(Pr{X|H1}*2^Q) 208 // = log2(h1_test) - log2(h0_test) 209 // = log2(2^(31-shifts_h1)*(1+b1)) 210 // - log2(2^(31-shifts_h0)*(1+b0)) 211 // = shifts_h0 - shifts_h1 212 // + log2(1+b1) - log2(1+b0) 213 // ~= shifts_h0 - shifts_h1 214 // 215 // Note that b0 and b1 are values less than 1, hence, 0 <= log2(1+b0) < 1. 216 // Further, b0 and b1 are independent and on the average the two terms 217 // cancel. 218 shifts_h0 = WebRtcSpl_NormW32(h0_test); 219 shifts_h1 = WebRtcSpl_NormW32(h1_test); 220 if (h0_test == 0) { 221 shifts_h0 = 31; 222 } 223 if (h1_test == 0) { 224 shifts_h1 = 31; 225 } 226 log_likelihood_ratio = shifts_h0 - shifts_h1; 227 228 // Update |sum_log_likelihood_ratios| with spectrum weighting. This is 229 // used for the global VAD decision. 230 sum_log_likelihood_ratios += 231 (int32_t) (log_likelihood_ratio * kSpectrumWeight[channel]); 232 233 // Local VAD decision. 234 if ((log_likelihood_ratio << 2) > individualTest) { 235 vadflag = 1; 236 } 237 238 // TODO(bjornv): The conditional probabilities below are applied on the 239 // hard coded number of Gaussians set to two. Find a way to generalize. 240 // Calculate local noise probabilities used later when updating the GMM. 241 h0 = (int16_t) (h0_test >> 12); // Q15 242 if (h0 > 0) { 243 // High probability of noise. Assign conditional probabilities for each 244 // Gaussian in the GMM. 245 tmp1_s32 = (noise_probability[0] & 0xFFFFF000) << 2; // Q29 246 ngprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h0); // Q14 247 ngprvec[channel + kNumChannels] = 16384 - ngprvec[channel]; 248 } else { 249 // Low noise probability. Assign conditional probability 1 to the first 250 // Gaussian and 0 to the rest (which is already set at initialization). 251 ngprvec[channel] = 16384; 252 } 253 254 // Calculate local speech probabilities used later when updating the GMM. 255 h1 = (int16_t) (h1_test >> 12); // Q15 256 if (h1 > 0) { 257 // High probability of speech. Assign conditional probabilities for each 258 // Gaussian in the GMM. Otherwise use the initialized values, i.e., 0. 259 tmp1_s32 = (speech_probability[0] & 0xFFFFF000) << 2; // Q29 260 sgprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h1); // Q14 261 sgprvec[channel + kNumChannels] = 16384 - sgprvec[channel]; 262 } 263 } 264 265 // Make a global VAD decision. 266 vadflag |= (sum_log_likelihood_ratios >= totalTest); 267 268 // Update the model parameters. 269 maxspe = 12800; 270 for (channel = 0; channel < kNumChannels; channel++) { 271 272 // Get minimum value in past which is used for long term correction in Q4. 273 feature_minimum = WebRtcVad_FindMinimum(self, features[channel], channel); 274 275 // Compute the "global" mean, that is the sum of the two means weighted. 276 noise_global_mean = WeightedAverage(&self->noise_means[channel], 0, 277 &kNoiseDataWeights[channel]); 278 tmp1_s16 = (int16_t) (noise_global_mean >> 6); // Q8 279 280 for (k = 0; k < kNumGaussians; k++) { 281 gaussian = channel + k * kNumChannels; 282 283 nmk = self->noise_means[gaussian]; 284 smk = self->speech_means[gaussian]; 285 nsk = self->noise_stds[gaussian]; 286 ssk = self->speech_stds[gaussian]; 287 288 // Update noise mean vector if the frame consists of noise only. 289 nmk2 = nmk; 290 if (!vadflag) { 291 // deltaN = (x-mu)/sigma^2 292 // ngprvec[k] = |noise_probability[k]| / 293 // (|noise_probability[0]| + |noise_probability[1]|) 294 295 // (Q14 * Q11 >> 11) = Q14. 296 delt = (int16_t)((ngprvec[gaussian] * deltaN[gaussian]) >> 11); 297 // Q7 + (Q14 * Q15 >> 22) = Q7. 298 nmk2 = nmk + (int16_t)((delt * kNoiseUpdateConst) >> 22); 299 } 300 301 // Long term correction of the noise mean. 302 // Q8 - Q8 = Q8. 303 ndelt = (feature_minimum << 4) - tmp1_s16; 304 // Q7 + (Q8 * Q8) >> 9 = Q7. 305 nmk3 = nmk2 + (int16_t)((ndelt * kBackEta) >> 9); 306 307 // Control that the noise mean does not drift to much. 308 tmp_s16 = (int16_t) ((k + 5) << 7); 309 if (nmk3 < tmp_s16) { 310 nmk3 = tmp_s16; 311 } 312 tmp_s16 = (int16_t) ((72 + k - channel) << 7); 313 if (nmk3 > tmp_s16) { 314 nmk3 = tmp_s16; 315 } 316 self->noise_means[gaussian] = nmk3; 317 318 if (vadflag) { 319 // Update speech mean vector: 320 // |deltaS| = (x-mu)/sigma^2 321 // sgprvec[k] = |speech_probability[k]| / 322 // (|speech_probability[0]| + |speech_probability[1]|) 323 324 // (Q14 * Q11) >> 11 = Q14. 325 delt = (int16_t)((sgprvec[gaussian] * deltaS[gaussian]) >> 11); 326 // Q14 * Q15 >> 21 = Q8. 327 tmp_s16 = (int16_t)((delt * kSpeechUpdateConst) >> 21); 328 // Q7 + (Q8 >> 1) = Q7. With rounding. 329 smk2 = smk + ((tmp_s16 + 1) >> 1); 330 331 // Control that the speech mean does not drift to much. 332 maxmu = maxspe + 640; 333 if (smk2 < kMinimumMean[k]) { 334 smk2 = kMinimumMean[k]; 335 } 336 if (smk2 > maxmu) { 337 smk2 = maxmu; 338 } 339 self->speech_means[gaussian] = smk2; // Q7. 340 341 // (Q7 >> 3) = Q4. With rounding. 342 tmp_s16 = ((smk + 4) >> 3); 343 344 tmp_s16 = features[channel] - tmp_s16; // Q4 345 // (Q11 * Q4 >> 3) = Q12. 346 tmp1_s32 = (deltaS[gaussian] * tmp_s16) >> 3; 347 tmp2_s32 = tmp1_s32 - 4096; 348 tmp_s16 = sgprvec[gaussian] >> 2; 349 // (Q14 >> 2) * Q12 = Q24. 350 tmp1_s32 = tmp_s16 * tmp2_s32; 351 352 tmp2_s32 = tmp1_s32 >> 4; // Q20 353 354 // 0.1 * Q20 / Q7 = Q13. 355 if (tmp2_s32 > 0) { 356 tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp2_s32, ssk * 10); 357 } else { 358 tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp2_s32, ssk * 10); 359 tmp_s16 = -tmp_s16; 360 } 361 // Divide by 4 giving an update factor of 0.025 (= 0.1 / 4). 362 // Note that division by 4 equals shift by 2, hence, 363 // (Q13 >> 8) = (Q13 >> 6) / 4 = Q7. 364 tmp_s16 += 128; // Rounding. 365 ssk += (tmp_s16 >> 8); 366 if (ssk < kMinStd) { 367 ssk = kMinStd; 368 } 369 self->speech_stds[gaussian] = ssk; 370 } else { 371 // Update GMM variance vectors. 372 // deltaN * (features[channel] - nmk) - 1 373 // Q4 - (Q7 >> 3) = Q4. 374 tmp_s16 = features[channel] - (nmk >> 3); 375 // (Q11 * Q4 >> 3) = Q12. 376 tmp1_s32 = (deltaN[gaussian] * tmp_s16) >> 3; 377 tmp1_s32 -= 4096; 378 379 // (Q14 >> 2) * Q12 = Q24. 380 tmp_s16 = (ngprvec[gaussian] + 2) >> 2; 381 tmp2_s32 = tmp_s16 * tmp1_s32; 382 // Q20 * approx 0.001 (2^-10=0.0009766), hence, 383 // (Q24 >> 14) = (Q24 >> 4) / 2^10 = Q20. 384 tmp1_s32 = tmp2_s32 >> 14; 385 386 // Q20 / Q7 = Q13. 387 if (tmp1_s32 > 0) { 388 tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, nsk); 389 } else { 390 tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp1_s32, nsk); 391 tmp_s16 = -tmp_s16; 392 } 393 tmp_s16 += 32; // Rounding 394 nsk += tmp_s16 >> 6; // Q13 >> 6 = Q7. 395 if (nsk < kMinStd) { 396 nsk = kMinStd; 397 } 398 self->noise_stds[gaussian] = nsk; 399 } 400 } 401 402 // Separate models if they are too close. 403 // |noise_global_mean| in Q14 (= Q7 * Q7). 404 noise_global_mean = WeightedAverage(&self->noise_means[channel], 0, 405 &kNoiseDataWeights[channel]); 406 407 // |speech_global_mean| in Q14 (= Q7 * Q7). 408 speech_global_mean = WeightedAverage(&self->speech_means[channel], 0, 409 &kSpeechDataWeights[channel]); 410 411 // |diff| = "global" speech mean - "global" noise mean. 412 // (Q14 >> 9) - (Q14 >> 9) = Q5. 413 diff = (int16_t) (speech_global_mean >> 9) - 414 (int16_t) (noise_global_mean >> 9); 415 if (diff < kMinimumDifference[channel]) { 416 tmp_s16 = kMinimumDifference[channel] - diff; 417 418 // |tmp1_s16| = ~0.8 * (kMinimumDifference - diff) in Q7. 419 // |tmp2_s16| = ~0.2 * (kMinimumDifference - diff) in Q7. 420 tmp1_s16 = (int16_t)((13 * tmp_s16) >> 2); 421 tmp2_s16 = (int16_t)((3 * tmp_s16) >> 2); 422 423 // Move Gaussian means for speech model by |tmp1_s16| and update 424 // |speech_global_mean|. Note that |self->speech_means[channel]| is 425 // changed after the call. 426 speech_global_mean = WeightedAverage(&self->speech_means[channel], 427 tmp1_s16, 428 &kSpeechDataWeights[channel]); 429 430 // Move Gaussian means for noise model by -|tmp2_s16| and update 431 // |noise_global_mean|. Note that |self->noise_means[channel]| is 432 // changed after the call. 433 noise_global_mean = WeightedAverage(&self->noise_means[channel], 434 -tmp2_s16, 435 &kNoiseDataWeights[channel]); 436 } 437 438 // Control that the speech & noise means do not drift to much. 439 maxspe = kMaximumSpeech[channel]; 440 tmp2_s16 = (int16_t) (speech_global_mean >> 7); 441 if (tmp2_s16 > maxspe) { 442 // Upper limit of speech model. 443 tmp2_s16 -= maxspe; 444 445 for (k = 0; k < kNumGaussians; k++) { 446 self->speech_means[channel + k * kNumChannels] -= tmp2_s16; 447 } 448 } 449 450 tmp2_s16 = (int16_t) (noise_global_mean >> 7); 451 if (tmp2_s16 > kMaximumNoise[channel]) { 452 tmp2_s16 -= kMaximumNoise[channel]; 453 454 for (k = 0; k < kNumGaussians; k++) { 455 self->noise_means[channel + k * kNumChannels] -= tmp2_s16; 456 } 457 } 458 } 459 self->frame_counter++; 460 } 461 462 // Smooth with respect to transition hysteresis. 463 if (!vadflag) { 464 if (self->over_hang > 0) { 465 vadflag = 2 + self->over_hang; 466 self->over_hang--; 467 } 468 self->num_of_speech = 0; 469 } else { 470 self->num_of_speech++; 471 if (self->num_of_speech > kMaxSpeechFrames) { 472 self->num_of_speech = kMaxSpeechFrames; 473 self->over_hang = overhead2; 474 } else { 475 self->over_hang = overhead1; 476 } 477 } 478 return vadflag; 479 } 480 481 // Initialize the VAD. Set aggressiveness mode to default value. 482 int WebRtcVad_InitCore(VadInstT* self) { 483 int i; 484 485 if (self == NULL) { 486 return -1; 487 } 488 489 // Initialization of general struct variables. 490 self->vad = 1; // Speech active (=1). 491 self->frame_counter = 0; 492 self->over_hang = 0; 493 self->num_of_speech = 0; 494 495 // Initialization of downsampling filter state. 496 memset(self->downsampling_filter_states, 0, 497 sizeof(self->downsampling_filter_states)); 498 499 // Initialization of 48 to 8 kHz downsampling. 500 WebRtcSpl_ResetResample48khzTo8khz(&self->state_48_to_8); 501 502 // Read initial PDF parameters. 503 for (i = 0; i < kTableSize; i++) { 504 self->noise_means[i] = kNoiseDataMeans[i]; 505 self->speech_means[i] = kSpeechDataMeans[i]; 506 self->noise_stds[i] = kNoiseDataStds[i]; 507 self->speech_stds[i] = kSpeechDataStds[i]; 508 } 509 510 // Initialize Index and Minimum value vectors. 511 for (i = 0; i < 16 * kNumChannels; i++) { 512 self->low_value_vector[i] = 10000; 513 self->index_vector[i] = 0; 514 } 515 516 // Initialize splitting filter states. 517 memset(self->upper_state, 0, sizeof(self->upper_state)); 518 memset(self->lower_state, 0, sizeof(self->lower_state)); 519 520 // Initialize high pass filter states. 521 memset(self->hp_filter_state, 0, sizeof(self->hp_filter_state)); 522 523 // Initialize mean value memory, for WebRtcVad_FindMinimum(). 524 for (i = 0; i < kNumChannels; i++) { 525 self->mean_value[i] = 1600; 526 } 527 528 // Set aggressiveness mode to default (=|kDefaultMode|). 529 if (WebRtcVad_set_mode_core(self, kDefaultMode) != 0) { 530 return -1; 531 } 532 533 self->init_flag = kInitCheck; 534 535 return 0; 536 } 537 538 // Set aggressiveness mode 539 int WebRtcVad_set_mode_core(VadInstT* self, int mode) { 540 int return_value = 0; 541 542 switch (mode) { 543 case 0: 544 // Quality mode. 545 memcpy(self->over_hang_max_1, kOverHangMax1Q, 546 sizeof(self->over_hang_max_1)); 547 memcpy(self->over_hang_max_2, kOverHangMax2Q, 548 sizeof(self->over_hang_max_2)); 549 memcpy(self->individual, kLocalThresholdQ, 550 sizeof(self->individual)); 551 memcpy(self->total, kGlobalThresholdQ, 552 sizeof(self->total)); 553 break; 554 case 1: 555 // Low bitrate mode. 556 memcpy(self->over_hang_max_1, kOverHangMax1LBR, 557 sizeof(self->over_hang_max_1)); 558 memcpy(self->over_hang_max_2, kOverHangMax2LBR, 559 sizeof(self->over_hang_max_2)); 560 memcpy(self->individual, kLocalThresholdLBR, 561 sizeof(self->individual)); 562 memcpy(self->total, kGlobalThresholdLBR, 563 sizeof(self->total)); 564 break; 565 case 2: 566 // Aggressive mode. 567 memcpy(self->over_hang_max_1, kOverHangMax1AGG, 568 sizeof(self->over_hang_max_1)); 569 memcpy(self->over_hang_max_2, kOverHangMax2AGG, 570 sizeof(self->over_hang_max_2)); 571 memcpy(self->individual, kLocalThresholdAGG, 572 sizeof(self->individual)); 573 memcpy(self->total, kGlobalThresholdAGG, 574 sizeof(self->total)); 575 break; 576 case 3: 577 // Very aggressive mode. 578 memcpy(self->over_hang_max_1, kOverHangMax1VAG, 579 sizeof(self->over_hang_max_1)); 580 memcpy(self->over_hang_max_2, kOverHangMax2VAG, 581 sizeof(self->over_hang_max_2)); 582 memcpy(self->individual, kLocalThresholdVAG, 583 sizeof(self->individual)); 584 memcpy(self->total, kGlobalThresholdVAG, 585 sizeof(self->total)); 586 break; 587 default: 588 return_value = -1; 589 break; 590 } 591 592 return return_value; 593 } 594 595 // Calculate VAD decision by first extracting feature values and then calculate 596 // probability for both speech and background noise. 597 598 int WebRtcVad_CalcVad48khz(VadInstT* inst, const int16_t* speech_frame, 599 size_t frame_length) { 600 int vad; 601 size_t i; 602 int16_t speech_nb[240]; // 30 ms in 8 kHz. 603 // |tmp_mem| is a temporary memory used by resample function, length is 604 // frame length in 10 ms (480 samples) + 256 extra. 605 int32_t tmp_mem[480 + 256] = { 0 }; 606 const size_t kFrameLen10ms48khz = 480; 607 const size_t kFrameLen10ms8khz = 80; 608 size_t num_10ms_frames = frame_length / kFrameLen10ms48khz; 609 610 for (i = 0; i < num_10ms_frames; i++) { 611 WebRtcSpl_Resample48khzTo8khz(speech_frame, 612 &speech_nb[i * kFrameLen10ms8khz], 613 &inst->state_48_to_8, 614 tmp_mem); 615 } 616 617 // Do VAD on an 8 kHz signal 618 vad = WebRtcVad_CalcVad8khz(inst, speech_nb, frame_length / 6); 619 620 return vad; 621 } 622 623 int WebRtcVad_CalcVad32khz(VadInstT* inst, const int16_t* speech_frame, 624 size_t frame_length) 625 { 626 size_t len; 627 int vad; 628 int16_t speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB) 629 int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) 630 631 632 // Downsample signal 32->16->8 before doing VAD 633 WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]), 634 frame_length); 635 len = frame_length / 2; 636 637 WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len); 638 len /= 2; 639 640 // Do VAD on an 8 kHz signal 641 vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); 642 643 return vad; 644 } 645 646 int WebRtcVad_CalcVad16khz(VadInstT* inst, const int16_t* speech_frame, 647 size_t frame_length) 648 { 649 size_t len; 650 int vad; 651 int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) 652 653 // Wideband: Downsample signal before doing VAD 654 WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states, 655 frame_length); 656 657 len = frame_length / 2; 658 vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); 659 660 return vad; 661 } 662 663 int WebRtcVad_CalcVad8khz(VadInstT* inst, const int16_t* speech_frame, 664 size_t frame_length) 665 { 666 int16_t feature_vector[kNumChannels], total_power; 667 668 // Get power in the bands 669 total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length, 670 feature_vector); 671 672 // Make a VAD 673 inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length); 674 675 return inst->vad; 676 } 677