1 /* 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "webrtc/common_audio/vad/vad_filterbank.h" 12 13 #include <assert.h> 14 15 #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 16 #include "webrtc/typedefs.h" 17 18 // Constants used in LogOfEnergy(). 19 static const int16_t kLogConst = 24660; // 160*log10(2) in Q9. 20 static const int16_t kLogEnergyIntPart = 14336; // 14 in Q10 21 22 // Coefficients used by HighPassFilter, Q14. 23 static const int16_t kHpZeroCoefs[3] = { 6631, -13262, 6631 }; 24 static const int16_t kHpPoleCoefs[3] = { 16384, -7756, 5620 }; 25 26 // Allpass filter coefficients, upper and lower, in Q15. 27 // Upper: 0.64, Lower: 0.17 28 static const int16_t kAllPassCoefsQ15[2] = { 20972, 5571 }; 29 30 // Adjustment for division with two in SplitFilter. 31 static const int16_t kOffsetVector[6] = { 368, 368, 272, 176, 176, 176 }; 32 33 // High pass filtering, with a cut-off frequency at 80 Hz, if the |data_in| is 34 // sampled at 500 Hz. 35 // 36 // - data_in [i] : Input audio data sampled at 500 Hz. 37 // - data_length [i] : Length of input and output data. 38 // - filter_state [i/o] : State of the filter. 39 // - data_out [o] : Output audio data in the frequency interval 40 // 80 - 250 Hz. 41 static void HighPassFilter(const int16_t* data_in, int data_length, 42 int16_t* filter_state, int16_t* data_out) { 43 int i; 44 const int16_t* in_ptr = data_in; 45 int16_t* out_ptr = data_out; 46 int32_t tmp32 = 0; 47 48 49 // The sum of the absolute values of the impulse response: 50 // The zero/pole-filter has a max amplification of a single sample of: 1.4546 51 // Impulse response: 0.4047 -0.6179 -0.0266 0.1993 0.1035 -0.0194 52 // The all-zero section has a max amplification of a single sample of: 1.6189 53 // Impulse response: 0.4047 -0.8094 0.4047 0 0 0 54 // The all-pole section has a max amplification of a single sample of: 1.9931 55 // Impulse response: 1.0000 0.4734 -0.1189 -0.2187 -0.0627 0.04532 56 57 for (i = 0; i < data_length; i++) { 58 // All-zero section (filter coefficients in Q14). 59 tmp32 = WEBRTC_SPL_MUL_16_16(kHpZeroCoefs[0], *in_ptr); 60 tmp32 += WEBRTC_SPL_MUL_16_16(kHpZeroCoefs[1], filter_state[0]); 61 tmp32 += WEBRTC_SPL_MUL_16_16(kHpZeroCoefs[2], filter_state[1]); 62 filter_state[1] = filter_state[0]; 63 filter_state[0] = *in_ptr++; 64 65 // All-pole section (filter coefficients in Q14). 66 tmp32 -= WEBRTC_SPL_MUL_16_16(kHpPoleCoefs[1], filter_state[2]); 67 tmp32 -= WEBRTC_SPL_MUL_16_16(kHpPoleCoefs[2], filter_state[3]); 68 filter_state[3] = filter_state[2]; 69 filter_state[2] = (int16_t) (tmp32 >> 14); 70 *out_ptr++ = filter_state[2]; 71 } 72 } 73 74 // All pass filtering of |data_in|, used before splitting the signal into two 75 // frequency bands (low pass vs high pass). 76 // Note that |data_in| and |data_out| can NOT correspond to the same address. 77 // 78 // - data_in [i] : Input audio signal given in Q0. 79 // - data_length [i] : Length of input and output data. 80 // - filter_coefficient [i] : Given in Q15. 81 // - filter_state [i/o] : State of the filter given in Q(-1). 82 // - data_out [o] : Output audio signal given in Q(-1). 83 static void AllPassFilter(const int16_t* data_in, int data_length, 84 int16_t filter_coefficient, int16_t* filter_state, 85 int16_t* data_out) { 86 // The filter can only cause overflow (in the w16 output variable) 87 // if more than 4 consecutive input numbers are of maximum value and 88 // has the the same sign as the impulse responses first taps. 89 // First 6 taps of the impulse response: 90 // 0.6399 0.5905 -0.3779 0.2418 -0.1547 0.0990 91 92 int i; 93 int16_t tmp16 = 0; 94 int32_t tmp32 = 0; 95 int32_t state32 = ((int32_t) (*filter_state) << 16); // Q15 96 97 for (i = 0; i < data_length; i++) { 98 tmp32 = state32 + WEBRTC_SPL_MUL_16_16(filter_coefficient, *data_in); 99 tmp16 = (int16_t) (tmp32 >> 16); // Q(-1) 100 *data_out++ = tmp16; 101 state32 = (((int32_t) (*data_in)) << 14); // Q14 102 state32 -= WEBRTC_SPL_MUL_16_16(filter_coefficient, tmp16); // Q14 103 state32 <<= 1; // Q15. 104 data_in += 2; 105 } 106 107 *filter_state = (int16_t) (state32 >> 16); // Q(-1) 108 } 109 110 // Splits |data_in| into |hp_data_out| and |lp_data_out| corresponding to 111 // an upper (high pass) part and a lower (low pass) part respectively. 112 // 113 // - data_in [i] : Input audio data to be split into two frequency bands. 114 // - data_length [i] : Length of |data_in|. 115 // - upper_state [i/o] : State of the upper filter, given in Q(-1). 116 // - lower_state [i/o] : State of the lower filter, given in Q(-1). 117 // - hp_data_out [o] : Output audio data of the upper half of the spectrum. 118 // The length is |data_length| / 2. 119 // - lp_data_out [o] : Output audio data of the lower half of the spectrum. 120 // The length is |data_length| / 2. 121 static void SplitFilter(const int16_t* data_in, int data_length, 122 int16_t* upper_state, int16_t* lower_state, 123 int16_t* hp_data_out, int16_t* lp_data_out) { 124 int i; 125 int half_length = data_length >> 1; // Downsampling by 2. 126 int16_t tmp_out; 127 128 // All-pass filtering upper branch. 129 AllPassFilter(&data_in[0], half_length, kAllPassCoefsQ15[0], upper_state, 130 hp_data_out); 131 132 // All-pass filtering lower branch. 133 AllPassFilter(&data_in[1], half_length, kAllPassCoefsQ15[1], lower_state, 134 lp_data_out); 135 136 // Make LP and HP signals. 137 for (i = 0; i < half_length; i++) { 138 tmp_out = *hp_data_out; 139 *hp_data_out++ -= *lp_data_out; 140 *lp_data_out++ += tmp_out; 141 } 142 } 143 144 // Calculates the energy of |data_in| in dB, and also updates an overall 145 // |total_energy| if necessary. 146 // 147 // - data_in [i] : Input audio data for energy calculation. 148 // - data_length [i] : Length of input data. 149 // - offset [i] : Offset value added to |log_energy|. 150 // - total_energy [i/o] : An external energy updated with the energy of 151 // |data_in|. 152 // NOTE: |total_energy| is only updated if 153 // |total_energy| <= |kMinEnergy|. 154 // - log_energy [o] : 10 * log10("energy of |data_in|") given in Q4. 155 static void LogOfEnergy(const int16_t* data_in, int data_length, 156 int16_t offset, int16_t* total_energy, 157 int16_t* log_energy) { 158 // |tot_rshifts| accumulates the number of right shifts performed on |energy|. 159 int tot_rshifts = 0; 160 // The |energy| will be normalized to 15 bits. We use unsigned integer because 161 // we eventually will mask out the fractional part. 162 uint32_t energy = 0; 163 164 assert(data_in != NULL); 165 assert(data_length > 0); 166 167 energy = (uint32_t) WebRtcSpl_Energy((int16_t*) data_in, data_length, 168 &tot_rshifts); 169 170 if (energy != 0) { 171 // By construction, normalizing to 15 bits is equivalent with 17 leading 172 // zeros of an unsigned 32 bit value. 173 int normalizing_rshifts = 17 - WebRtcSpl_NormU32(energy); 174 // In a 15 bit representation the leading bit is 2^14. log2(2^14) in Q10 is 175 // (14 << 10), which is what we initialize |log2_energy| with. For a more 176 // detailed derivations, see below. 177 int16_t log2_energy = kLogEnergyIntPart; 178 179 tot_rshifts += normalizing_rshifts; 180 // Normalize |energy| to 15 bits. 181 // |tot_rshifts| is now the total number of right shifts performed on 182 // |energy| after normalization. This means that |energy| is in 183 // Q(-tot_rshifts). 184 if (normalizing_rshifts < 0) { 185 energy <<= -normalizing_rshifts; 186 } else { 187 energy >>= normalizing_rshifts; 188 } 189 190 // Calculate the energy of |data_in| in dB, in Q4. 191 // 192 // 10 * log10("true energy") in Q4 = 2^4 * 10 * log10("true energy") = 193 // 160 * log10(|energy| * 2^|tot_rshifts|) = 194 // 160 * log10(2) * log2(|energy| * 2^|tot_rshifts|) = 195 // 160 * log10(2) * (log2(|energy|) + log2(2^|tot_rshifts|)) = 196 // (160 * log10(2)) * (log2(|energy|) + |tot_rshifts|) = 197 // |kLogConst| * (|log2_energy| + |tot_rshifts|) 198 // 199 // We know by construction that |energy| is normalized to 15 bits. Hence, 200 // |energy| = 2^14 + frac_Q15, where frac_Q15 is a fractional part in Q15. 201 // Further, we'd like |log2_energy| in Q10 202 // log2(|energy|) in Q10 = 2^10 * log2(2^14 + frac_Q15) = 203 // 2^10 * log2(2^14 * (1 + frac_Q15 * 2^-14)) = 204 // 2^10 * (14 + log2(1 + frac_Q15 * 2^-14)) ~= 205 // (14 << 10) + 2^10 * (frac_Q15 * 2^-14) = 206 // (14 << 10) + (frac_Q15 * 2^-4) = (14 << 10) + (frac_Q15 >> 4) 207 // 208 // Note that frac_Q15 = (|energy| & 0x00003FFF) 209 210 // Calculate and add the fractional part to |log2_energy|. 211 log2_energy += (int16_t) ((energy & 0x00003FFF) >> 4); 212 213 // |kLogConst| is in Q9, |log2_energy| in Q10 and |tot_rshifts| in Q0. 214 // Note that we in our derivation above have accounted for an output in Q4. 215 *log_energy = (int16_t) (WEBRTC_SPL_MUL_16_16_RSFT( 216 kLogConst, log2_energy, 19) + 217 WEBRTC_SPL_MUL_16_16_RSFT(tot_rshifts, kLogConst, 9)); 218 219 if (*log_energy < 0) { 220 *log_energy = 0; 221 } 222 } else { 223 *log_energy = offset; 224 return; 225 } 226 227 *log_energy += offset; 228 229 // Update the approximate |total_energy| with the energy of |data_in|, if 230 // |total_energy| has not exceeded |kMinEnergy|. |total_energy| is used as an 231 // energy indicator in WebRtcVad_GmmProbability() in vad_core.c. 232 if (*total_energy <= kMinEnergy) { 233 if (tot_rshifts >= 0) { 234 // We know by construction that the |energy| > |kMinEnergy| in Q0, so add 235 // an arbitrary value such that |total_energy| exceeds |kMinEnergy|. 236 *total_energy += kMinEnergy + 1; 237 } else { 238 // By construction |energy| is represented by 15 bits, hence any number of 239 // right shifted |energy| will fit in an int16_t. In addition, adding the 240 // value to |total_energy| is wrap around safe as long as 241 // |kMinEnergy| < 8192. 242 *total_energy += (int16_t) (energy >> -tot_rshifts); // Q0. 243 } 244 } 245 } 246 247 int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in, 248 int data_length, int16_t* features) { 249 int16_t total_energy = 0; 250 // We expect |data_length| to be 80, 160 or 240 samples, which corresponds to 251 // 10, 20 or 30 ms in 8 kHz. Therefore, the intermediate downsampled data will 252 // have at most 120 samples after the first split and at most 60 samples after 253 // the second split. 254 int16_t hp_120[120], lp_120[120]; 255 int16_t hp_60[60], lp_60[60]; 256 const int half_data_length = data_length >> 1; 257 int length = half_data_length; // |data_length| / 2, corresponds to 258 // bandwidth = 2000 Hz after downsampling. 259 260 // Initialize variables for the first SplitFilter(). 261 int frequency_band = 0; 262 const int16_t* in_ptr = data_in; // [0 - 4000] Hz. 263 int16_t* hp_out_ptr = hp_120; // [2000 - 4000] Hz. 264 int16_t* lp_out_ptr = lp_120; // [0 - 2000] Hz. 265 266 assert(data_length >= 0); 267 assert(data_length <= 240); 268 assert(4 < kNumChannels - 1); // Checking maximum |frequency_band|. 269 270 // Split at 2000 Hz and downsample. 271 SplitFilter(in_ptr, data_length, &self->upper_state[frequency_band], 272 &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr); 273 274 // For the upper band (2000 Hz - 4000 Hz) split at 3000 Hz and downsample. 275 frequency_band = 1; 276 in_ptr = hp_120; // [2000 - 4000] Hz. 277 hp_out_ptr = hp_60; // [3000 - 4000] Hz. 278 lp_out_ptr = lp_60; // [2000 - 3000] Hz. 279 SplitFilter(in_ptr, length, &self->upper_state[frequency_band], 280 &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr); 281 282 // Energy in 3000 Hz - 4000 Hz. 283 length >>= 1; // |data_length| / 4 <=> bandwidth = 1000 Hz. 284 285 LogOfEnergy(hp_60, length, kOffsetVector[5], &total_energy, &features[5]); 286 287 // Energy in 2000 Hz - 3000 Hz. 288 LogOfEnergy(lp_60, length, kOffsetVector[4], &total_energy, &features[4]); 289 290 // For the lower band (0 Hz - 2000 Hz) split at 1000 Hz and downsample. 291 frequency_band = 2; 292 in_ptr = lp_120; // [0 - 2000] Hz. 293 hp_out_ptr = hp_60; // [1000 - 2000] Hz. 294 lp_out_ptr = lp_60; // [0 - 1000] Hz. 295 length = half_data_length; // |data_length| / 2 <=> bandwidth = 2000 Hz. 296 SplitFilter(in_ptr, length, &self->upper_state[frequency_band], 297 &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr); 298 299 // Energy in 1000 Hz - 2000 Hz. 300 length >>= 1; // |data_length| / 4 <=> bandwidth = 1000 Hz. 301 LogOfEnergy(hp_60, length, kOffsetVector[3], &total_energy, &features[3]); 302 303 // For the lower band (0 Hz - 1000 Hz) split at 500 Hz and downsample. 304 frequency_band = 3; 305 in_ptr = lp_60; // [0 - 1000] Hz. 306 hp_out_ptr = hp_120; // [500 - 1000] Hz. 307 lp_out_ptr = lp_120; // [0 - 500] Hz. 308 SplitFilter(in_ptr, length, &self->upper_state[frequency_band], 309 &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr); 310 311 // Energy in 500 Hz - 1000 Hz. 312 length >>= 1; // |data_length| / 8 <=> bandwidth = 500 Hz. 313 LogOfEnergy(hp_120, length, kOffsetVector[2], &total_energy, &features[2]); 314 315 // For the lower band (0 Hz - 500 Hz) split at 250 Hz and downsample. 316 frequency_band = 4; 317 in_ptr = lp_120; // [0 - 500] Hz. 318 hp_out_ptr = hp_60; // [250 - 500] Hz. 319 lp_out_ptr = lp_60; // [0 - 250] Hz. 320 SplitFilter(in_ptr, length, &self->upper_state[frequency_band], 321 &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr); 322 323 // Energy in 250 Hz - 500 Hz. 324 length >>= 1; // |data_length| / 16 <=> bandwidth = 250 Hz. 325 LogOfEnergy(hp_60, length, kOffsetVector[1], &total_energy, &features[1]); 326 327 // Remove 0 Hz - 80 Hz, by high pass filtering the lower band. 328 HighPassFilter(lp_60, length, self->hp_filter_state, hp_120); 329 330 // Energy in 80 Hz - 250 Hz. 331 LogOfEnergy(hp_120, length, kOffsetVector[0], &total_energy, &features[0]); 332 333 return total_energy; 334 } 335