Home | History | Annotate | Download | only in vad
      1 /*
      2  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "webrtc/common_audio/vad/vad_filterbank.h"
     12 
     13 #include <assert.h>
     14 
     15 #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
     16 #include "webrtc/typedefs.h"
     17 
     18 // Constants used in LogOfEnergy().
     19 static const int16_t kLogConst = 24660;  // 160*log10(2) in Q9.
     20 static const int16_t kLogEnergyIntPart = 14336;  // 14 in Q10
     21 
     22 // Coefficients used by HighPassFilter, Q14.
     23 static const int16_t kHpZeroCoefs[3] = { 6631, -13262, 6631 };
     24 static const int16_t kHpPoleCoefs[3] = { 16384, -7756, 5620 };
     25 
     26 // Allpass filter coefficients, upper and lower, in Q15.
     27 // Upper: 0.64, Lower: 0.17
     28 static const int16_t kAllPassCoefsQ15[2] = { 20972, 5571 };
     29 
     30 // Adjustment for division with two in SplitFilter.
     31 static const int16_t kOffsetVector[6] = { 368, 368, 272, 176, 176, 176 };
     32 
     33 // High pass filtering, with a cut-off frequency at 80 Hz, if the |data_in| is
     34 // sampled at 500 Hz.
     35 //
     36 // - data_in      [i]   : Input audio data sampled at 500 Hz.
     37 // - data_length  [i]   : Length of input and output data.
     38 // - filter_state [i/o] : State of the filter.
     39 // - data_out     [o]   : Output audio data in the frequency interval
     40 //                        80 - 250 Hz.
     41 static void HighPassFilter(const int16_t* data_in, int data_length,
     42                            int16_t* filter_state, int16_t* data_out) {
     43   int i;
     44   const int16_t* in_ptr = data_in;
     45   int16_t* out_ptr = data_out;
     46   int32_t tmp32 = 0;
     47 
     48 
     49   // The sum of the absolute values of the impulse response:
     50   // The zero/pole-filter has a max amplification of a single sample of: 1.4546
     51   // Impulse response: 0.4047 -0.6179 -0.0266  0.1993  0.1035  -0.0194
     52   // The all-zero section has a max amplification of a single sample of: 1.6189
     53   // Impulse response: 0.4047 -0.8094  0.4047  0       0        0
     54   // The all-pole section has a max amplification of a single sample of: 1.9931
     55   // Impulse response: 1.0000  0.4734 -0.1189 -0.2187 -0.0627   0.04532
     56 
     57   for (i = 0; i < data_length; i++) {
     58     // All-zero section (filter coefficients in Q14).
     59     tmp32 = WEBRTC_SPL_MUL_16_16(kHpZeroCoefs[0], *in_ptr);
     60     tmp32 += WEBRTC_SPL_MUL_16_16(kHpZeroCoefs[1], filter_state[0]);
     61     tmp32 += WEBRTC_SPL_MUL_16_16(kHpZeroCoefs[2], filter_state[1]);
     62     filter_state[1] = filter_state[0];
     63     filter_state[0] = *in_ptr++;
     64 
     65     // All-pole section (filter coefficients in Q14).
     66     tmp32 -= WEBRTC_SPL_MUL_16_16(kHpPoleCoefs[1], filter_state[2]);
     67     tmp32 -= WEBRTC_SPL_MUL_16_16(kHpPoleCoefs[2], filter_state[3]);
     68     filter_state[3] = filter_state[2];
     69     filter_state[2] = (int16_t) (tmp32 >> 14);
     70     *out_ptr++ = filter_state[2];
     71   }
     72 }
     73 
     74 // All pass filtering of |data_in|, used before splitting the signal into two
     75 // frequency bands (low pass vs high pass).
     76 // Note that |data_in| and |data_out| can NOT correspond to the same address.
     77 //
     78 // - data_in            [i]   : Input audio signal given in Q0.
     79 // - data_length        [i]   : Length of input and output data.
     80 // - filter_coefficient [i]   : Given in Q15.
     81 // - filter_state       [i/o] : State of the filter given in Q(-1).
     82 // - data_out           [o]   : Output audio signal given in Q(-1).
     83 static void AllPassFilter(const int16_t* data_in, int data_length,
     84                           int16_t filter_coefficient, int16_t* filter_state,
     85                           int16_t* data_out) {
     86   // The filter can only cause overflow (in the w16 output variable)
     87   // if more than 4 consecutive input numbers are of maximum value and
     88   // has the the same sign as the impulse responses first taps.
     89   // First 6 taps of the impulse response:
     90   // 0.6399 0.5905 -0.3779 0.2418 -0.1547 0.0990
     91 
     92   int i;
     93   int16_t tmp16 = 0;
     94   int32_t tmp32 = 0;
     95   int32_t state32 = ((int32_t) (*filter_state) << 16);  // Q15
     96 
     97   for (i = 0; i < data_length; i++) {
     98     tmp32 = state32 + WEBRTC_SPL_MUL_16_16(filter_coefficient, *data_in);
     99     tmp16 = (int16_t) (tmp32 >> 16);  // Q(-1)
    100     *data_out++ = tmp16;
    101     state32 = (((int32_t) (*data_in)) << 14); // Q14
    102     state32 -= WEBRTC_SPL_MUL_16_16(filter_coefficient, tmp16);  // Q14
    103     state32 <<= 1;  // Q15.
    104     data_in += 2;
    105   }
    106 
    107   *filter_state = (int16_t) (state32 >> 16);  // Q(-1)
    108 }
    109 
    110 // Splits |data_in| into |hp_data_out| and |lp_data_out| corresponding to
    111 // an upper (high pass) part and a lower (low pass) part respectively.
    112 //
    113 // - data_in      [i]   : Input audio data to be split into two frequency bands.
    114 // - data_length  [i]   : Length of |data_in|.
    115 // - upper_state  [i/o] : State of the upper filter, given in Q(-1).
    116 // - lower_state  [i/o] : State of the lower filter, given in Q(-1).
    117 // - hp_data_out  [o]   : Output audio data of the upper half of the spectrum.
    118 //                        The length is |data_length| / 2.
    119 // - lp_data_out  [o]   : Output audio data of the lower half of the spectrum.
    120 //                        The length is |data_length| / 2.
    121 static void SplitFilter(const int16_t* data_in, int data_length,
    122                         int16_t* upper_state, int16_t* lower_state,
    123                         int16_t* hp_data_out, int16_t* lp_data_out) {
    124   int i;
    125   int half_length = data_length >> 1;  // Downsampling by 2.
    126   int16_t tmp_out;
    127 
    128   // All-pass filtering upper branch.
    129   AllPassFilter(&data_in[0], half_length, kAllPassCoefsQ15[0], upper_state,
    130                 hp_data_out);
    131 
    132   // All-pass filtering lower branch.
    133   AllPassFilter(&data_in[1], half_length, kAllPassCoefsQ15[1], lower_state,
    134                 lp_data_out);
    135 
    136   // Make LP and HP signals.
    137   for (i = 0; i < half_length; i++) {
    138     tmp_out = *hp_data_out;
    139     *hp_data_out++ -= *lp_data_out;
    140     *lp_data_out++ += tmp_out;
    141   }
    142 }
    143 
    144 // Calculates the energy of |data_in| in dB, and also updates an overall
    145 // |total_energy| if necessary.
    146 //
    147 // - data_in      [i]   : Input audio data for energy calculation.
    148 // - data_length  [i]   : Length of input data.
    149 // - offset       [i]   : Offset value added to |log_energy|.
    150 // - total_energy [i/o] : An external energy updated with the energy of
    151 //                        |data_in|.
    152 //                        NOTE: |total_energy| is only updated if
    153 //                        |total_energy| <= |kMinEnergy|.
    154 // - log_energy   [o]   : 10 * log10("energy of |data_in|") given in Q4.
    155 static void LogOfEnergy(const int16_t* data_in, int data_length,
    156                         int16_t offset, int16_t* total_energy,
    157                         int16_t* log_energy) {
    158   // |tot_rshifts| accumulates the number of right shifts performed on |energy|.
    159   int tot_rshifts = 0;
    160   // The |energy| will be normalized to 15 bits. We use unsigned integer because
    161   // we eventually will mask out the fractional part.
    162   uint32_t energy = 0;
    163 
    164   assert(data_in != NULL);
    165   assert(data_length > 0);
    166 
    167   energy = (uint32_t) WebRtcSpl_Energy((int16_t*) data_in, data_length,
    168                                        &tot_rshifts);
    169 
    170   if (energy != 0) {
    171     // By construction, normalizing to 15 bits is equivalent with 17 leading
    172     // zeros of an unsigned 32 bit value.
    173     int normalizing_rshifts = 17 - WebRtcSpl_NormU32(energy);
    174     // In a 15 bit representation the leading bit is 2^14. log2(2^14) in Q10 is
    175     // (14 << 10), which is what we initialize |log2_energy| with. For a more
    176     // detailed derivations, see below.
    177     int16_t log2_energy = kLogEnergyIntPart;
    178 
    179     tot_rshifts += normalizing_rshifts;
    180     // Normalize |energy| to 15 bits.
    181     // |tot_rshifts| is now the total number of right shifts performed on
    182     // |energy| after normalization. This means that |energy| is in
    183     // Q(-tot_rshifts).
    184     if (normalizing_rshifts < 0) {
    185       energy <<= -normalizing_rshifts;
    186     } else {
    187       energy >>= normalizing_rshifts;
    188     }
    189 
    190     // Calculate the energy of |data_in| in dB, in Q4.
    191     //
    192     // 10 * log10("true energy") in Q4 = 2^4 * 10 * log10("true energy") =
    193     // 160 * log10(|energy| * 2^|tot_rshifts|) =
    194     // 160 * log10(2) * log2(|energy| * 2^|tot_rshifts|) =
    195     // 160 * log10(2) * (log2(|energy|) + log2(2^|tot_rshifts|)) =
    196     // (160 * log10(2)) * (log2(|energy|) + |tot_rshifts|) =
    197     // |kLogConst| * (|log2_energy| + |tot_rshifts|)
    198     //
    199     // We know by construction that |energy| is normalized to 15 bits. Hence,
    200     // |energy| = 2^14 + frac_Q15, where frac_Q15 is a fractional part in Q15.
    201     // Further, we'd like |log2_energy| in Q10
    202     // log2(|energy|) in Q10 = 2^10 * log2(2^14 + frac_Q15) =
    203     // 2^10 * log2(2^14 * (1 + frac_Q15 * 2^-14)) =
    204     // 2^10 * (14 + log2(1 + frac_Q15 * 2^-14)) ~=
    205     // (14 << 10) + 2^10 * (frac_Q15 * 2^-14) =
    206     // (14 << 10) + (frac_Q15 * 2^-4) = (14 << 10) + (frac_Q15 >> 4)
    207     //
    208     // Note that frac_Q15 = (|energy| & 0x00003FFF)
    209 
    210     // Calculate and add the fractional part to |log2_energy|.
    211     log2_energy += (int16_t) ((energy & 0x00003FFF) >> 4);
    212 
    213     // |kLogConst| is in Q9, |log2_energy| in Q10 and |tot_rshifts| in Q0.
    214     // Note that we in our derivation above have accounted for an output in Q4.
    215     *log_energy = (int16_t) (WEBRTC_SPL_MUL_16_16_RSFT(
    216         kLogConst, log2_energy, 19) +
    217         WEBRTC_SPL_MUL_16_16_RSFT(tot_rshifts, kLogConst, 9));
    218 
    219     if (*log_energy < 0) {
    220       *log_energy = 0;
    221     }
    222   } else {
    223     *log_energy = offset;
    224     return;
    225   }
    226 
    227   *log_energy += offset;
    228 
    229   // Update the approximate |total_energy| with the energy of |data_in|, if
    230   // |total_energy| has not exceeded |kMinEnergy|. |total_energy| is used as an
    231   // energy indicator in WebRtcVad_GmmProbability() in vad_core.c.
    232   if (*total_energy <= kMinEnergy) {
    233     if (tot_rshifts >= 0) {
    234       // We know by construction that the |energy| > |kMinEnergy| in Q0, so add
    235       // an arbitrary value such that |total_energy| exceeds |kMinEnergy|.
    236       *total_energy += kMinEnergy + 1;
    237     } else {
    238       // By construction |energy| is represented by 15 bits, hence any number of
    239       // right shifted |energy| will fit in an int16_t. In addition, adding the
    240       // value to |total_energy| is wrap around safe as long as
    241       // |kMinEnergy| < 8192.
    242       *total_energy += (int16_t) (energy >> -tot_rshifts);  // Q0.
    243     }
    244   }
    245 }
    246 
    247 int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in,
    248                                     int data_length, int16_t* features) {
    249   int16_t total_energy = 0;
    250   // We expect |data_length| to be 80, 160 or 240 samples, which corresponds to
    251   // 10, 20 or 30 ms in 8 kHz. Therefore, the intermediate downsampled data will
    252   // have at most 120 samples after the first split and at most 60 samples after
    253   // the second split.
    254   int16_t hp_120[120], lp_120[120];
    255   int16_t hp_60[60], lp_60[60];
    256   const int half_data_length = data_length >> 1;
    257   int length = half_data_length;  // |data_length| / 2, corresponds to
    258                                   // bandwidth = 2000 Hz after downsampling.
    259 
    260   // Initialize variables for the first SplitFilter().
    261   int frequency_band = 0;
    262   const int16_t* in_ptr = data_in;  // [0 - 4000] Hz.
    263   int16_t* hp_out_ptr = hp_120;  // [2000 - 4000] Hz.
    264   int16_t* lp_out_ptr = lp_120;  // [0 - 2000] Hz.
    265 
    266   assert(data_length >= 0);
    267   assert(data_length <= 240);
    268   assert(4 < kNumChannels - 1);  // Checking maximum |frequency_band|.
    269 
    270   // Split at 2000 Hz and downsample.
    271   SplitFilter(in_ptr, data_length, &self->upper_state[frequency_band],
    272               &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
    273 
    274   // For the upper band (2000 Hz - 4000 Hz) split at 3000 Hz and downsample.
    275   frequency_band = 1;
    276   in_ptr = hp_120;  // [2000 - 4000] Hz.
    277   hp_out_ptr = hp_60;  // [3000 - 4000] Hz.
    278   lp_out_ptr = lp_60;  // [2000 - 3000] Hz.
    279   SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
    280               &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
    281 
    282   // Energy in 3000 Hz - 4000 Hz.
    283   length >>= 1;  // |data_length| / 4 <=> bandwidth = 1000 Hz.
    284 
    285   LogOfEnergy(hp_60, length, kOffsetVector[5], &total_energy, &features[5]);
    286 
    287   // Energy in 2000 Hz - 3000 Hz.
    288   LogOfEnergy(lp_60, length, kOffsetVector[4], &total_energy, &features[4]);
    289 
    290   // For the lower band (0 Hz - 2000 Hz) split at 1000 Hz and downsample.
    291   frequency_band = 2;
    292   in_ptr = lp_120;  // [0 - 2000] Hz.
    293   hp_out_ptr = hp_60;  // [1000 - 2000] Hz.
    294   lp_out_ptr = lp_60;  // [0 - 1000] Hz.
    295   length = half_data_length;  // |data_length| / 2 <=> bandwidth = 2000 Hz.
    296   SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
    297               &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
    298 
    299   // Energy in 1000 Hz - 2000 Hz.
    300   length >>= 1;  // |data_length| / 4 <=> bandwidth = 1000 Hz.
    301   LogOfEnergy(hp_60, length, kOffsetVector[3], &total_energy, &features[3]);
    302 
    303   // For the lower band (0 Hz - 1000 Hz) split at 500 Hz and downsample.
    304   frequency_band = 3;
    305   in_ptr = lp_60;  // [0 - 1000] Hz.
    306   hp_out_ptr = hp_120;  // [500 - 1000] Hz.
    307   lp_out_ptr = lp_120;  // [0 - 500] Hz.
    308   SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
    309               &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
    310 
    311   // Energy in 500 Hz - 1000 Hz.
    312   length >>= 1;  // |data_length| / 8 <=> bandwidth = 500 Hz.
    313   LogOfEnergy(hp_120, length, kOffsetVector[2], &total_energy, &features[2]);
    314 
    315   // For the lower band (0 Hz - 500 Hz) split at 250 Hz and downsample.
    316   frequency_band = 4;
    317   in_ptr = lp_120;  // [0 - 500] Hz.
    318   hp_out_ptr = hp_60;  // [250 - 500] Hz.
    319   lp_out_ptr = lp_60;  // [0 - 250] Hz.
    320   SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
    321               &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
    322 
    323   // Energy in 250 Hz - 500 Hz.
    324   length >>= 1;  // |data_length| / 16 <=> bandwidth = 250 Hz.
    325   LogOfEnergy(hp_60, length, kOffsetVector[1], &total_energy, &features[1]);
    326 
    327   // Remove 0 Hz - 80 Hz, by high pass filtering the lower band.
    328   HighPassFilter(lp_60, length, self->hp_filter_state, hp_120);
    329 
    330   // Energy in 80 Hz - 250 Hz.
    331   LogOfEnergy(hp_120, length, kOffsetVector[0], &total_energy, &features[0]);
    332 
    333   return total_energy;
    334 }
    335