1 /* 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "webrtc/modules/audio_processing/vad/vad_audio_proc.h" 12 13 #include <math.h> 14 #include <stdio.h> 15 16 #include "webrtc/common_audio/fft4g.h" 17 #include "webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h" 18 #include "webrtc/modules/audio_processing/vad/pitch_internal.h" 19 #include "webrtc/modules/audio_processing/vad/pole_zero_filter.h" 20 extern "C" { 21 #include "webrtc/modules/audio_coding/codecs/isac/main/source/codec.h" 22 #include "webrtc/modules/audio_coding/codecs/isac/main/source/lpc_analysis.h" 23 #include "webrtc/modules/audio_coding/codecs/isac/main/source/pitch_estimator.h" 24 #include "webrtc/modules/audio_coding/codecs/isac/main/source/structs.h" 25 } 26 #include "webrtc/modules/include/module_common_types.h" 27 28 namespace webrtc { 29 30 // The following structures are declared anonymous in iSAC's structs.h. To 31 // forward declare them, we use this derived class trick. 32 struct VadAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {}; 33 struct VadAudioProc::PreFiltBankstr : public ::PreFiltBankstr {}; 34 35 static const float kFrequencyResolution = 36 kSampleRateHz / static_cast<float>(VadAudioProc::kDftSize); 37 static const int kSilenceRms = 5; 38 39 // TODO(turajs): Make a Create or Init for VadAudioProc. 40 VadAudioProc::VadAudioProc() 41 : audio_buffer_(), 42 num_buffer_samples_(kNumPastSignalSamples), 43 log_old_gain_(-2), 44 old_lag_(50), // Arbitrary but valid as pitch-lag (in samples). 45 pitch_analysis_handle_(new PitchAnalysisStruct), 46 pre_filter_handle_(new PreFiltBankstr), 47 high_pass_filter_(PoleZeroFilter::Create(kCoeffNumerator, 48 kFilterOrder, 49 kCoeffDenominator, 50 kFilterOrder)) { 51 static_assert(kNumPastSignalSamples + kNumSubframeSamples == 52 sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]), 53 "lpc analysis window incorrect size"); 54 static_assert(kLpcOrder + 1 == sizeof(kCorrWeight) / sizeof(kCorrWeight[0]), 55 "correlation weight incorrect size"); 56 57 // TODO(turajs): Are we doing too much in the constructor? 58 float data[kDftSize]; 59 // Make FFT to initialize. 60 ip_[0] = 0; 61 WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_); 62 // TODO(turajs): Need to initialize high-pass filter. 63 64 // Initialize iSAC components. 65 WebRtcIsac_InitPreFilterbank(pre_filter_handle_.get()); 66 WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get()); 67 } 68 69 VadAudioProc::~VadAudioProc() { 70 } 71 72 void VadAudioProc::ResetBuffer() { 73 memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess], 74 sizeof(audio_buffer_[0]) * kNumPastSignalSamples); 75 num_buffer_samples_ = kNumPastSignalSamples; 76 } 77 78 int VadAudioProc::ExtractFeatures(const int16_t* frame, 79 size_t length, 80 AudioFeatures* features) { 81 features->num_frames = 0; 82 if (length != kNumSubframeSamples) { 83 return -1; 84 } 85 86 // High-pass filter to remove the DC component and very low frequency content. 87 // We have experienced that this high-pass filtering improves voice/non-voiced 88 // classification. 89 if (high_pass_filter_->Filter(frame, kNumSubframeSamples, 90 &audio_buffer_[num_buffer_samples_]) != 0) { 91 return -1; 92 } 93 94 num_buffer_samples_ += kNumSubframeSamples; 95 if (num_buffer_samples_ < kBufferLength) { 96 return 0; 97 } 98 assert(num_buffer_samples_ == kBufferLength); 99 features->num_frames = kNum10msSubframes; 100 features->silence = false; 101 102 Rms(features->rms, kMaxNumFrames); 103 for (size_t i = 0; i < kNum10msSubframes; ++i) { 104 if (features->rms[i] < kSilenceRms) { 105 // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence. 106 // Bail out here instead. 107 features->silence = true; 108 ResetBuffer(); 109 return 0; 110 } 111 } 112 113 PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz, 114 kMaxNumFrames); 115 FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames); 116 ResetBuffer(); 117 return 0; 118 } 119 120 // Computes |kLpcOrder + 1| correlation coefficients. 121 void VadAudioProc::SubframeCorrelation(double* corr, 122 size_t length_corr, 123 size_t subframe_index) { 124 assert(length_corr >= kLpcOrder + 1); 125 double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples]; 126 size_t buffer_index = subframe_index * kNumSubframeSamples; 127 128 for (size_t n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++) 129 windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n]; 130 131 WebRtcIsac_AutoCorr(corr, windowed_audio, 132 kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder); 133 } 134 135 // Compute |kNum10msSubframes| sets of LPC coefficients, one per 10 ms input. 136 // The analysis window is 15 ms long and it is centered on the first half of 137 // each 10ms sub-frame. This is equivalent to computing LPC coefficients for the 138 // first half of each 10 ms subframe. 139 void VadAudioProc::GetLpcPolynomials(double* lpc, size_t length_lpc) { 140 assert(length_lpc >= kNum10msSubframes * (kLpcOrder + 1)); 141 double corr[kLpcOrder + 1]; 142 double reflec_coeff[kLpcOrder]; 143 for (size_t i = 0, offset_lpc = 0; i < kNum10msSubframes; 144 i++, offset_lpc += kLpcOrder + 1) { 145 SubframeCorrelation(corr, kLpcOrder + 1, i); 146 corr[0] *= 1.0001; 147 // This makes Lev-Durb a bit more stable. 148 for (size_t k = 0; k < kLpcOrder + 1; k++) { 149 corr[k] *= kCorrWeight[k]; 150 } 151 WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder); 152 } 153 } 154 155 // Fit a second order curve to these 3 points and find the location of the 156 // extremum. The points are inverted before curve fitting. 157 static float QuadraticInterpolation(float prev_val, 158 float curr_val, 159 float next_val) { 160 // Doing the interpolation in |1 / A(z)|^2. 161 float fractional_index = 0; 162 next_val = 1.0f / next_val; 163 prev_val = 1.0f / prev_val; 164 curr_val = 1.0f / curr_val; 165 166 fractional_index = 167 -(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val); 168 assert(fabs(fractional_index) < 1); 169 return fractional_index; 170 } 171 172 // 1 / A(z), where A(z) is defined by |lpc| is a model of the spectral envelope 173 // of the input signal. The local maximum of the spectral envelope corresponds 174 // with the local minimum of A(z). It saves complexity, as we save one 175 // inversion. Furthermore, we find the first local maximum of magnitude squared, 176 // to save on one square root. 177 void VadAudioProc::FindFirstSpectralPeaks(double* f_peak, 178 size_t length_f_peak) { 179 assert(length_f_peak >= kNum10msSubframes); 180 double lpc[kNum10msSubframes * (kLpcOrder + 1)]; 181 // For all sub-frames. 182 GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1)); 183 184 const size_t kNumDftCoefficients = kDftSize / 2 + 1; 185 float data[kDftSize]; 186 187 for (size_t i = 0; i < kNum10msSubframes; i++) { 188 // Convert to float with zero pad. 189 memset(data, 0, sizeof(data)); 190 for (size_t n = 0; n < kLpcOrder + 1; n++) { 191 data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]); 192 } 193 // Transform to frequency domain. 194 WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_); 195 196 size_t index_peak = 0; 197 float prev_magn_sqr = data[0] * data[0]; 198 float curr_magn_sqr = data[2] * data[2] + data[3] * data[3]; 199 float next_magn_sqr; 200 bool found_peak = false; 201 for (size_t n = 2; n < kNumDftCoefficients - 1; n++) { 202 next_magn_sqr = 203 data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1]; 204 if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) { 205 found_peak = true; 206 index_peak = n - 1; 207 break; 208 } 209 prev_magn_sqr = curr_magn_sqr; 210 curr_magn_sqr = next_magn_sqr; 211 } 212 float fractional_index = 0; 213 if (!found_peak) { 214 // Checking if |kNumDftCoefficients - 1| is the local minimum. 215 next_magn_sqr = data[1] * data[1]; 216 if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) { 217 index_peak = kNumDftCoefficients - 1; 218 } 219 } else { 220 // A peak is found, do a simple quadratic interpolation to get a more 221 // accurate estimate of the peak location. 222 fractional_index = 223 QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr); 224 } 225 f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution; 226 } 227 } 228 229 // Using iSAC functions to estimate pitch gains & lags. 230 void VadAudioProc::PitchAnalysis(double* log_pitch_gains, 231 double* pitch_lags_hz, 232 size_t length) { 233 // TODO(turajs): This can be "imported" from iSAC & and the next two 234 // constants. 235 assert(length >= kNum10msSubframes); 236 const int kNumPitchSubframes = 4; 237 double gains[kNumPitchSubframes]; 238 double lags[kNumPitchSubframes]; 239 240 const int kNumSubbandFrameSamples = 240; 241 const int kNumLookaheadSamples = 24; 242 243 float lower[kNumSubbandFrameSamples]; 244 float upper[kNumSubbandFrameSamples]; 245 double lower_lookahead[kNumSubbandFrameSamples]; 246 double upper_lookahead[kNumSubbandFrameSamples]; 247 double lower_lookahead_pre_filter[kNumSubbandFrameSamples + 248 kNumLookaheadSamples]; 249 250 // Split signal to lower and upper bands 251 WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], lower, 252 upper, lower_lookahead, upper_lookahead, 253 pre_filter_handle_.get()); 254 WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter, 255 pitch_analysis_handle_.get(), lags, gains); 256 257 // Lags are computed on lower-band signal with sampling rate half of the 258 // input signal. 259 GetSubframesPitchParameters( 260 kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes, 261 &log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz); 262 } 263 264 void VadAudioProc::Rms(double* rms, size_t length_rms) { 265 assert(length_rms >= kNum10msSubframes); 266 size_t offset = kNumPastSignalSamples; 267 for (size_t i = 0; i < kNum10msSubframes; i++) { 268 rms[i] = 0; 269 for (size_t n = 0; n < kNumSubframeSamples; n++, offset++) 270 rms[i] += audio_buffer_[offset] * audio_buffer_[offset]; 271 rms[i] = sqrt(rms[i] / kNumSubframeSamples); 272 } 273 } 274 275 } // namespace webrtc 276