1 /* 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "webrtc/modules/audio_coding/neteq/time_stretch.h" 12 13 #include <algorithm> // min, max 14 15 #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 16 #include "webrtc/modules/audio_coding/neteq/background_noise.h" 17 #include "webrtc/modules/audio_coding/neteq/dsp_helper.h" 18 #include "webrtc/system_wrappers/interface/scoped_ptr.h" 19 20 namespace webrtc { 21 22 TimeStretch::ReturnCodes TimeStretch::Process( 23 const int16_t* input, 24 size_t input_len, 25 AudioMultiVector* output, 26 int16_t* length_change_samples) { 27 28 // Pre-calculate common multiplication with |fs_mult_|. 29 int fs_mult_120 = fs_mult_ * 120; // Corresponds to 15 ms. 30 31 const int16_t* signal; 32 scoped_ptr<int16_t[]> signal_array; 33 size_t signal_len; 34 if (num_channels_ == 1) { 35 signal = input; 36 signal_len = input_len; 37 } else { 38 // We want |signal| to be only the first channel of |input|, which is 39 // interleaved. Thus, we take the first sample, skip forward |num_channels| 40 // samples, and continue like that. 41 signal_len = input_len / num_channels_; 42 signal_array.reset(new int16_t[signal_len]); 43 signal = signal_array.get(); 44 size_t j = master_channel_; 45 for (size_t i = 0; i < signal_len; ++i) { 46 signal_array[i] = input[j]; 47 j += num_channels_; 48 } 49 } 50 51 // Find maximum absolute value of input signal. 52 max_input_value_ = WebRtcSpl_MaxAbsValueW16(signal, 53 static_cast<int>(signal_len)); 54 55 // Downsample to 4 kHz sample rate and calculate auto-correlation. 56 DspHelper::DownsampleTo4kHz(signal, signal_len, kDownsampledLen, 57 sample_rate_hz_, true /* compensate delay*/, 58 downsampled_input_); 59 AutoCorrelation(); 60 61 // Find the strongest correlation peak. 62 static const int kNumPeaks = 1; 63 int peak_index; 64 int16_t peak_value; 65 DspHelper::PeakDetection(auto_correlation_, kCorrelationLen, kNumPeaks, 66 fs_mult_, &peak_index, &peak_value); 67 // Assert that |peak_index| stays within boundaries. 68 assert(peak_index >= 0); 69 assert(peak_index <= (2 * kCorrelationLen - 1) * fs_mult_); 70 71 // Compensate peak_index for displaced starting position. The displacement 72 // happens in AutoCorrelation(). Here, |kMinLag| is in the down-sampled 4 kHz 73 // domain, while the |peak_index| is in the original sample rate; hence, the 74 // multiplication by fs_mult_ * 2. 75 peak_index += kMinLag * fs_mult_ * 2; 76 // Assert that |peak_index| stays within boundaries. 77 assert(peak_index >= 20 * fs_mult_); 78 assert(peak_index <= 20 * fs_mult_ + (2 * kCorrelationLen - 1) * fs_mult_); 79 80 // Calculate scaling to ensure that |peak_index| samples can be square-summed 81 // without overflowing. 82 int scaling = 31 - WebRtcSpl_NormW32(max_input_value_ * max_input_value_) - 83 WebRtcSpl_NormW32(peak_index); 84 scaling = std::max(0, scaling); 85 86 // |vec1| starts at 15 ms minus one pitch period. 87 const int16_t* vec1 = &signal[fs_mult_120 - peak_index]; 88 // |vec2| start at 15 ms. 89 const int16_t* vec2 = &signal[fs_mult_120]; 90 // Calculate energies for |vec1| and |vec2|, assuming they both contain 91 // |peak_index| samples. 92 int32_t vec1_energy = 93 WebRtcSpl_DotProductWithScale(vec1, vec1, peak_index, scaling); 94 int32_t vec2_energy = 95 WebRtcSpl_DotProductWithScale(vec2, vec2, peak_index, scaling); 96 97 // Calculate cross-correlation between |vec1| and |vec2|. 98 int32_t cross_corr = 99 WebRtcSpl_DotProductWithScale(vec1, vec2, peak_index, scaling); 100 101 // Check if the signal seems to be active speech or not (simple VAD). 102 bool active_speech = SpeechDetection(vec1_energy, vec2_energy, peak_index, 103 scaling); 104 105 int16_t best_correlation; 106 if (!active_speech) { 107 SetParametersForPassiveSpeech(signal_len, &best_correlation, &peak_index); 108 } else { 109 // Calculate correlation: 110 // cross_corr / sqrt(vec1_energy * vec2_energy). 111 112 // Start with calculating scale values. 113 int energy1_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec1_energy)); 114 int energy2_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec2_energy)); 115 116 // Make sure total scaling is even (to simplify scale factor after sqrt). 117 if ((energy1_scale + energy2_scale) & 1) { 118 // The sum is odd. 119 energy1_scale += 1; 120 } 121 122 // Scale energies to int16_t. 123 int16_t vec1_energy_int16 = 124 static_cast<int16_t>(vec1_energy >> energy1_scale); 125 int16_t vec2_energy_int16 = 126 static_cast<int16_t>(vec2_energy >> energy2_scale); 127 128 // Calculate square-root of energy product. 129 int16_t sqrt_energy_prod = WebRtcSpl_SqrtFloor(vec1_energy_int16 * 130 vec2_energy_int16); 131 132 // Calculate cross_corr / sqrt(en1*en2) in Q14. 133 int temp_scale = 14 - (energy1_scale + energy2_scale) / 2; 134 cross_corr = WEBRTC_SPL_SHIFT_W32(cross_corr, temp_scale); 135 cross_corr = std::max(0, cross_corr); // Don't use if negative. 136 best_correlation = WebRtcSpl_DivW32W16(cross_corr, sqrt_energy_prod); 137 // Make sure |best_correlation| is no larger than 1 in Q14. 138 best_correlation = std::min(static_cast<int16_t>(16384), best_correlation); 139 } 140 141 142 // Check accelerate criteria and stretch the signal. 143 ReturnCodes return_value = CheckCriteriaAndStretch( 144 input, input_len, peak_index, best_correlation, active_speech, output); 145 switch (return_value) { 146 case kSuccess: 147 *length_change_samples = peak_index; 148 break; 149 case kSuccessLowEnergy: 150 *length_change_samples = peak_index; 151 break; 152 case kNoStretch: 153 case kError: 154 *length_change_samples = 0; 155 break; 156 } 157 return return_value; 158 } 159 160 void TimeStretch::AutoCorrelation() { 161 // Set scaling factor for cross correlation to protect against overflow. 162 int scaling = kLogCorrelationLen - WebRtcSpl_NormW32( 163 max_input_value_ * max_input_value_); 164 scaling = std::max(0, scaling); 165 166 // Calculate correlation from lag kMinLag to lag kMaxLag in 4 kHz domain. 167 int32_t auto_corr[kCorrelationLen]; 168 WebRtcSpl_CrossCorrelation(auto_corr, &downsampled_input_[kMaxLag], 169 &downsampled_input_[kMaxLag - kMinLag], 170 kCorrelationLen, kMaxLag - kMinLag, scaling, -1); 171 172 // Normalize correlation to 14 bits and write to |auto_correlation_|. 173 int32_t max_corr = WebRtcSpl_MaxAbsValueW32(auto_corr, kCorrelationLen); 174 scaling = std::max(0, 17 - WebRtcSpl_NormW32(max_corr)); 175 WebRtcSpl_VectorBitShiftW32ToW16(auto_correlation_, kCorrelationLen, 176 auto_corr, scaling); 177 } 178 179 bool TimeStretch::SpeechDetection(int32_t vec1_energy, int32_t vec2_energy, 180 int peak_index, int scaling) const { 181 // Check if the signal seems to be active speech or not (simple VAD). 182 // If (vec1_energy + vec2_energy) / (2 * peak_index) <= 183 // 8 * background_noise_energy, then we say that the signal contains no 184 // active speech. 185 // Rewrite the inequality as: 186 // (vec1_energy + vec2_energy) / 16 <= peak_index * background_noise_energy. 187 // The two sides of the inequality will be denoted |left_side| and 188 // |right_side|. 189 int32_t left_side = (vec1_energy + vec2_energy) / 16; 190 int32_t right_side; 191 if (background_noise_.initialized()) { 192 right_side = background_noise_.Energy(master_channel_); 193 } else { 194 // If noise parameters have not been estimated, use a fixed threshold. 195 right_side = 75000; 196 } 197 int right_scale = 16 - WebRtcSpl_NormW32(right_side); 198 right_scale = std::max(0, right_scale); 199 left_side = left_side >> right_scale; 200 right_side = peak_index * (right_side >> right_scale); 201 202 // Scale |left_side| properly before comparing with |right_side|. 203 // (|scaling| is the scale factor before energy calculation, thus the scale 204 // factor for the energy is 2 * scaling.) 205 if (WebRtcSpl_NormW32(left_side) < 2 * scaling) { 206 // Cannot scale only |left_side|, must scale |right_side| too. 207 int temp_scale = WebRtcSpl_NormW32(left_side); 208 left_side = left_side << temp_scale; 209 right_side = right_side >> (2 * scaling - temp_scale); 210 } else { 211 left_side = left_side << 2 * scaling; 212 } 213 return left_side > right_side; 214 } 215 216 } // namespace webrtc 217