Home | History | Annotate | Download | only in neteq
      1 /*
      2  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "webrtc/modules/audio_coding/neteq/time_stretch.h"
     12 
     13 #include <algorithm>  // min, max
     14 
     15 #include "webrtc/base/safe_conversions.h"
     16 #include "webrtc/base/scoped_ptr.h"
     17 #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
     18 #include "webrtc/modules/audio_coding/neteq/background_noise.h"
     19 #include "webrtc/modules/audio_coding/neteq/dsp_helper.h"
     20 
     21 namespace webrtc {
     22 
     23 TimeStretch::ReturnCodes TimeStretch::Process(const int16_t* input,
     24                                               size_t input_len,
     25                                               bool fast_mode,
     26                                               AudioMultiVector* output,
     27                                               size_t* length_change_samples) {
     28   // Pre-calculate common multiplication with |fs_mult_|.
     29   size_t fs_mult_120 =
     30       static_cast<size_t>(fs_mult_ * 120);  // Corresponds to 15 ms.
     31 
     32   const int16_t* signal;
     33   rtc::scoped_ptr<int16_t[]> signal_array;
     34   size_t signal_len;
     35   if (num_channels_ == 1) {
     36     signal = input;
     37     signal_len = input_len;
     38   } else {
     39     // We want |signal| to be only the first channel of |input|, which is
     40     // interleaved. Thus, we take the first sample, skip forward |num_channels|
     41     // samples, and continue like that.
     42     signal_len = input_len / num_channels_;
     43     signal_array.reset(new int16_t[signal_len]);
     44     signal = signal_array.get();
     45     size_t j = master_channel_;
     46     for (size_t i = 0; i < signal_len; ++i) {
     47       signal_array[i] = input[j];
     48       j += num_channels_;
     49     }
     50   }
     51 
     52   // Find maximum absolute value of input signal.
     53   max_input_value_ = WebRtcSpl_MaxAbsValueW16(signal, signal_len);
     54 
     55   // Downsample to 4 kHz sample rate and calculate auto-correlation.
     56   DspHelper::DownsampleTo4kHz(signal, signal_len, kDownsampledLen,
     57                               sample_rate_hz_, true /* compensate delay*/,
     58                               downsampled_input_);
     59   AutoCorrelation();
     60 
     61   // Find the strongest correlation peak.
     62   static const size_t kNumPeaks = 1;
     63   size_t peak_index;
     64   int16_t peak_value;
     65   DspHelper::PeakDetection(auto_correlation_, kCorrelationLen, kNumPeaks,
     66                            fs_mult_, &peak_index, &peak_value);
     67   // Assert that |peak_index| stays within boundaries.
     68   assert(peak_index <= (2 * kCorrelationLen - 1) * fs_mult_);
     69 
     70   // Compensate peak_index for displaced starting position. The displacement
     71   // happens in AutoCorrelation(). Here, |kMinLag| is in the down-sampled 4 kHz
     72   // domain, while the |peak_index| is in the original sample rate; hence, the
     73   // multiplication by fs_mult_ * 2.
     74   peak_index += kMinLag * fs_mult_ * 2;
     75   // Assert that |peak_index| stays within boundaries.
     76   assert(peak_index >= static_cast<size_t>(20 * fs_mult_));
     77   assert(peak_index <= 20 * fs_mult_ + (2 * kCorrelationLen - 1) * fs_mult_);
     78 
     79   // Calculate scaling to ensure that |peak_index| samples can be square-summed
     80   // without overflowing.
     81   int scaling = 31 - WebRtcSpl_NormW32(max_input_value_ * max_input_value_) -
     82       WebRtcSpl_NormW32(static_cast<int32_t>(peak_index));
     83   scaling = std::max(0, scaling);
     84 
     85   // |vec1| starts at 15 ms minus one pitch period.
     86   const int16_t* vec1 = &signal[fs_mult_120 - peak_index];
     87   // |vec2| start at 15 ms.
     88   const int16_t* vec2 = &signal[fs_mult_120];
     89   // Calculate energies for |vec1| and |vec2|, assuming they both contain
     90   // |peak_index| samples.
     91   int32_t vec1_energy =
     92       WebRtcSpl_DotProductWithScale(vec1, vec1, peak_index, scaling);
     93   int32_t vec2_energy =
     94       WebRtcSpl_DotProductWithScale(vec2, vec2, peak_index, scaling);
     95 
     96   // Calculate cross-correlation between |vec1| and |vec2|.
     97   int32_t cross_corr =
     98       WebRtcSpl_DotProductWithScale(vec1, vec2, peak_index, scaling);
     99 
    100   // Check if the signal seems to be active speech or not (simple VAD).
    101   bool active_speech = SpeechDetection(vec1_energy, vec2_energy, peak_index,
    102                                        scaling);
    103 
    104   int16_t best_correlation;
    105   if (!active_speech) {
    106     SetParametersForPassiveSpeech(signal_len, &best_correlation, &peak_index);
    107   } else {
    108     // Calculate correlation:
    109     // cross_corr / sqrt(vec1_energy * vec2_energy).
    110 
    111     // Start with calculating scale values.
    112     int energy1_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec1_energy));
    113     int energy2_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec2_energy));
    114 
    115     // Make sure total scaling is even (to simplify scale factor after sqrt).
    116     if ((energy1_scale + energy2_scale) & 1) {
    117       // The sum is odd.
    118       energy1_scale += 1;
    119     }
    120 
    121     // Scale energies to int16_t.
    122     int16_t vec1_energy_int16 =
    123         static_cast<int16_t>(vec1_energy >> energy1_scale);
    124     int16_t vec2_energy_int16 =
    125         static_cast<int16_t>(vec2_energy >> energy2_scale);
    126 
    127     // Calculate square-root of energy product.
    128     int16_t sqrt_energy_prod = WebRtcSpl_SqrtFloor(vec1_energy_int16 *
    129                                                    vec2_energy_int16);
    130 
    131     // Calculate cross_corr / sqrt(en1*en2) in Q14.
    132     int temp_scale = 14 - (energy1_scale + energy2_scale) / 2;
    133     cross_corr = WEBRTC_SPL_SHIFT_W32(cross_corr, temp_scale);
    134     cross_corr = std::max(0, cross_corr);  // Don't use if negative.
    135     best_correlation = WebRtcSpl_DivW32W16(cross_corr, sqrt_energy_prod);
    136     // Make sure |best_correlation| is no larger than 1 in Q14.
    137     best_correlation = std::min(static_cast<int16_t>(16384), best_correlation);
    138   }
    139 
    140 
    141   // Check accelerate criteria and stretch the signal.
    142   ReturnCodes return_value =
    143       CheckCriteriaAndStretch(input, input_len, peak_index, best_correlation,
    144                               active_speech, fast_mode, output);
    145   switch (return_value) {
    146     case kSuccess:
    147       *length_change_samples = peak_index;
    148       break;
    149     case kSuccessLowEnergy:
    150       *length_change_samples = peak_index;
    151       break;
    152     case kNoStretch:
    153     case kError:
    154       *length_change_samples = 0;
    155       break;
    156   }
    157   return return_value;
    158 }
    159 
    160 void TimeStretch::AutoCorrelation() {
    161   // Set scaling factor for cross correlation to protect against overflow.
    162   int scaling = kLogCorrelationLen - WebRtcSpl_NormW32(
    163       max_input_value_ * max_input_value_);
    164   scaling = std::max(0, scaling);
    165 
    166   // Calculate correlation from lag kMinLag to lag kMaxLag in 4 kHz domain.
    167   int32_t auto_corr[kCorrelationLen];
    168   WebRtcSpl_CrossCorrelation(auto_corr, &downsampled_input_[kMaxLag],
    169                              &downsampled_input_[kMaxLag - kMinLag],
    170                              kCorrelationLen, kMaxLag - kMinLag, scaling, -1);
    171 
    172   // Normalize correlation to 14 bits and write to |auto_correlation_|.
    173   int32_t max_corr = WebRtcSpl_MaxAbsValueW32(auto_corr, kCorrelationLen);
    174   scaling = std::max(0, 17 - WebRtcSpl_NormW32(max_corr));
    175   WebRtcSpl_VectorBitShiftW32ToW16(auto_correlation_, kCorrelationLen,
    176                                    auto_corr, scaling);
    177 }
    178 
    179 bool TimeStretch::SpeechDetection(int32_t vec1_energy, int32_t vec2_energy,
    180                                   size_t peak_index, int scaling) const {
    181   // Check if the signal seems to be active speech or not (simple VAD).
    182   // If (vec1_energy + vec2_energy) / (2 * peak_index) <=
    183   // 8 * background_noise_energy, then we say that the signal contains no
    184   // active speech.
    185   // Rewrite the inequality as:
    186   // (vec1_energy + vec2_energy) / 16 <= peak_index * background_noise_energy.
    187   // The two sides of the inequality will be denoted |left_side| and
    188   // |right_side|.
    189   int32_t left_side = (vec1_energy + vec2_energy) / 16;
    190   int32_t right_side;
    191   if (background_noise_.initialized()) {
    192     right_side = background_noise_.Energy(master_channel_);
    193   } else {
    194     // If noise parameters have not been estimated, use a fixed threshold.
    195     right_side = 75000;
    196   }
    197   int right_scale = 16 - WebRtcSpl_NormW32(right_side);
    198   right_scale = std::max(0, right_scale);
    199   left_side = left_side >> right_scale;
    200   right_side =
    201       rtc::checked_cast<int32_t>(peak_index) * (right_side >> right_scale);
    202 
    203   // Scale |left_side| properly before comparing with |right_side|.
    204   // (|scaling| is the scale factor before energy calculation, thus the scale
    205   // factor for the energy is 2 * scaling.)
    206   if (WebRtcSpl_NormW32(left_side) < 2 * scaling) {
    207     // Cannot scale only |left_side|, must scale |right_side| too.
    208     int temp_scale = WebRtcSpl_NormW32(left_side);
    209     left_side = left_side << temp_scale;
    210     right_side = right_side >> (2 * scaling - temp_scale);
    211   } else {
    212     left_side = left_side << 2 * scaling;
    213   }
    214   return left_side > right_side;
    215 }
    216 
    217 }  // namespace webrtc
    218