Home | History | Annotate | Download | only in intelligibility
      1 /*
      2  *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 //
     12 //  Specifies core class for intelligbility enhancement.
     13 //
     14 
     15 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_
     16 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_
     17 
     18 #include <complex>
     19 #include <vector>
     20 
     21 #include "webrtc/base/scoped_ptr.h"
     22 #include "webrtc/common_audio/lapped_transform.h"
     23 #include "webrtc/common_audio/channel_buffer.h"
     24 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h"
     25 
     26 namespace webrtc {
     27 
     28 // Speech intelligibility enhancement module. Reads render and capture
     29 // audio streams and modifies the render stream with a set of gains per
     30 // frequency bin to enhance speech against the noise background.
     31 // Note: assumes speech and noise streams are already separated.
     32 class IntelligibilityEnhancer {
     33  public:
     34   struct Config {
     35     // |var_*| are parameters for the VarianceArray constructor for the
     36     // clear speech stream.
     37     // TODO(bercic): the |var_*|, |*_rate| and |gain_limit| parameters should
     38     // probably go away once fine tuning is done.
     39     Config()
     40         : sample_rate_hz(16000),
     41           num_capture_channels(1),
     42           num_render_channels(1),
     43           var_type(intelligibility::VarianceArray::kStepDecaying),
     44           var_decay_rate(0.9f),
     45           var_window_size(10),
     46           analysis_rate(800),
     47           gain_change_limit(0.1f),
     48           rho(0.02f) {}
     49     int sample_rate_hz;
     50     size_t num_capture_channels;
     51     size_t num_render_channels;
     52     intelligibility::VarianceArray::StepType var_type;
     53     float var_decay_rate;
     54     size_t var_window_size;
     55     int analysis_rate;
     56     float gain_change_limit;
     57     float rho;
     58   };
     59 
     60   explicit IntelligibilityEnhancer(const Config& config);
     61   IntelligibilityEnhancer();  // Initialize with default config.
     62 
     63   // Reads and processes chunk of noise stream in time domain.
     64   void AnalyzeCaptureAudio(float* const* audio,
     65                            int sample_rate_hz,
     66                            size_t num_channels);
     67 
     68   // Reads chunk of speech in time domain and updates with modified signal.
     69   void ProcessRenderAudio(float* const* audio,
     70                           int sample_rate_hz,
     71                           size_t num_channels);
     72   bool active() const;
     73 
     74  private:
     75   enum AudioSource {
     76     kRenderStream = 0,  // Clear speech stream.
     77     kCaptureStream,  // Noise stream.
     78   };
     79 
     80   // Provides access point to the frequency domain.
     81   class TransformCallback : public LappedTransform::Callback {
     82    public:
     83     TransformCallback(IntelligibilityEnhancer* parent, AudioSource source);
     84 
     85     // All in frequency domain, receives input |in_block|, applies
     86     // intelligibility enhancement, and writes result to |out_block|.
     87     void ProcessAudioBlock(const std::complex<float>* const* in_block,
     88                            size_t in_channels,
     89                            size_t frames,
     90                            size_t out_channels,
     91                            std::complex<float>* const* out_block) override;
     92 
     93    private:
     94     IntelligibilityEnhancer* parent_;
     95     AudioSource source_;
     96   };
     97   friend class TransformCallback;
     98   FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestErbCreation);
     99   FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestSolveForGains);
    100 
    101   // Sends streams to ProcessClearBlock or ProcessNoiseBlock based on source.
    102   void DispatchAudio(AudioSource source,
    103                      const std::complex<float>* in_block,
    104                      std::complex<float>* out_block);
    105 
    106   // Updates variance computation and analysis with |in_block_|,
    107   // and writes modified speech to |out_block|.
    108   void ProcessClearBlock(const std::complex<float>* in_block,
    109                          std::complex<float>* out_block);
    110 
    111   // Computes and sets modified gains.
    112   void AnalyzeClearBlock(float power_target);
    113 
    114   // Bisection search for optimal |lambda|.
    115   void SolveForLambda(float power_target, float power_bot, float power_top);
    116 
    117   // Transforms freq gains to ERB gains.
    118   void UpdateErbGains();
    119 
    120   // Updates variance calculation for noise input with |in_block|.
    121   void ProcessNoiseBlock(const std::complex<float>* in_block,
    122                          std::complex<float>* out_block);
    123 
    124   // Returns number of ERB filters.
    125   static size_t GetBankSize(int sample_rate, size_t erb_resolution);
    126 
    127   // Initializes ERB filterbank.
    128   void CreateErbBank();
    129 
    130   // Analytically solves quadratic for optimal gains given |lambda|.
    131   // Negative gains are set to 0. Stores the results in |sols|.
    132   void SolveForGainsGivenLambda(float lambda, size_t start_freq, float* sols);
    133 
    134   // Computes variance across ERB filters from freq variance |var|.
    135   // Stores in |result|.
    136   void FilterVariance(const float* var, float* result);
    137 
    138   // Returns dot product of vectors specified by size |length| arrays |a|,|b|.
    139   static float DotProduct(const float* a, const float* b, size_t length);
    140 
    141   const size_t freqs_;         // Num frequencies in frequency domain.
    142   const size_t window_size_;   // Window size in samples; also the block size.
    143   const size_t chunk_length_;  // Chunk size in samples.
    144   const size_t bank_size_;     // Num ERB filters.
    145   const int sample_rate_hz_;
    146   const int erb_resolution_;
    147   const size_t num_capture_channels_;
    148   const size_t num_render_channels_;
    149   const int analysis_rate_;    // Num blocks before gains recalculated.
    150 
    151   const bool active_;          // Whether render gains are being updated.
    152                                // TODO(ekm): Add logic for updating |active_|.
    153 
    154   intelligibility::VarianceArray clear_variance_;
    155   intelligibility::VarianceArray noise_variance_;
    156   rtc::scoped_ptr<float[]> filtered_clear_var_;
    157   rtc::scoped_ptr<float[]> filtered_noise_var_;
    158   std::vector<std::vector<float>> filter_bank_;
    159   rtc::scoped_ptr<float[]> center_freqs_;
    160   size_t start_freq_;
    161   rtc::scoped_ptr<float[]> rho_;  // Production and interpretation SNR.
    162                                   // for each ERB band.
    163   rtc::scoped_ptr<float[]> gains_eq_;  // Pre-filter modified gains.
    164   intelligibility::GainApplier gain_applier_;
    165 
    166   // Destination buffers used to reassemble blocked chunks before overwriting
    167   // the original input array with modifications.
    168   ChannelBuffer<float> temp_render_out_buffer_;
    169   ChannelBuffer<float> temp_capture_out_buffer_;
    170 
    171   rtc::scoped_ptr<float[]> kbd_window_;
    172   TransformCallback render_callback_;
    173   TransformCallback capture_callback_;
    174   rtc::scoped_ptr<LappedTransform> render_mangler_;
    175   rtc::scoped_ptr<LappedTransform> capture_mangler_;
    176   int block_count_;
    177   int analysis_step_;
    178 };
    179 
    180 }  // namespace webrtc
    181 
    182 #endif  // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_
    183