1 /* 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 // 12 // Specifies core class for intelligbility enhancement. 13 // 14 15 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_ 16 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_ 17 18 #include <complex> 19 #include <vector> 20 21 #include "webrtc/base/scoped_ptr.h" 22 #include "webrtc/common_audio/lapped_transform.h" 23 #include "webrtc/common_audio/channel_buffer.h" 24 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h" 25 26 namespace webrtc { 27 28 // Speech intelligibility enhancement module. Reads render and capture 29 // audio streams and modifies the render stream with a set of gains per 30 // frequency bin to enhance speech against the noise background. 31 // Note: assumes speech and noise streams are already separated. 32 class IntelligibilityEnhancer { 33 public: 34 struct Config { 35 // |var_*| are parameters for the VarianceArray constructor for the 36 // clear speech stream. 37 // TODO(bercic): the |var_*|, |*_rate| and |gain_limit| parameters should 38 // probably go away once fine tuning is done. 39 Config() 40 : sample_rate_hz(16000), 41 num_capture_channels(1), 42 num_render_channels(1), 43 var_type(intelligibility::VarianceArray::kStepDecaying), 44 var_decay_rate(0.9f), 45 var_window_size(10), 46 analysis_rate(800), 47 gain_change_limit(0.1f), 48 rho(0.02f) {} 49 int sample_rate_hz; 50 size_t num_capture_channels; 51 size_t num_render_channels; 52 intelligibility::VarianceArray::StepType var_type; 53 float var_decay_rate; 54 size_t var_window_size; 55 int analysis_rate; 56 float gain_change_limit; 57 float rho; 58 }; 59 60 explicit IntelligibilityEnhancer(const Config& config); 61 IntelligibilityEnhancer(); // Initialize with default config. 62 63 // Reads and processes chunk of noise stream in time domain. 64 void AnalyzeCaptureAudio(float* const* audio, 65 int sample_rate_hz, 66 size_t num_channels); 67 68 // Reads chunk of speech in time domain and updates with modified signal. 69 void ProcessRenderAudio(float* const* audio, 70 int sample_rate_hz, 71 size_t num_channels); 72 bool active() const; 73 74 private: 75 enum AudioSource { 76 kRenderStream = 0, // Clear speech stream. 77 kCaptureStream, // Noise stream. 78 }; 79 80 // Provides access point to the frequency domain. 81 class TransformCallback : public LappedTransform::Callback { 82 public: 83 TransformCallback(IntelligibilityEnhancer* parent, AudioSource source); 84 85 // All in frequency domain, receives input |in_block|, applies 86 // intelligibility enhancement, and writes result to |out_block|. 87 void ProcessAudioBlock(const std::complex<float>* const* in_block, 88 size_t in_channels, 89 size_t frames, 90 size_t out_channels, 91 std::complex<float>* const* out_block) override; 92 93 private: 94 IntelligibilityEnhancer* parent_; 95 AudioSource source_; 96 }; 97 friend class TransformCallback; 98 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestErbCreation); 99 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestSolveForGains); 100 101 // Sends streams to ProcessClearBlock or ProcessNoiseBlock based on source. 102 void DispatchAudio(AudioSource source, 103 const std::complex<float>* in_block, 104 std::complex<float>* out_block); 105 106 // Updates variance computation and analysis with |in_block_|, 107 // and writes modified speech to |out_block|. 108 void ProcessClearBlock(const std::complex<float>* in_block, 109 std::complex<float>* out_block); 110 111 // Computes and sets modified gains. 112 void AnalyzeClearBlock(float power_target); 113 114 // Bisection search for optimal |lambda|. 115 void SolveForLambda(float power_target, float power_bot, float power_top); 116 117 // Transforms freq gains to ERB gains. 118 void UpdateErbGains(); 119 120 // Updates variance calculation for noise input with |in_block|. 121 void ProcessNoiseBlock(const std::complex<float>* in_block, 122 std::complex<float>* out_block); 123 124 // Returns number of ERB filters. 125 static size_t GetBankSize(int sample_rate, size_t erb_resolution); 126 127 // Initializes ERB filterbank. 128 void CreateErbBank(); 129 130 // Analytically solves quadratic for optimal gains given |lambda|. 131 // Negative gains are set to 0. Stores the results in |sols|. 132 void SolveForGainsGivenLambda(float lambda, size_t start_freq, float* sols); 133 134 // Computes variance across ERB filters from freq variance |var|. 135 // Stores in |result|. 136 void FilterVariance(const float* var, float* result); 137 138 // Returns dot product of vectors specified by size |length| arrays |a|,|b|. 139 static float DotProduct(const float* a, const float* b, size_t length); 140 141 const size_t freqs_; // Num frequencies in frequency domain. 142 const size_t window_size_; // Window size in samples; also the block size. 143 const size_t chunk_length_; // Chunk size in samples. 144 const size_t bank_size_; // Num ERB filters. 145 const int sample_rate_hz_; 146 const int erb_resolution_; 147 const size_t num_capture_channels_; 148 const size_t num_render_channels_; 149 const int analysis_rate_; // Num blocks before gains recalculated. 150 151 const bool active_; // Whether render gains are being updated. 152 // TODO(ekm): Add logic for updating |active_|. 153 154 intelligibility::VarianceArray clear_variance_; 155 intelligibility::VarianceArray noise_variance_; 156 rtc::scoped_ptr<float[]> filtered_clear_var_; 157 rtc::scoped_ptr<float[]> filtered_noise_var_; 158 std::vector<std::vector<float>> filter_bank_; 159 rtc::scoped_ptr<float[]> center_freqs_; 160 size_t start_freq_; 161 rtc::scoped_ptr<float[]> rho_; // Production and interpretation SNR. 162 // for each ERB band. 163 rtc::scoped_ptr<float[]> gains_eq_; // Pre-filter modified gains. 164 intelligibility::GainApplier gain_applier_; 165 166 // Destination buffers used to reassemble blocked chunks before overwriting 167 // the original input array with modifications. 168 ChannelBuffer<float> temp_render_out_buffer_; 169 ChannelBuffer<float> temp_capture_out_buffer_; 170 171 rtc::scoped_ptr<float[]> kbd_window_; 172 TransformCallback render_callback_; 173 TransformCallback capture_callback_; 174 rtc::scoped_ptr<LappedTransform> render_mangler_; 175 rtc::scoped_ptr<LappedTransform> capture_mangler_; 176 int block_count_; 177 int analysis_step_; 178 }; 179 180 } // namespace webrtc 181 182 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_ 183