Home | History | Annotate | Download | only in include
      1 /*
      2  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INCLUDE_AUDIO_PROCESSING_H_
     12 #define WEBRTC_MODULES_AUDIO_PROCESSING_INCLUDE_AUDIO_PROCESSING_H_
     13 
     14 // MSVC++ requires this to be set before any other includes to get M_PI.
     15 #define _USE_MATH_DEFINES
     16 
     17 #include <math.h>
     18 #include <stddef.h>  // size_t
     19 #include <stdio.h>  // FILE
     20 #include <vector>
     21 
     22 #include "webrtc/base/arraysize.h"
     23 #include "webrtc/base/platform_file.h"
     24 #include "webrtc/common.h"
     25 #include "webrtc/modules/audio_processing/beamformer/array_util.h"
     26 #include "webrtc/typedefs.h"
     27 
     28 struct AecCore;
     29 
     30 namespace webrtc {
     31 
     32 class AudioFrame;
     33 
     34 template<typename T>
     35 class Beamformer;
     36 
     37 class StreamConfig;
     38 class ProcessingConfig;
     39 
     40 class EchoCancellation;
     41 class EchoControlMobile;
     42 class GainControl;
     43 class HighPassFilter;
     44 class LevelEstimator;
     45 class NoiseSuppression;
     46 class VoiceDetection;
     47 
     48 // Use to enable the extended filter mode in the AEC, along with robustness
     49 // measures around the reported system delays. It comes with a significant
     50 // increase in AEC complexity, but is much more robust to unreliable reported
     51 // delays.
     52 //
     53 // Detailed changes to the algorithm:
     54 // - The filter length is changed from 48 to 128 ms. This comes with tuning of
     55 //   several parameters: i) filter adaptation stepsize and error threshold;
     56 //   ii) non-linear processing smoothing and overdrive.
     57 // - Option to ignore the reported delays on platforms which we deem
     58 //   sufficiently unreliable. See WEBRTC_UNTRUSTED_DELAY in echo_cancellation.c.
     59 // - Faster startup times by removing the excessive "startup phase" processing
     60 //   of reported delays.
     61 // - Much more conservative adjustments to the far-end read pointer. We smooth
     62 //   the delay difference more heavily, and back off from the difference more.
     63 //   Adjustments force a readaptation of the filter, so they should be avoided
     64 //   except when really necessary.
     65 struct ExtendedFilter {
     66   ExtendedFilter() : enabled(false) {}
     67   explicit ExtendedFilter(bool enabled) : enabled(enabled) {}
     68   static const ConfigOptionID identifier = ConfigOptionID::kExtendedFilter;
     69   bool enabled;
     70 };
     71 
     72 // Enables delay-agnostic echo cancellation. This feature relies on internally
     73 // estimated delays between the process and reverse streams, thus not relying
     74 // on reported system delays. This configuration only applies to
     75 // EchoCancellation and not EchoControlMobile. It can be set in the constructor
     76 // or using AudioProcessing::SetExtraOptions().
     77 struct DelayAgnostic {
     78   DelayAgnostic() : enabled(false) {}
     79   explicit DelayAgnostic(bool enabled) : enabled(enabled) {}
     80   static const ConfigOptionID identifier = ConfigOptionID::kDelayAgnostic;
     81   bool enabled;
     82 };
     83 
     84 // Use to enable experimental gain control (AGC). At startup the experimental
     85 // AGC moves the microphone volume up to |startup_min_volume| if the current
     86 // microphone volume is set too low. The value is clamped to its operating range
     87 // [12, 255]. Here, 255 maps to 100%.
     88 //
     89 // Must be provided through AudioProcessing::Create(Confg&).
     90 #if defined(WEBRTC_CHROMIUM_BUILD)
     91 static const int kAgcStartupMinVolume = 85;
     92 #else
     93 static const int kAgcStartupMinVolume = 0;
     94 #endif  // defined(WEBRTC_CHROMIUM_BUILD)
     95 struct ExperimentalAgc {
     96   ExperimentalAgc() : enabled(true), startup_min_volume(kAgcStartupMinVolume) {}
     97   explicit ExperimentalAgc(bool enabled)
     98       : enabled(enabled), startup_min_volume(kAgcStartupMinVolume) {}
     99   ExperimentalAgc(bool enabled, int startup_min_volume)
    100       : enabled(enabled), startup_min_volume(startup_min_volume) {}
    101   static const ConfigOptionID identifier = ConfigOptionID::kExperimentalAgc;
    102   bool enabled;
    103   int startup_min_volume;
    104 };
    105 
    106 // Use to enable experimental noise suppression. It can be set in the
    107 // constructor or using AudioProcessing::SetExtraOptions().
    108 struct ExperimentalNs {
    109   ExperimentalNs() : enabled(false) {}
    110   explicit ExperimentalNs(bool enabled) : enabled(enabled) {}
    111   static const ConfigOptionID identifier = ConfigOptionID::kExperimentalNs;
    112   bool enabled;
    113 };
    114 
    115 // Use to enable beamforming. Must be provided through the constructor. It will
    116 // have no impact if used with AudioProcessing::SetExtraOptions().
    117 struct Beamforming {
    118   Beamforming()
    119       : enabled(false),
    120         array_geometry(),
    121         target_direction(
    122             SphericalPointf(static_cast<float>(M_PI) / 2.f, 0.f, 1.f)) {}
    123   Beamforming(bool enabled, const std::vector<Point>& array_geometry)
    124       : Beamforming(enabled,
    125                     array_geometry,
    126                     SphericalPointf(static_cast<float>(M_PI) / 2.f, 0.f, 1.f)) {
    127   }
    128   Beamforming(bool enabled,
    129               const std::vector<Point>& array_geometry,
    130               SphericalPointf target_direction)
    131       : enabled(enabled),
    132         array_geometry(array_geometry),
    133         target_direction(target_direction) {}
    134   static const ConfigOptionID identifier = ConfigOptionID::kBeamforming;
    135   const bool enabled;
    136   const std::vector<Point> array_geometry;
    137   const SphericalPointf target_direction;
    138 };
    139 
    140 // Use to enable intelligibility enhancer in audio processing. Must be provided
    141 // though the constructor. It will have no impact if used with
    142 // AudioProcessing::SetExtraOptions().
    143 //
    144 // Note: If enabled and the reverse stream has more than one output channel,
    145 // the reverse stream will become an upmixed mono signal.
    146 struct Intelligibility {
    147   Intelligibility() : enabled(false) {}
    148   explicit Intelligibility(bool enabled) : enabled(enabled) {}
    149   static const ConfigOptionID identifier = ConfigOptionID::kIntelligibility;
    150   bool enabled;
    151 };
    152 
    153 // The Audio Processing Module (APM) provides a collection of voice processing
    154 // components designed for real-time communications software.
    155 //
    156 // APM operates on two audio streams on a frame-by-frame basis. Frames of the
    157 // primary stream, on which all processing is applied, are passed to
    158 // |ProcessStream()|. Frames of the reverse direction stream, which are used for
    159 // analysis by some components, are passed to |AnalyzeReverseStream()|. On the
    160 // client-side, this will typically be the near-end (capture) and far-end
    161 // (render) streams, respectively. APM should be placed in the signal chain as
    162 // close to the audio hardware abstraction layer (HAL) as possible.
    163 //
    164 // On the server-side, the reverse stream will normally not be used, with
    165 // processing occurring on each incoming stream.
    166 //
    167 // Component interfaces follow a similar pattern and are accessed through
    168 // corresponding getters in APM. All components are disabled at create-time,
    169 // with default settings that are recommended for most situations. New settings
    170 // can be applied without enabling a component. Enabling a component triggers
    171 // memory allocation and initialization to allow it to start processing the
    172 // streams.
    173 //
    174 // Thread safety is provided with the following assumptions to reduce locking
    175 // overhead:
    176 //   1. The stream getters and setters are called from the same thread as
    177 //      ProcessStream(). More precisely, stream functions are never called
    178 //      concurrently with ProcessStream().
    179 //   2. Parameter getters are never called concurrently with the corresponding
    180 //      setter.
    181 //
    182 // APM accepts only linear PCM audio data in chunks of 10 ms. The int16
    183 // interfaces use interleaved data, while the float interfaces use deinterleaved
    184 // data.
    185 //
    186 // Usage example, omitting error checking:
    187 // AudioProcessing* apm = AudioProcessing::Create(0);
    188 //
    189 // apm->high_pass_filter()->Enable(true);
    190 //
    191 // apm->echo_cancellation()->enable_drift_compensation(false);
    192 // apm->echo_cancellation()->Enable(true);
    193 //
    194 // apm->noise_reduction()->set_level(kHighSuppression);
    195 // apm->noise_reduction()->Enable(true);
    196 //
    197 // apm->gain_control()->set_analog_level_limits(0, 255);
    198 // apm->gain_control()->set_mode(kAdaptiveAnalog);
    199 // apm->gain_control()->Enable(true);
    200 //
    201 // apm->voice_detection()->Enable(true);
    202 //
    203 // // Start a voice call...
    204 //
    205 // // ... Render frame arrives bound for the audio HAL ...
    206 // apm->AnalyzeReverseStream(render_frame);
    207 //
    208 // // ... Capture frame arrives from the audio HAL ...
    209 // // Call required set_stream_ functions.
    210 // apm->set_stream_delay_ms(delay_ms);
    211 // apm->gain_control()->set_stream_analog_level(analog_level);
    212 //
    213 // apm->ProcessStream(capture_frame);
    214 //
    215 // // Call required stream_ functions.
    216 // analog_level = apm->gain_control()->stream_analog_level();
    217 // has_voice = apm->stream_has_voice();
    218 //
    219 // // Repeate render and capture processing for the duration of the call...
    220 // // Start a new call...
    221 // apm->Initialize();
    222 //
    223 // // Close the application...
    224 // delete apm;
    225 //
    226 class AudioProcessing {
    227  public:
    228   // TODO(mgraczyk): Remove once all methods that use ChannelLayout are gone.
    229   enum ChannelLayout {
    230     kMono,
    231     // Left, right.
    232     kStereo,
    233     // Mono, keyboard mic.
    234     kMonoAndKeyboard,
    235     // Left, right, keyboard mic.
    236     kStereoAndKeyboard
    237   };
    238 
    239   // Creates an APM instance. Use one instance for every primary audio stream
    240   // requiring processing. On the client-side, this would typically be one
    241   // instance for the near-end stream, and additional instances for each far-end
    242   // stream which requires processing. On the server-side, this would typically
    243   // be one instance for every incoming stream.
    244   static AudioProcessing* Create();
    245   // Allows passing in an optional configuration at create-time.
    246   static AudioProcessing* Create(const Config& config);
    247   // Only for testing.
    248   static AudioProcessing* Create(const Config& config,
    249                                  Beamformer<float>* beamformer);
    250   virtual ~AudioProcessing() {}
    251 
    252   // Initializes internal states, while retaining all user settings. This
    253   // should be called before beginning to process a new audio stream. However,
    254   // it is not necessary to call before processing the first stream after
    255   // creation.
    256   //
    257   // It is also not necessary to call if the audio parameters (sample
    258   // rate and number of channels) have changed. Passing updated parameters
    259   // directly to |ProcessStream()| and |AnalyzeReverseStream()| is permissible.
    260   // If the parameters are known at init-time though, they may be provided.
    261   virtual int Initialize() = 0;
    262 
    263   // The int16 interfaces require:
    264   //   - only |NativeRate|s be used
    265   //   - that the input, output and reverse rates must match
    266   //   - that |processing_config.output_stream()| matches
    267   //     |processing_config.input_stream()|.
    268   //
    269   // The float interfaces accept arbitrary rates and support differing input and
    270   // output layouts, but the output must have either one channel or the same
    271   // number of channels as the input.
    272   virtual int Initialize(const ProcessingConfig& processing_config) = 0;
    273 
    274   // Initialize with unpacked parameters. See Initialize() above for details.
    275   //
    276   // TODO(mgraczyk): Remove once clients are updated to use the new interface.
    277   virtual int Initialize(int input_sample_rate_hz,
    278                          int output_sample_rate_hz,
    279                          int reverse_sample_rate_hz,
    280                          ChannelLayout input_layout,
    281                          ChannelLayout output_layout,
    282                          ChannelLayout reverse_layout) = 0;
    283 
    284   // Pass down additional options which don't have explicit setters. This
    285   // ensures the options are applied immediately.
    286   virtual void SetExtraOptions(const Config& config) = 0;
    287 
    288   // TODO(peah): Remove after voice engine no longer requires it to resample
    289   // the reverse stream to the forward rate.
    290   virtual int input_sample_rate_hz() const = 0;
    291 
    292   // TODO(ajm): Only intended for internal use. Make private and friend the
    293   // necessary classes?
    294   virtual int proc_sample_rate_hz() const = 0;
    295   virtual int proc_split_sample_rate_hz() const = 0;
    296   virtual size_t num_input_channels() const = 0;
    297   virtual size_t num_proc_channels() const = 0;
    298   virtual size_t num_output_channels() const = 0;
    299   virtual size_t num_reverse_channels() const = 0;
    300 
    301   // Set to true when the output of AudioProcessing will be muted or in some
    302   // other way not used. Ideally, the captured audio would still be processed,
    303   // but some components may change behavior based on this information.
    304   // Default false.
    305   virtual void set_output_will_be_muted(bool muted) = 0;
    306 
    307   // Processes a 10 ms |frame| of the primary audio stream. On the client-side,
    308   // this is the near-end (or captured) audio.
    309   //
    310   // If needed for enabled functionality, any function with the set_stream_ tag
    311   // must be called prior to processing the current frame. Any getter function
    312   // with the stream_ tag which is needed should be called after processing.
    313   //
    314   // The |sample_rate_hz_|, |num_channels_|, and |samples_per_channel_|
    315   // members of |frame| must be valid. If changed from the previous call to this
    316   // method, it will trigger an initialization.
    317   virtual int ProcessStream(AudioFrame* frame) = 0;
    318 
    319   // Accepts deinterleaved float audio with the range [-1, 1]. Each element
    320   // of |src| points to a channel buffer, arranged according to
    321   // |input_layout|. At output, the channels will be arranged according to
    322   // |output_layout| at |output_sample_rate_hz| in |dest|.
    323   //
    324   // The output layout must have one channel or as many channels as the input.
    325   // |src| and |dest| may use the same memory, if desired.
    326   //
    327   // TODO(mgraczyk): Remove once clients are updated to use the new interface.
    328   virtual int ProcessStream(const float* const* src,
    329                             size_t samples_per_channel,
    330                             int input_sample_rate_hz,
    331                             ChannelLayout input_layout,
    332                             int output_sample_rate_hz,
    333                             ChannelLayout output_layout,
    334                             float* const* dest) = 0;
    335 
    336   // Accepts deinterleaved float audio with the range [-1, 1]. Each element of
    337   // |src| points to a channel buffer, arranged according to |input_stream|. At
    338   // output, the channels will be arranged according to |output_stream| in
    339   // |dest|.
    340   //
    341   // The output must have one channel or as many channels as the input. |src|
    342   // and |dest| may use the same memory, if desired.
    343   virtual int ProcessStream(const float* const* src,
    344                             const StreamConfig& input_config,
    345                             const StreamConfig& output_config,
    346                             float* const* dest) = 0;
    347 
    348   // Analyzes a 10 ms |frame| of the reverse direction audio stream. The frame
    349   // will not be modified. On the client-side, this is the far-end (or to be
    350   // rendered) audio.
    351   //
    352   // It is only necessary to provide this if echo processing is enabled, as the
    353   // reverse stream forms the echo reference signal. It is recommended, but not
    354   // necessary, to provide if gain control is enabled. On the server-side this
    355   // typically will not be used. If you're not sure what to pass in here,
    356   // chances are you don't need to use it.
    357   //
    358   // The |sample_rate_hz_|, |num_channels_|, and |samples_per_channel_|
    359   // members of |frame| must be valid. |sample_rate_hz_| must correspond to
    360   // |input_sample_rate_hz()|
    361   //
    362   // TODO(ajm): add const to input; requires an implementation fix.
    363   // DEPRECATED: Use |ProcessReverseStream| instead.
    364   // TODO(ekm): Remove once all users have updated to |ProcessReverseStream|.
    365   virtual int AnalyzeReverseStream(AudioFrame* frame) = 0;
    366 
    367   // Same as |AnalyzeReverseStream|, but may modify |frame| if intelligibility
    368   // is enabled.
    369   virtual int ProcessReverseStream(AudioFrame* frame) = 0;
    370 
    371   // Accepts deinterleaved float audio with the range [-1, 1]. Each element
    372   // of |data| points to a channel buffer, arranged according to |layout|.
    373   // TODO(mgraczyk): Remove once clients are updated to use the new interface.
    374   virtual int AnalyzeReverseStream(const float* const* data,
    375                                    size_t samples_per_channel,
    376                                    int rev_sample_rate_hz,
    377                                    ChannelLayout layout) = 0;
    378 
    379   // Accepts deinterleaved float audio with the range [-1, 1]. Each element of
    380   // |data| points to a channel buffer, arranged according to |reverse_config|.
    381   virtual int ProcessReverseStream(const float* const* src,
    382                                    const StreamConfig& reverse_input_config,
    383                                    const StreamConfig& reverse_output_config,
    384                                    float* const* dest) = 0;
    385 
    386   // This must be called if and only if echo processing is enabled.
    387   //
    388   // Sets the |delay| in ms between AnalyzeReverseStream() receiving a far-end
    389   // frame and ProcessStream() receiving a near-end frame containing the
    390   // corresponding echo. On the client-side this can be expressed as
    391   //   delay = (t_render - t_analyze) + (t_process - t_capture)
    392   // where,
    393   //   - t_analyze is the time a frame is passed to AnalyzeReverseStream() and
    394   //     t_render is the time the first sample of the same frame is rendered by
    395   //     the audio hardware.
    396   //   - t_capture is the time the first sample of a frame is captured by the
    397   //     audio hardware and t_pull is the time the same frame is passed to
    398   //     ProcessStream().
    399   virtual int set_stream_delay_ms(int delay) = 0;
    400   virtual int stream_delay_ms() const = 0;
    401   virtual bool was_stream_delay_set() const = 0;
    402 
    403   // Call to signal that a key press occurred (true) or did not occur (false)
    404   // with this chunk of audio.
    405   virtual void set_stream_key_pressed(bool key_pressed) = 0;
    406 
    407   // Sets a delay |offset| in ms to add to the values passed in through
    408   // set_stream_delay_ms(). May be positive or negative.
    409   //
    410   // Note that this could cause an otherwise valid value passed to
    411   // set_stream_delay_ms() to return an error.
    412   virtual void set_delay_offset_ms(int offset) = 0;
    413   virtual int delay_offset_ms() const = 0;
    414 
    415   // Starts recording debugging information to a file specified by |filename|,
    416   // a NULL-terminated string. If there is an ongoing recording, the old file
    417   // will be closed, and recording will continue in the newly specified file.
    418   // An already existing file will be overwritten without warning.
    419   static const size_t kMaxFilenameSize = 1024;
    420   virtual int StartDebugRecording(const char filename[kMaxFilenameSize]) = 0;
    421 
    422   // Same as above but uses an existing file handle. Takes ownership
    423   // of |handle| and closes it at StopDebugRecording().
    424   virtual int StartDebugRecording(FILE* handle) = 0;
    425 
    426   // Same as above but uses an existing PlatformFile handle. Takes ownership
    427   // of |handle| and closes it at StopDebugRecording().
    428   // TODO(xians): Make this interface pure virtual.
    429   virtual int StartDebugRecordingForPlatformFile(rtc::PlatformFile /*handle*/) {
    430       return -1;
    431   }
    432 
    433   // Stops recording debugging information, and closes the file. Recording
    434   // cannot be resumed in the same file (without overwriting it).
    435   virtual int StopDebugRecording() = 0;
    436 
    437   // Use to send UMA histograms at end of a call. Note that all histogram
    438   // specific member variables are reset.
    439   virtual void UpdateHistogramsOnCallEnd() = 0;
    440 
    441   // These provide access to the component interfaces and should never return
    442   // NULL. The pointers will be valid for the lifetime of the APM instance.
    443   // The memory for these objects is entirely managed internally.
    444   virtual EchoCancellation* echo_cancellation() const = 0;
    445   virtual EchoControlMobile* echo_control_mobile() const = 0;
    446   virtual GainControl* gain_control() const = 0;
    447   virtual HighPassFilter* high_pass_filter() const = 0;
    448   virtual LevelEstimator* level_estimator() const = 0;
    449   virtual NoiseSuppression* noise_suppression() const = 0;
    450   virtual VoiceDetection* voice_detection() const = 0;
    451 
    452   struct Statistic {
    453     int instant;  // Instantaneous value.
    454     int average;  // Long-term average.
    455     int maximum;  // Long-term maximum.
    456     int minimum;  // Long-term minimum.
    457   };
    458 
    459   enum Error {
    460     // Fatal errors.
    461     kNoError = 0,
    462     kUnspecifiedError = -1,
    463     kCreationFailedError = -2,
    464     kUnsupportedComponentError = -3,
    465     kUnsupportedFunctionError = -4,
    466     kNullPointerError = -5,
    467     kBadParameterError = -6,
    468     kBadSampleRateError = -7,
    469     kBadDataLengthError = -8,
    470     kBadNumberChannelsError = -9,
    471     kFileError = -10,
    472     kStreamParameterNotSetError = -11,
    473     kNotEnabledError = -12,
    474 
    475     // Warnings are non-fatal.
    476     // This results when a set_stream_ parameter is out of range. Processing
    477     // will continue, but the parameter may have been truncated.
    478     kBadStreamParameterWarning = -13
    479   };
    480 
    481   enum NativeRate {
    482     kSampleRate8kHz = 8000,
    483     kSampleRate16kHz = 16000,
    484     kSampleRate32kHz = 32000,
    485     kSampleRate48kHz = 48000
    486   };
    487 
    488   static const int kNativeSampleRatesHz[];
    489   static const size_t kNumNativeSampleRates;
    490   static const int kMaxNativeSampleRateHz;
    491   static const int kMaxAECMSampleRateHz;
    492 
    493   static const int kChunkSizeMs = 10;
    494 };
    495 
    496 class StreamConfig {
    497  public:
    498   // sample_rate_hz: The sampling rate of the stream.
    499   //
    500   // num_channels: The number of audio channels in the stream, excluding the
    501   //               keyboard channel if it is present. When passing a
    502   //               StreamConfig with an array of arrays T*[N],
    503   //
    504   //                N == {num_channels + 1  if  has_keyboard
    505   //                     {num_channels      if  !has_keyboard
    506   //
    507   // has_keyboard: True if the stream has a keyboard channel. When has_keyboard
    508   //               is true, the last channel in any corresponding list of
    509   //               channels is the keyboard channel.
    510   StreamConfig(int sample_rate_hz = 0,
    511                size_t num_channels = 0,
    512                bool has_keyboard = false)
    513       : sample_rate_hz_(sample_rate_hz),
    514         num_channels_(num_channels),
    515         has_keyboard_(has_keyboard),
    516         num_frames_(calculate_frames(sample_rate_hz)) {}
    517 
    518   void set_sample_rate_hz(int value) {
    519     sample_rate_hz_ = value;
    520     num_frames_ = calculate_frames(value);
    521   }
    522   void set_num_channels(size_t value) { num_channels_ = value; }
    523   void set_has_keyboard(bool value) { has_keyboard_ = value; }
    524 
    525   int sample_rate_hz() const { return sample_rate_hz_; }
    526 
    527   // The number of channels in the stream, not including the keyboard channel if
    528   // present.
    529   size_t num_channels() const { return num_channels_; }
    530 
    531   bool has_keyboard() const { return has_keyboard_; }
    532   size_t num_frames() const { return num_frames_; }
    533   size_t num_samples() const { return num_channels_ * num_frames_; }
    534 
    535   bool operator==(const StreamConfig& other) const {
    536     return sample_rate_hz_ == other.sample_rate_hz_ &&
    537            num_channels_ == other.num_channels_ &&
    538            has_keyboard_ == other.has_keyboard_;
    539   }
    540 
    541   bool operator!=(const StreamConfig& other) const { return !(*this == other); }
    542 
    543  private:
    544   static size_t calculate_frames(int sample_rate_hz) {
    545     return static_cast<size_t>(
    546         AudioProcessing::kChunkSizeMs * sample_rate_hz / 1000);
    547   }
    548 
    549   int sample_rate_hz_;
    550   size_t num_channels_;
    551   bool has_keyboard_;
    552   size_t num_frames_;
    553 };
    554 
    555 class ProcessingConfig {
    556  public:
    557   enum StreamName {
    558     kInputStream,
    559     kOutputStream,
    560     kReverseInputStream,
    561     kReverseOutputStream,
    562     kNumStreamNames,
    563   };
    564 
    565   const StreamConfig& input_stream() const {
    566     return streams[StreamName::kInputStream];
    567   }
    568   const StreamConfig& output_stream() const {
    569     return streams[StreamName::kOutputStream];
    570   }
    571   const StreamConfig& reverse_input_stream() const {
    572     return streams[StreamName::kReverseInputStream];
    573   }
    574   const StreamConfig& reverse_output_stream() const {
    575     return streams[StreamName::kReverseOutputStream];
    576   }
    577 
    578   StreamConfig& input_stream() { return streams[StreamName::kInputStream]; }
    579   StreamConfig& output_stream() { return streams[StreamName::kOutputStream]; }
    580   StreamConfig& reverse_input_stream() {
    581     return streams[StreamName::kReverseInputStream];
    582   }
    583   StreamConfig& reverse_output_stream() {
    584     return streams[StreamName::kReverseOutputStream];
    585   }
    586 
    587   bool operator==(const ProcessingConfig& other) const {
    588     for (int i = 0; i < StreamName::kNumStreamNames; ++i) {
    589       if (this->streams[i] != other.streams[i]) {
    590         return false;
    591       }
    592     }
    593     return true;
    594   }
    595 
    596   bool operator!=(const ProcessingConfig& other) const {
    597     return !(*this == other);
    598   }
    599 
    600   StreamConfig streams[StreamName::kNumStreamNames];
    601 };
    602 
    603 // The acoustic echo cancellation (AEC) component provides better performance
    604 // than AECM but also requires more processing power and is dependent on delay
    605 // stability and reporting accuracy. As such it is well-suited and recommended
    606 // for PC and IP phone applications.
    607 //
    608 // Not recommended to be enabled on the server-side.
    609 class EchoCancellation {
    610  public:
    611   // EchoCancellation and EchoControlMobile may not be enabled simultaneously.
    612   // Enabling one will disable the other.
    613   virtual int Enable(bool enable) = 0;
    614   virtual bool is_enabled() const = 0;
    615 
    616   // Differences in clock speed on the primary and reverse streams can impact
    617   // the AEC performance. On the client-side, this could be seen when different
    618   // render and capture devices are used, particularly with webcams.
    619   //
    620   // This enables a compensation mechanism, and requires that
    621   // set_stream_drift_samples() be called.
    622   virtual int enable_drift_compensation(bool enable) = 0;
    623   virtual bool is_drift_compensation_enabled() const = 0;
    624 
    625   // Sets the difference between the number of samples rendered and captured by
    626   // the audio devices since the last call to |ProcessStream()|. Must be called
    627   // if drift compensation is enabled, prior to |ProcessStream()|.
    628   virtual void set_stream_drift_samples(int drift) = 0;
    629   virtual int stream_drift_samples() const = 0;
    630 
    631   enum SuppressionLevel {
    632     kLowSuppression,
    633     kModerateSuppression,
    634     kHighSuppression
    635   };
    636 
    637   // Sets the aggressiveness of the suppressor. A higher level trades off
    638   // double-talk performance for increased echo suppression.
    639   virtual int set_suppression_level(SuppressionLevel level) = 0;
    640   virtual SuppressionLevel suppression_level() const = 0;
    641 
    642   // Returns false if the current frame almost certainly contains no echo
    643   // and true if it _might_ contain echo.
    644   virtual bool stream_has_echo() const = 0;
    645 
    646   // Enables the computation of various echo metrics. These are obtained
    647   // through |GetMetrics()|.
    648   virtual int enable_metrics(bool enable) = 0;
    649   virtual bool are_metrics_enabled() const = 0;
    650 
    651   // Each statistic is reported in dB.
    652   // P_far:  Far-end (render) signal power.
    653   // P_echo: Near-end (capture) echo signal power.
    654   // P_out:  Signal power at the output of the AEC.
    655   // P_a:    Internal signal power at the point before the AEC's non-linear
    656   //         processor.
    657   struct Metrics {
    658     // RERL = ERL + ERLE
    659     AudioProcessing::Statistic residual_echo_return_loss;
    660 
    661     // ERL = 10log_10(P_far / P_echo)
    662     AudioProcessing::Statistic echo_return_loss;
    663 
    664     // ERLE = 10log_10(P_echo / P_out)
    665     AudioProcessing::Statistic echo_return_loss_enhancement;
    666 
    667     // (Pre non-linear processing suppression) A_NLP = 10log_10(P_echo / P_a)
    668     AudioProcessing::Statistic a_nlp;
    669   };
    670 
    671   // TODO(ajm): discuss the metrics update period.
    672   virtual int GetMetrics(Metrics* metrics) = 0;
    673 
    674   // Enables computation and logging of delay values. Statistics are obtained
    675   // through |GetDelayMetrics()|.
    676   virtual int enable_delay_logging(bool enable) = 0;
    677   virtual bool is_delay_logging_enabled() const = 0;
    678 
    679   // The delay metrics consists of the delay |median| and the delay standard
    680   // deviation |std|. It also consists of the fraction of delay estimates
    681   // |fraction_poor_delays| that can make the echo cancellation perform poorly.
    682   // The values are aggregated until the first call to |GetDelayMetrics()| and
    683   // afterwards aggregated and updated every second.
    684   // Note that if there are several clients pulling metrics from
    685   // |GetDelayMetrics()| during a session the first call from any of them will
    686   // change to one second aggregation window for all.
    687   // TODO(bjornv): Deprecated, remove.
    688   virtual int GetDelayMetrics(int* median, int* std) = 0;
    689   virtual int GetDelayMetrics(int* median, int* std,
    690                               float* fraction_poor_delays) = 0;
    691 
    692   // Returns a pointer to the low level AEC component.  In case of multiple
    693   // channels, the pointer to the first one is returned.  A NULL pointer is
    694   // returned when the AEC component is disabled or has not been initialized
    695   // successfully.
    696   virtual struct AecCore* aec_core() const = 0;
    697 
    698  protected:
    699   virtual ~EchoCancellation() {}
    700 };
    701 
    702 // The acoustic echo control for mobile (AECM) component is a low complexity
    703 // robust option intended for use on mobile devices.
    704 //
    705 // Not recommended to be enabled on the server-side.
    706 class EchoControlMobile {
    707  public:
    708   // EchoCancellation and EchoControlMobile may not be enabled simultaneously.
    709   // Enabling one will disable the other.
    710   virtual int Enable(bool enable) = 0;
    711   virtual bool is_enabled() const = 0;
    712 
    713   // Recommended settings for particular audio routes. In general, the louder
    714   // the echo is expected to be, the higher this value should be set. The
    715   // preferred setting may vary from device to device.
    716   enum RoutingMode {
    717     kQuietEarpieceOrHeadset,
    718     kEarpiece,
    719     kLoudEarpiece,
    720     kSpeakerphone,
    721     kLoudSpeakerphone
    722   };
    723 
    724   // Sets echo control appropriate for the audio routing |mode| on the device.
    725   // It can and should be updated during a call if the audio routing changes.
    726   virtual int set_routing_mode(RoutingMode mode) = 0;
    727   virtual RoutingMode routing_mode() const = 0;
    728 
    729   // Comfort noise replaces suppressed background noise to maintain a
    730   // consistent signal level.
    731   virtual int enable_comfort_noise(bool enable) = 0;
    732   virtual bool is_comfort_noise_enabled() const = 0;
    733 
    734   // A typical use case is to initialize the component with an echo path from a
    735   // previous call. The echo path is retrieved using |GetEchoPath()|, typically
    736   // at the end of a call. The data can then be stored for later use as an
    737   // initializer before the next call, using |SetEchoPath()|.
    738   //
    739   // Controlling the echo path this way requires the data |size_bytes| to match
    740   // the internal echo path size. This size can be acquired using
    741   // |echo_path_size_bytes()|. |SetEchoPath()| causes an entire reset, worth
    742   // noting if it is to be called during an ongoing call.
    743   //
    744   // It is possible that version incompatibilities may result in a stored echo
    745   // path of the incorrect size. In this case, the stored path should be
    746   // discarded.
    747   virtual int SetEchoPath(const void* echo_path, size_t size_bytes) = 0;
    748   virtual int GetEchoPath(void* echo_path, size_t size_bytes) const = 0;
    749 
    750   // The returned path size is guaranteed not to change for the lifetime of
    751   // the application.
    752   static size_t echo_path_size_bytes();
    753 
    754  protected:
    755   virtual ~EchoControlMobile() {}
    756 };
    757 
    758 // The automatic gain control (AGC) component brings the signal to an
    759 // appropriate range. This is done by applying a digital gain directly and, in
    760 // the analog mode, prescribing an analog gain to be applied at the audio HAL.
    761 //
    762 // Recommended to be enabled on the client-side.
    763 class GainControl {
    764  public:
    765   virtual int Enable(bool enable) = 0;
    766   virtual bool is_enabled() const = 0;
    767 
    768   // When an analog mode is set, this must be called prior to |ProcessStream()|
    769   // to pass the current analog level from the audio HAL. Must be within the
    770   // range provided to |set_analog_level_limits()|.
    771   virtual int set_stream_analog_level(int level) = 0;
    772 
    773   // When an analog mode is set, this should be called after |ProcessStream()|
    774   // to obtain the recommended new analog level for the audio HAL. It is the
    775   // users responsibility to apply this level.
    776   virtual int stream_analog_level() = 0;
    777 
    778   enum Mode {
    779     // Adaptive mode intended for use if an analog volume control is available
    780     // on the capture device. It will require the user to provide coupling
    781     // between the OS mixer controls and AGC through the |stream_analog_level()|
    782     // functions.
    783     //
    784     // It consists of an analog gain prescription for the audio device and a
    785     // digital compression stage.
    786     kAdaptiveAnalog,
    787 
    788     // Adaptive mode intended for situations in which an analog volume control
    789     // is unavailable. It operates in a similar fashion to the adaptive analog
    790     // mode, but with scaling instead applied in the digital domain. As with
    791     // the analog mode, it additionally uses a digital compression stage.
    792     kAdaptiveDigital,
    793 
    794     // Fixed mode which enables only the digital compression stage also used by
    795     // the two adaptive modes.
    796     //
    797     // It is distinguished from the adaptive modes by considering only a
    798     // short time-window of the input signal. It applies a fixed gain through
    799     // most of the input level range, and compresses (gradually reduces gain
    800     // with increasing level) the input signal at higher levels. This mode is
    801     // preferred on embedded devices where the capture signal level is
    802     // predictable, so that a known gain can be applied.
    803     kFixedDigital
    804   };
    805 
    806   virtual int set_mode(Mode mode) = 0;
    807   virtual Mode mode() const = 0;
    808 
    809   // Sets the target peak |level| (or envelope) of the AGC in dBFs (decibels
    810   // from digital full-scale). The convention is to use positive values. For
    811   // instance, passing in a value of 3 corresponds to -3 dBFs, or a target
    812   // level 3 dB below full-scale. Limited to [0, 31].
    813   //
    814   // TODO(ajm): use a negative value here instead, if/when VoE will similarly
    815   //            update its interface.
    816   virtual int set_target_level_dbfs(int level) = 0;
    817   virtual int target_level_dbfs() const = 0;
    818 
    819   // Sets the maximum |gain| the digital compression stage may apply, in dB. A
    820   // higher number corresponds to greater compression, while a value of 0 will
    821   // leave the signal uncompressed. Limited to [0, 90].
    822   virtual int set_compression_gain_db(int gain) = 0;
    823   virtual int compression_gain_db() const = 0;
    824 
    825   // When enabled, the compression stage will hard limit the signal to the
    826   // target level. Otherwise, the signal will be compressed but not limited
    827   // above the target level.
    828   virtual int enable_limiter(bool enable) = 0;
    829   virtual bool is_limiter_enabled() const = 0;
    830 
    831   // Sets the |minimum| and |maximum| analog levels of the audio capture device.
    832   // Must be set if and only if an analog mode is used. Limited to [0, 65535].
    833   virtual int set_analog_level_limits(int minimum,
    834                                       int maximum) = 0;
    835   virtual int analog_level_minimum() const = 0;
    836   virtual int analog_level_maximum() const = 0;
    837 
    838   // Returns true if the AGC has detected a saturation event (period where the
    839   // signal reaches digital full-scale) in the current frame and the analog
    840   // level cannot be reduced.
    841   //
    842   // This could be used as an indicator to reduce or disable analog mic gain at
    843   // the audio HAL.
    844   virtual bool stream_is_saturated() const = 0;
    845 
    846  protected:
    847   virtual ~GainControl() {}
    848 };
    849 
    850 // A filtering component which removes DC offset and low-frequency noise.
    851 // Recommended to be enabled on the client-side.
    852 class HighPassFilter {
    853  public:
    854   virtual int Enable(bool enable) = 0;
    855   virtual bool is_enabled() const = 0;
    856 
    857  protected:
    858   virtual ~HighPassFilter() {}
    859 };
    860 
    861 // An estimation component used to retrieve level metrics.
    862 class LevelEstimator {
    863  public:
    864   virtual int Enable(bool enable) = 0;
    865   virtual bool is_enabled() const = 0;
    866 
    867   // Returns the root mean square (RMS) level in dBFs (decibels from digital
    868   // full-scale), or alternately dBov. It is computed over all primary stream
    869   // frames since the last call to RMS(). The returned value is positive but
    870   // should be interpreted as negative. It is constrained to [0, 127].
    871   //
    872   // The computation follows: https://tools.ietf.org/html/rfc6465
    873   // with the intent that it can provide the RTP audio level indication.
    874   //
    875   // Frames passed to ProcessStream() with an |_energy| of zero are considered
    876   // to have been muted. The RMS of the frame will be interpreted as -127.
    877   virtual int RMS() = 0;
    878 
    879  protected:
    880   virtual ~LevelEstimator() {}
    881 };
    882 
    883 // The noise suppression (NS) component attempts to remove noise while
    884 // retaining speech. Recommended to be enabled on the client-side.
    885 //
    886 // Recommended to be enabled on the client-side.
    887 class NoiseSuppression {
    888  public:
    889   virtual int Enable(bool enable) = 0;
    890   virtual bool is_enabled() const = 0;
    891 
    892   // Determines the aggressiveness of the suppression. Increasing the level
    893   // will reduce the noise level at the expense of a higher speech distortion.
    894   enum Level {
    895     kLow,
    896     kModerate,
    897     kHigh,
    898     kVeryHigh
    899   };
    900 
    901   virtual int set_level(Level level) = 0;
    902   virtual Level level() const = 0;
    903 
    904   // Returns the internally computed prior speech probability of current frame
    905   // averaged over output channels. This is not supported in fixed point, for
    906   // which |kUnsupportedFunctionError| is returned.
    907   virtual float speech_probability() const = 0;
    908 
    909  protected:
    910   virtual ~NoiseSuppression() {}
    911 };
    912 
    913 // The voice activity detection (VAD) component analyzes the stream to
    914 // determine if voice is present. A facility is also provided to pass in an
    915 // external VAD decision.
    916 //
    917 // In addition to |stream_has_voice()| the VAD decision is provided through the
    918 // |AudioFrame| passed to |ProcessStream()|. The |vad_activity_| member will be
    919 // modified to reflect the current decision.
    920 class VoiceDetection {
    921  public:
    922   virtual int Enable(bool enable) = 0;
    923   virtual bool is_enabled() const = 0;
    924 
    925   // Returns true if voice is detected in the current frame. Should be called
    926   // after |ProcessStream()|.
    927   virtual bool stream_has_voice() const = 0;
    928 
    929   // Some of the APM functionality requires a VAD decision. In the case that
    930   // a decision is externally available for the current frame, it can be passed
    931   // in here, before |ProcessStream()| is called.
    932   //
    933   // VoiceDetection does _not_ need to be enabled to use this. If it happens to
    934   // be enabled, detection will be skipped for any frame in which an external
    935   // VAD decision is provided.
    936   virtual int set_stream_has_voice(bool has_voice) = 0;
    937 
    938   // Specifies the likelihood that a frame will be declared to contain voice.
    939   // A higher value makes it more likely that speech will not be clipped, at
    940   // the expense of more noise being detected as voice.
    941   enum Likelihood {
    942     kVeryLowLikelihood,
    943     kLowLikelihood,
    944     kModerateLikelihood,
    945     kHighLikelihood
    946   };
    947 
    948   virtual int set_likelihood(Likelihood likelihood) = 0;
    949   virtual Likelihood likelihood() const = 0;
    950 
    951   // Sets the |size| of the frames in ms on which the VAD will operate. Larger
    952   // frames will improve detection accuracy, but reduce the frequency of
    953   // updates.
    954   //
    955   // This does not impact the size of frames passed to |ProcessStream()|.
    956   virtual int set_frame_size_ms(int size) = 0;
    957   virtual int frame_size_ms() const = 0;
    958 
    959  protected:
    960   virtual ~VoiceDetection() {}
    961 };
    962 }  // namespace webrtc
    963 
    964 #endif  // WEBRTC_MODULES_AUDIO_PROCESSING_INCLUDE_AUDIO_PROCESSING_H_
    965