Home | History | Annotate | Download | only in interface
      1 /*
      2  *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_MAIN_INTERFACE_AUDIO_PROCESSING_H_
     12 #define WEBRTC_MODULES_AUDIO_PROCESSING_MAIN_INTERFACE_AUDIO_PROCESSING_H_
     13 
     14 #include "typedefs.h"
     15 #include "module.h"
     16 
     17 namespace webrtc {
     18 
     19 class AudioFrame;
     20 class EchoCancellation;
     21 class EchoControlMobile;
     22 class GainControl;
     23 class HighPassFilter;
     24 class LevelEstimator;
     25 class NoiseSuppression;
     26 class VoiceDetection;
     27 
     28 // The Audio Processing Module (APM) provides a collection of voice processing
     29 // components designed for real-time communications software.
     30 //
     31 // APM operates on two audio streams on a frame-by-frame basis. Frames of the
     32 // primary stream, on which all processing is applied, are passed to
     33 // |ProcessStream()|. Frames of the reverse direction stream, which are used for
     34 // analysis by some components, are passed to |AnalyzeReverseStream()|. On the
     35 // client-side, this will typically be the near-end (capture) and far-end
     36 // (render) streams, respectively. APM should be placed in the signal chain as
     37 // close to the audio hardware abstraction layer (HAL) as possible.
     38 //
     39 // On the server-side, the reverse stream will normally not be used, with
     40 // processing occurring on each incoming stream.
     41 //
     42 // Component interfaces follow a similar pattern and are accessed through
     43 // corresponding getters in APM. All components are disabled at create-time,
     44 // with default settings that are recommended for most situations. New settings
     45 // can be applied without enabling a component. Enabling a component triggers
     46 // memory allocation and initialization to allow it to start processing the
     47 // streams.
     48 //
     49 // Thread safety is provided with the following assumptions to reduce locking
     50 // overhead:
     51 //   1. The stream getters and setters are called from the same thread as
     52 //      ProcessStream(). More precisely, stream functions are never called
     53 //      concurrently with ProcessStream().
     54 //   2. Parameter getters are never called concurrently with the corresponding
     55 //      setter.
     56 //
     57 // APM accepts only 16-bit linear PCM audio data in frames of 10 ms. Multiple
     58 // channels should be interleaved.
     59 //
     60 // Usage example, omitting error checking:
     61 // AudioProcessing* apm = AudioProcessing::Create(0);
     62 // apm->set_sample_rate_hz(32000); // Super-wideband processing.
     63 //
     64 // // Mono capture and stereo render.
     65 // apm->set_num_channels(1, 1);
     66 // apm->set_num_reverse_channels(2);
     67 //
     68 // apm->high_pass_filter()->Enable(true);
     69 //
     70 // apm->echo_cancellation()->enable_drift_compensation(false);
     71 // apm->echo_cancellation()->Enable(true);
     72 //
     73 // apm->noise_reduction()->set_level(kHighSuppression);
     74 // apm->noise_reduction()->Enable(true);
     75 //
     76 // apm->gain_control()->set_analog_level_limits(0, 255);
     77 // apm->gain_control()->set_mode(kAdaptiveAnalog);
     78 // apm->gain_control()->Enable(true);
     79 //
     80 // apm->voice_detection()->Enable(true);
     81 //
     82 // // Start a voice call...
     83 //
     84 // // ... Render frame arrives bound for the audio HAL ...
     85 // apm->AnalyzeReverseStream(render_frame);
     86 //
     87 // // ... Capture frame arrives from the audio HAL ...
     88 // // Call required set_stream_ functions.
     89 // apm->set_stream_delay_ms(delay_ms);
     90 // apm->gain_control()->set_stream_analog_level(analog_level);
     91 //
     92 // apm->ProcessStream(capture_frame);
     93 //
     94 // // Call required stream_ functions.
     95 // analog_level = apm->gain_control()->stream_analog_level();
     96 // has_voice = apm->stream_has_voice();
     97 //
     98 // // Repeate render and capture processing for the duration of the call...
     99 // // Start a new call...
    100 // apm->Initialize();
    101 //
    102 // // Close the application...
    103 // AudioProcessing::Destroy(apm);
    104 // apm = NULL;
    105 //
    106 class AudioProcessing : public Module {
    107  public:
    108   // Creates a APM instance, with identifier |id|. Use one instance for every
    109   // primary audio stream requiring processing. On the client-side, this would
    110   // typically be one instance for the near-end stream, and additional instances
    111   // for each far-end stream which requires processing. On the server-side,
    112   // this would typically be one instance for every incoming stream.
    113   static AudioProcessing* Create(int id);
    114 
    115   // Destroys a |apm| instance.
    116   static void Destroy(AudioProcessing* apm);
    117 
    118   // Initializes internal states, while retaining all user settings. This
    119   // should be called before beginning to process a new audio stream. However,
    120   // it is not necessary to call before processing the first stream after
    121   // creation.
    122   virtual int Initialize() = 0;
    123 
    124   // Sets the sample |rate| in Hz for both the primary and reverse audio
    125   // streams. 8000, 16000 or 32000 Hz are permitted.
    126   virtual int set_sample_rate_hz(int rate) = 0;
    127   virtual int sample_rate_hz() const = 0;
    128 
    129   // Sets the number of channels for the primary audio stream. Input frames must
    130   // contain a number of channels given by |input_channels|, while output frames
    131   // will be returned with number of channels given by |output_channels|.
    132   virtual int set_num_channels(int input_channels, int output_channels) = 0;
    133   virtual int num_input_channels() const = 0;
    134   virtual int num_output_channels() const = 0;
    135 
    136   // Sets the number of channels for the reverse audio stream. Input frames must
    137   // contain a number of channels given by |channels|.
    138   virtual int set_num_reverse_channels(int channels) = 0;
    139   virtual int num_reverse_channels() const = 0;
    140 
    141   // Processes a 10 ms |frame| of the primary audio stream. On the client-side,
    142   // this is the near-end (or captured) audio.
    143   //
    144   // If needed for enabled functionality, any function with the set_stream_ tag
    145   // must be called prior to processing the current frame. Any getter function
    146   // with the stream_ tag which is needed should be called after processing.
    147   //
    148   // The |_frequencyInHz|, |_audioChannel|, and |_payloadDataLengthInSamples|
    149   // members of |frame| must be valid, and correspond to settings supplied
    150   // to APM.
    151   virtual int ProcessStream(AudioFrame* frame) = 0;
    152 
    153   // Analyzes a 10 ms |frame| of the reverse direction audio stream. The frame
    154   // will not be modified. On the client-side, this is the far-end (or to be
    155   // rendered) audio.
    156   //
    157   // It is only necessary to provide this if echo processing is enabled, as the
    158   // reverse stream forms the echo reference signal. It is recommended, but not
    159   // necessary, to provide if gain control is enabled. On the server-side this
    160   // typically will not be used. If you're not sure what to pass in here,
    161   // chances are you don't need to use it.
    162   //
    163   // The |_frequencyInHz|, |_audioChannel|, and |_payloadDataLengthInSamples|
    164   // members of |frame| must be valid.
    165   //
    166   // TODO(ajm): add const to input; requires an implementation fix.
    167   virtual int AnalyzeReverseStream(AudioFrame* frame) = 0;
    168 
    169   // This must be called if and only if echo processing is enabled.
    170   //
    171   // Sets the |delay| in ms between AnalyzeReverseStream() receiving a far-end
    172   // frame and ProcessStream() receiving a near-end frame containing the
    173   // corresponding echo. On the client-side this can be expressed as
    174   //   delay = (t_render - t_analyze) + (t_process - t_capture)
    175   // where,
    176   //   - t_analyze is the time a frame is passed to AnalyzeReverseStream() and
    177   //     t_render is the time the first sample of the same frame is rendered by
    178   //     the audio hardware.
    179   //   - t_capture is the time the first sample of a frame is captured by the
    180   //     audio hardware and t_pull is the time the same frame is passed to
    181   //     ProcessStream().
    182   virtual int set_stream_delay_ms(int delay) = 0;
    183   virtual int stream_delay_ms() const = 0;
    184 
    185   // Starts recording debugging information to a file specified by |filename|,
    186   // a NULL-terminated string. If there is an ongoing recording, the old file
    187   // will be closed, and recording will continue in the newly specified file.
    188   // An already existing file will be overwritten without warning.
    189   static const int kMaxFilenameSize = 1024;
    190   virtual int StartDebugRecording(const char filename[kMaxFilenameSize]) = 0;
    191 
    192   // Stops recording debugging information, and closes the file. Recording
    193   // cannot be resumed in the same file (without overwriting it).
    194   virtual int StopDebugRecording() = 0;
    195 
    196   // These provide access to the component interfaces and should never return
    197   // NULL. The pointers will be valid for the lifetime of the APM instance.
    198   // The memory for these objects is entirely managed internally.
    199   virtual EchoCancellation* echo_cancellation() const = 0;
    200   virtual EchoControlMobile* echo_control_mobile() const = 0;
    201   virtual GainControl* gain_control() const = 0;
    202   virtual HighPassFilter* high_pass_filter() const = 0;
    203   virtual LevelEstimator* level_estimator() const = 0;
    204   virtual NoiseSuppression* noise_suppression() const = 0;
    205   virtual VoiceDetection* voice_detection() const = 0;
    206 
    207   struct Statistic {
    208     int instant;  // Instantaneous value.
    209     int average;  // Long-term average.
    210     int maximum;  // Long-term maximum.
    211     int minimum;  // Long-term minimum.
    212   };
    213 
    214   // Fatal errors.
    215   enum Errors {
    216     kNoError = 0,
    217     kUnspecifiedError = -1,
    218     kCreationFailedError = -2,
    219     kUnsupportedComponentError = -3,
    220     kUnsupportedFunctionError = -4,
    221     kNullPointerError = -5,
    222     kBadParameterError = -6,
    223     kBadSampleRateError = -7,
    224     kBadDataLengthError = -8,
    225     kBadNumberChannelsError = -9,
    226     kFileError = -10,
    227     kStreamParameterNotSetError = -11,
    228     kNotEnabledError = -12
    229   };
    230 
    231   // Warnings are non-fatal.
    232   enum Warnings {
    233     // This results when a set_stream_ parameter is out of range. Processing
    234     // will continue, but the parameter may have been truncated.
    235     kBadStreamParameterWarning = -13,
    236   };
    237 
    238   // Inherited from Module.
    239   virtual WebRtc_Word32 TimeUntilNextProcess() { return -1; };
    240   virtual WebRtc_Word32 Process() { return -1; };
    241 
    242  protected:
    243   virtual ~AudioProcessing() {};
    244 };
    245 
    246 // The acoustic echo cancellation (AEC) component provides better performance
    247 // than AECM but also requires more processing power and is dependent on delay
    248 // stability and reporting accuracy. As such it is well-suited and recommended
    249 // for PC and IP phone applications.
    250 //
    251 // Not recommended to be enabled on the server-side.
    252 class EchoCancellation {
    253  public:
    254   // EchoCancellation and EchoControlMobile may not be enabled simultaneously.
    255   // Enabling one will disable the other.
    256   virtual int Enable(bool enable) = 0;
    257   virtual bool is_enabled() const = 0;
    258 
    259   // Differences in clock speed on the primary and reverse streams can impact
    260   // the AEC performance. On the client-side, this could be seen when different
    261   // render and capture devices are used, particularly with webcams.
    262   //
    263   // This enables a compensation mechanism, and requires that
    264   // |set_device_sample_rate_hz()| and |set_stream_drift_samples()| be called.
    265   virtual int enable_drift_compensation(bool enable) = 0;
    266   virtual bool is_drift_compensation_enabled() const = 0;
    267 
    268   // Provides the sampling rate of the audio devices. It is assumed the render
    269   // and capture devices use the same nominal sample rate. Required if and only
    270   // if drift compensation is enabled.
    271   virtual int set_device_sample_rate_hz(int rate) = 0;
    272   virtual int device_sample_rate_hz() const = 0;
    273 
    274   // Sets the difference between the number of samples rendered and captured by
    275   // the audio devices since the last call to |ProcessStream()|. Must be called
    276   // if and only if drift compensation is enabled, prior to |ProcessStream()|.
    277   virtual int set_stream_drift_samples(int drift) = 0;
    278   virtual int stream_drift_samples() const = 0;
    279 
    280   enum SuppressionLevel {
    281     kLowSuppression,
    282     kModerateSuppression,
    283     kHighSuppression
    284   };
    285 
    286   // Sets the aggressiveness of the suppressor. A higher level trades off
    287   // double-talk performance for increased echo suppression.
    288   virtual int set_suppression_level(SuppressionLevel level) = 0;
    289   virtual SuppressionLevel suppression_level() const = 0;
    290 
    291   // Returns false if the current frame almost certainly contains no echo
    292   // and true if it _might_ contain echo.
    293   virtual bool stream_has_echo() const = 0;
    294 
    295   // Enables the computation of various echo metrics. These are obtained
    296   // through |GetMetrics()|.
    297   virtual int enable_metrics(bool enable) = 0;
    298   virtual bool are_metrics_enabled() const = 0;
    299 
    300   // Each statistic is reported in dB.
    301   // P_far:  Far-end (render) signal power.
    302   // P_echo: Near-end (capture) echo signal power.
    303   // P_out:  Signal power at the output of the AEC.
    304   // P_a:    Internal signal power at the point before the AEC's non-linear
    305   //         processor.
    306   struct Metrics {
    307     // RERL = ERL + ERLE
    308     AudioProcessing::Statistic residual_echo_return_loss;
    309 
    310     // ERL = 10log_10(P_far / P_echo)
    311     AudioProcessing::Statistic echo_return_loss;
    312 
    313     // ERLE = 10log_10(P_echo / P_out)
    314     AudioProcessing::Statistic echo_return_loss_enhancement;
    315 
    316     // (Pre non-linear processing suppression) A_NLP = 10log_10(P_echo / P_a)
    317     AudioProcessing::Statistic a_nlp;
    318   };
    319 
    320   // TODO(ajm): discuss the metrics update period.
    321   virtual int GetMetrics(Metrics* metrics) = 0;
    322 
    323  protected:
    324   virtual ~EchoCancellation() {};
    325 };
    326 
    327 // The acoustic echo control for mobile (AECM) component is a low complexity
    328 // robust option intended for use on mobile devices.
    329 //
    330 // Not recommended to be enabled on the server-side.
    331 class EchoControlMobile {
    332  public:
    333   // EchoCancellation and EchoControlMobile may not be enabled simultaneously.
    334   // Enabling one will disable the other.
    335   virtual int Enable(bool enable) = 0;
    336   virtual bool is_enabled() const = 0;
    337 
    338   // Recommended settings for particular audio routes. In general, the louder
    339   // the echo is expected to be, the higher this value should be set. The
    340   // preferred setting may vary from device to device.
    341   enum RoutingMode {
    342     kQuietEarpieceOrHeadset,
    343     kEarpiece,
    344     kLoudEarpiece,
    345     kSpeakerphone,
    346     kLoudSpeakerphone
    347   };
    348 
    349   // Sets echo control appropriate for the audio routing |mode| on the device.
    350   // It can and should be updated during a call if the audio routing changes.
    351   virtual int set_routing_mode(RoutingMode mode) = 0;
    352   virtual RoutingMode routing_mode() const = 0;
    353 
    354   // Comfort noise replaces suppressed background noise to maintain a
    355   // consistent signal level.
    356   virtual int enable_comfort_noise(bool enable) = 0;
    357   virtual bool is_comfort_noise_enabled() const = 0;
    358 
    359  protected:
    360   virtual ~EchoControlMobile() {};
    361 };
    362 
    363 // The automatic gain control (AGC) component brings the signal to an
    364 // appropriate range. This is done by applying a digital gain directly and, in
    365 // the analog mode, prescribing an analog gain to be applied at the audio HAL.
    366 //
    367 // Recommended to be enabled on the client-side.
    368 class GainControl {
    369  public:
    370   virtual int Enable(bool enable) = 0;
    371   virtual bool is_enabled() const = 0;
    372 
    373   // When an analog mode is set, this must be called prior to |ProcessStream()|
    374   // to pass the current analog level from the audio HAL. Must be within the
    375   // range provided to |set_analog_level_limits()|.
    376   virtual int set_stream_analog_level(int level) = 0;
    377 
    378   // When an analog mode is set, this should be called after |ProcessStream()|
    379   // to obtain the recommended new analog level for the audio HAL. It is the
    380   // users responsibility to apply this level.
    381   virtual int stream_analog_level() = 0;
    382 
    383   enum Mode {
    384     // Adaptive mode intended for use if an analog volume control is available
    385     // on the capture device. It will require the user to provide coupling
    386     // between the OS mixer controls and AGC through the |stream_analog_level()|
    387     // functions.
    388     //
    389     // It consists of an analog gain prescription for the audio device and a
    390     // digital compression stage.
    391     kAdaptiveAnalog,
    392 
    393     // Adaptive mode intended for situations in which an analog volume control
    394     // is unavailable. It operates in a similar fashion to the adaptive analog
    395     // mode, but with scaling instead applied in the digital domain. As with
    396     // the analog mode, it additionally uses a digital compression stage.
    397     kAdaptiveDigital,
    398 
    399     // Fixed mode which enables only the digital compression stage also used by
    400     // the two adaptive modes.
    401     //
    402     // It is distinguished from the adaptive modes by considering only a
    403     // short time-window of the input signal. It applies a fixed gain through
    404     // most of the input level range, and compresses (gradually reduces gain
    405     // with increasing level) the input signal at higher levels. This mode is
    406     // preferred on embedded devices where the capture signal level is
    407     // predictable, so that a known gain can be applied.
    408     kFixedDigital
    409   };
    410 
    411   virtual int set_mode(Mode mode) = 0;
    412   virtual Mode mode() const = 0;
    413 
    414   // Sets the target peak |level| (or envelope) of the AGC in dBFs (decibels
    415   // from digital full-scale). The convention is to use positive values. For
    416   // instance, passing in a value of 3 corresponds to -3 dBFs, or a target
    417   // level 3 dB below full-scale. Limited to [0, 31].
    418   //
    419   // TODO(ajm): use a negative value here instead, if/when VoE will similarly
    420   //            update its interface.
    421   virtual int set_target_level_dbfs(int level) = 0;
    422   virtual int target_level_dbfs() const = 0;
    423 
    424   // Sets the maximum |gain| the digital compression stage may apply, in dB. A
    425   // higher number corresponds to greater compression, while a value of 0 will
    426   // leave the signal uncompressed. Limited to [0, 90].
    427   virtual int set_compression_gain_db(int gain) = 0;
    428   virtual int compression_gain_db() const = 0;
    429 
    430   // When enabled, the compression stage will hard limit the signal to the
    431   // target level. Otherwise, the signal will be compressed but not limited
    432   // above the target level.
    433   virtual int enable_limiter(bool enable) = 0;
    434   virtual bool is_limiter_enabled() const = 0;
    435 
    436   // Sets the |minimum| and |maximum| analog levels of the audio capture device.
    437   // Must be set if and only if an analog mode is used. Limited to [0, 65535].
    438   virtual int set_analog_level_limits(int minimum,
    439                                       int maximum) = 0;
    440   virtual int analog_level_minimum() const = 0;
    441   virtual int analog_level_maximum() const = 0;
    442 
    443   // Returns true if the AGC has detected a saturation event (period where the
    444   // signal reaches digital full-scale) in the current frame and the analog
    445   // level cannot be reduced.
    446   //
    447   // This could be used as an indicator to reduce or disable analog mic gain at
    448   // the audio HAL.
    449   virtual bool stream_is_saturated() const = 0;
    450 
    451  protected:
    452   virtual ~GainControl() {};
    453 };
    454 
    455 // A filtering component which removes DC offset and low-frequency noise.
    456 // Recommended to be enabled on the client-side.
    457 class HighPassFilter {
    458  public:
    459   virtual int Enable(bool enable) = 0;
    460   virtual bool is_enabled() const = 0;
    461 
    462  protected:
    463   virtual ~HighPassFilter() {};
    464 };
    465 
    466 // An estimation component used to retrieve level metrics.
    467 class LevelEstimator {
    468  public:
    469   virtual int Enable(bool enable) = 0;
    470   virtual bool is_enabled() const = 0;
    471 
    472   // The metrics are reported in dBFs calculated as:
    473   //   Level = 10log_10(P_s / P_max) [dBFs], where
    474   //   P_s is the signal power and P_max is the maximum possible (or peak)
    475   //   power. With 16-bit signals, P_max = (2^15)^2.
    476   struct Metrics {
    477     AudioProcessing::Statistic signal;  // Overall signal level.
    478     AudioProcessing::Statistic speech;  // Speech level.
    479     AudioProcessing::Statistic noise;   // Noise level.
    480   };
    481 
    482   virtual int GetMetrics(Metrics* metrics, Metrics* reverse_metrics) = 0;
    483 
    484   //virtual int enable_noise_warning(bool enable) = 0;
    485   //bool is_noise_warning_enabled() const = 0;
    486   //virtual bool stream_has_high_noise() const = 0;
    487 
    488  protected:
    489   virtual ~LevelEstimator() {};
    490 };
    491 
    492 // The noise suppression (NS) component attempts to remove noise while
    493 // retaining speech. Recommended to be enabled on the client-side.
    494 //
    495 // Recommended to be enabled on the client-side.
    496 class NoiseSuppression {
    497  public:
    498   virtual int Enable(bool enable) = 0;
    499   virtual bool is_enabled() const = 0;
    500 
    501   // Determines the aggressiveness of the suppression. Increasing the level
    502   // will reduce the noise level at the expense of a higher speech distortion.
    503   enum Level {
    504     kLow,
    505     kModerate,
    506     kHigh,
    507     kVeryHigh
    508   };
    509 
    510   virtual int set_level(Level level) = 0;
    511   virtual Level level() const = 0;
    512 
    513  protected:
    514   virtual ~NoiseSuppression() {};
    515 };
    516 
    517 // The voice activity detection (VAD) component analyzes the stream to
    518 // determine if voice is present. A facility is also provided to pass in an
    519 // external VAD decision.
    520 class VoiceDetection {
    521  public:
    522   virtual int Enable(bool enable) = 0;
    523   virtual bool is_enabled() const = 0;
    524 
    525   // Returns true if voice is detected in the current frame. Should be called
    526   // after |ProcessStream()|.
    527   virtual bool stream_has_voice() const = 0;
    528 
    529   // Some of the APM functionality requires a VAD decision. In the case that
    530   // a decision is externally available for the current frame, it can be passed
    531   // in here, before |ProcessStream()| is called.
    532   //
    533   // VoiceDetection does _not_ need to be enabled to use this. If it happens to
    534   // be enabled, detection will be skipped for any frame in which an external
    535   // VAD decision is provided.
    536   virtual int set_stream_has_voice(bool has_voice) = 0;
    537 
    538   // Specifies the likelihood that a frame will be declared to contain voice.
    539   // A higher value makes it more likely that speech will not be clipped, at
    540   // the expense of more noise being detected as voice.
    541   enum Likelihood {
    542     kVeryLowLikelihood,
    543     kLowLikelihood,
    544     kModerateLikelihood,
    545     kHighLikelihood
    546   };
    547 
    548   virtual int set_likelihood(Likelihood likelihood) = 0;
    549   virtual Likelihood likelihood() const = 0;
    550 
    551   // Sets the |size| of the frames in ms on which the VAD will operate. Larger
    552   // frames will improve detection accuracy, but reduce the frequency of
    553   // updates.
    554   //
    555   // This does not impact the size of frames passed to |ProcessStream()|.
    556   virtual int set_frame_size_ms(int size) = 0;
    557   virtual int frame_size_ms() const = 0;
    558 
    559  protected:
    560   virtual ~VoiceDetection() {};
    561 };
    562 }  // namespace webrtc
    563 
    564 #endif  // WEBRTC_MODULES_AUDIO_PROCESSING_MAIN_INTERFACE_AUDIO_PROCESSING_H_
    565