1 /* 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_MAIN_INTERFACE_AUDIO_PROCESSING_H_ 12 #define WEBRTC_MODULES_AUDIO_PROCESSING_MAIN_INTERFACE_AUDIO_PROCESSING_H_ 13 14 #include "typedefs.h" 15 #include "module.h" 16 17 namespace webrtc { 18 19 class AudioFrame; 20 class EchoCancellation; 21 class EchoControlMobile; 22 class GainControl; 23 class HighPassFilter; 24 class LevelEstimator; 25 class NoiseSuppression; 26 class VoiceDetection; 27 28 // The Audio Processing Module (APM) provides a collection of voice processing 29 // components designed for real-time communications software. 30 // 31 // APM operates on two audio streams on a frame-by-frame basis. Frames of the 32 // primary stream, on which all processing is applied, are passed to 33 // |ProcessStream()|. Frames of the reverse direction stream, which are used for 34 // analysis by some components, are passed to |AnalyzeReverseStream()|. On the 35 // client-side, this will typically be the near-end (capture) and far-end 36 // (render) streams, respectively. APM should be placed in the signal chain as 37 // close to the audio hardware abstraction layer (HAL) as possible. 38 // 39 // On the server-side, the reverse stream will normally not be used, with 40 // processing occurring on each incoming stream. 41 // 42 // Component interfaces follow a similar pattern and are accessed through 43 // corresponding getters in APM. All components are disabled at create-time, 44 // with default settings that are recommended for most situations. New settings 45 // can be applied without enabling a component. Enabling a component triggers 46 // memory allocation and initialization to allow it to start processing the 47 // streams. 48 // 49 // Thread safety is provided with the following assumptions to reduce locking 50 // overhead: 51 // 1. The stream getters and setters are called from the same thread as 52 // ProcessStream(). More precisely, stream functions are never called 53 // concurrently with ProcessStream(). 54 // 2. Parameter getters are never called concurrently with the corresponding 55 // setter. 56 // 57 // APM accepts only 16-bit linear PCM audio data in frames of 10 ms. Multiple 58 // channels should be interleaved. 59 // 60 // Usage example, omitting error checking: 61 // AudioProcessing* apm = AudioProcessing::Create(0); 62 // apm->set_sample_rate_hz(32000); // Super-wideband processing. 63 // 64 // // Mono capture and stereo render. 65 // apm->set_num_channels(1, 1); 66 // apm->set_num_reverse_channels(2); 67 // 68 // apm->high_pass_filter()->Enable(true); 69 // 70 // apm->echo_cancellation()->enable_drift_compensation(false); 71 // apm->echo_cancellation()->Enable(true); 72 // 73 // apm->noise_reduction()->set_level(kHighSuppression); 74 // apm->noise_reduction()->Enable(true); 75 // 76 // apm->gain_control()->set_analog_level_limits(0, 255); 77 // apm->gain_control()->set_mode(kAdaptiveAnalog); 78 // apm->gain_control()->Enable(true); 79 // 80 // apm->voice_detection()->Enable(true); 81 // 82 // // Start a voice call... 83 // 84 // // ... Render frame arrives bound for the audio HAL ... 85 // apm->AnalyzeReverseStream(render_frame); 86 // 87 // // ... Capture frame arrives from the audio HAL ... 88 // // Call required set_stream_ functions. 89 // apm->set_stream_delay_ms(delay_ms); 90 // apm->gain_control()->set_stream_analog_level(analog_level); 91 // 92 // apm->ProcessStream(capture_frame); 93 // 94 // // Call required stream_ functions. 95 // analog_level = apm->gain_control()->stream_analog_level(); 96 // has_voice = apm->stream_has_voice(); 97 // 98 // // Repeate render and capture processing for the duration of the call... 99 // // Start a new call... 100 // apm->Initialize(); 101 // 102 // // Close the application... 103 // AudioProcessing::Destroy(apm); 104 // apm = NULL; 105 // 106 class AudioProcessing : public Module { 107 public: 108 // Creates a APM instance, with identifier |id|. Use one instance for every 109 // primary audio stream requiring processing. On the client-side, this would 110 // typically be one instance for the near-end stream, and additional instances 111 // for each far-end stream which requires processing. On the server-side, 112 // this would typically be one instance for every incoming stream. 113 static AudioProcessing* Create(int id); 114 115 // Destroys a |apm| instance. 116 static void Destroy(AudioProcessing* apm); 117 118 // Initializes internal states, while retaining all user settings. This 119 // should be called before beginning to process a new audio stream. However, 120 // it is not necessary to call before processing the first stream after 121 // creation. 122 virtual int Initialize() = 0; 123 124 // Sets the sample |rate| in Hz for both the primary and reverse audio 125 // streams. 8000, 16000 or 32000 Hz are permitted. 126 virtual int set_sample_rate_hz(int rate) = 0; 127 virtual int sample_rate_hz() const = 0; 128 129 // Sets the number of channels for the primary audio stream. Input frames must 130 // contain a number of channels given by |input_channels|, while output frames 131 // will be returned with number of channels given by |output_channels|. 132 virtual int set_num_channels(int input_channels, int output_channels) = 0; 133 virtual int num_input_channels() const = 0; 134 virtual int num_output_channels() const = 0; 135 136 // Sets the number of channels for the reverse audio stream. Input frames must 137 // contain a number of channels given by |channels|. 138 virtual int set_num_reverse_channels(int channels) = 0; 139 virtual int num_reverse_channels() const = 0; 140 141 // Processes a 10 ms |frame| of the primary audio stream. On the client-side, 142 // this is the near-end (or captured) audio. 143 // 144 // If needed for enabled functionality, any function with the set_stream_ tag 145 // must be called prior to processing the current frame. Any getter function 146 // with the stream_ tag which is needed should be called after processing. 147 // 148 // The |_frequencyInHz|, |_audioChannel|, and |_payloadDataLengthInSamples| 149 // members of |frame| must be valid, and correspond to settings supplied 150 // to APM. 151 virtual int ProcessStream(AudioFrame* frame) = 0; 152 153 // Analyzes a 10 ms |frame| of the reverse direction audio stream. The frame 154 // will not be modified. On the client-side, this is the far-end (or to be 155 // rendered) audio. 156 // 157 // It is only necessary to provide this if echo processing is enabled, as the 158 // reverse stream forms the echo reference signal. It is recommended, but not 159 // necessary, to provide if gain control is enabled. On the server-side this 160 // typically will not be used. If you're not sure what to pass in here, 161 // chances are you don't need to use it. 162 // 163 // The |_frequencyInHz|, |_audioChannel|, and |_payloadDataLengthInSamples| 164 // members of |frame| must be valid. 165 // 166 // TODO(ajm): add const to input; requires an implementation fix. 167 virtual int AnalyzeReverseStream(AudioFrame* frame) = 0; 168 169 // This must be called if and only if echo processing is enabled. 170 // 171 // Sets the |delay| in ms between AnalyzeReverseStream() receiving a far-end 172 // frame and ProcessStream() receiving a near-end frame containing the 173 // corresponding echo. On the client-side this can be expressed as 174 // delay = (t_render - t_analyze) + (t_process - t_capture) 175 // where, 176 // - t_analyze is the time a frame is passed to AnalyzeReverseStream() and 177 // t_render is the time the first sample of the same frame is rendered by 178 // the audio hardware. 179 // - t_capture is the time the first sample of a frame is captured by the 180 // audio hardware and t_pull is the time the same frame is passed to 181 // ProcessStream(). 182 virtual int set_stream_delay_ms(int delay) = 0; 183 virtual int stream_delay_ms() const = 0; 184 185 // Starts recording debugging information to a file specified by |filename|, 186 // a NULL-terminated string. If there is an ongoing recording, the old file 187 // will be closed, and recording will continue in the newly specified file. 188 // An already existing file will be overwritten without warning. 189 static const int kMaxFilenameSize = 1024; 190 virtual int StartDebugRecording(const char filename[kMaxFilenameSize]) = 0; 191 192 // Stops recording debugging information, and closes the file. Recording 193 // cannot be resumed in the same file (without overwriting it). 194 virtual int StopDebugRecording() = 0; 195 196 // These provide access to the component interfaces and should never return 197 // NULL. The pointers will be valid for the lifetime of the APM instance. 198 // The memory for these objects is entirely managed internally. 199 virtual EchoCancellation* echo_cancellation() const = 0; 200 virtual EchoControlMobile* echo_control_mobile() const = 0; 201 virtual GainControl* gain_control() const = 0; 202 virtual HighPassFilter* high_pass_filter() const = 0; 203 virtual LevelEstimator* level_estimator() const = 0; 204 virtual NoiseSuppression* noise_suppression() const = 0; 205 virtual VoiceDetection* voice_detection() const = 0; 206 207 struct Statistic { 208 int instant; // Instantaneous value. 209 int average; // Long-term average. 210 int maximum; // Long-term maximum. 211 int minimum; // Long-term minimum. 212 }; 213 214 // Fatal errors. 215 enum Errors { 216 kNoError = 0, 217 kUnspecifiedError = -1, 218 kCreationFailedError = -2, 219 kUnsupportedComponentError = -3, 220 kUnsupportedFunctionError = -4, 221 kNullPointerError = -5, 222 kBadParameterError = -6, 223 kBadSampleRateError = -7, 224 kBadDataLengthError = -8, 225 kBadNumberChannelsError = -9, 226 kFileError = -10, 227 kStreamParameterNotSetError = -11, 228 kNotEnabledError = -12 229 }; 230 231 // Warnings are non-fatal. 232 enum Warnings { 233 // This results when a set_stream_ parameter is out of range. Processing 234 // will continue, but the parameter may have been truncated. 235 kBadStreamParameterWarning = -13, 236 }; 237 238 // Inherited from Module. 239 virtual WebRtc_Word32 TimeUntilNextProcess() { return -1; }; 240 virtual WebRtc_Word32 Process() { return -1; }; 241 242 protected: 243 virtual ~AudioProcessing() {}; 244 }; 245 246 // The acoustic echo cancellation (AEC) component provides better performance 247 // than AECM but also requires more processing power and is dependent on delay 248 // stability and reporting accuracy. As such it is well-suited and recommended 249 // for PC and IP phone applications. 250 // 251 // Not recommended to be enabled on the server-side. 252 class EchoCancellation { 253 public: 254 // EchoCancellation and EchoControlMobile may not be enabled simultaneously. 255 // Enabling one will disable the other. 256 virtual int Enable(bool enable) = 0; 257 virtual bool is_enabled() const = 0; 258 259 // Differences in clock speed on the primary and reverse streams can impact 260 // the AEC performance. On the client-side, this could be seen when different 261 // render and capture devices are used, particularly with webcams. 262 // 263 // This enables a compensation mechanism, and requires that 264 // |set_device_sample_rate_hz()| and |set_stream_drift_samples()| be called. 265 virtual int enable_drift_compensation(bool enable) = 0; 266 virtual bool is_drift_compensation_enabled() const = 0; 267 268 // Provides the sampling rate of the audio devices. It is assumed the render 269 // and capture devices use the same nominal sample rate. Required if and only 270 // if drift compensation is enabled. 271 virtual int set_device_sample_rate_hz(int rate) = 0; 272 virtual int device_sample_rate_hz() const = 0; 273 274 // Sets the difference between the number of samples rendered and captured by 275 // the audio devices since the last call to |ProcessStream()|. Must be called 276 // if and only if drift compensation is enabled, prior to |ProcessStream()|. 277 virtual int set_stream_drift_samples(int drift) = 0; 278 virtual int stream_drift_samples() const = 0; 279 280 enum SuppressionLevel { 281 kLowSuppression, 282 kModerateSuppression, 283 kHighSuppression 284 }; 285 286 // Sets the aggressiveness of the suppressor. A higher level trades off 287 // double-talk performance for increased echo suppression. 288 virtual int set_suppression_level(SuppressionLevel level) = 0; 289 virtual SuppressionLevel suppression_level() const = 0; 290 291 // Returns false if the current frame almost certainly contains no echo 292 // and true if it _might_ contain echo. 293 virtual bool stream_has_echo() const = 0; 294 295 // Enables the computation of various echo metrics. These are obtained 296 // through |GetMetrics()|. 297 virtual int enable_metrics(bool enable) = 0; 298 virtual bool are_metrics_enabled() const = 0; 299 300 // Each statistic is reported in dB. 301 // P_far: Far-end (render) signal power. 302 // P_echo: Near-end (capture) echo signal power. 303 // P_out: Signal power at the output of the AEC. 304 // P_a: Internal signal power at the point before the AEC's non-linear 305 // processor. 306 struct Metrics { 307 // RERL = ERL + ERLE 308 AudioProcessing::Statistic residual_echo_return_loss; 309 310 // ERL = 10log_10(P_far / P_echo) 311 AudioProcessing::Statistic echo_return_loss; 312 313 // ERLE = 10log_10(P_echo / P_out) 314 AudioProcessing::Statistic echo_return_loss_enhancement; 315 316 // (Pre non-linear processing suppression) A_NLP = 10log_10(P_echo / P_a) 317 AudioProcessing::Statistic a_nlp; 318 }; 319 320 // TODO(ajm): discuss the metrics update period. 321 virtual int GetMetrics(Metrics* metrics) = 0; 322 323 protected: 324 virtual ~EchoCancellation() {}; 325 }; 326 327 // The acoustic echo control for mobile (AECM) component is a low complexity 328 // robust option intended for use on mobile devices. 329 // 330 // Not recommended to be enabled on the server-side. 331 class EchoControlMobile { 332 public: 333 // EchoCancellation and EchoControlMobile may not be enabled simultaneously. 334 // Enabling one will disable the other. 335 virtual int Enable(bool enable) = 0; 336 virtual bool is_enabled() const = 0; 337 338 // Recommended settings for particular audio routes. In general, the louder 339 // the echo is expected to be, the higher this value should be set. The 340 // preferred setting may vary from device to device. 341 enum RoutingMode { 342 kQuietEarpieceOrHeadset, 343 kEarpiece, 344 kLoudEarpiece, 345 kSpeakerphone, 346 kLoudSpeakerphone 347 }; 348 349 // Sets echo control appropriate for the audio routing |mode| on the device. 350 // It can and should be updated during a call if the audio routing changes. 351 virtual int set_routing_mode(RoutingMode mode) = 0; 352 virtual RoutingMode routing_mode() const = 0; 353 354 // Comfort noise replaces suppressed background noise to maintain a 355 // consistent signal level. 356 virtual int enable_comfort_noise(bool enable) = 0; 357 virtual bool is_comfort_noise_enabled() const = 0; 358 359 protected: 360 virtual ~EchoControlMobile() {}; 361 }; 362 363 // The automatic gain control (AGC) component brings the signal to an 364 // appropriate range. This is done by applying a digital gain directly and, in 365 // the analog mode, prescribing an analog gain to be applied at the audio HAL. 366 // 367 // Recommended to be enabled on the client-side. 368 class GainControl { 369 public: 370 virtual int Enable(bool enable) = 0; 371 virtual bool is_enabled() const = 0; 372 373 // When an analog mode is set, this must be called prior to |ProcessStream()| 374 // to pass the current analog level from the audio HAL. Must be within the 375 // range provided to |set_analog_level_limits()|. 376 virtual int set_stream_analog_level(int level) = 0; 377 378 // When an analog mode is set, this should be called after |ProcessStream()| 379 // to obtain the recommended new analog level for the audio HAL. It is the 380 // users responsibility to apply this level. 381 virtual int stream_analog_level() = 0; 382 383 enum Mode { 384 // Adaptive mode intended for use if an analog volume control is available 385 // on the capture device. It will require the user to provide coupling 386 // between the OS mixer controls and AGC through the |stream_analog_level()| 387 // functions. 388 // 389 // It consists of an analog gain prescription for the audio device and a 390 // digital compression stage. 391 kAdaptiveAnalog, 392 393 // Adaptive mode intended for situations in which an analog volume control 394 // is unavailable. It operates in a similar fashion to the adaptive analog 395 // mode, but with scaling instead applied in the digital domain. As with 396 // the analog mode, it additionally uses a digital compression stage. 397 kAdaptiveDigital, 398 399 // Fixed mode which enables only the digital compression stage also used by 400 // the two adaptive modes. 401 // 402 // It is distinguished from the adaptive modes by considering only a 403 // short time-window of the input signal. It applies a fixed gain through 404 // most of the input level range, and compresses (gradually reduces gain 405 // with increasing level) the input signal at higher levels. This mode is 406 // preferred on embedded devices where the capture signal level is 407 // predictable, so that a known gain can be applied. 408 kFixedDigital 409 }; 410 411 virtual int set_mode(Mode mode) = 0; 412 virtual Mode mode() const = 0; 413 414 // Sets the target peak |level| (or envelope) of the AGC in dBFs (decibels 415 // from digital full-scale). The convention is to use positive values. For 416 // instance, passing in a value of 3 corresponds to -3 dBFs, or a target 417 // level 3 dB below full-scale. Limited to [0, 31]. 418 // 419 // TODO(ajm): use a negative value here instead, if/when VoE will similarly 420 // update its interface. 421 virtual int set_target_level_dbfs(int level) = 0; 422 virtual int target_level_dbfs() const = 0; 423 424 // Sets the maximum |gain| the digital compression stage may apply, in dB. A 425 // higher number corresponds to greater compression, while a value of 0 will 426 // leave the signal uncompressed. Limited to [0, 90]. 427 virtual int set_compression_gain_db(int gain) = 0; 428 virtual int compression_gain_db() const = 0; 429 430 // When enabled, the compression stage will hard limit the signal to the 431 // target level. Otherwise, the signal will be compressed but not limited 432 // above the target level. 433 virtual int enable_limiter(bool enable) = 0; 434 virtual bool is_limiter_enabled() const = 0; 435 436 // Sets the |minimum| and |maximum| analog levels of the audio capture device. 437 // Must be set if and only if an analog mode is used. Limited to [0, 65535]. 438 virtual int set_analog_level_limits(int minimum, 439 int maximum) = 0; 440 virtual int analog_level_minimum() const = 0; 441 virtual int analog_level_maximum() const = 0; 442 443 // Returns true if the AGC has detected a saturation event (period where the 444 // signal reaches digital full-scale) in the current frame and the analog 445 // level cannot be reduced. 446 // 447 // This could be used as an indicator to reduce or disable analog mic gain at 448 // the audio HAL. 449 virtual bool stream_is_saturated() const = 0; 450 451 protected: 452 virtual ~GainControl() {}; 453 }; 454 455 // A filtering component which removes DC offset and low-frequency noise. 456 // Recommended to be enabled on the client-side. 457 class HighPassFilter { 458 public: 459 virtual int Enable(bool enable) = 0; 460 virtual bool is_enabled() const = 0; 461 462 protected: 463 virtual ~HighPassFilter() {}; 464 }; 465 466 // An estimation component used to retrieve level metrics. 467 class LevelEstimator { 468 public: 469 virtual int Enable(bool enable) = 0; 470 virtual bool is_enabled() const = 0; 471 472 // The metrics are reported in dBFs calculated as: 473 // Level = 10log_10(P_s / P_max) [dBFs], where 474 // P_s is the signal power and P_max is the maximum possible (or peak) 475 // power. With 16-bit signals, P_max = (2^15)^2. 476 struct Metrics { 477 AudioProcessing::Statistic signal; // Overall signal level. 478 AudioProcessing::Statistic speech; // Speech level. 479 AudioProcessing::Statistic noise; // Noise level. 480 }; 481 482 virtual int GetMetrics(Metrics* metrics, Metrics* reverse_metrics) = 0; 483 484 //virtual int enable_noise_warning(bool enable) = 0; 485 //bool is_noise_warning_enabled() const = 0; 486 //virtual bool stream_has_high_noise() const = 0; 487 488 protected: 489 virtual ~LevelEstimator() {}; 490 }; 491 492 // The noise suppression (NS) component attempts to remove noise while 493 // retaining speech. Recommended to be enabled on the client-side. 494 // 495 // Recommended to be enabled on the client-side. 496 class NoiseSuppression { 497 public: 498 virtual int Enable(bool enable) = 0; 499 virtual bool is_enabled() const = 0; 500 501 // Determines the aggressiveness of the suppression. Increasing the level 502 // will reduce the noise level at the expense of a higher speech distortion. 503 enum Level { 504 kLow, 505 kModerate, 506 kHigh, 507 kVeryHigh 508 }; 509 510 virtual int set_level(Level level) = 0; 511 virtual Level level() const = 0; 512 513 protected: 514 virtual ~NoiseSuppression() {}; 515 }; 516 517 // The voice activity detection (VAD) component analyzes the stream to 518 // determine if voice is present. A facility is also provided to pass in an 519 // external VAD decision. 520 class VoiceDetection { 521 public: 522 virtual int Enable(bool enable) = 0; 523 virtual bool is_enabled() const = 0; 524 525 // Returns true if voice is detected in the current frame. Should be called 526 // after |ProcessStream()|. 527 virtual bool stream_has_voice() const = 0; 528 529 // Some of the APM functionality requires a VAD decision. In the case that 530 // a decision is externally available for the current frame, it can be passed 531 // in here, before |ProcessStream()| is called. 532 // 533 // VoiceDetection does _not_ need to be enabled to use this. If it happens to 534 // be enabled, detection will be skipped for any frame in which an external 535 // VAD decision is provided. 536 virtual int set_stream_has_voice(bool has_voice) = 0; 537 538 // Specifies the likelihood that a frame will be declared to contain voice. 539 // A higher value makes it more likely that speech will not be clipped, at 540 // the expense of more noise being detected as voice. 541 enum Likelihood { 542 kVeryLowLikelihood, 543 kLowLikelihood, 544 kModerateLikelihood, 545 kHighLikelihood 546 }; 547 548 virtual int set_likelihood(Likelihood likelihood) = 0; 549 virtual Likelihood likelihood() const = 0; 550 551 // Sets the |size| of the frames in ms on which the VAD will operate. Larger 552 // frames will improve detection accuracy, but reduce the frequency of 553 // updates. 554 // 555 // This does not impact the size of frames passed to |ProcessStream()|. 556 virtual int set_frame_size_ms(int size) = 0; 557 virtual int frame_size_ms() const = 0; 558 559 protected: 560 virtual ~VoiceDetection() {}; 561 }; 562 } // namespace webrtc 563 564 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_MAIN_INTERFACE_AUDIO_PROCESSING_H_ 565