Home | History | Annotate | Download | only in endpointer
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // The EnergyEndpointer class finds likely speech onset and offset points.
      6 //
      7 // The implementation described here is about the simplest possible.
      8 // It is based on timings of threshold crossings for overall signal
      9 // RMS. It is suitable for light weight applications.
     10 //
     11 // As written, the basic idea is that one specifies intervals that
     12 // must be occupied by super- and sub-threshold energy levels, and
     13 // defers decisions re onset and offset times until these
     14 // specifications have been met.  Three basic intervals are tested: an
     15 // onset window, a speech-on window, and an offset window.  We require
     16 // super-threshold to exceed some mimimum total durations in the onset
     17 // and speech-on windows before declaring the speech onset time, and
     18 // we specify a required sub-threshold residency in the offset window
     19 // before declaring speech offset. As the various residency requirements are
     20 // met, the EnergyEndpointer instance assumes various states, and can return the
     21 // ID of these states to the client (see EpStatus below).
     22 //
     23 // The levels of the speech and background noise are continuously updated. It is
     24 // important that the background noise level be estimated initially for
     25 // robustness in noisy conditions. The first frames are assumed to be background
     26 // noise and a fast update rate is used for the noise level. The duration for
     27 // fast update is controlled by the fast_update_dur_ paramter.
     28 //
     29 // If used in noisy conditions, the endpointer should be started and run in the
     30 // EnvironmentEstimation mode, for at least 200ms, before switching to
     31 // UserInputMode.
     32 // Audio feedback contamination can appear in the input audio, if not cut
     33 // out or handled by echo cancellation. Audio feedback can trigger a false
     34 // accept. The false accepts can be ignored by setting
     35 // ep_contamination_rejection_period.
     36 
     37 #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
     38 #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
     39 
     40 #include <vector>
     41 
     42 #include "base/basictypes.h"
     43 #include "base/memory/scoped_ptr.h"
     44 #include "content/browser/speech/endpointer/energy_endpointer_params.h"
     45 #include "content/common/content_export.h"
     46 
     47 namespace content {
     48 
     49 // Endpointer status codes
     50 enum EpStatus {
     51   EP_PRE_SPEECH = 10,
     52   EP_POSSIBLE_ONSET,
     53   EP_SPEECH_PRESENT,
     54   EP_POSSIBLE_OFFSET,
     55   EP_POST_SPEECH,
     56 };
     57 
     58 class CONTENT_EXPORT EnergyEndpointer {
     59  public:
     60   // The default construction MUST be followed by Init(), before any
     61   // other use can be made of the instance.
     62   EnergyEndpointer();
     63   virtual ~EnergyEndpointer();
     64 
     65   void Init(const EnergyEndpointerParams& params);
     66 
     67   // Start the endpointer. This should be called at the beginning of a session.
     68   void StartSession();
     69 
     70   // Stop the endpointer.
     71   void EndSession();
     72 
     73   // Start environment estimation. Audio will be used for environment estimation
     74   // i.e. noise level estimation.
     75   void SetEnvironmentEstimationMode();
     76 
     77   // Start user input. This should be called when the user indicates start of
     78   // input, e.g. by pressing a button.
     79   void SetUserInputMode();
     80 
     81   // Computes the next input frame and modifies EnergyEndpointer status as
     82   // appropriate based on the computation.
     83   void ProcessAudioFrame(int64 time_us,
     84                          const int16* samples, int num_samples,
     85                          float* rms_out);
     86 
     87   // Returns the current state of the EnergyEndpointer and the time
     88   // corresponding to the most recently computed frame.
     89   EpStatus Status(int64* status_time_us) const;
     90 
     91   bool estimating_environment() const {
     92     return estimating_environment_;
     93   }
     94 
     95   // Returns estimated noise level in dB.
     96   float GetNoiseLevelDb() const;
     97 
     98  private:
     99   class HistoryRing;
    100 
    101   // Resets the endpointer internal state.  If reset_threshold is true, the
    102   // state will be reset completely, including adaptive thresholds and the
    103   // removal of all history information.
    104   void Restart(bool reset_threshold);
    105 
    106   // Update internal speech and noise levels.
    107   void UpdateLevels(float rms);
    108 
    109   // Returns the number of frames (or frame number) corresponding to
    110   // the 'time' (in seconds).
    111   int TimeToFrame(float time) const;
    112 
    113   EpStatus status_;  // The current state of this instance.
    114   float offset_confirm_dur_sec_;  // max on time allowed to confirm POST_SPEECH
    115   int64 endpointer_time_us_;  // Time of the most recently received audio frame.
    116   int64 fast_update_frames_; // Number of frames for initial level adaptation.
    117   int64 frame_counter_;  // Number of frames seen. Used for initial adaptation.
    118   float max_window_dur_;  // Largest search window size (seconds)
    119   float sample_rate_;  // Sampling rate.
    120 
    121   // Ring buffers to hold the speech activity history.
    122   scoped_ptr<HistoryRing> history_;
    123 
    124   // Configuration parameters.
    125   EnergyEndpointerParams params_;
    126 
    127   // RMS which must be exceeded to conclude frame is speech.
    128   float decision_threshold_;
    129 
    130   // Flag to indicate that audio should be used to estimate environment, prior
    131   // to receiving user input.
    132   bool estimating_environment_;
    133 
    134   // Estimate of the background noise level. Used externally for UI feedback.
    135   float noise_level_;
    136 
    137   // An adaptive threshold used to update decision_threshold_ when appropriate.
    138   float rms_adapt_;
    139 
    140   // Start lag corresponds to the highest fundamental frequency.
    141   int start_lag_;
    142 
    143   // End lag corresponds to the lowest fundamental frequency.
    144   int end_lag_;
    145 
    146   // Time when mode switched from environment estimation to user input. This
    147   // is used to time forced rejection of audio feedback contamination.
    148   int64 user_input_start_time_us_;
    149 
    150   DISALLOW_COPY_AND_ASSIGN(EnergyEndpointer);
    151 };
    152 
    153 }  // namespace content
    154 
    155 #endif  // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
    156