1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // The EnergyEndpointer class finds likely speech onset and offset points. 6 // 7 // The implementation described here is about the simplest possible. 8 // It is based on timings of threshold crossings for overall signal 9 // RMS. It is suitable for light weight applications. 10 // 11 // As written, the basic idea is that one specifies intervals that 12 // must be occupied by super- and sub-threshold energy levels, and 13 // defers decisions re onset and offset times until these 14 // specifications have been met. Three basic intervals are tested: an 15 // onset window, a speech-on window, and an offset window. We require 16 // super-threshold to exceed some mimimum total durations in the onset 17 // and speech-on windows before declaring the speech onset time, and 18 // we specify a required sub-threshold residency in the offset window 19 // before declaring speech offset. As the various residency requirements are 20 // met, the EnergyEndpointer instance assumes various states, and can return the 21 // ID of these states to the client (see EpStatus below). 22 // 23 // The levels of the speech and background noise are continuously updated. It is 24 // important that the background noise level be estimated initially for 25 // robustness in noisy conditions. The first frames are assumed to be background 26 // noise and a fast update rate is used for the noise level. The duration for 27 // fast update is controlled by the fast_update_dur_ paramter. 28 // 29 // If used in noisy conditions, the endpointer should be started and run in the 30 // EnvironmentEstimation mode, for at least 200ms, before switching to 31 // UserInputMode. 32 // Audio feedback contamination can appear in the input audio, if not cut 33 // out or handled by echo cancellation. Audio feedback can trigger a false 34 // accept. The false accepts can be ignored by setting 35 // ep_contamination_rejection_period. 36 37 #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ 38 #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ 39 40 #include <vector> 41 42 #include "base/basictypes.h" 43 #include "base/memory/scoped_ptr.h" 44 #include "content/browser/speech/endpointer/energy_endpointer_params.h" 45 #include "content/common/content_export.h" 46 47 namespace content { 48 49 // Endpointer status codes 50 enum EpStatus { 51 EP_PRE_SPEECH = 10, 52 EP_POSSIBLE_ONSET, 53 EP_SPEECH_PRESENT, 54 EP_POSSIBLE_OFFSET, 55 EP_POST_SPEECH, 56 }; 57 58 class CONTENT_EXPORT EnergyEndpointer { 59 public: 60 // The default construction MUST be followed by Init(), before any 61 // other use can be made of the instance. 62 EnergyEndpointer(); 63 virtual ~EnergyEndpointer(); 64 65 void Init(const EnergyEndpointerParams& params); 66 67 // Start the endpointer. This should be called at the beginning of a session. 68 void StartSession(); 69 70 // Stop the endpointer. 71 void EndSession(); 72 73 // Start environment estimation. Audio will be used for environment estimation 74 // i.e. noise level estimation. 75 void SetEnvironmentEstimationMode(); 76 77 // Start user input. This should be called when the user indicates start of 78 // input, e.g. by pressing a button. 79 void SetUserInputMode(); 80 81 // Computes the next input frame and modifies EnergyEndpointer status as 82 // appropriate based on the computation. 83 void ProcessAudioFrame(int64 time_us, 84 const int16* samples, int num_samples, 85 float* rms_out); 86 87 // Returns the current state of the EnergyEndpointer and the time 88 // corresponding to the most recently computed frame. 89 EpStatus Status(int64* status_time_us) const; 90 91 bool estimating_environment() const { 92 return estimating_environment_; 93 } 94 95 // Returns estimated noise level in dB. 96 float GetNoiseLevelDb() const; 97 98 private: 99 class HistoryRing; 100 101 // Resets the endpointer internal state. If reset_threshold is true, the 102 // state will be reset completely, including adaptive thresholds and the 103 // removal of all history information. 104 void Restart(bool reset_threshold); 105 106 // Update internal speech and noise levels. 107 void UpdateLevels(float rms); 108 109 // Returns the number of frames (or frame number) corresponding to 110 // the 'time' (in seconds). 111 int TimeToFrame(float time) const; 112 113 EpStatus status_; // The current state of this instance. 114 float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH 115 int64 endpointer_time_us_; // Time of the most recently received audio frame. 116 int64 fast_update_frames_; // Number of frames for initial level adaptation. 117 int64 frame_counter_; // Number of frames seen. Used for initial adaptation. 118 float max_window_dur_; // Largest search window size (seconds) 119 float sample_rate_; // Sampling rate. 120 121 // Ring buffers to hold the speech activity history. 122 scoped_ptr<HistoryRing> history_; 123 124 // Configuration parameters. 125 EnergyEndpointerParams params_; 126 127 // RMS which must be exceeded to conclude frame is speech. 128 float decision_threshold_; 129 130 // Flag to indicate that audio should be used to estimate environment, prior 131 // to receiving user input. 132 bool estimating_environment_; 133 134 // Estimate of the background noise level. Used externally for UI feedback. 135 float noise_level_; 136 137 // An adaptive threshold used to update decision_threshold_ when appropriate. 138 float rms_adapt_; 139 140 // Start lag corresponds to the highest fundamental frequency. 141 int start_lag_; 142 143 // End lag corresponds to the lowest fundamental frequency. 144 int end_lag_; 145 146 // Time when mode switched from environment estimation to user input. This 147 // is used to time forced rejection of audio feedback contamination. 148 int64 user_input_start_time_us_; 149 150 DISALLOW_COPY_AND_ASSIGN(EnergyEndpointer); 151 }; 152 153 } // namespace content 154 155 #endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ 156