1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ 6 #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ 7 8 #include "base/basictypes.h" 9 #include "content/browser/speech/endpointer/energy_endpointer.h" 10 #include "content/common/content_export.h" 11 12 class EpStatus; 13 14 namespace content { 15 16 class AudioChunk; 17 18 // A simple interface to the underlying energy-endpointer implementation, this 19 // class lets callers provide audio as being recorded and let them poll to find 20 // when the user has stopped speaking. 21 // 22 // There are two events that may trigger the end of speech: 23 // 24 // speechInputPossiblyComplete event: 25 // 26 // Signals that silence/noise has been detected for a *short* amount of 27 // time after some speech has been detected. It can be used for low latency 28 // UI feedback. To disable it, set it to a large amount. 29 // 30 // speechInputComplete event: 31 // 32 // This event is intended to signal end of input and to stop recording. 33 // The amount of time to wait after speech is set by 34 // speech_input_complete_silence_length_ and optionally two other 35 // parameters (see below). 36 // This time can be held constant, or can change as more speech is detected. 37 // In the latter case, the time changes after a set amount of time from the 38 // *beginning* of speech. This is motivated by the expectation that there 39 // will be two distinct types of inputs: short search queries and longer 40 // dictation style input. 41 // 42 // Three parameters are used to define the piecewise constant timeout function. 43 // The timeout length is speech_input_complete_silence_length until 44 // long_speech_length, when it changes to 45 // long_speech_input_complete_silence_length. 46 class CONTENT_EXPORT Endpointer { 47 public: 48 explicit Endpointer(int sample_rate); 49 50 // Start the endpointer. This should be called at the beginning of a session. 51 void StartSession(); 52 53 // Stop the endpointer. 54 void EndSession(); 55 56 // Start environment estimation. Audio will be used for environment estimation 57 // i.e. noise level estimation. 58 void SetEnvironmentEstimationMode(); 59 60 // Start user input. This should be called when the user indicates start of 61 // input, e.g. by pressing a button. 62 void SetUserInputMode(); 63 64 // Process a segment of audio, which may be more than one frame. 65 // The status of the last frame will be returned. 66 EpStatus ProcessAudio(const AudioChunk& raw_audio, float* rms_out); 67 68 // Get the status of the endpointer. 69 EpStatus Status(int64 *time_us); 70 71 // Returns true if the endpointer detected reasonable audio levels above 72 // background noise which could be user speech, false if not. 73 bool DidStartReceivingSpeech() const { 74 return speech_previously_detected_; 75 } 76 77 bool IsEstimatingEnvironment() const { 78 return energy_endpointer_.estimating_environment(); 79 } 80 81 void set_speech_input_complete_silence_length(int64 time_us) { 82 speech_input_complete_silence_length_us_ = time_us; 83 } 84 85 void set_long_speech_input_complete_silence_length(int64 time_us) { 86 long_speech_input_complete_silence_length_us_ = time_us; 87 } 88 89 void set_speech_input_possibly_complete_silence_length(int64 time_us) { 90 speech_input_possibly_complete_silence_length_us_ = time_us; 91 } 92 93 void set_long_speech_length(int64 time_us) { 94 long_speech_length_us_ = time_us; 95 } 96 97 bool speech_input_complete() const { 98 return speech_input_complete_; 99 } 100 101 // RMS background noise level in dB. 102 float NoiseLevelDb() const { return energy_endpointer_.GetNoiseLevelDb(); } 103 104 private: 105 // Reset internal states. Helper method common to initial input utterance 106 // and following input utternaces. 107 void Reset(); 108 109 // Minimum allowable length of speech input. 110 int64 speech_input_minimum_length_us_; 111 112 // The speechInputPossiblyComplete event signals that silence/noise has been 113 // detected for a *short* amount of time after some speech has been detected. 114 // This proporty specifies the time period. 115 int64 speech_input_possibly_complete_silence_length_us_; 116 117 // The speechInputComplete event signals that silence/noise has been 118 // detected for a *long* amount of time after some speech has been detected. 119 // This property specifies the time period. 120 int64 speech_input_complete_silence_length_us_; 121 122 // Same as above, this specifies the required silence period after speech 123 // detection. This period is used instead of 124 // speech_input_complete_silence_length_ when the utterance is longer than 125 // long_speech_length_. This parameter is optional. 126 int64 long_speech_input_complete_silence_length_us_; 127 128 // The period of time after which the endpointer should consider 129 // long_speech_input_complete_silence_length_ as a valid silence period 130 // instead of speech_input_complete_silence_length_. This parameter is 131 // optional. 132 int64 long_speech_length_us_; 133 134 // First speech onset time, used in determination of speech complete timeout. 135 int64 speech_start_time_us_; 136 137 // Most recent end time, used in determination of speech complete timeout. 138 int64 speech_end_time_us_; 139 140 int64 audio_frame_time_us_; 141 EpStatus old_ep_status_; 142 bool waiting_for_speech_possibly_complete_timeout_; 143 bool waiting_for_speech_complete_timeout_; 144 bool speech_previously_detected_; 145 bool speech_input_complete_; 146 EnergyEndpointer energy_endpointer_; 147 int sample_rate_; 148 int32 frame_size_; 149 }; 150 151 } // namespace content 152 153 #endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ 154