1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_ 6 #define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_ 7 8 #include "base/basictypes.h" 9 #include "base/memory/scoped_ptr.h" 10 #include "content/browser/speech/endpointer/endpointer.h" 11 #include "content/browser/speech/speech_recognition_engine.h" 12 #include "content/browser/speech/speech_recognizer.h" 13 #include "content/public/common/speech_recognition_error.h" 14 #include "content/public/common/speech_recognition_result.h" 15 #include "media/audio/audio_input_controller.h" 16 #include "media/audio/audio_logging.h" 17 #include "net/url_request/url_request_context_getter.h" 18 19 namespace media { 20 class AudioBus; 21 class AudioManager; 22 } 23 24 namespace content { 25 26 class SpeechRecognitionEventListener; 27 28 // Handles speech recognition for a session (identified by |session_id|), taking 29 // care of audio capture, silence detection/endpointer and interaction with the 30 // SpeechRecognitionEngine. 31 class CONTENT_EXPORT SpeechRecognizerImpl 32 : public SpeechRecognizer, 33 public media::AudioInputController::EventHandler, 34 public NON_EXPORTED_BASE(SpeechRecognitionEngineDelegate) { 35 public: 36 static const int kAudioSampleRate; 37 static const media::ChannelLayout kChannelLayout; 38 static const int kNumBitsPerAudioSample; 39 static const int kNoSpeechTimeoutMs; 40 static const int kEndpointerEstimationTimeMs; 41 42 static void SetAudioManagerForTesting(media::AudioManager* audio_manager); 43 44 SpeechRecognizerImpl(SpeechRecognitionEventListener* listener, 45 int session_id, 46 bool continuous, 47 bool provisional_results, 48 SpeechRecognitionEngine* engine); 49 50 virtual void StartRecognition(const std::string& device_id) OVERRIDE; 51 virtual void AbortRecognition() OVERRIDE; 52 virtual void StopAudioCapture() OVERRIDE; 53 virtual bool IsActive() const OVERRIDE; 54 virtual bool IsCapturingAudio() const OVERRIDE; 55 const SpeechRecognitionEngine& recognition_engine() const; 56 57 private: 58 friend class SpeechRecognizerTest; 59 60 enum FSMState { 61 STATE_IDLE = 0, 62 STATE_STARTING, 63 STATE_ESTIMATING_ENVIRONMENT, 64 STATE_WAITING_FOR_SPEECH, 65 STATE_RECOGNIZING, 66 STATE_WAITING_FINAL_RESULT, 67 STATE_ENDED, 68 STATE_MAX_VALUE = STATE_ENDED 69 }; 70 71 enum FSMEvent { 72 EVENT_ABORT = 0, 73 EVENT_START, 74 EVENT_STOP_CAPTURE, 75 EVENT_AUDIO_DATA, 76 EVENT_ENGINE_RESULT, 77 EVENT_ENGINE_ERROR, 78 EVENT_AUDIO_ERROR, 79 EVENT_MAX_VALUE = EVENT_AUDIO_ERROR 80 }; 81 82 struct FSMEventArgs { 83 explicit FSMEventArgs(FSMEvent event_value); 84 ~FSMEventArgs(); 85 86 FSMEvent event; 87 scoped_refptr<AudioChunk> audio_data; 88 SpeechRecognitionResults engine_results; 89 SpeechRecognitionError engine_error; 90 }; 91 92 virtual ~SpeechRecognizerImpl(); 93 94 // Entry point for pushing any new external event into the recognizer FSM. 95 void DispatchEvent(const FSMEventArgs& event_args); 96 97 // Defines the behavior of the recognizer FSM, selecting the appropriate 98 // transition according to the current state and event. 99 FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& args); 100 101 // Process a new audio chunk in the audio pipeline (endpointer, vumeter, etc). 102 void ProcessAudioPipeline(const AudioChunk& raw_audio); 103 104 // The methods below handle transitions of the recognizer FSM. 105 FSMState StartRecording(const FSMEventArgs& event_args); 106 FSMState StartRecognitionEngine(const FSMEventArgs& event_args); 107 FSMState WaitEnvironmentEstimationCompletion(const FSMEventArgs& event_args); 108 FSMState DetectUserSpeechOrTimeout(const FSMEventArgs& event_args); 109 FSMState StopCaptureAndWaitForResult(const FSMEventArgs& event_args); 110 FSMState ProcessIntermediateResult(const FSMEventArgs& event_args); 111 FSMState ProcessFinalResult(const FSMEventArgs& event_args); 112 FSMState AbortSilently(const FSMEventArgs& event_args); 113 FSMState AbortWithError(const FSMEventArgs& event_args); 114 FSMState Abort(const SpeechRecognitionError& error); 115 FSMState DetectEndOfSpeech(const FSMEventArgs& event_args); 116 FSMState DoNothing(const FSMEventArgs& event_args) const; 117 FSMState NotFeasible(const FSMEventArgs& event_args); 118 119 // Returns the time span of captured audio samples since the start of capture. 120 int GetElapsedTimeMs() const; 121 122 // Calculates the input volume to be displayed in the UI, triggering the 123 // OnAudioLevelsChange event accordingly. 124 void UpdateSignalAndNoiseLevels(const float& rms, bool clip_detected); 125 126 void CloseAudioControllerAsynchronously(); 127 128 // Callback called on IO thread by audio_controller->Close(). 129 void OnAudioClosed(media::AudioInputController*); 130 131 // AudioInputController::EventHandler methods. 132 virtual void OnCreated(media::AudioInputController* controller) OVERRIDE {} 133 virtual void OnRecording(media::AudioInputController* controller) OVERRIDE {} 134 virtual void OnError(media::AudioInputController* controller, 135 media::AudioInputController::ErrorCode error_code) OVERRIDE; 136 virtual void OnData(media::AudioInputController* controller, 137 const media::AudioBus* data) OVERRIDE; 138 virtual void OnLog(media::AudioInputController* controller, 139 const std::string& message) OVERRIDE {} 140 141 // SpeechRecognitionEngineDelegate methods. 142 virtual void OnSpeechRecognitionEngineResults( 143 const SpeechRecognitionResults& results) OVERRIDE; 144 virtual void OnSpeechRecognitionEngineError( 145 const SpeechRecognitionError& error) OVERRIDE; 146 147 static media::AudioManager* audio_manager_for_tests_; 148 149 scoped_ptr<SpeechRecognitionEngine> recognition_engine_; 150 Endpointer endpointer_; 151 scoped_refptr<media::AudioInputController> audio_controller_; 152 scoped_ptr<media::AudioLog> audio_log_; 153 int num_samples_recorded_; 154 float audio_level_; 155 bool is_dispatching_event_; 156 bool provisional_results_; 157 FSMState state_; 158 std::string device_id_; 159 160 class OnDataConverter; 161 162 // Converts data between native input format and a WebSpeech specific 163 // output format. 164 scoped_ptr<SpeechRecognizerImpl::OnDataConverter> audio_converter_; 165 166 DISALLOW_COPY_AND_ASSIGN(SpeechRecognizerImpl); 167 }; 168 169 } // namespace content 170 171 #endif // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_ 172