1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_ 6 #define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_ 7 8 #include "base/basictypes.h" 9 #include "base/memory/scoped_ptr.h" 10 #include "content/browser/speech/endpointer/endpointer.h" 11 #include "content/browser/speech/speech_recognition_engine.h" 12 #include "content/browser/speech/speech_recognizer.h" 13 #include "content/public/common/speech_recognition_error.h" 14 #include "content/public/common/speech_recognition_result.h" 15 #include "media/audio/audio_input_controller.h" 16 #include "net/url_request/url_request_context_getter.h" 17 18 namespace media { 19 class AudioBus; 20 class AudioManager; 21 } 22 23 namespace content { 24 25 class SpeechRecognitionEventListener; 26 27 // Handles speech recognition for a session (identified by |session_id|), taking 28 // care of audio capture, silence detection/endpointer and interaction with the 29 // SpeechRecognitionEngine. 30 class CONTENT_EXPORT SpeechRecognizerImpl 31 : public SpeechRecognizer, 32 public media::AudioInputController::EventHandler, 33 public NON_EXPORTED_BASE(SpeechRecognitionEngineDelegate) { 34 public: 35 static const int kAudioSampleRate; 36 static const media::ChannelLayout kChannelLayout; 37 static const int kNumBitsPerAudioSample; 38 static const int kNoSpeechTimeoutMs; 39 static const int kEndpointerEstimationTimeMs; 40 41 static void SetAudioManagerForTesting(media::AudioManager* audio_manager); 42 43 SpeechRecognizerImpl(SpeechRecognitionEventListener* listener, 44 int session_id, 45 bool continuous, 46 bool provisional_results, 47 SpeechRecognitionEngine* engine); 48 49 virtual void StartRecognition(const std::string& device_id) OVERRIDE; 50 virtual void AbortRecognition() OVERRIDE; 51 virtual void StopAudioCapture() OVERRIDE; 52 virtual bool IsActive() const OVERRIDE; 53 virtual bool IsCapturingAudio() const OVERRIDE; 54 const SpeechRecognitionEngine& recognition_engine() const; 55 56 private: 57 friend class SpeechRecognizerTest; 58 59 enum FSMState { 60 STATE_IDLE = 0, 61 STATE_STARTING, 62 STATE_ESTIMATING_ENVIRONMENT, 63 STATE_WAITING_FOR_SPEECH, 64 STATE_RECOGNIZING, 65 STATE_WAITING_FINAL_RESULT, 66 STATE_ENDED, 67 STATE_MAX_VALUE = STATE_ENDED 68 }; 69 70 enum FSMEvent { 71 EVENT_ABORT = 0, 72 EVENT_START, 73 EVENT_STOP_CAPTURE, 74 EVENT_AUDIO_DATA, 75 EVENT_ENGINE_RESULT, 76 EVENT_ENGINE_ERROR, 77 EVENT_AUDIO_ERROR, 78 EVENT_MAX_VALUE = EVENT_AUDIO_ERROR 79 }; 80 81 struct FSMEventArgs { 82 explicit FSMEventArgs(FSMEvent event_value); 83 ~FSMEventArgs(); 84 85 FSMEvent event; 86 scoped_refptr<AudioChunk> audio_data; 87 SpeechRecognitionResults engine_results; 88 SpeechRecognitionError engine_error; 89 }; 90 91 virtual ~SpeechRecognizerImpl(); 92 93 // Entry point for pushing any new external event into the recognizer FSM. 94 void DispatchEvent(const FSMEventArgs& event_args); 95 96 // Defines the behavior of the recognizer FSM, selecting the appropriate 97 // transition according to the current state and event. 98 FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& args); 99 100 // Process a new audio chunk in the audio pipeline (endpointer, vumeter, etc). 101 void ProcessAudioPipeline(const AudioChunk& raw_audio); 102 103 // The methods below handle transitions of the recognizer FSM. 104 FSMState StartRecording(const FSMEventArgs& event_args); 105 FSMState StartRecognitionEngine(const FSMEventArgs& event_args); 106 FSMState WaitEnvironmentEstimationCompletion(const FSMEventArgs& event_args); 107 FSMState DetectUserSpeechOrTimeout(const FSMEventArgs& event_args); 108 FSMState StopCaptureAndWaitForResult(const FSMEventArgs& event_args); 109 FSMState ProcessIntermediateResult(const FSMEventArgs& event_args); 110 FSMState ProcessFinalResult(const FSMEventArgs& event_args); 111 FSMState AbortSilently(const FSMEventArgs& event_args); 112 FSMState AbortWithError(const FSMEventArgs& event_args); 113 FSMState Abort(const SpeechRecognitionError& error); 114 FSMState DetectEndOfSpeech(const FSMEventArgs& event_args); 115 FSMState DoNothing(const FSMEventArgs& event_args) const; 116 FSMState NotFeasible(const FSMEventArgs& event_args); 117 118 // Returns the time span of captured audio samples since the start of capture. 119 int GetElapsedTimeMs() const; 120 121 // Calculates the input volume to be displayed in the UI, triggering the 122 // OnAudioLevelsChange event accordingly. 123 void UpdateSignalAndNoiseLevels(const float& rms, bool clip_detected); 124 125 void CloseAudioControllerAsynchronously(); 126 127 // Callback called on IO thread by audio_controller->Close(). 128 void OnAudioClosed(media::AudioInputController*); 129 130 // AudioInputController::EventHandler methods. 131 virtual void OnCreated(media::AudioInputController* controller) OVERRIDE {} 132 virtual void OnRecording(media::AudioInputController* controller) OVERRIDE {} 133 virtual void OnError(media::AudioInputController* controller, 134 media::AudioInputController::ErrorCode error_code) OVERRIDE; 135 virtual void OnData(media::AudioInputController* controller, 136 const media::AudioBus* data) OVERRIDE; 137 virtual void OnLog(media::AudioInputController* controller, 138 const std::string& message) OVERRIDE {} 139 140 // SpeechRecognitionEngineDelegate methods. 141 virtual void OnSpeechRecognitionEngineResults( 142 const SpeechRecognitionResults& results) OVERRIDE; 143 virtual void OnSpeechRecognitionEngineError( 144 const SpeechRecognitionError& error) OVERRIDE; 145 146 static media::AudioManager* audio_manager_for_tests_; 147 148 scoped_ptr<SpeechRecognitionEngine> recognition_engine_; 149 Endpointer endpointer_; 150 scoped_refptr<media::AudioInputController> audio_controller_; 151 int num_samples_recorded_; 152 float audio_level_; 153 bool is_dispatching_event_; 154 bool provisional_results_; 155 FSMState state_; 156 std::string device_id_; 157 158 class OnDataConverter; 159 160 // Converts data between native input format and a WebSpeech specific 161 // output format. 162 scoped_ptr<SpeechRecognizerImpl::OnDataConverter> audio_converter_; 163 164 DISALLOW_COPY_AND_ASSIGN(SpeechRecognizerImpl); 165 }; 166 167 } // namespace content 168 169 #endif // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_ 170