1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_ 6 #define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_ 7 8 #include "base/basictypes.h" 9 #include "base/memory/scoped_ptr.h" 10 #include "content/browser/speech/endpointer/endpointer.h" 11 #include "content/browser/speech/speech_recognition_engine.h" 12 #include "content/browser/speech/speech_recognizer.h" 13 #include "content/public/common/speech_recognition_error.h" 14 #include "content/public/common/speech_recognition_result.h" 15 #include "media/audio/audio_input_controller.h" 16 #include "net/url_request/url_request_context_getter.h" 17 18 namespace media { 19 class AudioManager; 20 } 21 22 namespace content { 23 24 class SpeechRecognitionEventListener; 25 26 // Handles speech recognition for a session (identified by |session_id|), taking 27 // care of audio capture, silence detection/endpointer and interaction with the 28 // SpeechRecognitionEngine. 29 class CONTENT_EXPORT SpeechRecognizerImpl 30 : public SpeechRecognizer, 31 public media::AudioInputController::EventHandler, 32 public NON_EXPORTED_BASE(SpeechRecognitionEngineDelegate) { 33 public: 34 static const int kAudioSampleRate; 35 static const media::ChannelLayout kChannelLayout; 36 static const int kNumBitsPerAudioSample; 37 static const int kNoSpeechTimeoutMs; 38 static const int kEndpointerEstimationTimeMs; 39 40 static void SetAudioManagerForTests(media::AudioManager* audio_manager); 41 42 SpeechRecognizerImpl(SpeechRecognitionEventListener* listener, 43 int session_id, 44 bool is_single_shot, 45 SpeechRecognitionEngine* engine); 46 47 virtual void StartRecognition(const std::string& device_id) OVERRIDE; 48 virtual void AbortRecognition() OVERRIDE; 49 virtual void StopAudioCapture() OVERRIDE; 50 virtual bool IsActive() const OVERRIDE; 51 virtual bool IsCapturingAudio() const OVERRIDE; 52 const SpeechRecognitionEngine& recognition_engine() const; 53 54 private: 55 friend class SpeechRecognizerTest; 56 57 enum FSMState { 58 STATE_IDLE = 0, 59 STATE_STARTING, 60 STATE_ESTIMATING_ENVIRONMENT, 61 STATE_WAITING_FOR_SPEECH, 62 STATE_RECOGNIZING, 63 STATE_WAITING_FINAL_RESULT, 64 STATE_ENDED, 65 STATE_MAX_VALUE = STATE_ENDED 66 }; 67 68 enum FSMEvent { 69 EVENT_ABORT = 0, 70 EVENT_START, 71 EVENT_STOP_CAPTURE, 72 EVENT_AUDIO_DATA, 73 EVENT_ENGINE_RESULT, 74 EVENT_ENGINE_ERROR, 75 EVENT_AUDIO_ERROR, 76 EVENT_MAX_VALUE = EVENT_AUDIO_ERROR 77 }; 78 79 struct FSMEventArgs { 80 explicit FSMEventArgs(FSMEvent event_value); 81 ~FSMEventArgs(); 82 83 FSMEvent event; 84 scoped_refptr<AudioChunk> audio_data; 85 SpeechRecognitionResults engine_results; 86 SpeechRecognitionError engine_error; 87 }; 88 89 virtual ~SpeechRecognizerImpl(); 90 91 // Entry point for pushing any new external event into the recognizer FSM. 92 void DispatchEvent(const FSMEventArgs& event_args); 93 94 // Defines the behavior of the recognizer FSM, selecting the appropriate 95 // transition according to the current state and event. 96 FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& args); 97 98 // Process a new audio chunk in the audio pipeline (endpointer, vumeter, etc). 99 void ProcessAudioPipeline(const AudioChunk& raw_audio); 100 101 // The methods below handle transitions of the recognizer FSM. 102 FSMState StartRecording(const FSMEventArgs& event_args); 103 FSMState StartRecognitionEngine(const FSMEventArgs& event_args); 104 FSMState WaitEnvironmentEstimationCompletion(const FSMEventArgs& event_args); 105 FSMState DetectUserSpeechOrTimeout(const FSMEventArgs& event_args); 106 FSMState StopCaptureAndWaitForResult(const FSMEventArgs& event_args); 107 FSMState ProcessIntermediateResult(const FSMEventArgs& event_args); 108 FSMState ProcessFinalResult(const FSMEventArgs& event_args); 109 FSMState AbortSilently(const FSMEventArgs& event_args); 110 FSMState AbortWithError(const FSMEventArgs& event_args); 111 FSMState Abort(const SpeechRecognitionError& error); 112 FSMState DetectEndOfSpeech(const FSMEventArgs& event_args); 113 FSMState DoNothing(const FSMEventArgs& event_args) const; 114 FSMState NotFeasible(const FSMEventArgs& event_args); 115 116 // Returns the time span of captured audio samples since the start of capture. 117 int GetElapsedTimeMs() const; 118 119 // Calculates the input volume to be displayed in the UI, triggering the 120 // OnAudioLevelsChange event accordingly. 121 void UpdateSignalAndNoiseLevels(const float& rms, bool clip_detected); 122 123 void CloseAudioControllerAsynchronously(); 124 125 // Callback called on IO thread by audio_controller->Close(). 126 void OnAudioClosed(media::AudioInputController*); 127 128 // AudioInputController::EventHandler methods. 129 virtual void OnCreated(media::AudioInputController* controller) OVERRIDE {} 130 virtual void OnRecording(media::AudioInputController* controller) OVERRIDE {} 131 virtual void OnError(media::AudioInputController* controller) OVERRIDE; 132 virtual void OnData(media::AudioInputController* controller, 133 const uint8* data, uint32 size) OVERRIDE; 134 135 // SpeechRecognitionEngineDelegate methods. 136 virtual void OnSpeechRecognitionEngineResults( 137 const SpeechRecognitionResults& results) OVERRIDE; 138 virtual void OnSpeechRecognitionEngineError( 139 const SpeechRecognitionError& error) OVERRIDE; 140 141 static media::AudioManager* audio_manager_for_tests_; 142 143 scoped_ptr<SpeechRecognitionEngine> recognition_engine_; 144 Endpointer endpointer_; 145 scoped_refptr<media::AudioInputController> audio_controller_; 146 int num_samples_recorded_; 147 float audio_level_; 148 bool is_dispatching_event_; 149 bool is_single_shot_; 150 FSMState state_; 151 std::string device_id_; 152 153 class OnDataConverter; 154 155 // Converts data between native input format and a WebSpeech specific 156 // output format. 157 scoped_ptr<SpeechRecognizerImpl::OnDataConverter> audio_converter_; 158 159 DISALLOW_COPY_AND_ASSIGN(SpeechRecognizerImpl); 160 }; 161 162 } // namespace content 163 164 #endif // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_ 165