Home | History | Annotate | Download | only in speech
      1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_
      6 #define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_
      7 
      8 #include "base/basictypes.h"
      9 #include "base/memory/scoped_ptr.h"
     10 #include "content/browser/speech/endpointer/endpointer.h"
     11 #include "content/browser/speech/speech_recognition_engine.h"
     12 #include "content/browser/speech/speech_recognizer.h"
     13 #include "content/public/common/speech_recognition_error.h"
     14 #include "content/public/common/speech_recognition_result.h"
     15 #include "media/audio/audio_input_controller.h"
     16 #include "media/audio/audio_logging.h"
     17 #include "net/url_request/url_request_context_getter.h"
     18 
     19 namespace media {
     20 class AudioBus;
     21 class AudioManager;
     22 }
     23 
     24 namespace content {
     25 
     26 class SpeechRecognitionEventListener;
     27 
     28 // Handles speech recognition for a session (identified by |session_id|), taking
     29 // care of audio capture, silence detection/endpointer and interaction with the
     30 // SpeechRecognitionEngine.
     31 class CONTENT_EXPORT SpeechRecognizerImpl
     32     : public SpeechRecognizer,
     33       public media::AudioInputController::EventHandler,
     34       public NON_EXPORTED_BASE(SpeechRecognitionEngineDelegate) {
     35  public:
     36   static const int kAudioSampleRate;
     37   static const media::ChannelLayout kChannelLayout;
     38   static const int kNumBitsPerAudioSample;
     39   static const int kNoSpeechTimeoutMs;
     40   static const int kEndpointerEstimationTimeMs;
     41 
     42   static void SetAudioManagerForTesting(media::AudioManager* audio_manager);
     43 
     44   SpeechRecognizerImpl(SpeechRecognitionEventListener* listener,
     45                        int session_id,
     46                        bool continuous,
     47                        bool provisional_results,
     48                        SpeechRecognitionEngine* engine);
     49 
     50   virtual void StartRecognition(const std::string& device_id) OVERRIDE;
     51   virtual void AbortRecognition() OVERRIDE;
     52   virtual void StopAudioCapture() OVERRIDE;
     53   virtual bool IsActive() const OVERRIDE;
     54   virtual bool IsCapturingAudio() const OVERRIDE;
     55   const SpeechRecognitionEngine& recognition_engine() const;
     56 
     57  private:
     58   friend class SpeechRecognizerTest;
     59 
     60   enum FSMState {
     61     STATE_IDLE = 0,
     62     STATE_STARTING,
     63     STATE_ESTIMATING_ENVIRONMENT,
     64     STATE_WAITING_FOR_SPEECH,
     65     STATE_RECOGNIZING,
     66     STATE_WAITING_FINAL_RESULT,
     67     STATE_ENDED,
     68     STATE_MAX_VALUE = STATE_ENDED
     69   };
     70 
     71   enum FSMEvent {
     72     EVENT_ABORT = 0,
     73     EVENT_START,
     74     EVENT_STOP_CAPTURE,
     75     EVENT_AUDIO_DATA,
     76     EVENT_ENGINE_RESULT,
     77     EVENT_ENGINE_ERROR,
     78     EVENT_AUDIO_ERROR,
     79     EVENT_MAX_VALUE = EVENT_AUDIO_ERROR
     80   };
     81 
     82   struct FSMEventArgs {
     83     explicit FSMEventArgs(FSMEvent event_value);
     84     ~FSMEventArgs();
     85 
     86     FSMEvent event;
     87     scoped_refptr<AudioChunk> audio_data;
     88     SpeechRecognitionResults engine_results;
     89     SpeechRecognitionError engine_error;
     90   };
     91 
     92   virtual ~SpeechRecognizerImpl();
     93 
     94   // Entry point for pushing any new external event into the recognizer FSM.
     95   void DispatchEvent(const FSMEventArgs& event_args);
     96 
     97   // Defines the behavior of the recognizer FSM, selecting the appropriate
     98   // transition according to the current state and event.
     99   FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& args);
    100 
    101   // Process a new audio chunk in the audio pipeline (endpointer, vumeter, etc).
    102   void ProcessAudioPipeline(const AudioChunk& raw_audio);
    103 
    104   // The methods below handle transitions of the recognizer FSM.
    105   FSMState StartRecording(const FSMEventArgs& event_args);
    106   FSMState StartRecognitionEngine(const FSMEventArgs& event_args);
    107   FSMState WaitEnvironmentEstimationCompletion(const FSMEventArgs& event_args);
    108   FSMState DetectUserSpeechOrTimeout(const FSMEventArgs& event_args);
    109   FSMState StopCaptureAndWaitForResult(const FSMEventArgs& event_args);
    110   FSMState ProcessIntermediateResult(const FSMEventArgs& event_args);
    111   FSMState ProcessFinalResult(const FSMEventArgs& event_args);
    112   FSMState AbortSilently(const FSMEventArgs& event_args);
    113   FSMState AbortWithError(const FSMEventArgs& event_args);
    114   FSMState Abort(const SpeechRecognitionError& error);
    115   FSMState DetectEndOfSpeech(const FSMEventArgs& event_args);
    116   FSMState DoNothing(const FSMEventArgs& event_args) const;
    117   FSMState NotFeasible(const FSMEventArgs& event_args);
    118 
    119   // Returns the time span of captured audio samples since the start of capture.
    120   int GetElapsedTimeMs() const;
    121 
    122   // Calculates the input volume to be displayed in the UI, triggering the
    123   // OnAudioLevelsChange event accordingly.
    124   void UpdateSignalAndNoiseLevels(const float& rms, bool clip_detected);
    125 
    126   void CloseAudioControllerAsynchronously();
    127 
    128   // Callback called on IO thread by audio_controller->Close().
    129   void OnAudioClosed(media::AudioInputController*);
    130 
    131   // AudioInputController::EventHandler methods.
    132   virtual void OnCreated(media::AudioInputController* controller) OVERRIDE {}
    133   virtual void OnRecording(media::AudioInputController* controller) OVERRIDE {}
    134   virtual void OnError(media::AudioInputController* controller,
    135       media::AudioInputController::ErrorCode error_code) OVERRIDE;
    136   virtual void OnData(media::AudioInputController* controller,
    137                       const media::AudioBus* data) OVERRIDE;
    138   virtual void OnLog(media::AudioInputController* controller,
    139                      const std::string& message) OVERRIDE {}
    140 
    141   // SpeechRecognitionEngineDelegate methods.
    142   virtual void OnSpeechRecognitionEngineResults(
    143       const SpeechRecognitionResults& results) OVERRIDE;
    144   virtual void OnSpeechRecognitionEngineError(
    145       const SpeechRecognitionError& error) OVERRIDE;
    146 
    147   static media::AudioManager* audio_manager_for_tests_;
    148 
    149   scoped_ptr<SpeechRecognitionEngine> recognition_engine_;
    150   Endpointer endpointer_;
    151   scoped_refptr<media::AudioInputController> audio_controller_;
    152   scoped_ptr<media::AudioLog> audio_log_;
    153   int num_samples_recorded_;
    154   float audio_level_;
    155   bool is_dispatching_event_;
    156   bool provisional_results_;
    157   FSMState state_;
    158   std::string device_id_;
    159 
    160   class OnDataConverter;
    161 
    162   // Converts data between native input format and a WebSpeech specific
    163   // output format.
    164   scoped_ptr<SpeechRecognizerImpl::OnDataConverter> audio_converter_;
    165 
    166   DISALLOW_COPY_AND_ASSIGN(SpeechRecognizerImpl);
    167 };
    168 
    169 }  // namespace content
    170 
    171 #endif  // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_
    172