Home | History | Annotate | Download | only in speech
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_
      6 #define CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_
      7 
      8 #include <string>
      9 #include <vector>
     10 
     11 #include "base/basictypes.h"
     12 #include "base/memory/ref_counted.h"
     13 #include "base/memory/scoped_ptr.h"
     14 #include "base/threading/non_thread_safe.h"
     15 #include "content/browser/speech/audio_encoder.h"
     16 #include "content/browser/speech/chunked_byte_buffer.h"
     17 #include "content/browser/speech/speech_recognition_engine.h"
     18 #include "content/common/content_export.h"
     19 #include "content/public/common/speech_recognition_error.h"
     20 #include "net/url_request/url_fetcher_delegate.h"
     21 
     22 namespace net {
     23 class URLRequestContextGetter;
     24 }
     25 
     26 namespace content {
     27 
     28 class AudioChunk;
     29 struct SpeechRecognitionError;
     30 struct SpeechRecognitionResult;
     31 
     32 // Implements a SpeechRecognitionEngine supporting continuous recognition by
     33 // means of interaction with Google streaming speech recognition webservice.
     34 // More in details, this class establishes two HTTP(S) connections with the
     35 // webservice, for each session, herein called "upstream" and "downstream".
     36 // Audio chunks are sent on the upstream by means of a chunked HTTP POST upload.
     37 // Recognition results are retrieved in a full-duplex fashion (i.e. while
     38 // pushing audio on the upstream) on the downstream by means of a chunked
     39 // HTTP GET request. Pairing between the two stream is handled through a
     40 // randomly generated key, unique for each request, which is passed in the
     41 // &pair= arg to both stream request URLs.
     42 // In the case of a regular session, the upstream is closed when the audio
     43 // capture ends (notified through a |AudioChunksEnded| call) and the downstream
     44 // waits for a corresponding server closure (eventually some late results can
     45 // come after closing the upstream).
     46 // Both stream are guaranteed to be closed when |EndRecognition| call is issued.
     47 class CONTENT_EXPORT GoogleStreamingRemoteEngine
     48     : public NON_EXPORTED_BASE(SpeechRecognitionEngine),
     49       public net::URLFetcherDelegate,
     50       public NON_EXPORTED_BASE(base::NonThreadSafe) {
     51  public:
     52   explicit GoogleStreamingRemoteEngine(net::URLRequestContextGetter* context);
     53   virtual ~GoogleStreamingRemoteEngine();
     54 
     55   // SpeechRecognitionEngine methods.
     56   virtual void SetConfig(const SpeechRecognitionEngineConfig& config) OVERRIDE;
     57   virtual void StartRecognition() OVERRIDE;
     58   virtual void EndRecognition() OVERRIDE;
     59   virtual void TakeAudioChunk(const AudioChunk& data) OVERRIDE;
     60   virtual void AudioChunksEnded() OVERRIDE;
     61   virtual bool IsRecognitionPending() const OVERRIDE;
     62   virtual int GetDesiredAudioChunkDurationMs() const OVERRIDE;
     63 
     64   // net::URLFetcherDelegate methods.
     65   virtual void OnURLFetchComplete(const net::URLFetcher* source) OVERRIDE;
     66   virtual void OnURLFetchDownloadProgress(const net::URLFetcher* source,
     67                                           int64 current, int64 total) OVERRIDE;
     68 
     69  private:
     70   friend class GoogleStreamingRemoteEngineTest;
     71 
     72   // IDs passed to URLFetcher::Create(). Used for testing.
     73   static const int kUpstreamUrlFetcherIdForTests;
     74   static const int kDownstreamUrlFetcherIdForTests;
     75 
     76   // Response status codes from the speech recognition webservice.
     77   static const int kWebserviceStatusNoError;
     78   static const int kWebserviceStatusErrorNoMatch;
     79 
     80   // Data types for the internal Finite State Machine (FSM).
     81   enum FSMState {
     82     STATE_IDLE = 0,
     83     STATE_BOTH_STREAMS_CONNECTED,
     84     STATE_WAITING_DOWNSTREAM_RESULTS,
     85     STATE_MAX_VALUE = STATE_WAITING_DOWNSTREAM_RESULTS
     86   };
     87 
     88   enum FSMEvent {
     89     EVENT_END_RECOGNITION = 0,
     90     EVENT_START_RECOGNITION,
     91     EVENT_AUDIO_CHUNK,
     92     EVENT_AUDIO_CHUNKS_ENDED,
     93     EVENT_UPSTREAM_ERROR,
     94     EVENT_DOWNSTREAM_ERROR,
     95     EVENT_DOWNSTREAM_RESPONSE,
     96     EVENT_DOWNSTREAM_CLOSED,
     97     EVENT_MAX_VALUE = EVENT_DOWNSTREAM_CLOSED
     98   };
     99 
    100   struct FSMEventArgs {
    101     explicit FSMEventArgs(FSMEvent event_value);
    102     ~FSMEventArgs();
    103 
    104     FSMEvent event;
    105 
    106     // In case of EVENT_AUDIO_CHUNK, holds the chunk pushed by |TakeAudioChunk|.
    107     scoped_refptr<const AudioChunk> audio_data;
    108 
    109     // In case of EVENT_DOWNSTREAM_RESPONSE, hold the current chunk bytes.
    110     scoped_ptr<std::vector<uint8> > response;
    111 
    112    private:
    113     DISALLOW_COPY_AND_ASSIGN(FSMEventArgs);
    114   };
    115 
    116   // Invoked by both upstream and downstream URLFetcher callbacks to handle
    117   // new chunk data, connection closed or errors notifications.
    118   void DispatchHTTPResponse(const net::URLFetcher* source,
    119                             bool end_of_response);
    120 
    121   // Entry point for pushing any new external event into the recognizer FSM.
    122   void DispatchEvent(const FSMEventArgs& event_args);
    123 
    124   // Defines the behavior of the recognizer FSM, selecting the appropriate
    125   // transition according to the current state and event.
    126   FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& event_args);
    127 
    128   // The methods below handle transitions of the recognizer FSM.
    129   FSMState ConnectBothStreams(const FSMEventArgs& event_args);
    130   FSMState TransmitAudioUpstream(const FSMEventArgs& event_args);
    131   FSMState ProcessDownstreamResponse(const FSMEventArgs& event_args);
    132   FSMState RaiseNoMatchErrorIfGotNoResults(const FSMEventArgs& event_args);
    133   FSMState CloseUpstreamAndWaitForResults(const FSMEventArgs& event_args);
    134   FSMState CloseDownstream(const FSMEventArgs& event_args);
    135   FSMState AbortSilently(const FSMEventArgs& event_args);
    136   FSMState AbortWithError(const FSMEventArgs& event_args);
    137   FSMState Abort(SpeechRecognitionErrorCode error);
    138   FSMState DoNothing(const FSMEventArgs& event_args);
    139   FSMState NotFeasible(const FSMEventArgs& event_args);
    140 
    141   std::string GetAcceptedLanguages() const;
    142   std::string GenerateRequestKey() const;
    143 
    144   SpeechRecognitionEngineConfig config_;
    145   scoped_ptr<net::URLFetcher> upstream_fetcher_;
    146   scoped_ptr<net::URLFetcher> downstream_fetcher_;
    147   scoped_refptr<net::URLRequestContextGetter> url_context_;
    148   scoped_ptr<AudioEncoder> encoder_;
    149   ChunkedByteBuffer chunked_byte_buffer_;
    150   size_t previous_response_length_;
    151   bool got_last_definitive_result_;
    152   bool is_dispatching_event_;
    153   FSMState state_;
    154 
    155   DISALLOW_COPY_AND_ASSIGN(GoogleStreamingRemoteEngine);
    156 };
    157 
    158 }  // namespace content
    159 
    160 #endif  // CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_
    161