1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_ 6 #define CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_ 7 8 #include <string> 9 #include <vector> 10 11 #include "base/basictypes.h" 12 #include "base/memory/ref_counted.h" 13 #include "base/memory/scoped_ptr.h" 14 #include "base/threading/non_thread_safe.h" 15 #include "content/browser/speech/audio_encoder.h" 16 #include "content/browser/speech/chunked_byte_buffer.h" 17 #include "content/browser/speech/speech_recognition_engine.h" 18 #include "content/common/content_export.h" 19 #include "content/public/common/speech_recognition_error.h" 20 #include "net/url_request/url_fetcher_delegate.h" 21 22 namespace net { 23 class URLRequestContextGetter; 24 } 25 26 namespace content { 27 28 class AudioChunk; 29 struct SpeechRecognitionError; 30 struct SpeechRecognitionResult; 31 32 // Implements a SpeechRecognitionEngine supporting continuous recognition by 33 // means of interaction with Google streaming speech recognition webservice. 34 // More in details, this class establishes two HTTP(S) connections with the 35 // webservice, for each session, herein called "upstream" and "downstream". 36 // Audio chunks are sent on the upstream by means of a chunked HTTP POST upload. 37 // Recognition results are retrieved in a full-duplex fashion (i.e. while 38 // pushing audio on the upstream) on the downstream by means of a chunked 39 // HTTP GET request. Pairing between the two stream is handled through a 40 // randomly generated key, unique for each request, which is passed in the 41 // &pair= arg to both stream request URLs. 42 // In the case of a regular session, the upstream is closed when the audio 43 // capture ends (notified through a |AudioChunksEnded| call) and the downstream 44 // waits for a corresponding server closure (eventually some late results can 45 // come after closing the upstream). 46 // Both stream are guaranteed to be closed when |EndRecognition| call is issued. 47 class CONTENT_EXPORT GoogleStreamingRemoteEngine 48 : public NON_EXPORTED_BASE(SpeechRecognitionEngine), 49 public net::URLFetcherDelegate, 50 public NON_EXPORTED_BASE(base::NonThreadSafe) { 51 public: 52 explicit GoogleStreamingRemoteEngine(net::URLRequestContextGetter* context); 53 virtual ~GoogleStreamingRemoteEngine(); 54 55 // SpeechRecognitionEngine methods. 56 virtual void SetConfig(const SpeechRecognitionEngineConfig& config) OVERRIDE; 57 virtual void StartRecognition() OVERRIDE; 58 virtual void EndRecognition() OVERRIDE; 59 virtual void TakeAudioChunk(const AudioChunk& data) OVERRIDE; 60 virtual void AudioChunksEnded() OVERRIDE; 61 virtual bool IsRecognitionPending() const OVERRIDE; 62 virtual int GetDesiredAudioChunkDurationMs() const OVERRIDE; 63 64 // net::URLFetcherDelegate methods. 65 virtual void OnURLFetchComplete(const net::URLFetcher* source) OVERRIDE; 66 virtual void OnURLFetchDownloadProgress(const net::URLFetcher* source, 67 int64 current, int64 total) OVERRIDE; 68 69 private: 70 friend class GoogleStreamingRemoteEngineTest; 71 72 // IDs passed to URLFetcher::Create(). Used for testing. 73 static const int kUpstreamUrlFetcherIdForTests; 74 static const int kDownstreamUrlFetcherIdForTests; 75 76 // Response status codes from the speech recognition webservice. 77 static const int kWebserviceStatusNoError; 78 static const int kWebserviceStatusErrorNoMatch; 79 80 // Data types for the internal Finite State Machine (FSM). 81 enum FSMState { 82 STATE_IDLE = 0, 83 STATE_BOTH_STREAMS_CONNECTED, 84 STATE_WAITING_DOWNSTREAM_RESULTS, 85 STATE_MAX_VALUE = STATE_WAITING_DOWNSTREAM_RESULTS 86 }; 87 88 enum FSMEvent { 89 EVENT_END_RECOGNITION = 0, 90 EVENT_START_RECOGNITION, 91 EVENT_AUDIO_CHUNK, 92 EVENT_AUDIO_CHUNKS_ENDED, 93 EVENT_UPSTREAM_ERROR, 94 EVENT_DOWNSTREAM_ERROR, 95 EVENT_DOWNSTREAM_RESPONSE, 96 EVENT_DOWNSTREAM_CLOSED, 97 EVENT_MAX_VALUE = EVENT_DOWNSTREAM_CLOSED 98 }; 99 100 struct FSMEventArgs { 101 explicit FSMEventArgs(FSMEvent event_value); 102 ~FSMEventArgs(); 103 104 FSMEvent event; 105 106 // In case of EVENT_AUDIO_CHUNK, holds the chunk pushed by |TakeAudioChunk|. 107 scoped_refptr<const AudioChunk> audio_data; 108 109 // In case of EVENT_DOWNSTREAM_RESPONSE, hold the current chunk bytes. 110 scoped_ptr<std::vector<uint8> > response; 111 112 private: 113 DISALLOW_COPY_AND_ASSIGN(FSMEventArgs); 114 }; 115 116 // Invoked by both upstream and downstream URLFetcher callbacks to handle 117 // new chunk data, connection closed or errors notifications. 118 void DispatchHTTPResponse(const net::URLFetcher* source, 119 bool end_of_response); 120 121 // Entry point for pushing any new external event into the recognizer FSM. 122 void DispatchEvent(const FSMEventArgs& event_args); 123 124 // Defines the behavior of the recognizer FSM, selecting the appropriate 125 // transition according to the current state and event. 126 FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& event_args); 127 128 // The methods below handle transitions of the recognizer FSM. 129 FSMState ConnectBothStreams(const FSMEventArgs& event_args); 130 FSMState TransmitAudioUpstream(const FSMEventArgs& event_args); 131 FSMState ProcessDownstreamResponse(const FSMEventArgs& event_args); 132 FSMState RaiseNoMatchErrorIfGotNoResults(const FSMEventArgs& event_args); 133 FSMState CloseUpstreamAndWaitForResults(const FSMEventArgs& event_args); 134 FSMState CloseDownstream(const FSMEventArgs& event_args); 135 FSMState AbortSilently(const FSMEventArgs& event_args); 136 FSMState AbortWithError(const FSMEventArgs& event_args); 137 FSMState Abort(SpeechRecognitionErrorCode error); 138 FSMState DoNothing(const FSMEventArgs& event_args); 139 FSMState NotFeasible(const FSMEventArgs& event_args); 140 141 std::string GetAcceptedLanguages() const; 142 std::string GenerateRequestKey() const; 143 144 SpeechRecognitionEngineConfig config_; 145 scoped_ptr<net::URLFetcher> upstream_fetcher_; 146 scoped_ptr<net::URLFetcher> downstream_fetcher_; 147 scoped_refptr<net::URLRequestContextGetter> url_context_; 148 scoped_ptr<AudioEncoder> encoder_; 149 ChunkedByteBuffer chunked_byte_buffer_; 150 size_t previous_response_length_; 151 bool got_last_definitive_result_; 152 bool is_dispatching_event_; 153 FSMState state_; 154 155 DISALLOW_COPY_AND_ASSIGN(GoogleStreamingRemoteEngine); 156 }; 157 158 } // namespace content 159 160 #endif // CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_ 161