1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "content/browser/speech/google_one_shot_remote_engine.h" 6 7 #include <vector> 8 9 #include "base/json/json_reader.h" 10 #include "base/strings/string_number_conversions.h" 11 #include "base/strings/string_util.h" 12 #include "base/values.h" 13 #include "content/browser/speech/audio_buffer.h" 14 #include "content/public/common/speech_recognition_error.h" 15 #include "content/public/common/speech_recognition_result.h" 16 #include "google_apis/google_api_keys.h" 17 #include "net/base/escape.h" 18 #include "net/base/load_flags.h" 19 #include "net/url_request/http_user_agent_settings.h" 20 #include "net/url_request/url_fetcher.h" 21 #include "net/url_request/url_request_context.h" 22 #include "net/url_request/url_request_context_getter.h" 23 #include "net/url_request/url_request_status.h" 24 25 namespace content { 26 namespace { 27 28 const char* const kDefaultSpeechRecognitionUrl = 29 "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&"; 30 const char* const kStatusString = "status"; 31 const char* const kHypothesesString = "hypotheses"; 32 const char* const kUtteranceString = "utterance"; 33 const char* const kConfidenceString = "confidence"; 34 const int kWebServiceStatusNoError = 0; 35 const int kWebServiceStatusNoSpeech = 4; 36 const int kWebServiceStatusNoMatch = 5; 37 const AudioEncoder::Codec kDefaultAudioCodec = AudioEncoder::CODEC_FLAC; 38 39 bool ParseServerResponse(const std::string& response_body, 40 SpeechRecognitionResult* result, 41 SpeechRecognitionError* error) { 42 if (response_body.empty()) { 43 LOG(WARNING) << "ParseServerResponse: Response was empty."; 44 return false; 45 } 46 DVLOG(1) << "ParseServerResponse: Parsing response " << response_body; 47 48 // Parse the response, ignoring comments. 49 std::string error_msg; 50 scoped_ptr<base::Value> response_value(base::JSONReader::ReadAndReturnError( 51 response_body, base::JSON_PARSE_RFC, NULL, &error_msg)); 52 if (response_value == NULL) { 53 LOG(WARNING) << "ParseServerResponse: JSONReader failed : " << error_msg; 54 return false; 55 } 56 57 if (!response_value->IsType(base::Value::TYPE_DICTIONARY)) { 58 VLOG(1) << "ParseServerResponse: Unexpected response type " 59 << response_value->GetType(); 60 return false; 61 } 62 const base::DictionaryValue* response_object = 63 static_cast<const base::DictionaryValue*>(response_value.get()); 64 65 // Get the status. 66 int status; 67 if (!response_object->GetInteger(kStatusString, &status)) { 68 VLOG(1) << "ParseServerResponse: " << kStatusString 69 << " is not a valid integer value."; 70 return false; 71 } 72 73 // Process the status. 74 switch (status) { 75 case kWebServiceStatusNoError: 76 break; 77 case kWebServiceStatusNoSpeech: 78 error->code = SPEECH_RECOGNITION_ERROR_NO_SPEECH; 79 return false; 80 case kWebServiceStatusNoMatch: 81 error->code = SPEECH_RECOGNITION_ERROR_NO_MATCH; 82 return false; 83 default: 84 error->code = SPEECH_RECOGNITION_ERROR_NETWORK; 85 // Other status codes should not be returned by the server. 86 VLOG(1) << "ParseServerResponse: unexpected status code " << status; 87 return false; 88 } 89 90 // Get the hypotheses. 91 const base::Value* hypotheses_value = NULL; 92 if (!response_object->Get(kHypothesesString, &hypotheses_value)) { 93 VLOG(1) << "ParseServerResponse: Missing hypotheses attribute."; 94 return false; 95 } 96 97 DCHECK(hypotheses_value); 98 if (!hypotheses_value->IsType(base::Value::TYPE_LIST)) { 99 VLOG(1) << "ParseServerResponse: Unexpected hypotheses type " 100 << hypotheses_value->GetType(); 101 return false; 102 } 103 104 const base::ListValue* hypotheses_list = 105 static_cast<const base::ListValue*>(hypotheses_value); 106 107 // For now we support only single shot recognition, so we are giving only a 108 // final result, consisting of one fragment (with one or more hypotheses). 109 size_t index = 0; 110 for (; index < hypotheses_list->GetSize(); ++index) { 111 const base::Value* hypothesis = NULL; 112 if (!hypotheses_list->Get(index, &hypothesis)) { 113 LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value."; 114 break; 115 } 116 DCHECK(hypothesis); 117 if (!hypothesis->IsType(base::Value::TYPE_DICTIONARY)) { 118 LOG(WARNING) << "ParseServerResponse: Unexpected value type " 119 << hypothesis->GetType(); 120 break; 121 } 122 123 const base::DictionaryValue* hypothesis_value = 124 static_cast<const base::DictionaryValue*>(hypothesis); 125 base::string16 utterance; 126 127 if (!hypothesis_value->GetString(kUtteranceString, &utterance)) { 128 LOG(WARNING) << "ParseServerResponse: Missing utterance value."; 129 break; 130 } 131 132 // It is not an error if the 'confidence' field is missing. 133 double confidence = 0.0; 134 hypothesis_value->GetDouble(kConfidenceString, &confidence); 135 result->hypotheses.push_back(SpeechRecognitionHypothesis(utterance, 136 confidence)); 137 } 138 139 if (index < hypotheses_list->GetSize()) { 140 result->hypotheses.clear(); 141 return false; 142 } 143 return true; 144 } 145 146 } // namespace 147 148 const int GoogleOneShotRemoteEngine::kAudioPacketIntervalMs = 100; 149 int GoogleOneShotRemoteEngine::url_fetcher_id_for_tests = 0; 150 151 GoogleOneShotRemoteEngine::GoogleOneShotRemoteEngine( 152 net::URLRequestContextGetter* context) 153 : url_context_(context) { 154 } 155 156 GoogleOneShotRemoteEngine::~GoogleOneShotRemoteEngine() {} 157 158 void GoogleOneShotRemoteEngine::SetConfig( 159 const SpeechRecognitionEngineConfig& config) { 160 config_ = config; 161 } 162 163 void GoogleOneShotRemoteEngine::StartRecognition() { 164 DCHECK(delegate()); 165 DCHECK(!url_fetcher_.get()); 166 std::string lang_param = config_.language; 167 168 if (lang_param.empty() && url_context_.get()) { 169 // If no language is provided then we use the first from the accepted 170 // language list. If this list is empty then it defaults to "en-US". 171 // Example of the contents of this list: "es,en-GB;q=0.8", "" 172 net::URLRequestContext* request_context = 173 url_context_->GetURLRequestContext(); 174 DCHECK(request_context); 175 // TODO(pauljensen): GoogleOneShotRemoteEngine should be constructed with 176 // a reference to the HttpUserAgentSettings rather than accessing the 177 // accept language through the URLRequestContext. 178 if (request_context->http_user_agent_settings()) { 179 std::string accepted_language_list = 180 request_context->http_user_agent_settings()->GetAcceptLanguage(); 181 size_t separator = accepted_language_list.find_first_of(",;"); 182 lang_param = accepted_language_list.substr(0, separator); 183 } 184 } 185 186 if (lang_param.empty()) 187 lang_param = "en-US"; 188 189 std::vector<std::string> parts; 190 parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true)); 191 192 if (!config_.grammars.empty()) { 193 DCHECK_EQ(config_.grammars.size(), 1U); 194 parts.push_back("lm=" + net::EscapeQueryParamValue(config_.grammars[0].url, 195 true)); 196 } 197 198 if (!config_.hardware_info.empty()) 199 parts.push_back("xhw=" + net::EscapeQueryParamValue(config_.hardware_info, 200 true)); 201 parts.push_back("maxresults=" + base::UintToString(config_.max_hypotheses)); 202 parts.push_back(config_.filter_profanities ? "pfilter=2" : "pfilter=0"); 203 204 std::string api_key = google_apis::GetAPIKey(); 205 parts.push_back("key=" + net::EscapeQueryParamValue(api_key, true)); 206 207 GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&')); 208 209 encoder_.reset(AudioEncoder::Create(kDefaultAudioCodec, 210 config_.audio_sample_rate, 211 config_.audio_num_bits_per_sample)); 212 DCHECK(encoder_.get()); 213 url_fetcher_.reset(net::URLFetcher::Create(url_fetcher_id_for_tests, 214 url, 215 net::URLFetcher::POST, 216 this)); 217 url_fetcher_->SetChunkedUpload(encoder_->mime_type()); 218 url_fetcher_->SetRequestContext(url_context_.get()); 219 url_fetcher_->SetReferrer(config_.origin_url); 220 221 // The speech recognition API does not require user identification as part 222 // of requests, so we don't send cookies or auth data for these requests to 223 // prevent any accidental connection between users who are logged into the 224 // domain for other services (e.g. bookmark sync) with the speech requests. 225 url_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES | 226 net::LOAD_DO_NOT_SEND_COOKIES | 227 net::LOAD_DO_NOT_SEND_AUTH_DATA); 228 url_fetcher_->Start(); 229 } 230 231 void GoogleOneShotRemoteEngine::EndRecognition() { 232 url_fetcher_.reset(); 233 } 234 235 void GoogleOneShotRemoteEngine::TakeAudioChunk(const AudioChunk& data) { 236 DCHECK(url_fetcher_.get()); 237 DCHECK(encoder_.get()); 238 DCHECK_EQ(data.bytes_per_sample(), config_.audio_num_bits_per_sample / 8); 239 encoder_->Encode(data); 240 scoped_refptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear()); 241 url_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false); 242 } 243 244 void GoogleOneShotRemoteEngine::AudioChunksEnded() { 245 DCHECK(url_fetcher_.get()); 246 DCHECK(encoder_.get()); 247 248 // UploadAudioChunk requires a non-empty final buffer. So we encode a packet 249 // of silence in case encoder had no data already. 250 std::vector<int16> samples( 251 config_.audio_sample_rate * kAudioPacketIntervalMs / 1000); 252 scoped_refptr<AudioChunk> dummy_chunk( 253 new AudioChunk(reinterpret_cast<uint8*>(&samples[0]), 254 samples.size() * sizeof(int16), 255 encoder_->bits_per_sample() / 8)); 256 encoder_->Encode(*dummy_chunk.get()); 257 encoder_->Flush(); 258 scoped_refptr<AudioChunk> encoded_dummy_data( 259 encoder_->GetEncodedDataAndClear()); 260 DCHECK(!encoded_dummy_data->IsEmpty()); 261 encoder_.reset(); 262 263 url_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true); 264 } 265 266 void GoogleOneShotRemoteEngine::OnURLFetchComplete( 267 const net::URLFetcher* source) { 268 DCHECK_EQ(url_fetcher_.get(), source); 269 SpeechRecognitionResults results; 270 results.push_back(SpeechRecognitionResult()); 271 SpeechRecognitionResult& result = results.back(); 272 SpeechRecognitionError error(SPEECH_RECOGNITION_ERROR_NETWORK); 273 std::string data; 274 275 // The default error code in case of parse errors is NETWORK_FAILURE, however 276 // ParseServerResponse can change the error to a more appropriate one. 277 bool error_occurred = (!source->GetStatus().is_success() || 278 source->GetResponseCode() != 200 || 279 !source->GetResponseAsString(&data) || 280 !ParseServerResponse(data, &result, &error)); 281 url_fetcher_.reset(); 282 if (error_occurred) { 283 DVLOG(1) << "GoogleOneShotRemoteEngine: Network Error " << error.code; 284 delegate()->OnSpeechRecognitionEngineError(error); 285 } else { 286 DVLOG(1) << "GoogleOneShotRemoteEngine: Invoking delegate with result."; 287 delegate()->OnSpeechRecognitionEngineResults(results); 288 } 289 } 290 291 bool GoogleOneShotRemoteEngine::IsRecognitionPending() const { 292 return url_fetcher_ != NULL; 293 } 294 295 int GoogleOneShotRemoteEngine::GetDesiredAudioChunkDurationMs() const { 296 return kAudioPacketIntervalMs; 297 } 298 299 } // namespace content 300