Home | History | Annotate | Download | only in speech
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "content/browser/speech/google_one_shot_remote_engine.h"
      6 
      7 #include <vector>
      8 
      9 #include "base/json/json_reader.h"
     10 #include "base/strings/string_number_conversions.h"
     11 #include "base/strings/string_util.h"
     12 #include "base/values.h"
     13 #include "content/browser/speech/audio_buffer.h"
     14 #include "content/public/common/speech_recognition_error.h"
     15 #include "content/public/common/speech_recognition_result.h"
     16 #include "google_apis/google_api_keys.h"
     17 #include "net/base/escape.h"
     18 #include "net/base/load_flags.h"
     19 #include "net/url_request/http_user_agent_settings.h"
     20 #include "net/url_request/url_fetcher.h"
     21 #include "net/url_request/url_request_context.h"
     22 #include "net/url_request/url_request_context_getter.h"
     23 #include "net/url_request/url_request_status.h"
     24 
     25 namespace content {
     26 namespace {
     27 
     28 const char* const kDefaultSpeechRecognitionUrl =
     29     "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&";
     30 const char* const kStatusString = "status";
     31 const char* const kHypothesesString = "hypotheses";
     32 const char* const kUtteranceString = "utterance";
     33 const char* const kConfidenceString = "confidence";
     34 const int kWebServiceStatusNoError = 0;
     35 const int kWebServiceStatusNoSpeech = 4;
     36 const int kWebServiceStatusNoMatch = 5;
     37 const AudioEncoder::Codec kDefaultAudioCodec = AudioEncoder::CODEC_FLAC;
     38 
     39 bool ParseServerResponse(const std::string& response_body,
     40                          SpeechRecognitionResult* result,
     41                          SpeechRecognitionError* error) {
     42   if (response_body.empty()) {
     43     LOG(WARNING) << "ParseServerResponse: Response was empty.";
     44     return false;
     45   }
     46   DVLOG(1) << "ParseServerResponse: Parsing response " << response_body;
     47 
     48   // Parse the response, ignoring comments.
     49   std::string error_msg;
     50   scoped_ptr<base::Value> response_value(base::JSONReader::ReadAndReturnError(
     51       response_body, base::JSON_PARSE_RFC, NULL, &error_msg));
     52   if (response_value == NULL) {
     53     LOG(WARNING) << "ParseServerResponse: JSONReader failed : " << error_msg;
     54     return false;
     55   }
     56 
     57   if (!response_value->IsType(base::Value::TYPE_DICTIONARY)) {
     58     VLOG(1) << "ParseServerResponse: Unexpected response type "
     59             << response_value->GetType();
     60     return false;
     61   }
     62   const base::DictionaryValue* response_object =
     63       static_cast<const base::DictionaryValue*>(response_value.get());
     64 
     65   // Get the status.
     66   int status;
     67   if (!response_object->GetInteger(kStatusString, &status)) {
     68     VLOG(1) << "ParseServerResponse: " << kStatusString
     69             << " is not a valid integer value.";
     70     return false;
     71   }
     72 
     73   // Process the status.
     74   switch (status) {
     75     case kWebServiceStatusNoError:
     76       break;
     77     case kWebServiceStatusNoSpeech:
     78       error->code = SPEECH_RECOGNITION_ERROR_NO_SPEECH;
     79       return false;
     80     case kWebServiceStatusNoMatch:
     81       error->code = SPEECH_RECOGNITION_ERROR_NO_MATCH;
     82       return false;
     83     default:
     84       error->code = SPEECH_RECOGNITION_ERROR_NETWORK;
     85       // Other status codes should not be returned by the server.
     86       VLOG(1) << "ParseServerResponse: unexpected status code " << status;
     87       return false;
     88   }
     89 
     90   // Get the hypotheses.
     91   const base::Value* hypotheses_value = NULL;
     92   if (!response_object->Get(kHypothesesString, &hypotheses_value)) {
     93     VLOG(1) << "ParseServerResponse: Missing hypotheses attribute.";
     94     return false;
     95   }
     96 
     97   DCHECK(hypotheses_value);
     98   if (!hypotheses_value->IsType(base::Value::TYPE_LIST)) {
     99     VLOG(1) << "ParseServerResponse: Unexpected hypotheses type "
    100             << hypotheses_value->GetType();
    101     return false;
    102   }
    103 
    104   const base::ListValue* hypotheses_list =
    105       static_cast<const base::ListValue*>(hypotheses_value);
    106 
    107   // For now we support only single shot recognition, so we are giving only a
    108   // final result, consisting of one fragment (with one or more hypotheses).
    109   size_t index = 0;
    110   for (; index < hypotheses_list->GetSize(); ++index) {
    111     const base::Value* hypothesis = NULL;
    112     if (!hypotheses_list->Get(index, &hypothesis)) {
    113       LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value.";
    114       break;
    115     }
    116     DCHECK(hypothesis);
    117     if (!hypothesis->IsType(base::Value::TYPE_DICTIONARY)) {
    118       LOG(WARNING) << "ParseServerResponse: Unexpected value type "
    119                    << hypothesis->GetType();
    120       break;
    121     }
    122 
    123     const base::DictionaryValue* hypothesis_value =
    124         static_cast<const base::DictionaryValue*>(hypothesis);
    125     base::string16 utterance;
    126 
    127     if (!hypothesis_value->GetString(kUtteranceString, &utterance)) {
    128       LOG(WARNING) << "ParseServerResponse: Missing utterance value.";
    129       break;
    130     }
    131 
    132     // It is not an error if the 'confidence' field is missing.
    133     double confidence = 0.0;
    134     hypothesis_value->GetDouble(kConfidenceString, &confidence);
    135     result->hypotheses.push_back(SpeechRecognitionHypothesis(utterance,
    136                                                              confidence));
    137   }
    138 
    139   if (index < hypotheses_list->GetSize()) {
    140     result->hypotheses.clear();
    141     return false;
    142   }
    143   return true;
    144 }
    145 
    146 }  // namespace
    147 
    148 const int GoogleOneShotRemoteEngine::kAudioPacketIntervalMs = 100;
    149 int GoogleOneShotRemoteEngine::url_fetcher_id_for_tests = 0;
    150 
    151 GoogleOneShotRemoteEngine::GoogleOneShotRemoteEngine(
    152     net::URLRequestContextGetter* context)
    153     : url_context_(context) {
    154 }
    155 
    156 GoogleOneShotRemoteEngine::~GoogleOneShotRemoteEngine() {}
    157 
    158 void GoogleOneShotRemoteEngine::SetConfig(
    159     const SpeechRecognitionEngineConfig& config) {
    160   config_ = config;
    161 }
    162 
    163 void GoogleOneShotRemoteEngine::StartRecognition() {
    164   DCHECK(delegate());
    165   DCHECK(!url_fetcher_.get());
    166   std::string lang_param = config_.language;
    167 
    168   if (lang_param.empty() && url_context_.get()) {
    169     // If no language is provided then we use the first from the accepted
    170     // language list. If this list is empty then it defaults to "en-US".
    171     // Example of the contents of this list: "es,en-GB;q=0.8", ""
    172     net::URLRequestContext* request_context =
    173         url_context_->GetURLRequestContext();
    174     DCHECK(request_context);
    175     // TODO(pauljensen): GoogleOneShotRemoteEngine should be constructed with
    176     // a reference to the HttpUserAgentSettings rather than accessing the
    177     // accept language through the URLRequestContext.
    178     if (request_context->http_user_agent_settings()) {
    179       std::string accepted_language_list =
    180           request_context->http_user_agent_settings()->GetAcceptLanguage();
    181       size_t separator = accepted_language_list.find_first_of(",;");
    182       lang_param = accepted_language_list.substr(0, separator);
    183     }
    184   }
    185 
    186   if (lang_param.empty())
    187     lang_param = "en-US";
    188 
    189   std::vector<std::string> parts;
    190   parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true));
    191 
    192   if (!config_.grammars.empty()) {
    193     DCHECK_EQ(config_.grammars.size(), 1U);
    194     parts.push_back("lm=" + net::EscapeQueryParamValue(config_.grammars[0].url,
    195                                                        true));
    196   }
    197 
    198   if (!config_.hardware_info.empty())
    199     parts.push_back("xhw=" + net::EscapeQueryParamValue(config_.hardware_info,
    200                                                         true));
    201   parts.push_back("maxresults=" + base::UintToString(config_.max_hypotheses));
    202   parts.push_back(config_.filter_profanities ? "pfilter=2" : "pfilter=0");
    203 
    204   std::string api_key = google_apis::GetAPIKey();
    205   parts.push_back("key=" + net::EscapeQueryParamValue(api_key, true));
    206 
    207   GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&'));
    208 
    209   encoder_.reset(AudioEncoder::Create(kDefaultAudioCodec,
    210                                       config_.audio_sample_rate,
    211                                       config_.audio_num_bits_per_sample));
    212   DCHECK(encoder_.get());
    213   url_fetcher_.reset(net::URLFetcher::Create(url_fetcher_id_for_tests,
    214                                              url,
    215                                              net::URLFetcher::POST,
    216                                              this));
    217   url_fetcher_->SetChunkedUpload(encoder_->mime_type());
    218   url_fetcher_->SetRequestContext(url_context_.get());
    219   url_fetcher_->SetReferrer(config_.origin_url);
    220 
    221   // The speech recognition API does not require user identification as part
    222   // of requests, so we don't send cookies or auth data for these requests to
    223   // prevent any accidental connection between users who are logged into the
    224   // domain for other services (e.g. bookmark sync) with the speech requests.
    225   url_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |
    226                              net::LOAD_DO_NOT_SEND_COOKIES |
    227                              net::LOAD_DO_NOT_SEND_AUTH_DATA);
    228   url_fetcher_->Start();
    229 }
    230 
    231 void GoogleOneShotRemoteEngine::EndRecognition() {
    232   url_fetcher_.reset();
    233 }
    234 
    235 void GoogleOneShotRemoteEngine::TakeAudioChunk(const AudioChunk& data) {
    236   DCHECK(url_fetcher_.get());
    237   DCHECK(encoder_.get());
    238   DCHECK_EQ(data.bytes_per_sample(), config_.audio_num_bits_per_sample / 8);
    239   encoder_->Encode(data);
    240   scoped_refptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());
    241   url_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false);
    242 }
    243 
    244 void GoogleOneShotRemoteEngine::AudioChunksEnded() {
    245   DCHECK(url_fetcher_.get());
    246   DCHECK(encoder_.get());
    247 
    248   // UploadAudioChunk requires a non-empty final buffer. So we encode a packet
    249   // of silence in case encoder had no data already.
    250   std::vector<int16> samples(
    251       config_.audio_sample_rate * kAudioPacketIntervalMs / 1000);
    252   scoped_refptr<AudioChunk> dummy_chunk(
    253       new AudioChunk(reinterpret_cast<uint8*>(&samples[0]),
    254                      samples.size() * sizeof(int16),
    255                      encoder_->bits_per_sample() / 8));
    256   encoder_->Encode(*dummy_chunk.get());
    257   encoder_->Flush();
    258   scoped_refptr<AudioChunk> encoded_dummy_data(
    259       encoder_->GetEncodedDataAndClear());
    260   DCHECK(!encoded_dummy_data->IsEmpty());
    261   encoder_.reset();
    262 
    263   url_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true);
    264 }
    265 
    266 void GoogleOneShotRemoteEngine::OnURLFetchComplete(
    267     const net::URLFetcher* source) {
    268   DCHECK_EQ(url_fetcher_.get(), source);
    269   SpeechRecognitionResults results;
    270   results.push_back(SpeechRecognitionResult());
    271   SpeechRecognitionResult& result = results.back();
    272   SpeechRecognitionError error(SPEECH_RECOGNITION_ERROR_NETWORK);
    273   std::string data;
    274 
    275   // The default error code in case of parse errors is NETWORK_FAILURE, however
    276   // ParseServerResponse can change the error to a more appropriate one.
    277   bool error_occurred = (!source->GetStatus().is_success() ||
    278                         source->GetResponseCode() != 200 ||
    279                         !source->GetResponseAsString(&data) ||
    280                         !ParseServerResponse(data, &result, &error));
    281   url_fetcher_.reset();
    282   if (error_occurred) {
    283     DVLOG(1) << "GoogleOneShotRemoteEngine: Network Error " << error.code;
    284     delegate()->OnSpeechRecognitionEngineError(error);
    285   } else {
    286     DVLOG(1) << "GoogleOneShotRemoteEngine: Invoking delegate with result.";
    287     delegate()->OnSpeechRecognitionEngineResults(results);
    288   }
    289 }
    290 
    291 bool GoogleOneShotRemoteEngine::IsRecognitionPending() const {
    292   return url_fetcher_ != NULL;
    293 }
    294 
    295 int GoogleOneShotRemoteEngine::GetDesiredAudioChunkDurationMs() const {
    296   return kAudioPacketIntervalMs;
    297 }
    298 
    299 }  // namespace content
    300