1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include <queue> 6 7 #include "base/memory/scoped_ptr.h" 8 #include "base/message_loop/message_loop.h" 9 #include "base/strings/utf_string_conversions.h" 10 #include "content/browser/speech/audio_buffer.h" 11 #include "content/browser/speech/google_streaming_remote_engine.h" 12 #include "content/browser/speech/proto/google_streaming_api.pb.h" 13 #include "content/public/common/speech_recognition_error.h" 14 #include "content/public/common/speech_recognition_result.h" 15 #include "net/url_request/test_url_fetcher_factory.h" 16 #include "net/url_request/url_request_context_getter.h" 17 #include "net/url_request/url_request_status.h" 18 #include "testing/gtest/include/gtest/gtest.h" 19 20 using net::URLRequestStatus; 21 using net::TestURLFetcher; 22 using net::TestURLFetcherFactory; 23 24 namespace content { 25 26 // Note: the terms upstream and downstream are from the point-of-view of the 27 // client (engine_under_test_). 28 29 class GoogleStreamingRemoteEngineTest : public SpeechRecognitionEngineDelegate, 30 public testing::Test { 31 public: 32 GoogleStreamingRemoteEngineTest() 33 : last_number_of_upstream_chunks_seen_(0U), 34 error_(SPEECH_RECOGNITION_ERROR_NONE) { } 35 36 // Creates a speech recognition request and invokes its URL fetcher delegate 37 // with the given test data. 38 void CreateAndTestRequest(bool success, const std::string& http_response); 39 40 // SpeechRecognitionRequestDelegate methods. 41 virtual void OnSpeechRecognitionEngineResults( 42 const SpeechRecognitionResults& results) OVERRIDE { 43 results_.push(results); 44 } 45 virtual void OnSpeechRecognitionEngineError( 46 const SpeechRecognitionError& error) OVERRIDE { 47 error_ = error.code; 48 } 49 50 // testing::Test methods. 51 virtual void SetUp() OVERRIDE; 52 virtual void TearDown() OVERRIDE; 53 54 protected: 55 enum DownstreamError { 56 DOWNSTREAM_ERROR_NONE, 57 DOWNSTREAM_ERROR_HTTP500, 58 DOWNSTREAM_ERROR_NETWORK, 59 DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH 60 }; 61 static bool ResultsAreEqual(const SpeechRecognitionResults& a, 62 const SpeechRecognitionResults& b); 63 static std::string SerializeProtobufResponse( 64 const proto::SpeechRecognitionEvent& msg); 65 static std::string ToBigEndian32(uint32 value); 66 67 TestURLFetcher* GetUpstreamFetcher(); 68 TestURLFetcher* GetDownstreamFetcher(); 69 void StartMockRecognition(); 70 void EndMockRecognition(); 71 void InjectDummyAudioChunk(); 72 size_t UpstreamChunksUploadedFromLastCall(); 73 void ProvideMockProtoResultDownstream( 74 const proto::SpeechRecognitionEvent& result); 75 void ProvideMockResultDownstream(const SpeechRecognitionResult& result); 76 void ExpectResultsReceived(const SpeechRecognitionResults& result); 77 void CloseMockDownstream(DownstreamError error); 78 79 scoped_ptr<GoogleStreamingRemoteEngine> engine_under_test_; 80 TestURLFetcherFactory url_fetcher_factory_; 81 size_t last_number_of_upstream_chunks_seen_; 82 base::MessageLoop message_loop_; 83 std::string response_buffer_; 84 SpeechRecognitionErrorCode error_; 85 std::queue<SpeechRecognitionResults> results_; 86 }; 87 88 TEST_F(GoogleStreamingRemoteEngineTest, SingleDefinitiveResult) { 89 StartMockRecognition(); 90 ASSERT_TRUE(GetUpstreamFetcher()); 91 ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); 92 93 // Inject some dummy audio chunks and check a corresponding chunked upload 94 // is performed every time on the server. 95 for (int i = 0; i < 3; ++i) { 96 InjectDummyAudioChunk(); 97 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); 98 } 99 100 // Ensure that a final (empty) audio chunk is uploaded on chunks end. 101 engine_under_test_->AudioChunksEnded(); 102 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); 103 ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); 104 105 // Simulate a protobuf message streamed from the server containing a single 106 // result with two hypotheses. 107 SpeechRecognitionResults results; 108 results.push_back(SpeechRecognitionResult()); 109 SpeechRecognitionResult& result = results.back(); 110 result.is_provisional = false; 111 result.hypotheses.push_back( 112 SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis 1"), 0.1F)); 113 result.hypotheses.push_back( 114 SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis 2"), 0.2F)); 115 116 ProvideMockResultDownstream(result); 117 ExpectResultsReceived(results); 118 ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); 119 120 // Ensure everything is closed cleanly after the downstream is closed. 121 CloseMockDownstream(DOWNSTREAM_ERROR_NONE); 122 ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); 123 EndMockRecognition(); 124 ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); 125 ASSERT_EQ(0U, results_.size()); 126 } 127 128 TEST_F(GoogleStreamingRemoteEngineTest, SeveralStreamingResults) { 129 StartMockRecognition(); 130 ASSERT_TRUE(GetUpstreamFetcher()); 131 ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); 132 133 for (int i = 0; i < 4; ++i) { 134 InjectDummyAudioChunk(); 135 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); 136 137 SpeechRecognitionResults results; 138 results.push_back(SpeechRecognitionResult()); 139 SpeechRecognitionResult& result = results.back(); 140 result.is_provisional = (i % 2 == 0); // Alternate result types. 141 float confidence = result.is_provisional ? 0.0F : (i * 0.1F); 142 result.hypotheses.push_back( 143 SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis"), confidence)); 144 145 ProvideMockResultDownstream(result); 146 ExpectResultsReceived(results); 147 ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); 148 } 149 150 // Ensure that a final (empty) audio chunk is uploaded on chunks end. 151 engine_under_test_->AudioChunksEnded(); 152 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); 153 ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); 154 155 // Simulate a final definitive result. 156 SpeechRecognitionResults results; 157 results.push_back(SpeechRecognitionResult()); 158 SpeechRecognitionResult& result = results.back(); 159 result.is_provisional = false; 160 result.hypotheses.push_back( 161 SpeechRecognitionHypothesis(UTF8ToUTF16("The final result"), 1.0F)); 162 ProvideMockResultDownstream(result); 163 ExpectResultsReceived(results); 164 ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); 165 166 // Ensure everything is closed cleanly after the downstream is closed. 167 CloseMockDownstream(DOWNSTREAM_ERROR_NONE); 168 ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); 169 EndMockRecognition(); 170 ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); 171 ASSERT_EQ(0U, results_.size()); 172 } 173 174 TEST_F(GoogleStreamingRemoteEngineTest, NoFinalResultAfterAudioChunksEnded) { 175 StartMockRecognition(); 176 ASSERT_TRUE(GetUpstreamFetcher()); 177 ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); 178 179 // Simulate one pushed audio chunk. 180 InjectDummyAudioChunk(); 181 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); 182 183 // Simulate the corresponding definitive result. 184 SpeechRecognitionResults results; 185 results.push_back(SpeechRecognitionResult()); 186 SpeechRecognitionResult& result = results.back(); 187 result.hypotheses.push_back( 188 SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis"), 1.0F)); 189 ProvideMockResultDownstream(result); 190 ExpectResultsReceived(results); 191 ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); 192 193 // Simulate a silent downstream closure after |AudioChunksEnded|. 194 engine_under_test_->AudioChunksEnded(); 195 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); 196 ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); 197 CloseMockDownstream(DOWNSTREAM_ERROR_NONE); 198 199 // Expect an empty result, aimed at notifying recognition ended with no 200 // actual results nor errors. 201 SpeechRecognitionResults empty_results; 202 ExpectResultsReceived(empty_results); 203 204 // Ensure everything is closed cleanly after the downstream is closed. 205 ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); 206 EndMockRecognition(); 207 ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); 208 ASSERT_EQ(0U, results_.size()); 209 } 210 211 TEST_F(GoogleStreamingRemoteEngineTest, NoMatchError) { 212 StartMockRecognition(); 213 ASSERT_TRUE(GetUpstreamFetcher()); 214 ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); 215 216 for (int i = 0; i < 3; ++i) 217 InjectDummyAudioChunk(); 218 engine_under_test_->AudioChunksEnded(); 219 ASSERT_EQ(4U, UpstreamChunksUploadedFromLastCall()); 220 ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); 221 222 // Simulate only a provisional result. 223 SpeechRecognitionResults results; 224 results.push_back(SpeechRecognitionResult()); 225 SpeechRecognitionResult& result = results.back(); 226 result.is_provisional = true; 227 result.hypotheses.push_back( 228 SpeechRecognitionHypothesis(UTF8ToUTF16("The final result"), 0.0F)); 229 ProvideMockResultDownstream(result); 230 ExpectResultsReceived(results); 231 ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); 232 233 CloseMockDownstream(DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH); 234 235 // Expect an empty result. 236 ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); 237 EndMockRecognition(); 238 SpeechRecognitionResults empty_result; 239 ExpectResultsReceived(empty_result); 240 } 241 242 TEST_F(GoogleStreamingRemoteEngineTest, HTTPError) { 243 StartMockRecognition(); 244 ASSERT_TRUE(GetUpstreamFetcher()); 245 ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); 246 247 InjectDummyAudioChunk(); 248 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); 249 250 // Close the downstream with a HTTP 500 error. 251 CloseMockDownstream(DOWNSTREAM_ERROR_HTTP500); 252 253 // Expect a SPEECH_RECOGNITION_ERROR_NETWORK error to be raised. 254 ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); 255 EndMockRecognition(); 256 ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_); 257 ASSERT_EQ(0U, results_.size()); 258 } 259 260 TEST_F(GoogleStreamingRemoteEngineTest, NetworkError) { 261 StartMockRecognition(); 262 ASSERT_TRUE(GetUpstreamFetcher()); 263 ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); 264 265 InjectDummyAudioChunk(); 266 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); 267 268 // Close the downstream fetcher simulating a network failure. 269 CloseMockDownstream(DOWNSTREAM_ERROR_NETWORK); 270 271 // Expect a SPEECH_RECOGNITION_ERROR_NETWORK error to be raised. 272 ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); 273 EndMockRecognition(); 274 ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_); 275 ASSERT_EQ(0U, results_.size()); 276 } 277 278 TEST_F(GoogleStreamingRemoteEngineTest, Stability) { 279 StartMockRecognition(); 280 ASSERT_TRUE(GetUpstreamFetcher()); 281 ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); 282 283 // Upload a dummy audio chunk. 284 InjectDummyAudioChunk(); 285 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); 286 engine_under_test_->AudioChunksEnded(); 287 288 // Simulate a protobuf message with an intermediate result without confidence, 289 // but with stability. 290 proto::SpeechRecognitionEvent proto_event; 291 proto_event.set_status(proto::SpeechRecognitionEvent::STATUS_SUCCESS); 292 proto::SpeechRecognitionResult* proto_result = proto_event.add_result(); 293 proto_result->set_stability(0.5); 294 proto::SpeechRecognitionAlternative *proto_alternative = 295 proto_result->add_alternative(); 296 proto_alternative->set_transcript("foo"); 297 ProvideMockProtoResultDownstream(proto_event); 298 299 // Set up expectations. 300 SpeechRecognitionResults results; 301 results.push_back(SpeechRecognitionResult()); 302 SpeechRecognitionResult& result = results.back(); 303 result.is_provisional = true; 304 result.hypotheses.push_back( 305 SpeechRecognitionHypothesis(UTF8ToUTF16("foo"), 0.5)); 306 307 // Check that the protobuf generated the expected result. 308 ExpectResultsReceived(results); 309 310 // Since it was a provisional result, recognition is still pending. 311 ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); 312 313 // Shut down. 314 CloseMockDownstream(DOWNSTREAM_ERROR_NONE); 315 ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); 316 EndMockRecognition(); 317 318 // Since there was no final result, we get an empty "no match" result. 319 SpeechRecognitionResults empty_result; 320 ExpectResultsReceived(empty_result); 321 ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); 322 ASSERT_EQ(0U, results_.size()); 323 } 324 325 void GoogleStreamingRemoteEngineTest::SetUp() { 326 engine_under_test_.reset( 327 new GoogleStreamingRemoteEngine(NULL /*URLRequestContextGetter*/)); 328 engine_under_test_->set_delegate(this); 329 } 330 331 void GoogleStreamingRemoteEngineTest::TearDown() { 332 engine_under_test_.reset(); 333 } 334 335 TestURLFetcher* GoogleStreamingRemoteEngineTest::GetUpstreamFetcher() { 336 return url_fetcher_factory_.GetFetcherByID( 337 GoogleStreamingRemoteEngine::kUpstreamUrlFetcherIdForTests); 338 } 339 340 TestURLFetcher* GoogleStreamingRemoteEngineTest::GetDownstreamFetcher() { 341 return url_fetcher_factory_.GetFetcherByID( 342 GoogleStreamingRemoteEngine::kDownstreamUrlFetcherIdForTests); 343 } 344 345 // Starts recognition on the engine, ensuring that both stream fetchers are 346 // created. 347 void GoogleStreamingRemoteEngineTest::StartMockRecognition() { 348 DCHECK(engine_under_test_.get()); 349 350 ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); 351 352 engine_under_test_->StartRecognition(); 353 ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); 354 355 TestURLFetcher* upstream_fetcher = GetUpstreamFetcher(); 356 ASSERT_TRUE(upstream_fetcher); 357 upstream_fetcher->set_url(upstream_fetcher->GetOriginalURL()); 358 359 TestURLFetcher* downstream_fetcher = GetDownstreamFetcher(); 360 ASSERT_TRUE(downstream_fetcher); 361 downstream_fetcher->set_url(downstream_fetcher->GetOriginalURL()); 362 } 363 364 void GoogleStreamingRemoteEngineTest::EndMockRecognition() { 365 DCHECK(engine_under_test_.get()); 366 engine_under_test_->EndRecognition(); 367 ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); 368 369 // TODO(primiano): In order to be very pedantic we should check that both the 370 // upstream and downstream URL fetchers have been disposed at this time. 371 // Unfortunately it seems that there is no direct way to detect (in tests) 372 // if a url_fetcher has been freed or not, since they are not automatically 373 // de-registered from the TestURLFetcherFactory on destruction. 374 } 375 376 void GoogleStreamingRemoteEngineTest::InjectDummyAudioChunk() { 377 unsigned char dummy_audio_buffer_data[2] = {'\0', '\0'}; 378 scoped_refptr<AudioChunk> dummy_audio_chunk( 379 new AudioChunk(&dummy_audio_buffer_data[0], 380 sizeof(dummy_audio_buffer_data), 381 2 /* bytes per sample */)); 382 DCHECK(engine_under_test_.get()); 383 engine_under_test_->TakeAudioChunk(*dummy_audio_chunk.get()); 384 } 385 386 size_t GoogleStreamingRemoteEngineTest::UpstreamChunksUploadedFromLastCall() { 387 TestURLFetcher* upstream_fetcher = GetUpstreamFetcher(); 388 DCHECK(upstream_fetcher); 389 const size_t number_of_chunks = upstream_fetcher->upload_chunks().size(); 390 DCHECK_GE(number_of_chunks, last_number_of_upstream_chunks_seen_); 391 const size_t new_chunks = number_of_chunks - 392 last_number_of_upstream_chunks_seen_; 393 last_number_of_upstream_chunks_seen_ = number_of_chunks; 394 return new_chunks; 395 } 396 397 void GoogleStreamingRemoteEngineTest::ProvideMockProtoResultDownstream( 398 const proto::SpeechRecognitionEvent& result) { 399 TestURLFetcher* downstream_fetcher = GetDownstreamFetcher(); 400 401 ASSERT_TRUE(downstream_fetcher); 402 downstream_fetcher->set_status(URLRequestStatus(/* default=SUCCESS */)); 403 downstream_fetcher->set_response_code(200); 404 405 std::string response_string = SerializeProtobufResponse(result); 406 response_buffer_.append(response_string); 407 downstream_fetcher->SetResponseString(response_buffer_); 408 downstream_fetcher->delegate()->OnURLFetchDownloadProgress( 409 downstream_fetcher, 410 response_buffer_.size(), 411 -1 /* total response length not used */); 412 } 413 414 void GoogleStreamingRemoteEngineTest::ProvideMockResultDownstream( 415 const SpeechRecognitionResult& result) { 416 proto::SpeechRecognitionEvent proto_event; 417 proto_event.set_status(proto::SpeechRecognitionEvent::STATUS_SUCCESS); 418 proto::SpeechRecognitionResult* proto_result = proto_event.add_result(); 419 proto_result->set_final(!result.is_provisional); 420 for (size_t i = 0; i < result.hypotheses.size(); ++i) { 421 proto::SpeechRecognitionAlternative* proto_alternative = 422 proto_result->add_alternative(); 423 const SpeechRecognitionHypothesis& hypothesis = result.hypotheses[i]; 424 proto_alternative->set_confidence(hypothesis.confidence); 425 proto_alternative->set_transcript(UTF16ToUTF8(hypothesis.utterance)); 426 } 427 ProvideMockProtoResultDownstream(proto_event); 428 } 429 430 void GoogleStreamingRemoteEngineTest::CloseMockDownstream( 431 DownstreamError error) { 432 TestURLFetcher* downstream_fetcher = GetDownstreamFetcher(); 433 ASSERT_TRUE(downstream_fetcher); 434 435 const URLRequestStatus::Status fetcher_status = 436 (error == DOWNSTREAM_ERROR_NETWORK) ? URLRequestStatus::FAILED : 437 URLRequestStatus::SUCCESS; 438 downstream_fetcher->set_status(URLRequestStatus(fetcher_status, 0)); 439 downstream_fetcher->set_response_code( 440 (error == DOWNSTREAM_ERROR_HTTP500) ? 500 : 200); 441 442 if (error == DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH) { 443 // Send empty response. 444 proto::SpeechRecognitionEvent response; 445 response_buffer_.append(SerializeProtobufResponse(response)); 446 } 447 downstream_fetcher->SetResponseString(response_buffer_); 448 downstream_fetcher->delegate()->OnURLFetchComplete(downstream_fetcher); 449 } 450 451 void GoogleStreamingRemoteEngineTest::ExpectResultsReceived( 452 const SpeechRecognitionResults& results) { 453 ASSERT_GE(1U, results_.size()); 454 ASSERT_TRUE(ResultsAreEqual(results, results_.front())); 455 results_.pop(); 456 } 457 458 bool GoogleStreamingRemoteEngineTest::ResultsAreEqual( 459 const SpeechRecognitionResults& a, const SpeechRecognitionResults& b) { 460 if (a.size() != b.size()) 461 return false; 462 463 SpeechRecognitionResults::const_iterator it_a = a.begin(); 464 SpeechRecognitionResults::const_iterator it_b = b.begin(); 465 for (; it_a != a.end() && it_b != b.end(); ++it_a, ++it_b) { 466 if (it_a->is_provisional != it_b->is_provisional || 467 it_a->hypotheses.size() != it_b->hypotheses.size()) { 468 return false; 469 } 470 for (size_t i = 0; i < it_a->hypotheses.size(); ++i) { 471 const SpeechRecognitionHypothesis& hyp_a = it_a->hypotheses[i]; 472 const SpeechRecognitionHypothesis& hyp_b = it_b->hypotheses[i]; 473 if (hyp_a.utterance != hyp_b.utterance || 474 hyp_a.confidence != hyp_b.confidence) { 475 return false; 476 } 477 } 478 } 479 480 return true; 481 } 482 483 std::string GoogleStreamingRemoteEngineTest::SerializeProtobufResponse( 484 const proto::SpeechRecognitionEvent& msg) { 485 std::string msg_string; 486 msg.SerializeToString(&msg_string); 487 488 // Prepend 4 byte prefix length indication to the protobuf message as 489 // envisaged by the google streaming recognition webservice protocol. 490 msg_string.insert(0, ToBigEndian32(msg_string.size())); 491 return msg_string; 492 } 493 494 std::string GoogleStreamingRemoteEngineTest::ToBigEndian32(uint32 value) { 495 char raw_data[4]; 496 raw_data[0] = static_cast<uint8>((value >> 24) & 0xFF); 497 raw_data[1] = static_cast<uint8>((value >> 16) & 0xFF); 498 raw_data[2] = static_cast<uint8>((value >> 8) & 0xFF); 499 raw_data[3] = static_cast<uint8>(value & 0xFF); 500 return std::string(raw_data, sizeof(raw_data)); 501 } 502 503 } // namespace content 504