1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include <queue> 6 7 #include "base/memory/scoped_ptr.h" 8 #include "base/message_loop/message_loop.h" 9 #include "base/numerics/safe_conversions.h" 10 #include "base/strings/utf_string_conversions.h" 11 #include "base/sys_byteorder.h" 12 #include "content/browser/speech/audio_buffer.h" 13 #include "content/browser/speech/google_streaming_remote_engine.h" 14 #include "content/browser/speech/proto/google_streaming_api.pb.h" 15 #include "content/public/common/speech_recognition_error.h" 16 #include "content/public/common/speech_recognition_result.h" 17 #include "net/url_request/test_url_fetcher_factory.h" 18 #include "net/url_request/url_request_context_getter.h" 19 #include "net/url_request/url_request_status.h" 20 #include "testing/gtest/include/gtest/gtest.h" 21 22 using base::HostToNet32; 23 using base::checked_cast; 24 using net::URLRequestStatus; 25 using net::TestURLFetcher; 26 using net::TestURLFetcherFactory; 27 28 namespace content { 29 30 // Note: the terms upstream and downstream are from the point-of-view of the 31 // client (engine_under_test_). 32 33 class GoogleStreamingRemoteEngineTest : public SpeechRecognitionEngineDelegate, 34 public testing::Test { 35 public: 36 GoogleStreamingRemoteEngineTest() 37 : last_number_of_upstream_chunks_seen_(0U), 38 error_(SPEECH_RECOGNITION_ERROR_NONE) { } 39 40 // Creates a speech recognition request and invokes its URL fetcher delegate 41 // with the given test data. 42 void CreateAndTestRequest(bool success, const std::string& http_response); 43 44 // SpeechRecognitionRequestDelegate methods. 45 virtual void OnSpeechRecognitionEngineResults( 46 const SpeechRecognitionResults& results) OVERRIDE { 47 results_.push(results); 48 } 49 virtual void OnSpeechRecognitionEngineError( 50 const SpeechRecognitionError& error) OVERRIDE { 51 error_ = error.code; 52 } 53 54 // testing::Test methods. 55 virtual void SetUp() OVERRIDE; 56 virtual void TearDown() OVERRIDE; 57 58 protected: 59 enum DownstreamError { 60 DOWNSTREAM_ERROR_NONE, 61 DOWNSTREAM_ERROR_HTTP500, 62 DOWNSTREAM_ERROR_NETWORK, 63 DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH 64 }; 65 static bool ResultsAreEqual(const SpeechRecognitionResults& a, 66 const SpeechRecognitionResults& b); 67 static std::string SerializeProtobufResponse( 68 const proto::SpeechRecognitionEvent& msg); 69 70 TestURLFetcher* GetUpstreamFetcher(); 71 TestURLFetcher* GetDownstreamFetcher(); 72 void StartMockRecognition(); 73 void EndMockRecognition(); 74 void InjectDummyAudioChunk(); 75 size_t UpstreamChunksUploadedFromLastCall(); 76 void ProvideMockProtoResultDownstream( 77 const proto::SpeechRecognitionEvent& result); 78 void ProvideMockResultDownstream(const SpeechRecognitionResult& result); 79 void ExpectResultsReceived(const SpeechRecognitionResults& result); 80 void CloseMockDownstream(DownstreamError error); 81 82 scoped_ptr<GoogleStreamingRemoteEngine> engine_under_test_; 83 TestURLFetcherFactory url_fetcher_factory_; 84 size_t last_number_of_upstream_chunks_seen_; 85 base::MessageLoop message_loop_; 86 std::string response_buffer_; 87 SpeechRecognitionErrorCode error_; 88 std::queue<SpeechRecognitionResults> results_; 89 }; 90 91 TEST_F(GoogleStreamingRemoteEngineTest, SingleDefinitiveResult) { 92 StartMockRecognition(); 93 ASSERT_TRUE(GetUpstreamFetcher()); 94 ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); 95 96 // Inject some dummy audio chunks and check a corresponding chunked upload 97 // is performed every time on the server. 98 for (int i = 0; i < 3; ++i) { 99 InjectDummyAudioChunk(); 100 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); 101 } 102 103 // Ensure that a final (empty) audio chunk is uploaded on chunks end. 104 engine_under_test_->AudioChunksEnded(); 105 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); 106 ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); 107 108 // Simulate a protobuf message streamed from the server containing a single 109 // result with two hypotheses. 110 SpeechRecognitionResults results; 111 results.push_back(SpeechRecognitionResult()); 112 SpeechRecognitionResult& result = results.back(); 113 result.is_provisional = false; 114 result.hypotheses.push_back( 115 SpeechRecognitionHypothesis(base::UTF8ToUTF16("hypothesis 1"), 0.1F)); 116 result.hypotheses.push_back( 117 SpeechRecognitionHypothesis(base::UTF8ToUTF16("hypothesis 2"), 0.2F)); 118 119 ProvideMockResultDownstream(result); 120 ExpectResultsReceived(results); 121 ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); 122 123 // Ensure everything is closed cleanly after the downstream is closed. 124 CloseMockDownstream(DOWNSTREAM_ERROR_NONE); 125 ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); 126 EndMockRecognition(); 127 ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); 128 ASSERT_EQ(0U, results_.size()); 129 } 130 131 TEST_F(GoogleStreamingRemoteEngineTest, SeveralStreamingResults) { 132 StartMockRecognition(); 133 ASSERT_TRUE(GetUpstreamFetcher()); 134 ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); 135 136 for (int i = 0; i < 4; ++i) { 137 InjectDummyAudioChunk(); 138 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); 139 140 SpeechRecognitionResults results; 141 results.push_back(SpeechRecognitionResult()); 142 SpeechRecognitionResult& result = results.back(); 143 result.is_provisional = (i % 2 == 0); // Alternate result types. 144 float confidence = result.is_provisional ? 0.0F : (i * 0.1F); 145 result.hypotheses.push_back(SpeechRecognitionHypothesis( 146 base::UTF8ToUTF16("hypothesis"), confidence)); 147 148 ProvideMockResultDownstream(result); 149 ExpectResultsReceived(results); 150 ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); 151 } 152 153 // Ensure that a final (empty) audio chunk is uploaded on chunks end. 154 engine_under_test_->AudioChunksEnded(); 155 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); 156 ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); 157 158 // Simulate a final definitive result. 159 SpeechRecognitionResults results; 160 results.push_back(SpeechRecognitionResult()); 161 SpeechRecognitionResult& result = results.back(); 162 result.is_provisional = false; 163 result.hypotheses.push_back( 164 SpeechRecognitionHypothesis(base::UTF8ToUTF16("The final result"), 1.0F)); 165 ProvideMockResultDownstream(result); 166 ExpectResultsReceived(results); 167 ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); 168 169 // Ensure everything is closed cleanly after the downstream is closed. 170 CloseMockDownstream(DOWNSTREAM_ERROR_NONE); 171 ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); 172 EndMockRecognition(); 173 ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); 174 ASSERT_EQ(0U, results_.size()); 175 } 176 177 TEST_F(GoogleStreamingRemoteEngineTest, NoFinalResultAfterAudioChunksEnded) { 178 StartMockRecognition(); 179 ASSERT_TRUE(GetUpstreamFetcher()); 180 ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); 181 182 // Simulate one pushed audio chunk. 183 InjectDummyAudioChunk(); 184 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); 185 186 // Simulate the corresponding definitive result. 187 SpeechRecognitionResults results; 188 results.push_back(SpeechRecognitionResult()); 189 SpeechRecognitionResult& result = results.back(); 190 result.hypotheses.push_back( 191 SpeechRecognitionHypothesis(base::UTF8ToUTF16("hypothesis"), 1.0F)); 192 ProvideMockResultDownstream(result); 193 ExpectResultsReceived(results); 194 ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); 195 196 // Simulate a silent downstream closure after |AudioChunksEnded|. 197 engine_under_test_->AudioChunksEnded(); 198 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); 199 ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); 200 CloseMockDownstream(DOWNSTREAM_ERROR_NONE); 201 202 // Expect an empty result, aimed at notifying recognition ended with no 203 // actual results nor errors. 204 SpeechRecognitionResults empty_results; 205 ExpectResultsReceived(empty_results); 206 207 // Ensure everything is closed cleanly after the downstream is closed. 208 ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); 209 EndMockRecognition(); 210 ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); 211 ASSERT_EQ(0U, results_.size()); 212 } 213 214 TEST_F(GoogleStreamingRemoteEngineTest, NoMatchError) { 215 StartMockRecognition(); 216 ASSERT_TRUE(GetUpstreamFetcher()); 217 ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); 218 219 for (int i = 0; i < 3; ++i) 220 InjectDummyAudioChunk(); 221 engine_under_test_->AudioChunksEnded(); 222 ASSERT_EQ(4U, UpstreamChunksUploadedFromLastCall()); 223 ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); 224 225 // Simulate only a provisional result. 226 SpeechRecognitionResults results; 227 results.push_back(SpeechRecognitionResult()); 228 SpeechRecognitionResult& result = results.back(); 229 result.is_provisional = true; 230 result.hypotheses.push_back( 231 SpeechRecognitionHypothesis(base::UTF8ToUTF16("The final result"), 0.0F)); 232 ProvideMockResultDownstream(result); 233 ExpectResultsReceived(results); 234 ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); 235 236 CloseMockDownstream(DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH); 237 238 // Expect an empty result. 239 ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); 240 EndMockRecognition(); 241 SpeechRecognitionResults empty_result; 242 ExpectResultsReceived(empty_result); 243 } 244 245 TEST_F(GoogleStreamingRemoteEngineTest, HTTPError) { 246 StartMockRecognition(); 247 ASSERT_TRUE(GetUpstreamFetcher()); 248 ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); 249 250 InjectDummyAudioChunk(); 251 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); 252 253 // Close the downstream with a HTTP 500 error. 254 CloseMockDownstream(DOWNSTREAM_ERROR_HTTP500); 255 256 // Expect a SPEECH_RECOGNITION_ERROR_NETWORK error to be raised. 257 ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); 258 EndMockRecognition(); 259 ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_); 260 ASSERT_EQ(0U, results_.size()); 261 } 262 263 TEST_F(GoogleStreamingRemoteEngineTest, NetworkError) { 264 StartMockRecognition(); 265 ASSERT_TRUE(GetUpstreamFetcher()); 266 ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); 267 268 InjectDummyAudioChunk(); 269 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); 270 271 // Close the downstream fetcher simulating a network failure. 272 CloseMockDownstream(DOWNSTREAM_ERROR_NETWORK); 273 274 // Expect a SPEECH_RECOGNITION_ERROR_NETWORK error to be raised. 275 ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); 276 EndMockRecognition(); 277 ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_); 278 ASSERT_EQ(0U, results_.size()); 279 } 280 281 TEST_F(GoogleStreamingRemoteEngineTest, Stability) { 282 StartMockRecognition(); 283 ASSERT_TRUE(GetUpstreamFetcher()); 284 ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); 285 286 // Upload a dummy audio chunk. 287 InjectDummyAudioChunk(); 288 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); 289 engine_under_test_->AudioChunksEnded(); 290 291 // Simulate a protobuf message with an intermediate result without confidence, 292 // but with stability. 293 proto::SpeechRecognitionEvent proto_event; 294 proto_event.set_status(proto::SpeechRecognitionEvent::STATUS_SUCCESS); 295 proto::SpeechRecognitionResult* proto_result = proto_event.add_result(); 296 proto_result->set_stability(0.5); 297 proto::SpeechRecognitionAlternative *proto_alternative = 298 proto_result->add_alternative(); 299 proto_alternative->set_transcript("foo"); 300 ProvideMockProtoResultDownstream(proto_event); 301 302 // Set up expectations. 303 SpeechRecognitionResults results; 304 results.push_back(SpeechRecognitionResult()); 305 SpeechRecognitionResult& result = results.back(); 306 result.is_provisional = true; 307 result.hypotheses.push_back( 308 SpeechRecognitionHypothesis(base::UTF8ToUTF16("foo"), 0.5)); 309 310 // Check that the protobuf generated the expected result. 311 ExpectResultsReceived(results); 312 313 // Since it was a provisional result, recognition is still pending. 314 ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); 315 316 // Shut down. 317 CloseMockDownstream(DOWNSTREAM_ERROR_NONE); 318 ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); 319 EndMockRecognition(); 320 321 // Since there was no final result, we get an empty "no match" result. 322 SpeechRecognitionResults empty_result; 323 ExpectResultsReceived(empty_result); 324 ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); 325 ASSERT_EQ(0U, results_.size()); 326 } 327 328 void GoogleStreamingRemoteEngineTest::SetUp() { 329 engine_under_test_.reset( 330 new GoogleStreamingRemoteEngine(NULL /*URLRequestContextGetter*/)); 331 engine_under_test_->set_delegate(this); 332 } 333 334 void GoogleStreamingRemoteEngineTest::TearDown() { 335 engine_under_test_.reset(); 336 } 337 338 TestURLFetcher* GoogleStreamingRemoteEngineTest::GetUpstreamFetcher() { 339 return url_fetcher_factory_.GetFetcherByID( 340 GoogleStreamingRemoteEngine::kUpstreamUrlFetcherIdForTesting); 341 } 342 343 TestURLFetcher* GoogleStreamingRemoteEngineTest::GetDownstreamFetcher() { 344 return url_fetcher_factory_.GetFetcherByID( 345 GoogleStreamingRemoteEngine::kDownstreamUrlFetcherIdForTesting); 346 } 347 348 // Starts recognition on the engine, ensuring that both stream fetchers are 349 // created. 350 void GoogleStreamingRemoteEngineTest::StartMockRecognition() { 351 DCHECK(engine_under_test_.get()); 352 353 ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); 354 355 engine_under_test_->StartRecognition(); 356 ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); 357 358 TestURLFetcher* upstream_fetcher = GetUpstreamFetcher(); 359 ASSERT_TRUE(upstream_fetcher); 360 upstream_fetcher->set_url(upstream_fetcher->GetOriginalURL()); 361 362 TestURLFetcher* downstream_fetcher = GetDownstreamFetcher(); 363 ASSERT_TRUE(downstream_fetcher); 364 downstream_fetcher->set_url(downstream_fetcher->GetOriginalURL()); 365 } 366 367 void GoogleStreamingRemoteEngineTest::EndMockRecognition() { 368 DCHECK(engine_under_test_.get()); 369 engine_under_test_->EndRecognition(); 370 ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); 371 372 // TODO(primiano): In order to be very pedantic we should check that both the 373 // upstream and downstream URL fetchers have been disposed at this time. 374 // Unfortunately it seems that there is no direct way to detect (in tests) 375 // if a url_fetcher has been freed or not, since they are not automatically 376 // de-registered from the TestURLFetcherFactory on destruction. 377 } 378 379 void GoogleStreamingRemoteEngineTest::InjectDummyAudioChunk() { 380 unsigned char dummy_audio_buffer_data[2] = {'\0', '\0'}; 381 scoped_refptr<AudioChunk> dummy_audio_chunk( 382 new AudioChunk(&dummy_audio_buffer_data[0], 383 sizeof(dummy_audio_buffer_data), 384 2 /* bytes per sample */)); 385 DCHECK(engine_under_test_.get()); 386 engine_under_test_->TakeAudioChunk(*dummy_audio_chunk.get()); 387 } 388 389 size_t GoogleStreamingRemoteEngineTest::UpstreamChunksUploadedFromLastCall() { 390 TestURLFetcher* upstream_fetcher = GetUpstreamFetcher(); 391 DCHECK(upstream_fetcher); 392 const size_t number_of_chunks = upstream_fetcher->upload_chunks().size(); 393 DCHECK_GE(number_of_chunks, last_number_of_upstream_chunks_seen_); 394 const size_t new_chunks = number_of_chunks - 395 last_number_of_upstream_chunks_seen_; 396 last_number_of_upstream_chunks_seen_ = number_of_chunks; 397 return new_chunks; 398 } 399 400 void GoogleStreamingRemoteEngineTest::ProvideMockProtoResultDownstream( 401 const proto::SpeechRecognitionEvent& result) { 402 TestURLFetcher* downstream_fetcher = GetDownstreamFetcher(); 403 404 ASSERT_TRUE(downstream_fetcher); 405 downstream_fetcher->set_status(URLRequestStatus(/* default=SUCCESS */)); 406 downstream_fetcher->set_response_code(200); 407 408 std::string response_string = SerializeProtobufResponse(result); 409 response_buffer_.append(response_string); 410 downstream_fetcher->SetResponseString(response_buffer_); 411 downstream_fetcher->delegate()->OnURLFetchDownloadProgress( 412 downstream_fetcher, 413 response_buffer_.size(), 414 -1 /* total response length not used */); 415 } 416 417 void GoogleStreamingRemoteEngineTest::ProvideMockResultDownstream( 418 const SpeechRecognitionResult& result) { 419 proto::SpeechRecognitionEvent proto_event; 420 proto_event.set_status(proto::SpeechRecognitionEvent::STATUS_SUCCESS); 421 proto::SpeechRecognitionResult* proto_result = proto_event.add_result(); 422 proto_result->set_final(!result.is_provisional); 423 for (size_t i = 0; i < result.hypotheses.size(); ++i) { 424 proto::SpeechRecognitionAlternative* proto_alternative = 425 proto_result->add_alternative(); 426 const SpeechRecognitionHypothesis& hypothesis = result.hypotheses[i]; 427 proto_alternative->set_confidence(hypothesis.confidence); 428 proto_alternative->set_transcript(base::UTF16ToUTF8(hypothesis.utterance)); 429 } 430 ProvideMockProtoResultDownstream(proto_event); 431 } 432 433 void GoogleStreamingRemoteEngineTest::CloseMockDownstream( 434 DownstreamError error) { 435 TestURLFetcher* downstream_fetcher = GetDownstreamFetcher(); 436 ASSERT_TRUE(downstream_fetcher); 437 438 const URLRequestStatus::Status fetcher_status = 439 (error == DOWNSTREAM_ERROR_NETWORK) ? URLRequestStatus::FAILED : 440 URLRequestStatus::SUCCESS; 441 downstream_fetcher->set_status(URLRequestStatus(fetcher_status, 0)); 442 downstream_fetcher->set_response_code( 443 (error == DOWNSTREAM_ERROR_HTTP500) ? 500 : 200); 444 445 if (error == DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH) { 446 // Send empty response. 447 proto::SpeechRecognitionEvent response; 448 response_buffer_.append(SerializeProtobufResponse(response)); 449 } 450 downstream_fetcher->SetResponseString(response_buffer_); 451 downstream_fetcher->delegate()->OnURLFetchComplete(downstream_fetcher); 452 } 453 454 void GoogleStreamingRemoteEngineTest::ExpectResultsReceived( 455 const SpeechRecognitionResults& results) { 456 ASSERT_GE(1U, results_.size()); 457 ASSERT_TRUE(ResultsAreEqual(results, results_.front())); 458 results_.pop(); 459 } 460 461 bool GoogleStreamingRemoteEngineTest::ResultsAreEqual( 462 const SpeechRecognitionResults& a, const SpeechRecognitionResults& b) { 463 if (a.size() != b.size()) 464 return false; 465 466 SpeechRecognitionResults::const_iterator it_a = a.begin(); 467 SpeechRecognitionResults::const_iterator it_b = b.begin(); 468 for (; it_a != a.end() && it_b != b.end(); ++it_a, ++it_b) { 469 if (it_a->is_provisional != it_b->is_provisional || 470 it_a->hypotheses.size() != it_b->hypotheses.size()) { 471 return false; 472 } 473 for (size_t i = 0; i < it_a->hypotheses.size(); ++i) { 474 const SpeechRecognitionHypothesis& hyp_a = it_a->hypotheses[i]; 475 const SpeechRecognitionHypothesis& hyp_b = it_b->hypotheses[i]; 476 if (hyp_a.utterance != hyp_b.utterance || 477 hyp_a.confidence != hyp_b.confidence) { 478 return false; 479 } 480 } 481 } 482 483 return true; 484 } 485 486 std::string GoogleStreamingRemoteEngineTest::SerializeProtobufResponse( 487 const proto::SpeechRecognitionEvent& msg) { 488 std::string msg_string; 489 msg.SerializeToString(&msg_string); 490 491 // Prepend 4 byte prefix length indication to the protobuf message as 492 // envisaged by the google streaming recognition webservice protocol. 493 uint32 prefix = HostToNet32(checked_cast<uint32>(msg_string.size())); 494 msg_string.insert(0, reinterpret_cast<char*>(&prefix), sizeof(prefix)); 495 496 return msg_string; 497 } 498 499 } // namespace content 500