1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include <vector> 6 7 #include "content/browser/browser_thread_impl.h" 8 #include "content/browser/speech/google_one_shot_remote_engine.h" 9 #include "content/browser/speech/speech_recognizer_impl.h" 10 #include "content/public/browser/speech_recognition_event_listener.h" 11 #include "media/audio/audio_manager_base.h" 12 #include "media/audio/fake_audio_input_stream.h" 13 #include "media/audio/fake_audio_output_stream.h" 14 #include "media/audio/mock_audio_manager.h" 15 #include "media/audio/test_audio_input_controller_factory.h" 16 #include "net/base/net_errors.h" 17 #include "net/url_request/test_url_fetcher_factory.h" 18 #include "net/url_request/url_request_status.h" 19 #include "testing/gtest/include/gtest/gtest.h" 20 21 using base::MessageLoopProxy; 22 using media::AudioInputController; 23 using media::AudioInputStream; 24 using media::AudioManager; 25 using media::AudioOutputStream; 26 using media::AudioParameters; 27 using media::TestAudioInputController; 28 using media::TestAudioInputControllerFactory; 29 30 namespace content { 31 32 class SpeechRecognizerImplTest : public SpeechRecognitionEventListener, 33 public testing::Test { 34 public: 35 SpeechRecognizerImplTest() 36 : io_thread_(BrowserThread::IO, &message_loop_), 37 recognition_started_(false), 38 recognition_ended_(false), 39 result_received_(false), 40 audio_started_(false), 41 audio_ended_(false), 42 sound_started_(false), 43 sound_ended_(false), 44 error_(SPEECH_RECOGNITION_ERROR_NONE), 45 volume_(-1.0f) { 46 // SpeechRecognizer takes ownership of sr_engine. 47 SpeechRecognitionEngine* sr_engine = 48 new GoogleOneShotRemoteEngine(NULL /* URLRequestContextGetter */); 49 SpeechRecognitionEngineConfig config; 50 config.audio_num_bits_per_sample = 51 SpeechRecognizerImpl::kNumBitsPerAudioSample; 52 config.audio_sample_rate = SpeechRecognizerImpl::kAudioSampleRate; 53 config.filter_profanities = false; 54 sr_engine->SetConfig(config); 55 56 const int kTestingSessionId = 1; 57 const bool kOneShotMode = true; 58 recognizer_ = new SpeechRecognizerImpl( 59 this, kTestingSessionId, kOneShotMode, sr_engine); 60 audio_manager_.reset(new media::MockAudioManager( 61 base::MessageLoop::current()->message_loop_proxy().get())); 62 recognizer_->SetAudioManagerForTesting(audio_manager_.get()); 63 64 int audio_packet_length_bytes = 65 (SpeechRecognizerImpl::kAudioSampleRate * 66 GoogleOneShotRemoteEngine::kAudioPacketIntervalMs * 67 ChannelLayoutToChannelCount(SpeechRecognizerImpl::kChannelLayout) * 68 SpeechRecognizerImpl::kNumBitsPerAudioSample) / (8 * 1000); 69 audio_packet_.resize(audio_packet_length_bytes); 70 } 71 72 void CheckEventsConsistency() { 73 // Note: "!x || y" == "x implies y". 74 EXPECT_TRUE(!recognition_ended_ || recognition_started_); 75 EXPECT_TRUE(!audio_ended_ || audio_started_); 76 EXPECT_TRUE(!sound_ended_ || sound_started_); 77 EXPECT_TRUE(!audio_started_ || recognition_started_); 78 EXPECT_TRUE(!sound_started_ || audio_started_); 79 EXPECT_TRUE(!audio_ended_ || (sound_ended_ || !sound_started_)); 80 EXPECT_TRUE(!recognition_ended_ || (audio_ended_ || !audio_started_)); 81 } 82 83 void CheckFinalEventsConsistency() { 84 // Note: "!(x ^ y)" == "(x && y) || (!x && !x)". 85 EXPECT_FALSE(recognition_started_ ^ recognition_ended_); 86 EXPECT_FALSE(audio_started_ ^ audio_ended_); 87 EXPECT_FALSE(sound_started_ ^ sound_ended_); 88 } 89 90 // Overridden from SpeechRecognitionEventListener: 91 virtual void OnAudioStart(int session_id) OVERRIDE { 92 audio_started_ = true; 93 CheckEventsConsistency(); 94 } 95 96 virtual void OnAudioEnd(int session_id) OVERRIDE { 97 audio_ended_ = true; 98 CheckEventsConsistency(); 99 } 100 101 virtual void OnRecognitionResults( 102 int session_id, const SpeechRecognitionResults& results) OVERRIDE { 103 result_received_ = true; 104 } 105 106 virtual void OnRecognitionError( 107 int session_id, const SpeechRecognitionError& error) OVERRIDE { 108 EXPECT_TRUE(recognition_started_); 109 EXPECT_FALSE(recognition_ended_); 110 error_ = error.code; 111 } 112 113 virtual void OnAudioLevelsChange(int session_id, float volume, 114 float noise_volume) OVERRIDE { 115 volume_ = volume; 116 noise_volume_ = noise_volume; 117 } 118 119 virtual void OnRecognitionEnd(int session_id) OVERRIDE { 120 recognition_ended_ = true; 121 CheckEventsConsistency(); 122 } 123 124 virtual void OnRecognitionStart(int session_id) OVERRIDE { 125 recognition_started_ = true; 126 CheckEventsConsistency(); 127 } 128 129 virtual void OnEnvironmentEstimationComplete(int session_id) OVERRIDE {} 130 131 virtual void OnSoundStart(int session_id) OVERRIDE { 132 sound_started_ = true; 133 CheckEventsConsistency(); 134 } 135 136 virtual void OnSoundEnd(int session_id) OVERRIDE { 137 sound_ended_ = true; 138 CheckEventsConsistency(); 139 } 140 141 // testing::Test methods. 142 virtual void SetUp() OVERRIDE { 143 AudioInputController::set_factory_for_testing( 144 &audio_input_controller_factory_); 145 } 146 147 virtual void TearDown() OVERRIDE { 148 AudioInputController::set_factory_for_testing(NULL); 149 } 150 151 void FillPacketWithTestWaveform() { 152 // Fill the input with a simple pattern, a 125Hz sawtooth waveform. 153 for (size_t i = 0; i < audio_packet_.size(); ++i) 154 audio_packet_[i] = static_cast<uint8>(i); 155 } 156 157 void FillPacketWithNoise() { 158 int value = 0; 159 int factor = 175; 160 for (size_t i = 0; i < audio_packet_.size(); ++i) { 161 value += factor; 162 audio_packet_[i] = value % 100; 163 } 164 } 165 166 protected: 167 base::MessageLoopForIO message_loop_; 168 BrowserThreadImpl io_thread_; 169 scoped_refptr<SpeechRecognizerImpl> recognizer_; 170 scoped_ptr<AudioManager> audio_manager_; 171 bool recognition_started_; 172 bool recognition_ended_; 173 bool result_received_; 174 bool audio_started_; 175 bool audio_ended_; 176 bool sound_started_; 177 bool sound_ended_; 178 SpeechRecognitionErrorCode error_; 179 net::TestURLFetcherFactory url_fetcher_factory_; 180 TestAudioInputControllerFactory audio_input_controller_factory_; 181 std::vector<uint8> audio_packet_; 182 float volume_; 183 float noise_volume_; 184 }; 185 186 TEST_F(SpeechRecognizerImplTest, StopNoData) { 187 // Check for callbacks when stopping record before any audio gets recorded. 188 recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId); 189 recognizer_->StopAudioCapture(); 190 base::MessageLoop::current()->RunUntilIdle(); 191 EXPECT_TRUE(recognition_started_); 192 EXPECT_FALSE(audio_started_); 193 EXPECT_FALSE(result_received_); 194 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); 195 CheckFinalEventsConsistency(); 196 } 197 198 TEST_F(SpeechRecognizerImplTest, CancelNoData) { 199 // Check for callbacks when canceling recognition before any audio gets 200 // recorded. 201 recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId); 202 recognizer_->AbortRecognition(); 203 base::MessageLoop::current()->RunUntilIdle(); 204 EXPECT_TRUE(recognition_started_); 205 EXPECT_FALSE(audio_started_); 206 EXPECT_FALSE(result_received_); 207 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_ABORTED, error_); 208 CheckFinalEventsConsistency(); 209 } 210 211 TEST_F(SpeechRecognizerImplTest, StopWithData) { 212 // Start recording, give some data and then stop. This should wait for the 213 // network callback to arrive before completion. 214 recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId); 215 base::MessageLoop::current()->RunUntilIdle(); 216 TestAudioInputController* controller = 217 audio_input_controller_factory_.controller(); 218 ASSERT_TRUE(controller); 219 220 // Try sending 5 chunks of mock audio data and verify that each of them 221 // resulted immediately in a packet sent out via the network. This verifies 222 // that we are streaming out encoded data as chunks without waiting for the 223 // full recording to complete. 224 const size_t kNumChunks = 5; 225 for (size_t i = 0; i < kNumChunks; ++i) { 226 controller->event_handler()->OnData(controller, &audio_packet_[0], 227 audio_packet_.size()); 228 base::MessageLoop::current()->RunUntilIdle(); 229 net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0); 230 ASSERT_TRUE(fetcher); 231 EXPECT_EQ(i + 1, fetcher->upload_chunks().size()); 232 } 233 234 recognizer_->StopAudioCapture(); 235 base::MessageLoop::current()->RunUntilIdle(); 236 EXPECT_TRUE(audio_started_); 237 EXPECT_TRUE(audio_ended_); 238 EXPECT_FALSE(recognition_ended_); 239 EXPECT_FALSE(result_received_); 240 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); 241 242 // Issue the network callback to complete the process. 243 net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0); 244 ASSERT_TRUE(fetcher); 245 246 fetcher->set_url(fetcher->GetOriginalURL()); 247 net::URLRequestStatus status; 248 status.set_status(net::URLRequestStatus::SUCCESS); 249 fetcher->set_status(status); 250 fetcher->set_response_code(200); 251 fetcher->SetResponseString( 252 "{\"status\":0,\"hypotheses\":[{\"utterance\":\"123\"}]}"); 253 fetcher->delegate()->OnURLFetchComplete(fetcher); 254 base::MessageLoop::current()->RunUntilIdle(); 255 EXPECT_TRUE(recognition_ended_); 256 EXPECT_TRUE(result_received_); 257 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); 258 CheckFinalEventsConsistency(); 259 } 260 261 TEST_F(SpeechRecognizerImplTest, CancelWithData) { 262 // Start recording, give some data and then cancel. 263 recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId); 264 base::MessageLoop::current()->RunUntilIdle(); 265 TestAudioInputController* controller = 266 audio_input_controller_factory_.controller(); 267 ASSERT_TRUE(controller); 268 controller->event_handler()->OnData(controller, &audio_packet_[0], 269 audio_packet_.size()); 270 base::MessageLoop::current()->RunUntilIdle(); 271 recognizer_->AbortRecognition(); 272 base::MessageLoop::current()->RunUntilIdle(); 273 ASSERT_TRUE(url_fetcher_factory_.GetFetcherByID(0)); 274 EXPECT_TRUE(recognition_started_); 275 EXPECT_TRUE(audio_started_); 276 EXPECT_FALSE(result_received_); 277 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_ABORTED, error_); 278 CheckFinalEventsConsistency(); 279 } 280 281 TEST_F(SpeechRecognizerImplTest, ConnectionError) { 282 // Start recording, give some data and then stop. Issue the network callback 283 // with a connection error and verify that the recognizer bubbles the error up 284 recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId); 285 base::MessageLoop::current()->RunUntilIdle(); 286 TestAudioInputController* controller = 287 audio_input_controller_factory_.controller(); 288 ASSERT_TRUE(controller); 289 controller->event_handler()->OnData(controller, &audio_packet_[0], 290 audio_packet_.size()); 291 base::MessageLoop::current()->RunUntilIdle(); 292 net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0); 293 ASSERT_TRUE(fetcher); 294 295 recognizer_->StopAudioCapture(); 296 base::MessageLoop::current()->RunUntilIdle(); 297 EXPECT_TRUE(audio_started_); 298 EXPECT_TRUE(audio_ended_); 299 EXPECT_FALSE(recognition_ended_); 300 EXPECT_FALSE(result_received_); 301 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); 302 303 // Issue the network callback to complete the process. 304 fetcher->set_url(fetcher->GetOriginalURL()); 305 net::URLRequestStatus status; 306 status.set_status(net::URLRequestStatus::FAILED); 307 status.set_error(net::ERR_CONNECTION_REFUSED); 308 fetcher->set_status(status); 309 fetcher->set_response_code(0); 310 fetcher->SetResponseString(std::string()); 311 fetcher->delegate()->OnURLFetchComplete(fetcher); 312 base::MessageLoop::current()->RunUntilIdle(); 313 EXPECT_TRUE(recognition_ended_); 314 EXPECT_FALSE(result_received_); 315 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_); 316 CheckFinalEventsConsistency(); 317 } 318 319 TEST_F(SpeechRecognizerImplTest, ServerError) { 320 // Start recording, give some data and then stop. Issue the network callback 321 // with a 500 error and verify that the recognizer bubbles the error up 322 recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId); 323 base::MessageLoop::current()->RunUntilIdle(); 324 TestAudioInputController* controller = 325 audio_input_controller_factory_.controller(); 326 ASSERT_TRUE(controller); 327 controller->event_handler()->OnData(controller, &audio_packet_[0], 328 audio_packet_.size()); 329 base::MessageLoop::current()->RunUntilIdle(); 330 net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0); 331 ASSERT_TRUE(fetcher); 332 333 recognizer_->StopAudioCapture(); 334 base::MessageLoop::current()->RunUntilIdle(); 335 EXPECT_TRUE(audio_started_); 336 EXPECT_TRUE(audio_ended_); 337 EXPECT_FALSE(recognition_ended_); 338 EXPECT_FALSE(result_received_); 339 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); 340 341 // Issue the network callback to complete the process. 342 fetcher->set_url(fetcher->GetOriginalURL()); 343 net::URLRequestStatus status; 344 status.set_status(net::URLRequestStatus::SUCCESS); 345 fetcher->set_status(status); 346 fetcher->set_response_code(500); 347 fetcher->SetResponseString("Internal Server Error"); 348 fetcher->delegate()->OnURLFetchComplete(fetcher); 349 base::MessageLoop::current()->RunUntilIdle(); 350 EXPECT_TRUE(recognition_ended_); 351 EXPECT_FALSE(result_received_); 352 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_); 353 CheckFinalEventsConsistency(); 354 } 355 356 TEST_F(SpeechRecognizerImplTest, AudioControllerErrorNoData) { 357 // Check if things tear down properly if AudioInputController threw an error. 358 recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId); 359 base::MessageLoop::current()->RunUntilIdle(); 360 TestAudioInputController* controller = 361 audio_input_controller_factory_.controller(); 362 ASSERT_TRUE(controller); 363 controller->event_handler()->OnError(controller); 364 base::MessageLoop::current()->RunUntilIdle(); 365 EXPECT_TRUE(recognition_started_); 366 EXPECT_FALSE(audio_started_); 367 EXPECT_FALSE(result_received_); 368 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_AUDIO, error_); 369 CheckFinalEventsConsistency(); 370 } 371 372 TEST_F(SpeechRecognizerImplTest, AudioControllerErrorWithData) { 373 // Check if things tear down properly if AudioInputController threw an error 374 // after giving some audio data. 375 recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId); 376 base::MessageLoop::current()->RunUntilIdle(); 377 TestAudioInputController* controller = 378 audio_input_controller_factory_.controller(); 379 ASSERT_TRUE(controller); 380 controller->event_handler()->OnData(controller, &audio_packet_[0], 381 audio_packet_.size()); 382 controller->event_handler()->OnError(controller); 383 base::MessageLoop::current()->RunUntilIdle(); 384 ASSERT_TRUE(url_fetcher_factory_.GetFetcherByID(0)); 385 EXPECT_TRUE(recognition_started_); 386 EXPECT_TRUE(audio_started_); 387 EXPECT_FALSE(result_received_); 388 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_AUDIO, error_); 389 CheckFinalEventsConsistency(); 390 } 391 392 TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackIssued) { 393 // Start recording and give a lot of packets with audio samples set to zero. 394 // This should trigger the no-speech detector and issue a callback. 395 recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId); 396 base::MessageLoop::current()->RunUntilIdle(); 397 TestAudioInputController* controller = 398 audio_input_controller_factory_.controller(); 399 ASSERT_TRUE(controller); 400 401 int num_packets = (SpeechRecognizerImpl::kNoSpeechTimeoutMs) / 402 GoogleOneShotRemoteEngine::kAudioPacketIntervalMs + 1; 403 // The vector is already filled with zero value samples on create. 404 for (int i = 0; i < num_packets; ++i) { 405 controller->event_handler()->OnData(controller, &audio_packet_[0], 406 audio_packet_.size()); 407 } 408 base::MessageLoop::current()->RunUntilIdle(); 409 EXPECT_TRUE(recognition_started_); 410 EXPECT_TRUE(audio_started_); 411 EXPECT_FALSE(result_received_); 412 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NO_SPEECH, error_); 413 CheckFinalEventsConsistency(); 414 } 415 416 TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackNotIssued) { 417 // Start recording and give a lot of packets with audio samples set to zero 418 // and then some more with reasonably loud audio samples. This should be 419 // treated as normal speech input and the no-speech detector should not get 420 // triggered. 421 recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId); 422 base::MessageLoop::current()->RunUntilIdle(); 423 TestAudioInputController* controller = 424 audio_input_controller_factory_.controller(); 425 ASSERT_TRUE(controller); 426 controller = audio_input_controller_factory_.controller(); 427 ASSERT_TRUE(controller); 428 429 int num_packets = (SpeechRecognizerImpl::kNoSpeechTimeoutMs) / 430 GoogleOneShotRemoteEngine::kAudioPacketIntervalMs; 431 432 // The vector is already filled with zero value samples on create. 433 for (int i = 0; i < num_packets / 2; ++i) { 434 controller->event_handler()->OnData(controller, &audio_packet_[0], 435 audio_packet_.size()); 436 } 437 438 FillPacketWithTestWaveform(); 439 for (int i = 0; i < num_packets / 2; ++i) { 440 controller->event_handler()->OnData(controller, &audio_packet_[0], 441 audio_packet_.size()); 442 } 443 444 base::MessageLoop::current()->RunUntilIdle(); 445 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); 446 EXPECT_TRUE(audio_started_); 447 EXPECT_FALSE(audio_ended_); 448 EXPECT_FALSE(recognition_ended_); 449 recognizer_->AbortRecognition(); 450 base::MessageLoop::current()->RunUntilIdle(); 451 CheckFinalEventsConsistency(); 452 } 453 454 TEST_F(SpeechRecognizerImplTest, SetInputVolumeCallback) { 455 // Start recording and give a lot of packets with audio samples set to zero 456 // and then some more with reasonably loud audio samples. Check that we don't 457 // get the callback during estimation phase, then get zero for the silence 458 // samples and proper volume for the loud audio. 459 recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId); 460 base::MessageLoop::current()->RunUntilIdle(); 461 TestAudioInputController* controller = 462 audio_input_controller_factory_.controller(); 463 ASSERT_TRUE(controller); 464 controller = audio_input_controller_factory_.controller(); 465 ASSERT_TRUE(controller); 466 467 // Feed some samples to begin with for the endpointer to do noise estimation. 468 int num_packets = SpeechRecognizerImpl::kEndpointerEstimationTimeMs / 469 GoogleOneShotRemoteEngine::kAudioPacketIntervalMs; 470 FillPacketWithNoise(); 471 for (int i = 0; i < num_packets; ++i) { 472 controller->event_handler()->OnData(controller, &audio_packet_[0], 473 audio_packet_.size()); 474 } 475 base::MessageLoop::current()->RunUntilIdle(); 476 EXPECT_EQ(-1.0f, volume_); // No audio volume set yet. 477 478 // The vector is already filled with zero value samples on create. 479 controller->event_handler()->OnData(controller, &audio_packet_[0], 480 audio_packet_.size()); 481 base::MessageLoop::current()->RunUntilIdle(); 482 EXPECT_FLOAT_EQ(0.74939233f, volume_); 483 484 FillPacketWithTestWaveform(); 485 controller->event_handler()->OnData(controller, &audio_packet_[0], 486 audio_packet_.size()); 487 base::MessageLoop::current()->RunUntilIdle(); 488 EXPECT_NEAR(0.89926866f, volume_, 0.00001f); 489 EXPECT_FLOAT_EQ(0.75071919f, noise_volume_); 490 491 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); 492 EXPECT_FALSE(audio_ended_); 493 EXPECT_FALSE(recognition_ended_); 494 recognizer_->AbortRecognition(); 495 base::MessageLoop::current()->RunUntilIdle(); 496 CheckFinalEventsConsistency(); 497 } 498 499 } // namespace content 500