1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include <vector> 6 7 #include "content/browser/browser_thread_impl.h" 8 #include "content/browser/speech/google_one_shot_remote_engine.h" 9 #include "content/browser/speech/speech_recognizer_impl.h" 10 #include "content/public/browser/speech_recognition_event_listener.h" 11 #include "media/audio/audio_manager_base.h" 12 #include "media/audio/fake_audio_input_stream.h" 13 #include "media/audio/fake_audio_output_stream.h" 14 #include "media/audio/mock_audio_manager.h" 15 #include "media/audio/test_audio_input_controller_factory.h" 16 #include "media/base/audio_bus.h" 17 #include "net/base/net_errors.h" 18 #include "net/url_request/test_url_fetcher_factory.h" 19 #include "net/url_request/url_request_status.h" 20 #include "testing/gtest/include/gtest/gtest.h" 21 22 using base::MessageLoopProxy; 23 using media::AudioInputController; 24 using media::AudioInputStream; 25 using media::AudioManager; 26 using media::AudioOutputStream; 27 using media::AudioParameters; 28 using media::TestAudioInputController; 29 using media::TestAudioInputControllerFactory; 30 31 namespace content { 32 33 class SpeechRecognizerImplTest : public SpeechRecognitionEventListener, 34 public testing::Test { 35 public: 36 SpeechRecognizerImplTest() 37 : io_thread_(BrowserThread::IO, &message_loop_), 38 recognition_started_(false), 39 recognition_ended_(false), 40 result_received_(false), 41 audio_started_(false), 42 audio_ended_(false), 43 sound_started_(false), 44 sound_ended_(false), 45 error_(SPEECH_RECOGNITION_ERROR_NONE), 46 volume_(-1.0f) { 47 // SpeechRecognizer takes ownership of sr_engine. 48 SpeechRecognitionEngine* sr_engine = 49 new GoogleOneShotRemoteEngine(NULL /* URLRequestContextGetter */); 50 SpeechRecognitionEngineConfig config; 51 config.audio_num_bits_per_sample = 52 SpeechRecognizerImpl::kNumBitsPerAudioSample; 53 config.audio_sample_rate = SpeechRecognizerImpl::kAudioSampleRate; 54 config.filter_profanities = false; 55 sr_engine->SetConfig(config); 56 57 const int kTestingSessionId = 1; 58 recognizer_ = new SpeechRecognizerImpl( 59 this, kTestingSessionId, false, false, sr_engine); 60 audio_manager_.reset(new media::MockAudioManager( 61 base::MessageLoop::current()->message_loop_proxy().get())); 62 recognizer_->SetAudioManagerForTesting(audio_manager_.get()); 63 64 int audio_packet_length_bytes = 65 (SpeechRecognizerImpl::kAudioSampleRate * 66 GoogleOneShotRemoteEngine::kAudioPacketIntervalMs * 67 ChannelLayoutToChannelCount(SpeechRecognizerImpl::kChannelLayout) * 68 SpeechRecognizerImpl::kNumBitsPerAudioSample) / (8 * 1000); 69 audio_packet_.resize(audio_packet_length_bytes); 70 71 const int channels = 72 ChannelLayoutToChannelCount(SpeechRecognizerImpl::kChannelLayout); 73 bytes_per_sample_ = SpeechRecognizerImpl::kNumBitsPerAudioSample / 8; 74 const int frames = audio_packet_length_bytes / channels / bytes_per_sample_; 75 audio_bus_ = media::AudioBus::Create(channels, frames); 76 audio_bus_->Zero(); 77 } 78 79 void CheckEventsConsistency() { 80 // Note: "!x || y" == "x implies y". 81 EXPECT_TRUE(!recognition_ended_ || recognition_started_); 82 EXPECT_TRUE(!audio_ended_ || audio_started_); 83 EXPECT_TRUE(!sound_ended_ || sound_started_); 84 EXPECT_TRUE(!audio_started_ || recognition_started_); 85 EXPECT_TRUE(!sound_started_ || audio_started_); 86 EXPECT_TRUE(!audio_ended_ || (sound_ended_ || !sound_started_)); 87 EXPECT_TRUE(!recognition_ended_ || (audio_ended_ || !audio_started_)); 88 } 89 90 void CheckFinalEventsConsistency() { 91 // Note: "!(x ^ y)" == "(x && y) || (!x && !x)". 92 EXPECT_FALSE(recognition_started_ ^ recognition_ended_); 93 EXPECT_FALSE(audio_started_ ^ audio_ended_); 94 EXPECT_FALSE(sound_started_ ^ sound_ended_); 95 } 96 97 // Overridden from SpeechRecognitionEventListener: 98 virtual void OnAudioStart(int session_id) OVERRIDE { 99 audio_started_ = true; 100 CheckEventsConsistency(); 101 } 102 103 virtual void OnAudioEnd(int session_id) OVERRIDE { 104 audio_ended_ = true; 105 CheckEventsConsistency(); 106 } 107 108 virtual void OnRecognitionResults( 109 int session_id, const SpeechRecognitionResults& results) OVERRIDE { 110 result_received_ = true; 111 } 112 113 virtual void OnRecognitionError( 114 int session_id, const SpeechRecognitionError& error) OVERRIDE { 115 EXPECT_TRUE(recognition_started_); 116 EXPECT_FALSE(recognition_ended_); 117 error_ = error.code; 118 } 119 120 virtual void OnAudioLevelsChange(int session_id, float volume, 121 float noise_volume) OVERRIDE { 122 volume_ = volume; 123 noise_volume_ = noise_volume; 124 } 125 126 virtual void OnRecognitionEnd(int session_id) OVERRIDE { 127 recognition_ended_ = true; 128 CheckEventsConsistency(); 129 } 130 131 virtual void OnRecognitionStart(int session_id) OVERRIDE { 132 recognition_started_ = true; 133 CheckEventsConsistency(); 134 } 135 136 virtual void OnEnvironmentEstimationComplete(int session_id) OVERRIDE {} 137 138 virtual void OnSoundStart(int session_id) OVERRIDE { 139 sound_started_ = true; 140 CheckEventsConsistency(); 141 } 142 143 virtual void OnSoundEnd(int session_id) OVERRIDE { 144 sound_ended_ = true; 145 CheckEventsConsistency(); 146 } 147 148 // testing::Test methods. 149 virtual void SetUp() OVERRIDE { 150 AudioInputController::set_factory_for_testing( 151 &audio_input_controller_factory_); 152 } 153 154 virtual void TearDown() OVERRIDE { 155 AudioInputController::set_factory_for_testing(NULL); 156 } 157 158 void CopyPacketToAudioBus() { 159 // Copy the created signal into an audio bus in a deinterleaved format. 160 audio_bus_->FromInterleaved( 161 &audio_packet_[0], audio_bus_->frames(), bytes_per_sample_); 162 } 163 164 void FillPacketWithTestWaveform() { 165 // Fill the input with a simple pattern, a 125Hz sawtooth waveform. 166 for (size_t i = 0; i < audio_packet_.size(); ++i) 167 audio_packet_[i] = static_cast<uint8>(i); 168 CopyPacketToAudioBus(); 169 } 170 171 void FillPacketWithNoise() { 172 int value = 0; 173 int factor = 175; 174 for (size_t i = 0; i < audio_packet_.size(); ++i) { 175 value += factor; 176 audio_packet_[i] = value % 100; 177 } 178 CopyPacketToAudioBus(); 179 } 180 181 protected: 182 base::MessageLoopForIO message_loop_; 183 BrowserThreadImpl io_thread_; 184 scoped_refptr<SpeechRecognizerImpl> recognizer_; 185 scoped_ptr<AudioManager> audio_manager_; 186 bool recognition_started_; 187 bool recognition_ended_; 188 bool result_received_; 189 bool audio_started_; 190 bool audio_ended_; 191 bool sound_started_; 192 bool sound_ended_; 193 SpeechRecognitionErrorCode error_; 194 net::TestURLFetcherFactory url_fetcher_factory_; 195 TestAudioInputControllerFactory audio_input_controller_factory_; 196 std::vector<uint8> audio_packet_; 197 scoped_ptr<media::AudioBus> audio_bus_; 198 int bytes_per_sample_; 199 float volume_; 200 float noise_volume_; 201 }; 202 203 TEST_F(SpeechRecognizerImplTest, StopNoData) { 204 // Check for callbacks when stopping record before any audio gets recorded. 205 recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId); 206 recognizer_->StopAudioCapture(); 207 base::MessageLoop::current()->RunUntilIdle(); 208 EXPECT_TRUE(recognition_started_); 209 EXPECT_FALSE(audio_started_); 210 EXPECT_FALSE(result_received_); 211 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); 212 CheckFinalEventsConsistency(); 213 } 214 215 TEST_F(SpeechRecognizerImplTest, CancelNoData) { 216 // Check for callbacks when canceling recognition before any audio gets 217 // recorded. 218 recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId); 219 recognizer_->AbortRecognition(); 220 base::MessageLoop::current()->RunUntilIdle(); 221 EXPECT_TRUE(recognition_started_); 222 EXPECT_FALSE(audio_started_); 223 EXPECT_FALSE(result_received_); 224 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_ABORTED, error_); 225 CheckFinalEventsConsistency(); 226 } 227 228 TEST_F(SpeechRecognizerImplTest, StopWithData) { 229 // Start recording, give some data and then stop. This should wait for the 230 // network callback to arrive before completion. 231 recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId); 232 base::MessageLoop::current()->RunUntilIdle(); 233 TestAudioInputController* controller = 234 audio_input_controller_factory_.controller(); 235 ASSERT_TRUE(controller); 236 237 // Try sending 5 chunks of mock audio data and verify that each of them 238 // resulted immediately in a packet sent out via the network. This verifies 239 // that we are streaming out encoded data as chunks without waiting for the 240 // full recording to complete. 241 const size_t kNumChunks = 5; 242 for (size_t i = 0; i < kNumChunks; ++i) { 243 controller->event_handler()->OnData(controller, audio_bus_.get()); 244 base::MessageLoop::current()->RunUntilIdle(); 245 net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0); 246 ASSERT_TRUE(fetcher); 247 EXPECT_EQ(i + 1, fetcher->upload_chunks().size()); 248 } 249 250 recognizer_->StopAudioCapture(); 251 base::MessageLoop::current()->RunUntilIdle(); 252 EXPECT_TRUE(audio_started_); 253 EXPECT_TRUE(audio_ended_); 254 EXPECT_FALSE(recognition_ended_); 255 EXPECT_FALSE(result_received_); 256 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); 257 258 // Issue the network callback to complete the process. 259 net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0); 260 ASSERT_TRUE(fetcher); 261 262 fetcher->set_url(fetcher->GetOriginalURL()); 263 net::URLRequestStatus status; 264 status.set_status(net::URLRequestStatus::SUCCESS); 265 fetcher->set_status(status); 266 fetcher->set_response_code(200); 267 fetcher->SetResponseString( 268 "{\"status\":0,\"hypotheses\":[{\"utterance\":\"123\"}]}"); 269 fetcher->delegate()->OnURLFetchComplete(fetcher); 270 base::MessageLoop::current()->RunUntilIdle(); 271 EXPECT_TRUE(recognition_ended_); 272 EXPECT_TRUE(result_received_); 273 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); 274 CheckFinalEventsConsistency(); 275 } 276 277 TEST_F(SpeechRecognizerImplTest, CancelWithData) { 278 // Start recording, give some data and then cancel. 279 recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId); 280 base::MessageLoop::current()->RunUntilIdle(); 281 TestAudioInputController* controller = 282 audio_input_controller_factory_.controller(); 283 ASSERT_TRUE(controller); 284 controller->event_handler()->OnData(controller, audio_bus_.get()); 285 base::MessageLoop::current()->RunUntilIdle(); 286 recognizer_->AbortRecognition(); 287 base::MessageLoop::current()->RunUntilIdle(); 288 ASSERT_TRUE(url_fetcher_factory_.GetFetcherByID(0)); 289 EXPECT_TRUE(recognition_started_); 290 EXPECT_TRUE(audio_started_); 291 EXPECT_FALSE(result_received_); 292 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_ABORTED, error_); 293 CheckFinalEventsConsistency(); 294 } 295 296 TEST_F(SpeechRecognizerImplTest, ConnectionError) { 297 // Start recording, give some data and then stop. Issue the network callback 298 // with a connection error and verify that the recognizer bubbles the error up 299 recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId); 300 base::MessageLoop::current()->RunUntilIdle(); 301 TestAudioInputController* controller = 302 audio_input_controller_factory_.controller(); 303 ASSERT_TRUE(controller); 304 controller->event_handler()->OnData(controller, audio_bus_.get()); 305 base::MessageLoop::current()->RunUntilIdle(); 306 net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0); 307 ASSERT_TRUE(fetcher); 308 309 recognizer_->StopAudioCapture(); 310 base::MessageLoop::current()->RunUntilIdle(); 311 EXPECT_TRUE(audio_started_); 312 EXPECT_TRUE(audio_ended_); 313 EXPECT_FALSE(recognition_ended_); 314 EXPECT_FALSE(result_received_); 315 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); 316 317 // Issue the network callback to complete the process. 318 fetcher->set_url(fetcher->GetOriginalURL()); 319 net::URLRequestStatus status; 320 status.set_status(net::URLRequestStatus::FAILED); 321 status.set_error(net::ERR_CONNECTION_REFUSED); 322 fetcher->set_status(status); 323 fetcher->set_response_code(0); 324 fetcher->SetResponseString(std::string()); 325 fetcher->delegate()->OnURLFetchComplete(fetcher); 326 base::MessageLoop::current()->RunUntilIdle(); 327 EXPECT_TRUE(recognition_ended_); 328 EXPECT_FALSE(result_received_); 329 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_); 330 CheckFinalEventsConsistency(); 331 } 332 333 TEST_F(SpeechRecognizerImplTest, ServerError) { 334 // Start recording, give some data and then stop. Issue the network callback 335 // with a 500 error and verify that the recognizer bubbles the error up 336 recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId); 337 base::MessageLoop::current()->RunUntilIdle(); 338 TestAudioInputController* controller = 339 audio_input_controller_factory_.controller(); 340 ASSERT_TRUE(controller); 341 controller->event_handler()->OnData(controller, audio_bus_.get()); 342 base::MessageLoop::current()->RunUntilIdle(); 343 net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0); 344 ASSERT_TRUE(fetcher); 345 346 recognizer_->StopAudioCapture(); 347 base::MessageLoop::current()->RunUntilIdle(); 348 EXPECT_TRUE(audio_started_); 349 EXPECT_TRUE(audio_ended_); 350 EXPECT_FALSE(recognition_ended_); 351 EXPECT_FALSE(result_received_); 352 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); 353 354 // Issue the network callback to complete the process. 355 fetcher->set_url(fetcher->GetOriginalURL()); 356 net::URLRequestStatus status; 357 status.set_status(net::URLRequestStatus::SUCCESS); 358 fetcher->set_status(status); 359 fetcher->set_response_code(500); 360 fetcher->SetResponseString("Internal Server Error"); 361 fetcher->delegate()->OnURLFetchComplete(fetcher); 362 base::MessageLoop::current()->RunUntilIdle(); 363 EXPECT_TRUE(recognition_ended_); 364 EXPECT_FALSE(result_received_); 365 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_); 366 CheckFinalEventsConsistency(); 367 } 368 369 TEST_F(SpeechRecognizerImplTest, AudioControllerErrorNoData) { 370 // Check if things tear down properly if AudioInputController threw an error. 371 recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId); 372 base::MessageLoop::current()->RunUntilIdle(); 373 TestAudioInputController* controller = 374 audio_input_controller_factory_.controller(); 375 ASSERT_TRUE(controller); 376 controller->event_handler()->OnError(controller, 377 AudioInputController::UNKNOWN_ERROR); 378 base::MessageLoop::current()->RunUntilIdle(); 379 EXPECT_TRUE(recognition_started_); 380 EXPECT_FALSE(audio_started_); 381 EXPECT_FALSE(result_received_); 382 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_AUDIO, error_); 383 CheckFinalEventsConsistency(); 384 } 385 386 TEST_F(SpeechRecognizerImplTest, AudioControllerErrorWithData) { 387 // Check if things tear down properly if AudioInputController threw an error 388 // after giving some audio data. 389 recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId); 390 base::MessageLoop::current()->RunUntilIdle(); 391 TestAudioInputController* controller = 392 audio_input_controller_factory_.controller(); 393 ASSERT_TRUE(controller); 394 controller->event_handler()->OnData(controller, audio_bus_.get()); 395 controller->event_handler()->OnError(controller, 396 AudioInputController::UNKNOWN_ERROR); 397 base::MessageLoop::current()->RunUntilIdle(); 398 ASSERT_TRUE(url_fetcher_factory_.GetFetcherByID(0)); 399 EXPECT_TRUE(recognition_started_); 400 EXPECT_TRUE(audio_started_); 401 EXPECT_FALSE(result_received_); 402 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_AUDIO, error_); 403 CheckFinalEventsConsistency(); 404 } 405 406 TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackIssued) { 407 // Start recording and give a lot of packets with audio samples set to zero. 408 // This should trigger the no-speech detector and issue a callback. 409 recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId); 410 base::MessageLoop::current()->RunUntilIdle(); 411 TestAudioInputController* controller = 412 audio_input_controller_factory_.controller(); 413 ASSERT_TRUE(controller); 414 415 int num_packets = (SpeechRecognizerImpl::kNoSpeechTimeoutMs) / 416 GoogleOneShotRemoteEngine::kAudioPacketIntervalMs + 1; 417 // The vector is already filled with zero value samples on create. 418 for (int i = 0; i < num_packets; ++i) { 419 controller->event_handler()->OnData(controller, audio_bus_.get()); 420 } 421 base::MessageLoop::current()->RunUntilIdle(); 422 EXPECT_TRUE(recognition_started_); 423 EXPECT_TRUE(audio_started_); 424 EXPECT_FALSE(result_received_); 425 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NO_SPEECH, error_); 426 CheckFinalEventsConsistency(); 427 } 428 429 TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackNotIssued) { 430 // Start recording and give a lot of packets with audio samples set to zero 431 // and then some more with reasonably loud audio samples. This should be 432 // treated as normal speech input and the no-speech detector should not get 433 // triggered. 434 recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId); 435 base::MessageLoop::current()->RunUntilIdle(); 436 TestAudioInputController* controller = 437 audio_input_controller_factory_.controller(); 438 ASSERT_TRUE(controller); 439 controller = audio_input_controller_factory_.controller(); 440 ASSERT_TRUE(controller); 441 442 int num_packets = (SpeechRecognizerImpl::kNoSpeechTimeoutMs) / 443 GoogleOneShotRemoteEngine::kAudioPacketIntervalMs; 444 445 // The vector is already filled with zero value samples on create. 446 for (int i = 0; i < num_packets / 2; ++i) { 447 controller->event_handler()->OnData(controller, audio_bus_.get()); 448 } 449 450 FillPacketWithTestWaveform(); 451 for (int i = 0; i < num_packets / 2; ++i) { 452 controller->event_handler()->OnData(controller, audio_bus_.get()); 453 } 454 455 base::MessageLoop::current()->RunUntilIdle(); 456 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); 457 EXPECT_TRUE(audio_started_); 458 EXPECT_FALSE(audio_ended_); 459 EXPECT_FALSE(recognition_ended_); 460 recognizer_->AbortRecognition(); 461 base::MessageLoop::current()->RunUntilIdle(); 462 CheckFinalEventsConsistency(); 463 } 464 465 TEST_F(SpeechRecognizerImplTest, SetInputVolumeCallback) { 466 // Start recording and give a lot of packets with audio samples set to zero 467 // and then some more with reasonably loud audio samples. Check that we don't 468 // get the callback during estimation phase, then get zero for the silence 469 // samples and proper volume for the loud audio. 470 recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId); 471 base::MessageLoop::current()->RunUntilIdle(); 472 TestAudioInputController* controller = 473 audio_input_controller_factory_.controller(); 474 ASSERT_TRUE(controller); 475 controller = audio_input_controller_factory_.controller(); 476 ASSERT_TRUE(controller); 477 478 // Feed some samples to begin with for the endpointer to do noise estimation. 479 int num_packets = SpeechRecognizerImpl::kEndpointerEstimationTimeMs / 480 GoogleOneShotRemoteEngine::kAudioPacketIntervalMs; 481 FillPacketWithNoise(); 482 for (int i = 0; i < num_packets; ++i) { 483 controller->event_handler()->OnData(controller, audio_bus_.get()); 484 } 485 base::MessageLoop::current()->RunUntilIdle(); 486 EXPECT_EQ(-1.0f, volume_); // No audio volume set yet. 487 488 // The vector is already filled with zero value samples on create. 489 controller->event_handler()->OnData(controller, audio_bus_.get()); 490 base::MessageLoop::current()->RunUntilIdle(); 491 EXPECT_FLOAT_EQ(0.74939233f, volume_); 492 493 FillPacketWithTestWaveform(); 494 controller->event_handler()->OnData(controller, audio_bus_.get()); 495 base::MessageLoop::current()->RunUntilIdle(); 496 EXPECT_NEAR(0.89926866f, volume_, 0.00001f); 497 EXPECT_FLOAT_EQ(0.75071919f, noise_volume_); 498 499 EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); 500 EXPECT_FALSE(audio_ended_); 501 EXPECT_FALSE(recognition_ended_); 502 recognizer_->AbortRecognition(); 503 base::MessageLoop::current()->RunUntilIdle(); 504 CheckFinalEventsConsistency(); 505 } 506 507 } // namespace content 508