Home | History | Annotate | Download | only in speech
      1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include <vector>
      6 
      7 #include "content/browser/browser_thread_impl.h"
      8 #include "content/browser/speech/google_one_shot_remote_engine.h"
      9 #include "content/browser/speech/speech_recognizer_impl.h"
     10 #include "content/public/browser/speech_recognition_event_listener.h"
     11 #include "media/audio/audio_manager_base.h"
     12 #include "media/audio/fake_audio_input_stream.h"
     13 #include "media/audio/fake_audio_output_stream.h"
     14 #include "media/audio/mock_audio_manager.h"
     15 #include "media/audio/test_audio_input_controller_factory.h"
     16 #include "net/base/net_errors.h"
     17 #include "net/url_request/test_url_fetcher_factory.h"
     18 #include "net/url_request/url_request_status.h"
     19 #include "testing/gtest/include/gtest/gtest.h"
     20 
     21 using base::MessageLoopProxy;
     22 using media::AudioInputController;
     23 using media::AudioInputStream;
     24 using media::AudioManager;
     25 using media::AudioOutputStream;
     26 using media::AudioParameters;
     27 using media::TestAudioInputController;
     28 using media::TestAudioInputControllerFactory;
     29 
     30 namespace content {
     31 
     32 class SpeechRecognizerImplTest : public SpeechRecognitionEventListener,
     33                                  public testing::Test {
     34  public:
     35   SpeechRecognizerImplTest()
     36       : io_thread_(BrowserThread::IO, &message_loop_),
     37         recognition_started_(false),
     38         recognition_ended_(false),
     39         result_received_(false),
     40         audio_started_(false),
     41         audio_ended_(false),
     42         sound_started_(false),
     43         sound_ended_(false),
     44         error_(SPEECH_RECOGNITION_ERROR_NONE),
     45         volume_(-1.0f) {
     46     // SpeechRecognizer takes ownership of sr_engine.
     47     SpeechRecognitionEngine* sr_engine =
     48         new GoogleOneShotRemoteEngine(NULL /* URLRequestContextGetter */);
     49     SpeechRecognitionEngineConfig config;
     50     config.audio_num_bits_per_sample =
     51         SpeechRecognizerImpl::kNumBitsPerAudioSample;
     52     config.audio_sample_rate = SpeechRecognizerImpl::kAudioSampleRate;
     53     config.filter_profanities = false;
     54     sr_engine->SetConfig(config);
     55 
     56     const int kTestingSessionId = 1;
     57     const bool kOneShotMode = true;
     58     recognizer_ = new SpeechRecognizerImpl(
     59         this, kTestingSessionId, kOneShotMode, sr_engine);
     60     audio_manager_.reset(new media::MockAudioManager(
     61         base::MessageLoop::current()->message_loop_proxy().get()));
     62     recognizer_->SetAudioManagerForTesting(audio_manager_.get());
     63 
     64     int audio_packet_length_bytes =
     65         (SpeechRecognizerImpl::kAudioSampleRate *
     66          GoogleOneShotRemoteEngine::kAudioPacketIntervalMs *
     67          ChannelLayoutToChannelCount(SpeechRecognizerImpl::kChannelLayout) *
     68          SpeechRecognizerImpl::kNumBitsPerAudioSample) / (8 * 1000);
     69     audio_packet_.resize(audio_packet_length_bytes);
     70   }
     71 
     72   void CheckEventsConsistency() {
     73     // Note: "!x || y" == "x implies y".
     74     EXPECT_TRUE(!recognition_ended_ || recognition_started_);
     75     EXPECT_TRUE(!audio_ended_ || audio_started_);
     76     EXPECT_TRUE(!sound_ended_ || sound_started_);
     77     EXPECT_TRUE(!audio_started_ || recognition_started_);
     78     EXPECT_TRUE(!sound_started_ || audio_started_);
     79     EXPECT_TRUE(!audio_ended_ || (sound_ended_ || !sound_started_));
     80     EXPECT_TRUE(!recognition_ended_ || (audio_ended_ || !audio_started_));
     81   }
     82 
     83   void CheckFinalEventsConsistency() {
     84     // Note: "!(x ^ y)" == "(x && y) || (!x && !x)".
     85     EXPECT_FALSE(recognition_started_ ^ recognition_ended_);
     86     EXPECT_FALSE(audio_started_ ^ audio_ended_);
     87     EXPECT_FALSE(sound_started_ ^ sound_ended_);
     88   }
     89 
     90   // Overridden from SpeechRecognitionEventListener:
     91   virtual void OnAudioStart(int session_id) OVERRIDE {
     92     audio_started_ = true;
     93     CheckEventsConsistency();
     94   }
     95 
     96   virtual void OnAudioEnd(int session_id) OVERRIDE {
     97     audio_ended_ = true;
     98     CheckEventsConsistency();
     99   }
    100 
    101   virtual void OnRecognitionResults(
    102       int session_id, const SpeechRecognitionResults& results) OVERRIDE {
    103     result_received_ = true;
    104   }
    105 
    106   virtual void OnRecognitionError(
    107       int session_id, const SpeechRecognitionError& error) OVERRIDE {
    108     EXPECT_TRUE(recognition_started_);
    109     EXPECT_FALSE(recognition_ended_);
    110     error_ = error.code;
    111   }
    112 
    113   virtual void OnAudioLevelsChange(int session_id, float volume,
    114                                    float noise_volume) OVERRIDE {
    115     volume_ = volume;
    116     noise_volume_ = noise_volume;
    117   }
    118 
    119   virtual void OnRecognitionEnd(int session_id) OVERRIDE {
    120     recognition_ended_ = true;
    121     CheckEventsConsistency();
    122   }
    123 
    124   virtual void OnRecognitionStart(int session_id) OVERRIDE {
    125     recognition_started_ = true;
    126     CheckEventsConsistency();
    127   }
    128 
    129   virtual void OnEnvironmentEstimationComplete(int session_id) OVERRIDE {}
    130 
    131   virtual void OnSoundStart(int session_id) OVERRIDE {
    132     sound_started_ = true;
    133     CheckEventsConsistency();
    134   }
    135 
    136   virtual void OnSoundEnd(int session_id) OVERRIDE {
    137     sound_ended_ = true;
    138     CheckEventsConsistency();
    139   }
    140 
    141   // testing::Test methods.
    142   virtual void SetUp() OVERRIDE {
    143     AudioInputController::set_factory_for_testing(
    144         &audio_input_controller_factory_);
    145   }
    146 
    147   virtual void TearDown() OVERRIDE {
    148     AudioInputController::set_factory_for_testing(NULL);
    149   }
    150 
    151   void FillPacketWithTestWaveform() {
    152     // Fill the input with a simple pattern, a 125Hz sawtooth waveform.
    153     for (size_t i = 0; i < audio_packet_.size(); ++i)
    154       audio_packet_[i] = static_cast<uint8>(i);
    155   }
    156 
    157   void FillPacketWithNoise() {
    158     int value = 0;
    159     int factor = 175;
    160     for (size_t i = 0; i < audio_packet_.size(); ++i) {
    161       value += factor;
    162       audio_packet_[i] = value % 100;
    163     }
    164   }
    165 
    166  protected:
    167   base::MessageLoopForIO message_loop_;
    168   BrowserThreadImpl io_thread_;
    169   scoped_refptr<SpeechRecognizerImpl> recognizer_;
    170   scoped_ptr<AudioManager> audio_manager_;
    171   bool recognition_started_;
    172   bool recognition_ended_;
    173   bool result_received_;
    174   bool audio_started_;
    175   bool audio_ended_;
    176   bool sound_started_;
    177   bool sound_ended_;
    178   SpeechRecognitionErrorCode error_;
    179   net::TestURLFetcherFactory url_fetcher_factory_;
    180   TestAudioInputControllerFactory audio_input_controller_factory_;
    181   std::vector<uint8> audio_packet_;
    182   float volume_;
    183   float noise_volume_;
    184 };
    185 
    186 TEST_F(SpeechRecognizerImplTest, StopNoData) {
    187   // Check for callbacks when stopping record before any audio gets recorded.
    188   recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
    189   recognizer_->StopAudioCapture();
    190   base::MessageLoop::current()->RunUntilIdle();
    191   EXPECT_TRUE(recognition_started_);
    192   EXPECT_FALSE(audio_started_);
    193   EXPECT_FALSE(result_received_);
    194   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
    195   CheckFinalEventsConsistency();
    196 }
    197 
    198 TEST_F(SpeechRecognizerImplTest, CancelNoData) {
    199   // Check for callbacks when canceling recognition before any audio gets
    200   // recorded.
    201   recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
    202   recognizer_->AbortRecognition();
    203   base::MessageLoop::current()->RunUntilIdle();
    204   EXPECT_TRUE(recognition_started_);
    205   EXPECT_FALSE(audio_started_);
    206   EXPECT_FALSE(result_received_);
    207   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_ABORTED, error_);
    208   CheckFinalEventsConsistency();
    209 }
    210 
    211 TEST_F(SpeechRecognizerImplTest, StopWithData) {
    212   // Start recording, give some data and then stop. This should wait for the
    213   // network callback to arrive before completion.
    214   recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
    215   base::MessageLoop::current()->RunUntilIdle();
    216   TestAudioInputController* controller =
    217       audio_input_controller_factory_.controller();
    218   ASSERT_TRUE(controller);
    219 
    220   // Try sending 5 chunks of mock audio data and verify that each of them
    221   // resulted immediately in a packet sent out via the network. This verifies
    222   // that we are streaming out encoded data as chunks without waiting for the
    223   // full recording to complete.
    224   const size_t kNumChunks = 5;
    225   for (size_t i = 0; i < kNumChunks; ++i) {
    226     controller->event_handler()->OnData(controller, &audio_packet_[0],
    227                                         audio_packet_.size());
    228     base::MessageLoop::current()->RunUntilIdle();
    229     net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0);
    230     ASSERT_TRUE(fetcher);
    231     EXPECT_EQ(i + 1, fetcher->upload_chunks().size());
    232   }
    233 
    234   recognizer_->StopAudioCapture();
    235   base::MessageLoop::current()->RunUntilIdle();
    236   EXPECT_TRUE(audio_started_);
    237   EXPECT_TRUE(audio_ended_);
    238   EXPECT_FALSE(recognition_ended_);
    239   EXPECT_FALSE(result_received_);
    240   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
    241 
    242   // Issue the network callback to complete the process.
    243   net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0);
    244   ASSERT_TRUE(fetcher);
    245 
    246   fetcher->set_url(fetcher->GetOriginalURL());
    247   net::URLRequestStatus status;
    248   status.set_status(net::URLRequestStatus::SUCCESS);
    249   fetcher->set_status(status);
    250   fetcher->set_response_code(200);
    251   fetcher->SetResponseString(
    252       "{\"status\":0,\"hypotheses\":[{\"utterance\":\"123\"}]}");
    253   fetcher->delegate()->OnURLFetchComplete(fetcher);
    254   base::MessageLoop::current()->RunUntilIdle();
    255   EXPECT_TRUE(recognition_ended_);
    256   EXPECT_TRUE(result_received_);
    257   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
    258   CheckFinalEventsConsistency();
    259 }
    260 
    261 TEST_F(SpeechRecognizerImplTest, CancelWithData) {
    262   // Start recording, give some data and then cancel.
    263   recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
    264   base::MessageLoop::current()->RunUntilIdle();
    265   TestAudioInputController* controller =
    266       audio_input_controller_factory_.controller();
    267   ASSERT_TRUE(controller);
    268   controller->event_handler()->OnData(controller, &audio_packet_[0],
    269                                       audio_packet_.size());
    270   base::MessageLoop::current()->RunUntilIdle();
    271   recognizer_->AbortRecognition();
    272   base::MessageLoop::current()->RunUntilIdle();
    273   ASSERT_TRUE(url_fetcher_factory_.GetFetcherByID(0));
    274   EXPECT_TRUE(recognition_started_);
    275   EXPECT_TRUE(audio_started_);
    276   EXPECT_FALSE(result_received_);
    277   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_ABORTED, error_);
    278   CheckFinalEventsConsistency();
    279 }
    280 
    281 TEST_F(SpeechRecognizerImplTest, ConnectionError) {
    282   // Start recording, give some data and then stop. Issue the network callback
    283   // with a connection error and verify that the recognizer bubbles the error up
    284   recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
    285   base::MessageLoop::current()->RunUntilIdle();
    286   TestAudioInputController* controller =
    287       audio_input_controller_factory_.controller();
    288   ASSERT_TRUE(controller);
    289   controller->event_handler()->OnData(controller, &audio_packet_[0],
    290                                       audio_packet_.size());
    291   base::MessageLoop::current()->RunUntilIdle();
    292   net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0);
    293   ASSERT_TRUE(fetcher);
    294 
    295   recognizer_->StopAudioCapture();
    296   base::MessageLoop::current()->RunUntilIdle();
    297   EXPECT_TRUE(audio_started_);
    298   EXPECT_TRUE(audio_ended_);
    299   EXPECT_FALSE(recognition_ended_);
    300   EXPECT_FALSE(result_received_);
    301   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
    302 
    303   // Issue the network callback to complete the process.
    304   fetcher->set_url(fetcher->GetOriginalURL());
    305   net::URLRequestStatus status;
    306   status.set_status(net::URLRequestStatus::FAILED);
    307   status.set_error(net::ERR_CONNECTION_REFUSED);
    308   fetcher->set_status(status);
    309   fetcher->set_response_code(0);
    310   fetcher->SetResponseString(std::string());
    311   fetcher->delegate()->OnURLFetchComplete(fetcher);
    312   base::MessageLoop::current()->RunUntilIdle();
    313   EXPECT_TRUE(recognition_ended_);
    314   EXPECT_FALSE(result_received_);
    315   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_);
    316   CheckFinalEventsConsistency();
    317 }
    318 
    319 TEST_F(SpeechRecognizerImplTest, ServerError) {
    320   // Start recording, give some data and then stop. Issue the network callback
    321   // with a 500 error and verify that the recognizer bubbles the error up
    322   recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
    323   base::MessageLoop::current()->RunUntilIdle();
    324   TestAudioInputController* controller =
    325       audio_input_controller_factory_.controller();
    326   ASSERT_TRUE(controller);
    327   controller->event_handler()->OnData(controller, &audio_packet_[0],
    328                                       audio_packet_.size());
    329   base::MessageLoop::current()->RunUntilIdle();
    330   net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0);
    331   ASSERT_TRUE(fetcher);
    332 
    333   recognizer_->StopAudioCapture();
    334   base::MessageLoop::current()->RunUntilIdle();
    335   EXPECT_TRUE(audio_started_);
    336   EXPECT_TRUE(audio_ended_);
    337   EXPECT_FALSE(recognition_ended_);
    338   EXPECT_FALSE(result_received_);
    339   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
    340 
    341   // Issue the network callback to complete the process.
    342   fetcher->set_url(fetcher->GetOriginalURL());
    343   net::URLRequestStatus status;
    344   status.set_status(net::URLRequestStatus::SUCCESS);
    345   fetcher->set_status(status);
    346   fetcher->set_response_code(500);
    347   fetcher->SetResponseString("Internal Server Error");
    348   fetcher->delegate()->OnURLFetchComplete(fetcher);
    349   base::MessageLoop::current()->RunUntilIdle();
    350   EXPECT_TRUE(recognition_ended_);
    351   EXPECT_FALSE(result_received_);
    352   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_);
    353   CheckFinalEventsConsistency();
    354 }
    355 
    356 TEST_F(SpeechRecognizerImplTest, AudioControllerErrorNoData) {
    357   // Check if things tear down properly if AudioInputController threw an error.
    358   recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
    359   base::MessageLoop::current()->RunUntilIdle();
    360   TestAudioInputController* controller =
    361       audio_input_controller_factory_.controller();
    362   ASSERT_TRUE(controller);
    363   controller->event_handler()->OnError(controller);
    364   base::MessageLoop::current()->RunUntilIdle();
    365   EXPECT_TRUE(recognition_started_);
    366   EXPECT_FALSE(audio_started_);
    367   EXPECT_FALSE(result_received_);
    368   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_AUDIO, error_);
    369   CheckFinalEventsConsistency();
    370 }
    371 
    372 TEST_F(SpeechRecognizerImplTest, AudioControllerErrorWithData) {
    373   // Check if things tear down properly if AudioInputController threw an error
    374   // after giving some audio data.
    375   recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
    376   base::MessageLoop::current()->RunUntilIdle();
    377   TestAudioInputController* controller =
    378       audio_input_controller_factory_.controller();
    379   ASSERT_TRUE(controller);
    380   controller->event_handler()->OnData(controller, &audio_packet_[0],
    381                                       audio_packet_.size());
    382   controller->event_handler()->OnError(controller);
    383   base::MessageLoop::current()->RunUntilIdle();
    384   ASSERT_TRUE(url_fetcher_factory_.GetFetcherByID(0));
    385   EXPECT_TRUE(recognition_started_);
    386   EXPECT_TRUE(audio_started_);
    387   EXPECT_FALSE(result_received_);
    388   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_AUDIO, error_);
    389   CheckFinalEventsConsistency();
    390 }
    391 
    392 TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackIssued) {
    393   // Start recording and give a lot of packets with audio samples set to zero.
    394   // This should trigger the no-speech detector and issue a callback.
    395   recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
    396   base::MessageLoop::current()->RunUntilIdle();
    397   TestAudioInputController* controller =
    398       audio_input_controller_factory_.controller();
    399   ASSERT_TRUE(controller);
    400 
    401   int num_packets = (SpeechRecognizerImpl::kNoSpeechTimeoutMs) /
    402                      GoogleOneShotRemoteEngine::kAudioPacketIntervalMs + 1;
    403   // The vector is already filled with zero value samples on create.
    404   for (int i = 0; i < num_packets; ++i) {
    405     controller->event_handler()->OnData(controller, &audio_packet_[0],
    406                                         audio_packet_.size());
    407   }
    408   base::MessageLoop::current()->RunUntilIdle();
    409   EXPECT_TRUE(recognition_started_);
    410   EXPECT_TRUE(audio_started_);
    411   EXPECT_FALSE(result_received_);
    412   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NO_SPEECH, error_);
    413   CheckFinalEventsConsistency();
    414 }
    415 
    416 TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackNotIssued) {
    417   // Start recording and give a lot of packets with audio samples set to zero
    418   // and then some more with reasonably loud audio samples. This should be
    419   // treated as normal speech input and the no-speech detector should not get
    420   // triggered.
    421   recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
    422   base::MessageLoop::current()->RunUntilIdle();
    423   TestAudioInputController* controller =
    424       audio_input_controller_factory_.controller();
    425   ASSERT_TRUE(controller);
    426   controller = audio_input_controller_factory_.controller();
    427   ASSERT_TRUE(controller);
    428 
    429   int num_packets = (SpeechRecognizerImpl::kNoSpeechTimeoutMs) /
    430                      GoogleOneShotRemoteEngine::kAudioPacketIntervalMs;
    431 
    432   // The vector is already filled with zero value samples on create.
    433   for (int i = 0; i < num_packets / 2; ++i) {
    434     controller->event_handler()->OnData(controller, &audio_packet_[0],
    435                                         audio_packet_.size());
    436   }
    437 
    438   FillPacketWithTestWaveform();
    439   for (int i = 0; i < num_packets / 2; ++i) {
    440     controller->event_handler()->OnData(controller, &audio_packet_[0],
    441                                         audio_packet_.size());
    442   }
    443 
    444   base::MessageLoop::current()->RunUntilIdle();
    445   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
    446   EXPECT_TRUE(audio_started_);
    447   EXPECT_FALSE(audio_ended_);
    448   EXPECT_FALSE(recognition_ended_);
    449   recognizer_->AbortRecognition();
    450   base::MessageLoop::current()->RunUntilIdle();
    451   CheckFinalEventsConsistency();
    452 }
    453 
    454 TEST_F(SpeechRecognizerImplTest, SetInputVolumeCallback) {
    455   // Start recording and give a lot of packets with audio samples set to zero
    456   // and then some more with reasonably loud audio samples. Check that we don't
    457   // get the callback during estimation phase, then get zero for the silence
    458   // samples and proper volume for the loud audio.
    459   recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
    460   base::MessageLoop::current()->RunUntilIdle();
    461   TestAudioInputController* controller =
    462       audio_input_controller_factory_.controller();
    463   ASSERT_TRUE(controller);
    464   controller = audio_input_controller_factory_.controller();
    465   ASSERT_TRUE(controller);
    466 
    467   // Feed some samples to begin with for the endpointer to do noise estimation.
    468   int num_packets = SpeechRecognizerImpl::kEndpointerEstimationTimeMs /
    469                     GoogleOneShotRemoteEngine::kAudioPacketIntervalMs;
    470   FillPacketWithNoise();
    471   for (int i = 0; i < num_packets; ++i) {
    472     controller->event_handler()->OnData(controller, &audio_packet_[0],
    473                                         audio_packet_.size());
    474   }
    475   base::MessageLoop::current()->RunUntilIdle();
    476   EXPECT_EQ(-1.0f, volume_);  // No audio volume set yet.
    477 
    478   // The vector is already filled with zero value samples on create.
    479   controller->event_handler()->OnData(controller, &audio_packet_[0],
    480                                       audio_packet_.size());
    481   base::MessageLoop::current()->RunUntilIdle();
    482   EXPECT_FLOAT_EQ(0.74939233f, volume_);
    483 
    484   FillPacketWithTestWaveform();
    485   controller->event_handler()->OnData(controller, &audio_packet_[0],
    486                                       audio_packet_.size());
    487   base::MessageLoop::current()->RunUntilIdle();
    488   EXPECT_NEAR(0.89926866f, volume_, 0.00001f);
    489   EXPECT_FLOAT_EQ(0.75071919f, noise_volume_);
    490 
    491   EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
    492   EXPECT_FALSE(audio_ended_);
    493   EXPECT_FALSE(recognition_ended_);
    494   recognizer_->AbortRecognition();
    495   base::MessageLoop::current()->RunUntilIdle();
    496   CheckFinalEventsConsistency();
    497 }
    498 
    499 }  // namespace content
    500