Home | History | Annotate | Download | only in speech
      1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "content/browser/speech/speech_recognizer_impl.h"
      6 
      7 #include "base/basictypes.h"
      8 #include "base/bind.h"
      9 #include "base/time/time.h"
     10 #include "content/browser/browser_main_loop.h"
     11 #include "content/browser/speech/audio_buffer.h"
     12 #include "content/browser/speech/google_one_shot_remote_engine.h"
     13 #include "content/public/browser/speech_recognition_event_listener.h"
     14 #include "media/base/audio_converter.h"
     15 #include "net/url_request/url_request_context_getter.h"
     16 
     17 #if defined(OS_WIN)
     18 #include "media/audio/win/core_audio_util_win.h"
     19 #endif
     20 
     21 using media::AudioBus;
     22 using media::AudioConverter;
     23 using media::AudioInputController;
     24 using media::AudioManager;
     25 using media::AudioParameters;
     26 using media::ChannelLayout;
     27 
     28 namespace content {
     29 
     30 // Private class which encapsulates the audio converter and the
     31 // AudioConverter::InputCallback. It handles resampling, buffering and
     32 // channel mixing between input and output parameters.
     33 class SpeechRecognizerImpl::OnDataConverter
     34     : public media::AudioConverter::InputCallback {
     35  public:
     36   OnDataConverter(const AudioParameters& input_params,
     37                   const AudioParameters& output_params);
     38   virtual ~OnDataConverter();
     39 
     40   // Converts input |data| buffer into an AudioChunk where the input format
     41   // is given by |input_parameters_| and the output format by
     42   // |output_parameters_|.
     43   scoped_refptr<AudioChunk> Convert(const uint8* data, size_t size);
     44 
     45  private:
     46   // media::AudioConverter::InputCallback implementation.
     47   virtual double ProvideInput(AudioBus* dest,
     48                               base::TimeDelta buffer_delay) OVERRIDE;
     49 
     50   // Handles resampling, buffering, and channel mixing between input and output
     51   // parameters.
     52   AudioConverter audio_converter_;
     53 
     54   scoped_ptr<AudioBus> input_bus_;
     55   scoped_ptr<AudioBus> output_bus_;
     56   const AudioParameters input_parameters_;
     57   const AudioParameters output_parameters_;
     58   bool waiting_for_input_;
     59   scoped_ptr<uint8[]> converted_data_;
     60 
     61   DISALLOW_COPY_AND_ASSIGN(OnDataConverter);
     62 };
     63 
     64 namespace {
     65 
     66 // The following constants are related to the volume level indicator shown in
     67 // the UI for recorded audio.
     68 // Multiplier used when new volume is greater than previous level.
     69 const float kUpSmoothingFactor = 1.0f;
     70 // Multiplier used when new volume is lesser than previous level.
     71 const float kDownSmoothingFactor = 0.7f;
     72 // RMS dB value of a maximum (unclipped) sine wave for int16 samples.
     73 const float kAudioMeterMaxDb = 90.31f;
     74 // This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0.
     75 // Values lower than this will display as empty level-meter.
     76 const float kAudioMeterMinDb = 30.0f;
     77 const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb;
     78 
     79 // Maximum level to draw to display unclipped meter. (1.0f displays clipping.)
     80 const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f;
     81 
     82 // Returns true if more than 5% of the samples are at min or max value.
     83 bool DetectClipping(const AudioChunk& chunk) {
     84   const int num_samples = chunk.NumSamples();
     85   const int16* samples = chunk.SamplesData16();
     86   const int kThreshold = num_samples / 20;
     87   int clipping_samples = 0;
     88 
     89   for (int i = 0; i < num_samples; ++i) {
     90     if (samples[i] <= -32767 || samples[i] >= 32767) {
     91       if (++clipping_samples > kThreshold)
     92         return true;
     93     }
     94   }
     95   return false;
     96 }
     97 
     98 void KeepAudioControllerRefcountedForDtor(scoped_refptr<AudioInputController>) {
     99 }
    100 
    101 }  // namespace
    102 
    103 const int SpeechRecognizerImpl::kAudioSampleRate = 16000;
    104 const ChannelLayout SpeechRecognizerImpl::kChannelLayout =
    105     media::CHANNEL_LAYOUT_MONO;
    106 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16;
    107 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000;
    108 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300;
    109 media::AudioManager* SpeechRecognizerImpl::audio_manager_for_tests_ = NULL;
    110 
    111 COMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0,
    112                kNumBitsPerAudioSample_must_be_a_multiple_of_8);
    113 
    114 // SpeechRecognizerImpl::OnDataConverter implementation
    115 
    116 SpeechRecognizerImpl::OnDataConverter::OnDataConverter(
    117     const AudioParameters& input_params, const AudioParameters& output_params)
    118     : audio_converter_(input_params, output_params, false),
    119       input_bus_(AudioBus::Create(input_params)),
    120       output_bus_(AudioBus::Create(output_params)),
    121       input_parameters_(input_params),
    122       output_parameters_(output_params),
    123       waiting_for_input_(false),
    124       converted_data_(new uint8[output_parameters_.GetBytesPerBuffer()]) {
    125   audio_converter_.AddInput(this);
    126 }
    127 
    128 SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() {
    129   // It should now be safe to unregister the converter since no more OnData()
    130   // callbacks are outstanding at this point.
    131   audio_converter_.RemoveInput(this);
    132 }
    133 
    134 scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert(
    135     const uint8* data, size_t size) {
    136   CHECK_EQ(size, static_cast<size_t>(input_parameters_.GetBytesPerBuffer()));
    137 
    138   input_bus_->FromInterleaved(
    139       data, input_bus_->frames(), input_parameters_.bits_per_sample() / 8);
    140 
    141   waiting_for_input_ = true;
    142   audio_converter_.Convert(output_bus_.get());
    143 
    144   output_bus_->ToInterleaved(
    145       output_bus_->frames(), output_parameters_.bits_per_sample() / 8,
    146       converted_data_.get());
    147 
    148   // TODO(primiano): Refactor AudioChunk to avoid the extra-copy here
    149   // (see http://crbug.com/249316 for details).
    150   return scoped_refptr<AudioChunk>(new AudioChunk(
    151       converted_data_.get(),
    152       output_parameters_.GetBytesPerBuffer(),
    153       output_parameters_.bits_per_sample() / 8));
    154 }
    155 
    156 double SpeechRecognizerImpl::OnDataConverter::ProvideInput(
    157     AudioBus* dest, base::TimeDelta buffer_delay) {
    158   // The audio converted should never ask for more than one bus in each call
    159   // to Convert(). If so, we have a serious issue in our design since we might
    160   // miss recorded chunks of 100 ms audio data.
    161   CHECK(waiting_for_input_);
    162 
    163   // Read from the input bus to feed the converter.
    164   input_bus_->CopyTo(dest);
    165 
    166   // |input_bus_| should only be provide once.
    167   waiting_for_input_ = false;
    168   return 1;
    169 }
    170 
    171 // SpeechRecognizerImpl implementation
    172 
    173 SpeechRecognizerImpl::SpeechRecognizerImpl(
    174     SpeechRecognitionEventListener* listener,
    175     int session_id,
    176     bool is_single_shot,
    177     SpeechRecognitionEngine* engine)
    178     : SpeechRecognizer(listener, session_id),
    179       recognition_engine_(engine),
    180       endpointer_(kAudioSampleRate),
    181       is_dispatching_event_(false),
    182       is_single_shot_(is_single_shot),
    183       state_(STATE_IDLE) {
    184   DCHECK(recognition_engine_ != NULL);
    185   if (is_single_shot) {
    186     // In single shot recognition, the session is automatically ended after:
    187     //  - 0.5 seconds of silence if time <  3 seconds
    188     //  - 1   seconds of silence if time >= 3 seconds
    189     endpointer_.set_speech_input_complete_silence_length(
    190         base::Time::kMicrosecondsPerSecond / 2);
    191     endpointer_.set_long_speech_input_complete_silence_length(
    192         base::Time::kMicrosecondsPerSecond);
    193     endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);
    194   } else {
    195     // In continuous recognition, the session is automatically ended after 15
    196     // seconds of silence.
    197     const int64 cont_timeout_us = base::Time::kMicrosecondsPerSecond * 15;
    198     endpointer_.set_speech_input_complete_silence_length(cont_timeout_us);
    199     endpointer_.set_long_speech_length(0);  // Use only a single timeout.
    200   }
    201   endpointer_.StartSession();
    202   recognition_engine_->set_delegate(this);
    203 }
    204 
    205 // -------  Methods that trigger Finite State Machine (FSM) events ------------
    206 
    207 // NOTE:all the external events and requests should be enqueued (PostTask), even
    208 // if they come from the same (IO) thread, in order to preserve the relationship
    209 // of causality between events and avoid interleaved event processing due to
    210 // synchronous callbacks.
    211 
    212 void SpeechRecognizerImpl::StartRecognition(const std::string& device_id) {
    213   DCHECK(!device_id.empty());
    214   device_id_ = device_id;
    215 
    216   BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
    217                           base::Bind(&SpeechRecognizerImpl::DispatchEvent,
    218                                      this, FSMEventArgs(EVENT_START)));
    219 }
    220 
    221 void SpeechRecognizerImpl::AbortRecognition() {
    222   BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
    223                           base::Bind(&SpeechRecognizerImpl::DispatchEvent,
    224                                      this, FSMEventArgs(EVENT_ABORT)));
    225 }
    226 
    227 void SpeechRecognizerImpl::StopAudioCapture() {
    228   BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
    229                           base::Bind(&SpeechRecognizerImpl::DispatchEvent,
    230                                      this, FSMEventArgs(EVENT_STOP_CAPTURE)));
    231 }
    232 
    233 bool SpeechRecognizerImpl::IsActive() const {
    234   // Checking the FSM state from another thread (thus, while the FSM is
    235   // potentially concurrently evolving) is meaningless.
    236   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
    237   return state_ != STATE_IDLE && state_ != STATE_ENDED;
    238 }
    239 
    240 bool SpeechRecognizerImpl::IsCapturingAudio() const {
    241   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive().
    242   const bool is_capturing_audio = state_ >= STATE_STARTING &&
    243                                   state_ <= STATE_RECOGNIZING;
    244   DCHECK((is_capturing_audio && (audio_controller_.get() != NULL)) ||
    245          (!is_capturing_audio && audio_controller_.get() == NULL));
    246   return is_capturing_audio;
    247 }
    248 
    249 const SpeechRecognitionEngine&
    250 SpeechRecognizerImpl::recognition_engine() const {
    251   return *(recognition_engine_.get());
    252 }
    253 
    254 SpeechRecognizerImpl::~SpeechRecognizerImpl() {
    255   endpointer_.EndSession();
    256   if (audio_controller_.get()) {
    257     audio_controller_->Close(
    258         base::Bind(&KeepAudioControllerRefcountedForDtor, audio_controller_));
    259   }
    260 }
    261 
    262 // Invoked in the audio thread.
    263 void SpeechRecognizerImpl::OnError(AudioInputController* controller) {
    264   FSMEventArgs event_args(EVENT_AUDIO_ERROR);
    265   BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
    266                           base::Bind(&SpeechRecognizerImpl::DispatchEvent,
    267                                      this, event_args));
    268 }
    269 
    270 void SpeechRecognizerImpl::OnData(AudioInputController* controller,
    271                                   const uint8* data, uint32 size) {
    272   if (size == 0)  // This could happen when audio capture stops and is normal.
    273     return;
    274 
    275   // Convert audio from native format to fixed format used by WebSpeech.
    276   FSMEventArgs event_args(EVENT_AUDIO_DATA);
    277   event_args.audio_data = audio_converter_->Convert(data, size);
    278 
    279   BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
    280                           base::Bind(&SpeechRecognizerImpl::DispatchEvent,
    281                                      this, event_args));
    282 }
    283 
    284 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {}
    285 
    286 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResults(
    287     const SpeechRecognitionResults& results) {
    288   FSMEventArgs event_args(EVENT_ENGINE_RESULT);
    289   event_args.engine_results = results;
    290   BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
    291                           base::Bind(&SpeechRecognizerImpl::DispatchEvent,
    292                                      this, event_args));
    293 }
    294 
    295 void SpeechRecognizerImpl::OnSpeechRecognitionEngineError(
    296     const SpeechRecognitionError& error) {
    297   FSMEventArgs event_args(EVENT_ENGINE_ERROR);
    298   event_args.engine_error = error;
    299   BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
    300                           base::Bind(&SpeechRecognizerImpl::DispatchEvent,
    301                                      this, event_args));
    302 }
    303 
    304 // -----------------------  Core FSM implementation ---------------------------
    305 // TODO(primiano): After the changes in the media package (r129173), this class
    306 // slightly violates the SpeechRecognitionEventListener interface contract. In
    307 // particular, it is not true anymore that this class can be freed after the
    308 // OnRecognitionEnd event, since the audio_controller_.Close() asynchronous
    309 // call can be still in progress after the end event. Currently, it does not
    310 // represent a problem for the browser itself, since refcounting protects us
    311 // against such race conditions. However, we should fix this in the next CLs.
    312 // For instance, tests are currently working just because the
    313 // TestAudioInputController is not closing asynchronously as the real controller
    314 // does, but they will become flaky if TestAudioInputController will be fixed.
    315 
    316 void SpeechRecognizerImpl::DispatchEvent(const FSMEventArgs& event_args) {
    317   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
    318   DCHECK_LE(event_args.event, EVENT_MAX_VALUE);
    319   DCHECK_LE(state_, STATE_MAX_VALUE);
    320 
    321   // Event dispatching must be sequential, otherwise it will break all the rules
    322   // and the assumptions of the finite state automata model.
    323   DCHECK(!is_dispatching_event_);
    324   is_dispatching_event_ = true;
    325 
    326   // Guard against the delegate freeing us until we finish processing the event.
    327   scoped_refptr<SpeechRecognizerImpl> me(this);
    328 
    329   if (event_args.event == EVENT_AUDIO_DATA) {
    330     DCHECK(event_args.audio_data.get() != NULL);
    331     ProcessAudioPipeline(*event_args.audio_data.get());
    332   }
    333 
    334   // The audio pipeline must be processed before the event dispatch, otherwise
    335   // it would take actions according to the future state instead of the current.
    336   state_ = ExecuteTransitionAndGetNextState(event_args);
    337   is_dispatching_event_ = false;
    338 }
    339 
    340 SpeechRecognizerImpl::FSMState
    341 SpeechRecognizerImpl::ExecuteTransitionAndGetNextState(
    342     const FSMEventArgs& event_args) {
    343   const FSMEvent event = event_args.event;
    344   switch (state_) {
    345     case STATE_IDLE:
    346       switch (event) {
    347         // TODO(primiano): restore UNREACHABLE_CONDITION on EVENT_ABORT and
    348         // EVENT_STOP_CAPTURE below once speech input extensions are fixed.
    349         case EVENT_ABORT:
    350           return AbortSilently(event_args);
    351         case EVENT_START:
    352           return StartRecording(event_args);
    353         case EVENT_STOP_CAPTURE:
    354           return AbortSilently(event_args);
    355         case EVENT_AUDIO_DATA:     // Corner cases related to queued messages
    356         case EVENT_ENGINE_RESULT:  // being lately dispatched.
    357         case EVENT_ENGINE_ERROR:
    358         case EVENT_AUDIO_ERROR:
    359           return DoNothing(event_args);
    360       }
    361       break;
    362     case STATE_STARTING:
    363       switch (event) {
    364         case EVENT_ABORT:
    365           return AbortWithError(event_args);
    366         case EVENT_START:
    367           return NotFeasible(event_args);
    368         case EVENT_STOP_CAPTURE:
    369           return AbortSilently(event_args);
    370         case EVENT_AUDIO_DATA:
    371           return StartRecognitionEngine(event_args);
    372         case EVENT_ENGINE_RESULT:
    373           return NotFeasible(event_args);
    374         case EVENT_ENGINE_ERROR:
    375         case EVENT_AUDIO_ERROR:
    376           return AbortWithError(event_args);
    377       }
    378       break;
    379     case STATE_ESTIMATING_ENVIRONMENT:
    380       switch (event) {
    381         case EVENT_ABORT:
    382           return AbortWithError(event_args);
    383         case EVENT_START:
    384           return NotFeasible(event_args);
    385         case EVENT_STOP_CAPTURE:
    386           return StopCaptureAndWaitForResult(event_args);
    387         case EVENT_AUDIO_DATA:
    388           return WaitEnvironmentEstimationCompletion(event_args);
    389         case EVENT_ENGINE_RESULT:
    390           return ProcessIntermediateResult(event_args);
    391         case EVENT_ENGINE_ERROR:
    392         case EVENT_AUDIO_ERROR:
    393           return AbortWithError(event_args);
    394       }
    395       break;
    396     case STATE_WAITING_FOR_SPEECH:
    397       switch (event) {
    398         case EVENT_ABORT:
    399           return AbortWithError(event_args);
    400         case EVENT_START:
    401           return NotFeasible(event_args);
    402         case EVENT_STOP_CAPTURE:
    403           return StopCaptureAndWaitForResult(event_args);
    404         case EVENT_AUDIO_DATA:
    405           return DetectUserSpeechOrTimeout(event_args);
    406         case EVENT_ENGINE_RESULT:
    407           return ProcessIntermediateResult(event_args);
    408         case EVENT_ENGINE_ERROR:
    409         case EVENT_AUDIO_ERROR:
    410           return AbortWithError(event_args);
    411       }
    412       break;
    413     case STATE_RECOGNIZING:
    414       switch (event) {
    415         case EVENT_ABORT:
    416           return AbortWithError(event_args);
    417         case EVENT_START:
    418           return NotFeasible(event_args);
    419         case EVENT_STOP_CAPTURE:
    420           return StopCaptureAndWaitForResult(event_args);
    421         case EVENT_AUDIO_DATA:
    422           return DetectEndOfSpeech(event_args);
    423         case EVENT_ENGINE_RESULT:
    424           return ProcessIntermediateResult(event_args);
    425         case EVENT_ENGINE_ERROR:
    426         case EVENT_AUDIO_ERROR:
    427           return AbortWithError(event_args);
    428       }
    429       break;
    430     case STATE_WAITING_FINAL_RESULT:
    431       switch (event) {
    432         case EVENT_ABORT:
    433           return AbortWithError(event_args);
    434         case EVENT_START:
    435           return NotFeasible(event_args);
    436         case EVENT_STOP_CAPTURE:
    437         case EVENT_AUDIO_DATA:
    438           return DoNothing(event_args);
    439         case EVENT_ENGINE_RESULT:
    440           return ProcessFinalResult(event_args);
    441         case EVENT_ENGINE_ERROR:
    442         case EVENT_AUDIO_ERROR:
    443           return AbortWithError(event_args);
    444       }
    445       break;
    446 
    447     // TODO(primiano): remove this state when speech input extensions support
    448     // will be removed and STATE_IDLE.EVENT_ABORT,EVENT_STOP_CAPTURE will be
    449     // reset to NotFeasible (see TODO above).
    450     case STATE_ENDED:
    451       return DoNothing(event_args);
    452   }
    453   return NotFeasible(event_args);
    454 }
    455 
    456 // ----------- Contract for all the FSM evolution functions below -------------
    457 //  - Are guaranteed to be executed in the IO thread;
    458 //  - Are guaranteed to be not reentrant (themselves and each other);
    459 //  - event_args members are guaranteed to be stable during the call;
    460 //  - The class won't be freed in the meanwhile due to callbacks;
    461 //  - IsCapturingAudio() returns true if and only if audio_controller_ != NULL.
    462 
    463 // TODO(primiano): the audio pipeline is currently serial. However, the
    464 // clipper->endpointer->vumeter chain and the sr_engine could be parallelized.
    465 // We should profile the execution to see if it would be worth or not.
    466 void SpeechRecognizerImpl::ProcessAudioPipeline(const AudioChunk& raw_audio) {
    467   const bool route_to_endpointer = state_ >= STATE_ESTIMATING_ENVIRONMENT &&
    468                                    state_ <= STATE_RECOGNIZING;
    469   const bool route_to_sr_engine = route_to_endpointer;
    470   const bool route_to_vumeter = state_ >= STATE_WAITING_FOR_SPEECH &&
    471                                 state_ <= STATE_RECOGNIZING;
    472   const bool clip_detected = DetectClipping(raw_audio);
    473   float rms = 0.0f;
    474 
    475   num_samples_recorded_ += raw_audio.NumSamples();
    476 
    477   if (route_to_endpointer)
    478     endpointer_.ProcessAudio(raw_audio, &rms);
    479 
    480   if (route_to_vumeter) {
    481     DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|.
    482     UpdateSignalAndNoiseLevels(rms, clip_detected);
    483   }
    484   if (route_to_sr_engine) {
    485     DCHECK(recognition_engine_.get() != NULL);
    486     recognition_engine_->TakeAudioChunk(raw_audio);
    487   }
    488 }
    489 
    490 SpeechRecognizerImpl::FSMState
    491 SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) {
    492   DCHECK(recognition_engine_.get() != NULL);
    493   DCHECK(!IsCapturingAudio());
    494   const bool unit_test_is_active = (audio_manager_for_tests_ != NULL);
    495   AudioManager* audio_manager = unit_test_is_active ?
    496                                 audio_manager_for_tests_ :
    497                                 AudioManager::Get();
    498   DCHECK(audio_manager != NULL);
    499 
    500   DVLOG(1) << "SpeechRecognizerImpl starting audio capture.";
    501   num_samples_recorded_ = 0;
    502   audio_level_ = 0;
    503   listener()->OnRecognitionStart(session_id());
    504 
    505   // TODO(xians): Check if the OS has the device with |device_id_|, return
    506   // |SPEECH_AUDIO_ERROR_DETAILS_NO_MIC| if the target device does not exist.
    507   if (!audio_manager->HasAudioInputDevices()) {
    508     return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO,
    509                                         SPEECH_AUDIO_ERROR_DETAILS_NO_MIC));
    510   }
    511 
    512   int chunk_duration_ms = recognition_engine_->GetDesiredAudioChunkDurationMs();
    513 
    514   AudioParameters in_params = audio_manager->GetInputStreamParameters(
    515       device_id_);
    516   if (!in_params.IsValid() && !unit_test_is_active) {
    517     DLOG(ERROR) << "Invalid native audio input parameters";
    518     return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO));
    519   }
    520 
    521   // Audio converter shall provide audio based on these parameters as output.
    522   // Hard coded, WebSpeech specific parameters are utilized here.
    523   int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000;
    524   AudioParameters output_parameters = AudioParameters(
    525       AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate,
    526       kNumBitsPerAudioSample, frames_per_buffer);
    527 
    528   // Audio converter will receive audio based on these parameters as input.
    529   // On Windows we start by verifying that Core Audio is supported. If not,
    530   // the WaveIn API is used and we might as well avoid all audio conversations
    531   // since WaveIn does the conversion for us.
    532   // TODO(henrika): this code should be moved to platform dependent audio
    533   // managers.
    534   bool use_native_audio_params = true;
    535 #if defined(OS_WIN)
    536   use_native_audio_params = media::CoreAudioUtil::IsSupported();
    537   DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech";
    538 #endif
    539 
    540   AudioParameters input_parameters = output_parameters;
    541   if (use_native_audio_params && !unit_test_is_active) {
    542     // Use native audio parameters but avoid opening up at the native buffer
    543     // size. Instead use same frame size (in milliseconds) as WebSpeech uses.
    544     // We rely on internal buffers in the audio back-end to fulfill this request
    545     // and the idea is to simplify the audio conversion since each Convert()
    546     // call will then render exactly one ProvideInput() call.
    547     // Due to implementation details in the audio converter, 2 milliseconds
    548     // are added to the default frame size (100 ms) to ensure there is enough
    549     // data to generate 100 ms of output when resampling.
    550     frames_per_buffer =
    551         ((in_params.sample_rate() * (chunk_duration_ms + 2)) / 1000.0) + 0.5;
    552     input_parameters.Reset(in_params.format(),
    553                            in_params.channel_layout(),
    554                            in_params.channels(),
    555                            in_params.input_channels(),
    556                            in_params.sample_rate(),
    557                            in_params.bits_per_sample(),
    558                            frames_per_buffer);
    559   }
    560 
    561   // Create an audio converter which converts data between native input format
    562   // and WebSpeech specific output format.
    563   audio_converter_.reset(
    564       new OnDataConverter(input_parameters, output_parameters));
    565 
    566   audio_controller_ = AudioInputController::Create(
    567       audio_manager, this, input_parameters, device_id_);
    568 
    569   if (!audio_controller_.get()) {
    570     return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO));
    571   }
    572 
    573   // The endpointer needs to estimate the environment/background noise before
    574   // starting to treat the audio as user input. We wait in the state
    575   // ESTIMATING_ENVIRONMENT until such interval has elapsed before switching
    576   // to user input mode.
    577   endpointer_.SetEnvironmentEstimationMode();
    578   audio_controller_->Record();
    579   return STATE_STARTING;
    580 }
    581 
    582 SpeechRecognizerImpl::FSMState
    583 SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) {
    584   // This is the first audio packet captured, so the recognition engine is
    585   // started and the delegate notified about the event.
    586   DCHECK(recognition_engine_.get() != NULL);
    587   recognition_engine_->StartRecognition();
    588   listener()->OnAudioStart(session_id());
    589 
    590   // This is a little hack, since TakeAudioChunk() is already called by
    591   // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping
    592   // the first audio chunk captured after opening the audio device.
    593   recognition_engine_->TakeAudioChunk(*(event_args.audio_data.get()));
    594   return STATE_ESTIMATING_ENVIRONMENT;
    595 }
    596 
    597 SpeechRecognizerImpl::FSMState
    598 SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) {
    599   DCHECK(endpointer_.IsEstimatingEnvironment());
    600   if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) {
    601     endpointer_.SetUserInputMode();
    602     listener()->OnEnvironmentEstimationComplete(session_id());
    603     return STATE_WAITING_FOR_SPEECH;
    604   } else {
    605     return STATE_ESTIMATING_ENVIRONMENT;
    606   }
    607 }
    608 
    609 SpeechRecognizerImpl::FSMState
    610 SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) {
    611   if (endpointer_.DidStartReceivingSpeech()) {
    612     listener()->OnSoundStart(session_id());
    613     return STATE_RECOGNIZING;
    614   } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) {
    615     return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NO_SPEECH));
    616   }
    617   return STATE_WAITING_FOR_SPEECH;
    618 }
    619 
    620 SpeechRecognizerImpl::FSMState
    621 SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) {
    622   if (endpointer_.speech_input_complete())
    623     return StopCaptureAndWaitForResult(event_args);
    624   return STATE_RECOGNIZING;
    625 }
    626 
    627 SpeechRecognizerImpl::FSMState
    628 SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) {
    629   DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING);
    630 
    631   DVLOG(1) << "Concluding recognition";
    632   CloseAudioControllerAsynchronously();
    633   recognition_engine_->AudioChunksEnded();
    634 
    635   if (state_ > STATE_WAITING_FOR_SPEECH)
    636     listener()->OnSoundEnd(session_id());
    637 
    638   listener()->OnAudioEnd(session_id());
    639   return STATE_WAITING_FINAL_RESULT;
    640 }
    641 
    642 SpeechRecognizerImpl::FSMState
    643 SpeechRecognizerImpl::AbortSilently(const FSMEventArgs& event_args) {
    644   DCHECK_NE(event_args.event, EVENT_AUDIO_ERROR);
    645   DCHECK_NE(event_args.event, EVENT_ENGINE_ERROR);
    646   return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NONE));
    647 }
    648 
    649 SpeechRecognizerImpl::FSMState
    650 SpeechRecognizerImpl::AbortWithError(const FSMEventArgs& event_args) {
    651   if (event_args.event == EVENT_AUDIO_ERROR) {
    652     return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO));
    653   } else if (event_args.event == EVENT_ENGINE_ERROR) {
    654     return Abort(event_args.engine_error);
    655   }
    656   return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_ABORTED));
    657 }
    658 
    659 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort(
    660     const SpeechRecognitionError& error) {
    661   if (IsCapturingAudio())
    662     CloseAudioControllerAsynchronously();
    663 
    664   DVLOG(1) << "SpeechRecognizerImpl canceling recognition. ";
    665 
    666   // The recognition engine is initialized only after STATE_STARTING.
    667   if (state_ > STATE_STARTING) {
    668     DCHECK(recognition_engine_.get() != NULL);
    669     recognition_engine_->EndRecognition();
    670   }
    671 
    672   if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT)
    673     listener()->OnSoundEnd(session_id());
    674 
    675   if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT)
    676     listener()->OnAudioEnd(session_id());
    677 
    678   if (error.code != SPEECH_RECOGNITION_ERROR_NONE)
    679     listener()->OnRecognitionError(session_id(), error);
    680 
    681   listener()->OnRecognitionEnd(session_id());
    682 
    683   return STATE_ENDED;
    684 }
    685 
    686 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ProcessIntermediateResult(
    687     const FSMEventArgs& event_args) {
    688   // Provisional results can occur only during continuous (non one-shot) mode.
    689   // If this check is reached it means that a continuous speech recognition
    690   // engine is being used for a one shot recognition.
    691   DCHECK_EQ(false, is_single_shot_);
    692 
    693   // In continuous recognition, intermediate results can occur even when we are
    694   // in the ESTIMATING_ENVIRONMENT or WAITING_FOR_SPEECH states (if the
    695   // recognition engine is "faster" than our endpointer). In these cases we
    696   // skip the endpointer and fast-forward to the RECOGNIZING state, with respect
    697   // of the events triggering order.
    698   if (state_ == STATE_ESTIMATING_ENVIRONMENT) {
    699     DCHECK(endpointer_.IsEstimatingEnvironment());
    700     endpointer_.SetUserInputMode();
    701     listener()->OnEnvironmentEstimationComplete(session_id());
    702   } else if (state_ == STATE_WAITING_FOR_SPEECH) {
    703     listener()->OnSoundStart(session_id());
    704   } else {
    705     DCHECK_EQ(STATE_RECOGNIZING, state_);
    706   }
    707 
    708   listener()->OnRecognitionResults(session_id(), event_args.engine_results);
    709   return STATE_RECOGNIZING;
    710 }
    711 
    712 SpeechRecognizerImpl::FSMState
    713 SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) {
    714   const SpeechRecognitionResults& results = event_args.engine_results;
    715   SpeechRecognitionResults::const_iterator i = results.begin();
    716   bool provisional_results_pending = false;
    717   bool results_are_empty = true;
    718   for (; i != results.end(); ++i) {
    719     const SpeechRecognitionResult& result = *i;
    720     if (result.is_provisional) {
    721       provisional_results_pending = true;
    722       DCHECK(!is_single_shot_);
    723     } else if (results_are_empty) {
    724       results_are_empty = result.hypotheses.empty();
    725     }
    726   }
    727 
    728   if (provisional_results_pending) {
    729     listener()->OnRecognitionResults(session_id(), results);
    730     // We don't end the recognition if a provisional result is received in
    731     // STATE_WAITING_FINAL_RESULT. A definitive result will come next and will
    732     // end the recognition.
    733     return state_;
    734   }
    735 
    736   recognition_engine_->EndRecognition();
    737 
    738   if (!results_are_empty) {
    739     // We could receive an empty result (which we won't propagate further)
    740     // in the following (continuous) scenario:
    741     //  1. The caller start pushing audio and receives some results;
    742     //  2. A |StopAudioCapture| is issued later;
    743     //  3. The final audio frames captured in the interval ]1,2] do not lead to
    744     //     any result (nor any error);
    745     //  4. The speech recognition engine, therefore, emits an empty result to
    746     //     notify that the recognition is ended with no error, yet neither any
    747     //     further result.
    748     listener()->OnRecognitionResults(session_id(), results);
    749   }
    750 
    751   listener()->OnRecognitionEnd(session_id());
    752   return STATE_ENDED;
    753 }
    754 
    755 SpeechRecognizerImpl::FSMState
    756 SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const {
    757   return state_;  // Just keep the current state.
    758 }
    759 
    760 SpeechRecognizerImpl::FSMState
    761 SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) {
    762   NOTREACHED() << "Unfeasible event " << event_args.event
    763                << " in state " << state_;
    764   return state_;
    765 }
    766 
    767 void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() {
    768   DCHECK(IsCapturingAudio());
    769   DVLOG(1) << "SpeechRecognizerImpl closing audio controller.";
    770   // Issues a Close on the audio controller, passing an empty callback. The only
    771   // purpose of such callback is to keep the audio controller refcounted until
    772   // Close has completed (in the audio thread) and automatically destroy it
    773   // afterwards (upon return from OnAudioClosed).
    774   audio_controller_->Close(base::Bind(&SpeechRecognizerImpl::OnAudioClosed,
    775                                       this, audio_controller_));
    776   audio_controller_ = NULL;  // The controller is still refcounted by Bind.
    777 }
    778 
    779 int SpeechRecognizerImpl::GetElapsedTimeMs() const {
    780   return (num_samples_recorded_ * 1000) / kAudioSampleRate;
    781 }
    782 
    783 void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms,
    784                                                   bool clip_detected) {
    785   // Calculate the input volume to display in the UI, smoothing towards the
    786   // new level.
    787   // TODO(primiano): Do we really need all this floating point arith here?
    788   // Perhaps it might be quite expensive on mobile.
    789   float level = (rms - kAudioMeterMinDb) /
    790       (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
    791   level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);
    792   const float smoothing_factor = (level > audio_level_) ? kUpSmoothingFactor :
    793                                                           kDownSmoothingFactor;
    794   audio_level_ += (level - audio_level_) * smoothing_factor;
    795 
    796   float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /
    797       (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
    798   noise_level = std::min(std::max(0.0f, noise_level),
    799                          kAudioMeterRangeMaxUnclipped);
    800 
    801   listener()->OnAudioLevelsChange(
    802       session_id(), clip_detected ? 1.0f : audio_level_, noise_level);
    803 }
    804 
    805 void SpeechRecognizerImpl::SetAudioManagerForTests(
    806     AudioManager* audio_manager) {
    807   audio_manager_for_tests_ = audio_manager;
    808 }
    809 
    810 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value)
    811     : event(event_value),
    812       audio_data(NULL),
    813       engine_error(SPEECH_RECOGNITION_ERROR_NONE) {
    814 }
    815 
    816 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() {
    817 }
    818 
    819 }  // namespace content
    820