1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "content/browser/speech/speech_recognizer_impl.h" 6 7 #include "base/basictypes.h" 8 #include "base/bind.h" 9 #include "base/time/time.h" 10 #include "content/browser/browser_main_loop.h" 11 #include "content/browser/media/media_internals.h" 12 #include "content/browser/speech/audio_buffer.h" 13 #include "content/browser/speech/google_one_shot_remote_engine.h" 14 #include "content/public/browser/speech_recognition_event_listener.h" 15 #include "media/base/audio_converter.h" 16 #include "net/url_request/url_request_context_getter.h" 17 18 #if defined(OS_WIN) 19 #include "media/audio/win/core_audio_util_win.h" 20 #endif 21 22 using media::AudioBus; 23 using media::AudioConverter; 24 using media::AudioInputController; 25 using media::AudioManager; 26 using media::AudioParameters; 27 using media::ChannelLayout; 28 29 namespace content { 30 31 // Private class which encapsulates the audio converter and the 32 // AudioConverter::InputCallback. It handles resampling, buffering and 33 // channel mixing between input and output parameters. 34 class SpeechRecognizerImpl::OnDataConverter 35 : public media::AudioConverter::InputCallback { 36 public: 37 OnDataConverter(const AudioParameters& input_params, 38 const AudioParameters& output_params); 39 virtual ~OnDataConverter(); 40 41 // Converts input audio |data| bus into an AudioChunk where the input format 42 // is given by |input_parameters_| and the output format by 43 // |output_parameters_|. 44 scoped_refptr<AudioChunk> Convert(const AudioBus* data); 45 46 private: 47 // media::AudioConverter::InputCallback implementation. 48 virtual double ProvideInput(AudioBus* dest, 49 base::TimeDelta buffer_delay) OVERRIDE; 50 51 // Handles resampling, buffering, and channel mixing between input and output 52 // parameters. 53 AudioConverter audio_converter_; 54 55 scoped_ptr<AudioBus> input_bus_; 56 scoped_ptr<AudioBus> output_bus_; 57 const AudioParameters input_parameters_; 58 const AudioParameters output_parameters_; 59 bool waiting_for_input_; 60 scoped_ptr<uint8[]> converted_data_; 61 62 DISALLOW_COPY_AND_ASSIGN(OnDataConverter); 63 }; 64 65 namespace { 66 67 // The following constants are related to the volume level indicator shown in 68 // the UI for recorded audio. 69 // Multiplier used when new volume is greater than previous level. 70 const float kUpSmoothingFactor = 1.0f; 71 // Multiplier used when new volume is lesser than previous level. 72 const float kDownSmoothingFactor = 0.7f; 73 // RMS dB value of a maximum (unclipped) sine wave for int16 samples. 74 const float kAudioMeterMaxDb = 90.31f; 75 // This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0. 76 // Values lower than this will display as empty level-meter. 77 const float kAudioMeterMinDb = 30.0f; 78 const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb; 79 80 // Maximum level to draw to display unclipped meter. (1.0f displays clipping.) 81 const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f; 82 83 // Returns true if more than 5% of the samples are at min or max value. 84 bool DetectClipping(const AudioChunk& chunk) { 85 const int num_samples = chunk.NumSamples(); 86 const int16* samples = chunk.SamplesData16(); 87 const int kThreshold = num_samples / 20; 88 int clipping_samples = 0; 89 90 for (int i = 0; i < num_samples; ++i) { 91 if (samples[i] <= -32767 || samples[i] >= 32767) { 92 if (++clipping_samples > kThreshold) 93 return true; 94 } 95 } 96 return false; 97 } 98 99 void KeepAudioControllerRefcountedForDtor(scoped_refptr<AudioInputController>) { 100 } 101 102 } // namespace 103 104 const int SpeechRecognizerImpl::kAudioSampleRate = 16000; 105 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = 106 media::CHANNEL_LAYOUT_MONO; 107 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; 108 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; 109 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; 110 media::AudioManager* SpeechRecognizerImpl::audio_manager_for_tests_ = NULL; 111 112 COMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0, 113 kNumBitsPerAudioSample_must_be_a_multiple_of_8); 114 115 // SpeechRecognizerImpl::OnDataConverter implementation 116 117 SpeechRecognizerImpl::OnDataConverter::OnDataConverter( 118 const AudioParameters& input_params, const AudioParameters& output_params) 119 : audio_converter_(input_params, output_params, false), 120 input_bus_(AudioBus::Create(input_params)), 121 output_bus_(AudioBus::Create(output_params)), 122 input_parameters_(input_params), 123 output_parameters_(output_params), 124 waiting_for_input_(false), 125 converted_data_(new uint8[output_parameters_.GetBytesPerBuffer()]) { 126 audio_converter_.AddInput(this); 127 } 128 129 SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() { 130 // It should now be safe to unregister the converter since no more OnData() 131 // callbacks are outstanding at this point. 132 audio_converter_.RemoveInput(this); 133 } 134 135 scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert( 136 const AudioBus* data) { 137 CHECK_EQ(data->frames(), input_parameters_.frames_per_buffer()); 138 139 data->CopyTo(input_bus_.get()); 140 141 waiting_for_input_ = true; 142 audio_converter_.Convert(output_bus_.get()); 143 144 output_bus_->ToInterleaved( 145 output_bus_->frames(), output_parameters_.bits_per_sample() / 8, 146 converted_data_.get()); 147 148 // TODO(primiano): Refactor AudioChunk to avoid the extra-copy here 149 // (see http://crbug.com/249316 for details). 150 return scoped_refptr<AudioChunk>(new AudioChunk( 151 converted_data_.get(), 152 output_parameters_.GetBytesPerBuffer(), 153 output_parameters_.bits_per_sample() / 8)); 154 } 155 156 double SpeechRecognizerImpl::OnDataConverter::ProvideInput( 157 AudioBus* dest, base::TimeDelta buffer_delay) { 158 // The audio converted should never ask for more than one bus in each call 159 // to Convert(). If so, we have a serious issue in our design since we might 160 // miss recorded chunks of 100 ms audio data. 161 CHECK(waiting_for_input_); 162 163 // Read from the input bus to feed the converter. 164 input_bus_->CopyTo(dest); 165 166 // |input_bus_| should only be provide once. 167 waiting_for_input_ = false; 168 return 1; 169 } 170 171 // SpeechRecognizerImpl implementation 172 173 SpeechRecognizerImpl::SpeechRecognizerImpl( 174 SpeechRecognitionEventListener* listener, 175 int session_id, 176 bool continuous, 177 bool provisional_results, 178 SpeechRecognitionEngine* engine) 179 : SpeechRecognizer(listener, session_id), 180 recognition_engine_(engine), 181 endpointer_(kAudioSampleRate), 182 audio_log_(MediaInternals::GetInstance()->CreateAudioLog( 183 media::AudioLogFactory::AUDIO_INPUT_CONTROLLER)), 184 is_dispatching_event_(false), 185 provisional_results_(provisional_results), 186 state_(STATE_IDLE) { 187 DCHECK(recognition_engine_ != NULL); 188 if (!continuous) { 189 // In single shot (non-continous) recognition, 190 // the session is automatically ended after: 191 // - 0.5 seconds of silence if time < 3 seconds 192 // - 1 seconds of silence if time >= 3 seconds 193 endpointer_.set_speech_input_complete_silence_length( 194 base::Time::kMicrosecondsPerSecond / 2); 195 endpointer_.set_long_speech_input_complete_silence_length( 196 base::Time::kMicrosecondsPerSecond); 197 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); 198 } else { 199 // In continuous recognition, the session is automatically ended after 15 200 // seconds of silence. 201 const int64 cont_timeout_us = base::Time::kMicrosecondsPerSecond * 15; 202 endpointer_.set_speech_input_complete_silence_length(cont_timeout_us); 203 endpointer_.set_long_speech_length(0); // Use only a single timeout. 204 } 205 endpointer_.StartSession(); 206 recognition_engine_->set_delegate(this); 207 } 208 209 // ------- Methods that trigger Finite State Machine (FSM) events ------------ 210 211 // NOTE:all the external events and requests should be enqueued (PostTask), even 212 // if they come from the same (IO) thread, in order to preserve the relationship 213 // of causality between events and avoid interleaved event processing due to 214 // synchronous callbacks. 215 216 void SpeechRecognizerImpl::StartRecognition(const std::string& device_id) { 217 DCHECK(!device_id.empty()); 218 device_id_ = device_id; 219 220 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 221 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 222 this, FSMEventArgs(EVENT_START))); 223 } 224 225 void SpeechRecognizerImpl::AbortRecognition() { 226 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 227 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 228 this, FSMEventArgs(EVENT_ABORT))); 229 } 230 231 void SpeechRecognizerImpl::StopAudioCapture() { 232 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 233 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 234 this, FSMEventArgs(EVENT_STOP_CAPTURE))); 235 } 236 237 bool SpeechRecognizerImpl::IsActive() const { 238 // Checking the FSM state from another thread (thus, while the FSM is 239 // potentially concurrently evolving) is meaningless. 240 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); 241 return state_ != STATE_IDLE && state_ != STATE_ENDED; 242 } 243 244 bool SpeechRecognizerImpl::IsCapturingAudio() const { 245 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive(). 246 const bool is_capturing_audio = state_ >= STATE_STARTING && 247 state_ <= STATE_RECOGNIZING; 248 DCHECK((is_capturing_audio && (audio_controller_.get() != NULL)) || 249 (!is_capturing_audio && audio_controller_.get() == NULL)); 250 return is_capturing_audio; 251 } 252 253 const SpeechRecognitionEngine& 254 SpeechRecognizerImpl::recognition_engine() const { 255 return *(recognition_engine_.get()); 256 } 257 258 SpeechRecognizerImpl::~SpeechRecognizerImpl() { 259 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); 260 endpointer_.EndSession(); 261 if (audio_controller_.get()) { 262 audio_controller_->Close( 263 base::Bind(&KeepAudioControllerRefcountedForDtor, audio_controller_)); 264 audio_log_->OnClosed(0); 265 } 266 } 267 268 // Invoked in the audio thread. 269 void SpeechRecognizerImpl::OnError(AudioInputController* controller, 270 media::AudioInputController::ErrorCode error_code) { 271 FSMEventArgs event_args(EVENT_AUDIO_ERROR); 272 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 273 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 274 this, event_args)); 275 } 276 277 void SpeechRecognizerImpl::OnData(AudioInputController* controller, 278 const AudioBus* data) { 279 // Convert audio from native format to fixed format used by WebSpeech. 280 FSMEventArgs event_args(EVENT_AUDIO_DATA); 281 event_args.audio_data = audio_converter_->Convert(data); 282 283 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 284 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 285 this, event_args)); 286 } 287 288 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} 289 290 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResults( 291 const SpeechRecognitionResults& results) { 292 FSMEventArgs event_args(EVENT_ENGINE_RESULT); 293 event_args.engine_results = results; 294 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 295 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 296 this, event_args)); 297 } 298 299 void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( 300 const SpeechRecognitionError& error) { 301 FSMEventArgs event_args(EVENT_ENGINE_ERROR); 302 event_args.engine_error = error; 303 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 304 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 305 this, event_args)); 306 } 307 308 // ----------------------- Core FSM implementation --------------------------- 309 // TODO(primiano): After the changes in the media package (r129173), this class 310 // slightly violates the SpeechRecognitionEventListener interface contract. In 311 // particular, it is not true anymore that this class can be freed after the 312 // OnRecognitionEnd event, since the audio_controller_.Close() asynchronous 313 // call can be still in progress after the end event. Currently, it does not 314 // represent a problem for the browser itself, since refcounting protects us 315 // against such race conditions. However, we should fix this in the next CLs. 316 // For instance, tests are currently working just because the 317 // TestAudioInputController is not closing asynchronously as the real controller 318 // does, but they will become flaky if TestAudioInputController will be fixed. 319 320 void SpeechRecognizerImpl::DispatchEvent(const FSMEventArgs& event_args) { 321 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); 322 DCHECK_LE(event_args.event, EVENT_MAX_VALUE); 323 DCHECK_LE(state_, STATE_MAX_VALUE); 324 325 // Event dispatching must be sequential, otherwise it will break all the rules 326 // and the assumptions of the finite state automata model. 327 DCHECK(!is_dispatching_event_); 328 is_dispatching_event_ = true; 329 330 // Guard against the delegate freeing us until we finish processing the event. 331 scoped_refptr<SpeechRecognizerImpl> me(this); 332 333 if (event_args.event == EVENT_AUDIO_DATA) { 334 DCHECK(event_args.audio_data.get() != NULL); 335 ProcessAudioPipeline(*event_args.audio_data.get()); 336 } 337 338 // The audio pipeline must be processed before the event dispatch, otherwise 339 // it would take actions according to the future state instead of the current. 340 state_ = ExecuteTransitionAndGetNextState(event_args); 341 is_dispatching_event_ = false; 342 } 343 344 SpeechRecognizerImpl::FSMState 345 SpeechRecognizerImpl::ExecuteTransitionAndGetNextState( 346 const FSMEventArgs& event_args) { 347 const FSMEvent event = event_args.event; 348 switch (state_) { 349 case STATE_IDLE: 350 switch (event) { 351 // TODO(primiano): restore UNREACHABLE_CONDITION on EVENT_ABORT and 352 // EVENT_STOP_CAPTURE below once speech input extensions are fixed. 353 case EVENT_ABORT: 354 return AbortSilently(event_args); 355 case EVENT_START: 356 return StartRecording(event_args); 357 case EVENT_STOP_CAPTURE: 358 return AbortSilently(event_args); 359 case EVENT_AUDIO_DATA: // Corner cases related to queued messages 360 case EVENT_ENGINE_RESULT: // being lately dispatched. 361 case EVENT_ENGINE_ERROR: 362 case EVENT_AUDIO_ERROR: 363 return DoNothing(event_args); 364 } 365 break; 366 case STATE_STARTING: 367 switch (event) { 368 case EVENT_ABORT: 369 return AbortWithError(event_args); 370 case EVENT_START: 371 return NotFeasible(event_args); 372 case EVENT_STOP_CAPTURE: 373 return AbortSilently(event_args); 374 case EVENT_AUDIO_DATA: 375 return StartRecognitionEngine(event_args); 376 case EVENT_ENGINE_RESULT: 377 return NotFeasible(event_args); 378 case EVENT_ENGINE_ERROR: 379 case EVENT_AUDIO_ERROR: 380 return AbortWithError(event_args); 381 } 382 break; 383 case STATE_ESTIMATING_ENVIRONMENT: 384 switch (event) { 385 case EVENT_ABORT: 386 return AbortWithError(event_args); 387 case EVENT_START: 388 return NotFeasible(event_args); 389 case EVENT_STOP_CAPTURE: 390 return StopCaptureAndWaitForResult(event_args); 391 case EVENT_AUDIO_DATA: 392 return WaitEnvironmentEstimationCompletion(event_args); 393 case EVENT_ENGINE_RESULT: 394 return ProcessIntermediateResult(event_args); 395 case EVENT_ENGINE_ERROR: 396 case EVENT_AUDIO_ERROR: 397 return AbortWithError(event_args); 398 } 399 break; 400 case STATE_WAITING_FOR_SPEECH: 401 switch (event) { 402 case EVENT_ABORT: 403 return AbortWithError(event_args); 404 case EVENT_START: 405 return NotFeasible(event_args); 406 case EVENT_STOP_CAPTURE: 407 return StopCaptureAndWaitForResult(event_args); 408 case EVENT_AUDIO_DATA: 409 return DetectUserSpeechOrTimeout(event_args); 410 case EVENT_ENGINE_RESULT: 411 return ProcessIntermediateResult(event_args); 412 case EVENT_ENGINE_ERROR: 413 case EVENT_AUDIO_ERROR: 414 return AbortWithError(event_args); 415 } 416 break; 417 case STATE_RECOGNIZING: 418 switch (event) { 419 case EVENT_ABORT: 420 return AbortWithError(event_args); 421 case EVENT_START: 422 return NotFeasible(event_args); 423 case EVENT_STOP_CAPTURE: 424 return StopCaptureAndWaitForResult(event_args); 425 case EVENT_AUDIO_DATA: 426 return DetectEndOfSpeech(event_args); 427 case EVENT_ENGINE_RESULT: 428 return ProcessIntermediateResult(event_args); 429 case EVENT_ENGINE_ERROR: 430 case EVENT_AUDIO_ERROR: 431 return AbortWithError(event_args); 432 } 433 break; 434 case STATE_WAITING_FINAL_RESULT: 435 switch (event) { 436 case EVENT_ABORT: 437 return AbortWithError(event_args); 438 case EVENT_START: 439 return NotFeasible(event_args); 440 case EVENT_STOP_CAPTURE: 441 case EVENT_AUDIO_DATA: 442 return DoNothing(event_args); 443 case EVENT_ENGINE_RESULT: 444 return ProcessFinalResult(event_args); 445 case EVENT_ENGINE_ERROR: 446 case EVENT_AUDIO_ERROR: 447 return AbortWithError(event_args); 448 } 449 break; 450 451 // TODO(primiano): remove this state when speech input extensions support 452 // will be removed and STATE_IDLE.EVENT_ABORT,EVENT_STOP_CAPTURE will be 453 // reset to NotFeasible (see TODO above). 454 case STATE_ENDED: 455 return DoNothing(event_args); 456 } 457 return NotFeasible(event_args); 458 } 459 460 // ----------- Contract for all the FSM evolution functions below ------------- 461 // - Are guaranteed to be executed in the IO thread; 462 // - Are guaranteed to be not reentrant (themselves and each other); 463 // - event_args members are guaranteed to be stable during the call; 464 // - The class won't be freed in the meanwhile due to callbacks; 465 // - IsCapturingAudio() returns true if and only if audio_controller_ != NULL. 466 467 // TODO(primiano): the audio pipeline is currently serial. However, the 468 // clipper->endpointer->vumeter chain and the sr_engine could be parallelized. 469 // We should profile the execution to see if it would be worth or not. 470 void SpeechRecognizerImpl::ProcessAudioPipeline(const AudioChunk& raw_audio) { 471 const bool route_to_endpointer = state_ >= STATE_ESTIMATING_ENVIRONMENT && 472 state_ <= STATE_RECOGNIZING; 473 const bool route_to_sr_engine = route_to_endpointer; 474 const bool route_to_vumeter = state_ >= STATE_WAITING_FOR_SPEECH && 475 state_ <= STATE_RECOGNIZING; 476 const bool clip_detected = DetectClipping(raw_audio); 477 float rms = 0.0f; 478 479 num_samples_recorded_ += raw_audio.NumSamples(); 480 481 if (route_to_endpointer) 482 endpointer_.ProcessAudio(raw_audio, &rms); 483 484 if (route_to_vumeter) { 485 DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|. 486 UpdateSignalAndNoiseLevels(rms, clip_detected); 487 } 488 if (route_to_sr_engine) { 489 DCHECK(recognition_engine_.get() != NULL); 490 recognition_engine_->TakeAudioChunk(raw_audio); 491 } 492 } 493 494 SpeechRecognizerImpl::FSMState 495 SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) { 496 DCHECK(recognition_engine_.get() != NULL); 497 DCHECK(!IsCapturingAudio()); 498 const bool unit_test_is_active = (audio_manager_for_tests_ != NULL); 499 AudioManager* audio_manager = unit_test_is_active ? 500 audio_manager_for_tests_ : 501 AudioManager::Get(); 502 DCHECK(audio_manager != NULL); 503 504 DVLOG(1) << "SpeechRecognizerImpl starting audio capture."; 505 num_samples_recorded_ = 0; 506 audio_level_ = 0; 507 listener()->OnRecognitionStart(session_id()); 508 509 // TODO(xians): Check if the OS has the device with |device_id_|, return 510 // |SPEECH_AUDIO_ERROR_DETAILS_NO_MIC| if the target device does not exist. 511 if (!audio_manager->HasAudioInputDevices()) { 512 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO, 513 SPEECH_AUDIO_ERROR_DETAILS_NO_MIC)); 514 } 515 516 int chunk_duration_ms = recognition_engine_->GetDesiredAudioChunkDurationMs(); 517 518 AudioParameters in_params = audio_manager->GetInputStreamParameters( 519 device_id_); 520 if (!in_params.IsValid() && !unit_test_is_active) { 521 DLOG(ERROR) << "Invalid native audio input parameters"; 522 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO)); 523 } 524 525 // Audio converter shall provide audio based on these parameters as output. 526 // Hard coded, WebSpeech specific parameters are utilized here. 527 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000; 528 AudioParameters output_parameters = AudioParameters( 529 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate, 530 kNumBitsPerAudioSample, frames_per_buffer); 531 532 // Audio converter will receive audio based on these parameters as input. 533 // On Windows we start by verifying that Core Audio is supported. If not, 534 // the WaveIn API is used and we might as well avoid all audio conversations 535 // since WaveIn does the conversion for us. 536 // TODO(henrika): this code should be moved to platform dependent audio 537 // managers. 538 bool use_native_audio_params = true; 539 #if defined(OS_WIN) 540 use_native_audio_params = media::CoreAudioUtil::IsSupported(); 541 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech"; 542 #endif 543 544 AudioParameters input_parameters = output_parameters; 545 if (use_native_audio_params && !unit_test_is_active) { 546 // Use native audio parameters but avoid opening up at the native buffer 547 // size. Instead use same frame size (in milliseconds) as WebSpeech uses. 548 // We rely on internal buffers in the audio back-end to fulfill this request 549 // and the idea is to simplify the audio conversion since each Convert() 550 // call will then render exactly one ProvideInput() call. 551 // Due to implementation details in the audio converter, 2 milliseconds 552 // are added to the default frame size (100 ms) to ensure there is enough 553 // data to generate 100 ms of output when resampling. 554 frames_per_buffer = 555 ((in_params.sample_rate() * (chunk_duration_ms + 2)) / 1000.0) + 0.5; 556 input_parameters.Reset(in_params.format(), 557 in_params.channel_layout(), 558 in_params.channels(), 559 in_params.sample_rate(), 560 in_params.bits_per_sample(), 561 frames_per_buffer); 562 } 563 564 // Create an audio converter which converts data between native input format 565 // and WebSpeech specific output format. 566 audio_converter_.reset( 567 new OnDataConverter(input_parameters, output_parameters)); 568 569 audio_controller_ = AudioInputController::Create( 570 audio_manager, this, input_parameters, device_id_, NULL); 571 572 if (!audio_controller_.get()) { 573 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO)); 574 } 575 576 audio_log_->OnCreated(0, input_parameters, device_id_); 577 578 // The endpointer needs to estimate the environment/background noise before 579 // starting to treat the audio as user input. We wait in the state 580 // ESTIMATING_ENVIRONMENT until such interval has elapsed before switching 581 // to user input mode. 582 endpointer_.SetEnvironmentEstimationMode(); 583 audio_controller_->Record(); 584 audio_log_->OnStarted(0); 585 return STATE_STARTING; 586 } 587 588 SpeechRecognizerImpl::FSMState 589 SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) { 590 // This is the first audio packet captured, so the recognition engine is 591 // started and the delegate notified about the event. 592 DCHECK(recognition_engine_.get() != NULL); 593 recognition_engine_->StartRecognition(); 594 listener()->OnAudioStart(session_id()); 595 596 // This is a little hack, since TakeAudioChunk() is already called by 597 // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping 598 // the first audio chunk captured after opening the audio device. 599 recognition_engine_->TakeAudioChunk(*(event_args.audio_data.get())); 600 return STATE_ESTIMATING_ENVIRONMENT; 601 } 602 603 SpeechRecognizerImpl::FSMState 604 SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) { 605 DCHECK(endpointer_.IsEstimatingEnvironment()); 606 if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) { 607 endpointer_.SetUserInputMode(); 608 listener()->OnEnvironmentEstimationComplete(session_id()); 609 return STATE_WAITING_FOR_SPEECH; 610 } else { 611 return STATE_ESTIMATING_ENVIRONMENT; 612 } 613 } 614 615 SpeechRecognizerImpl::FSMState 616 SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) { 617 if (endpointer_.DidStartReceivingSpeech()) { 618 listener()->OnSoundStart(session_id()); 619 return STATE_RECOGNIZING; 620 } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) { 621 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NO_SPEECH)); 622 } 623 return STATE_WAITING_FOR_SPEECH; 624 } 625 626 SpeechRecognizerImpl::FSMState 627 SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) { 628 if (endpointer_.speech_input_complete()) 629 return StopCaptureAndWaitForResult(event_args); 630 return STATE_RECOGNIZING; 631 } 632 633 SpeechRecognizerImpl::FSMState 634 SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) { 635 DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING); 636 637 DVLOG(1) << "Concluding recognition"; 638 CloseAudioControllerAsynchronously(); 639 recognition_engine_->AudioChunksEnded(); 640 641 if (state_ > STATE_WAITING_FOR_SPEECH) 642 listener()->OnSoundEnd(session_id()); 643 644 listener()->OnAudioEnd(session_id()); 645 return STATE_WAITING_FINAL_RESULT; 646 } 647 648 SpeechRecognizerImpl::FSMState 649 SpeechRecognizerImpl::AbortSilently(const FSMEventArgs& event_args) { 650 DCHECK_NE(event_args.event, EVENT_AUDIO_ERROR); 651 DCHECK_NE(event_args.event, EVENT_ENGINE_ERROR); 652 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NONE)); 653 } 654 655 SpeechRecognizerImpl::FSMState 656 SpeechRecognizerImpl::AbortWithError(const FSMEventArgs& event_args) { 657 if (event_args.event == EVENT_AUDIO_ERROR) { 658 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO)); 659 } else if (event_args.event == EVENT_ENGINE_ERROR) { 660 return Abort(event_args.engine_error); 661 } 662 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_ABORTED)); 663 } 664 665 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort( 666 const SpeechRecognitionError& error) { 667 if (IsCapturingAudio()) 668 CloseAudioControllerAsynchronously(); 669 670 DVLOG(1) << "SpeechRecognizerImpl canceling recognition. "; 671 672 // The recognition engine is initialized only after STATE_STARTING. 673 if (state_ > STATE_STARTING) { 674 DCHECK(recognition_engine_.get() != NULL); 675 recognition_engine_->EndRecognition(); 676 } 677 678 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT) 679 listener()->OnSoundEnd(session_id()); 680 681 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT) 682 listener()->OnAudioEnd(session_id()); 683 684 if (error.code != SPEECH_RECOGNITION_ERROR_NONE) 685 listener()->OnRecognitionError(session_id(), error); 686 687 listener()->OnRecognitionEnd(session_id()); 688 689 return STATE_ENDED; 690 } 691 692 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ProcessIntermediateResult( 693 const FSMEventArgs& event_args) { 694 // Provisional results can occur only if explicitly enabled in the JS API. 695 DCHECK(provisional_results_); 696 697 // In continuous recognition, intermediate results can occur even when we are 698 // in the ESTIMATING_ENVIRONMENT or WAITING_FOR_SPEECH states (if the 699 // recognition engine is "faster" than our endpointer). In these cases we 700 // skip the endpointer and fast-forward to the RECOGNIZING state, with respect 701 // of the events triggering order. 702 if (state_ == STATE_ESTIMATING_ENVIRONMENT) { 703 DCHECK(endpointer_.IsEstimatingEnvironment()); 704 endpointer_.SetUserInputMode(); 705 listener()->OnEnvironmentEstimationComplete(session_id()); 706 } else if (state_ == STATE_WAITING_FOR_SPEECH) { 707 listener()->OnSoundStart(session_id()); 708 } else { 709 DCHECK_EQ(STATE_RECOGNIZING, state_); 710 } 711 712 listener()->OnRecognitionResults(session_id(), event_args.engine_results); 713 return STATE_RECOGNIZING; 714 } 715 716 SpeechRecognizerImpl::FSMState 717 SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) { 718 const SpeechRecognitionResults& results = event_args.engine_results; 719 SpeechRecognitionResults::const_iterator i = results.begin(); 720 bool provisional_results_pending = false; 721 bool results_are_empty = true; 722 for (; i != results.end(); ++i) { 723 const SpeechRecognitionResult& result = *i; 724 if (result.is_provisional) { 725 DCHECK(provisional_results_); 726 provisional_results_pending = true; 727 } else if (results_are_empty) { 728 results_are_empty = result.hypotheses.empty(); 729 } 730 } 731 732 if (provisional_results_pending) { 733 listener()->OnRecognitionResults(session_id(), results); 734 // We don't end the recognition if a provisional result is received in 735 // STATE_WAITING_FINAL_RESULT. A definitive result will come next and will 736 // end the recognition. 737 return state_; 738 } 739 740 recognition_engine_->EndRecognition(); 741 742 if (!results_are_empty) { 743 // We could receive an empty result (which we won't propagate further) 744 // in the following (continuous) scenario: 745 // 1. The caller start pushing audio and receives some results; 746 // 2. A |StopAudioCapture| is issued later; 747 // 3. The final audio frames captured in the interval ]1,2] do not lead to 748 // any result (nor any error); 749 // 4. The speech recognition engine, therefore, emits an empty result to 750 // notify that the recognition is ended with no error, yet neither any 751 // further result. 752 listener()->OnRecognitionResults(session_id(), results); 753 } 754 755 listener()->OnRecognitionEnd(session_id()); 756 return STATE_ENDED; 757 } 758 759 SpeechRecognizerImpl::FSMState 760 SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const { 761 return state_; // Just keep the current state. 762 } 763 764 SpeechRecognizerImpl::FSMState 765 SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) { 766 NOTREACHED() << "Unfeasible event " << event_args.event 767 << " in state " << state_; 768 return state_; 769 } 770 771 void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() { 772 DCHECK(IsCapturingAudio()); 773 DVLOG(1) << "SpeechRecognizerImpl closing audio controller."; 774 // Issues a Close on the audio controller, passing an empty callback. The only 775 // purpose of such callback is to keep the audio controller refcounted until 776 // Close has completed (in the audio thread) and automatically destroy it 777 // afterwards (upon return from OnAudioClosed). 778 audio_controller_->Close(base::Bind(&SpeechRecognizerImpl::OnAudioClosed, 779 this, audio_controller_)); 780 audio_controller_ = NULL; // The controller is still refcounted by Bind. 781 audio_log_->OnClosed(0); 782 } 783 784 int SpeechRecognizerImpl::GetElapsedTimeMs() const { 785 return (num_samples_recorded_ * 1000) / kAudioSampleRate; 786 } 787 788 void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms, 789 bool clip_detected) { 790 // Calculate the input volume to display in the UI, smoothing towards the 791 // new level. 792 // TODO(primiano): Do we really need all this floating point arith here? 793 // Perhaps it might be quite expensive on mobile. 794 float level = (rms - kAudioMeterMinDb) / 795 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); 796 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped); 797 const float smoothing_factor = (level > audio_level_) ? kUpSmoothingFactor : 798 kDownSmoothingFactor; 799 audio_level_ += (level - audio_level_) * smoothing_factor; 800 801 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / 802 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); 803 noise_level = std::min(std::max(0.0f, noise_level), 804 kAudioMeterRangeMaxUnclipped); 805 806 listener()->OnAudioLevelsChange( 807 session_id(), clip_detected ? 1.0f : audio_level_, noise_level); 808 } 809 810 void SpeechRecognizerImpl::SetAudioManagerForTesting( 811 AudioManager* audio_manager) { 812 audio_manager_for_tests_ = audio_manager; 813 } 814 815 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) 816 : event(event_value), 817 audio_data(NULL), 818 engine_error(SPEECH_RECOGNITION_ERROR_NONE) { 819 } 820 821 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { 822 } 823 824 } // namespace content 825