1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "content/browser/speech/speech_recognizer_impl.h" 6 7 #include "base/basictypes.h" 8 #include "base/bind.h" 9 #include "base/time/time.h" 10 #include "content/browser/browser_main_loop.h" 11 #include "content/browser/speech/audio_buffer.h" 12 #include "content/browser/speech/google_one_shot_remote_engine.h" 13 #include "content/public/browser/speech_recognition_event_listener.h" 14 #include "media/base/audio_converter.h" 15 #include "net/url_request/url_request_context_getter.h" 16 17 #if defined(OS_WIN) 18 #include "media/audio/win/core_audio_util_win.h" 19 #endif 20 21 using media::AudioBus; 22 using media::AudioConverter; 23 using media::AudioInputController; 24 using media::AudioManager; 25 using media::AudioParameters; 26 using media::ChannelLayout; 27 28 namespace content { 29 30 // Private class which encapsulates the audio converter and the 31 // AudioConverter::InputCallback. It handles resampling, buffering and 32 // channel mixing between input and output parameters. 33 class SpeechRecognizerImpl::OnDataConverter 34 : public media::AudioConverter::InputCallback { 35 public: 36 OnDataConverter(const AudioParameters& input_params, 37 const AudioParameters& output_params); 38 virtual ~OnDataConverter(); 39 40 // Converts input |data| buffer into an AudioChunk where the input format 41 // is given by |input_parameters_| and the output format by 42 // |output_parameters_|. 43 scoped_refptr<AudioChunk> Convert(const uint8* data, size_t size); 44 45 private: 46 // media::AudioConverter::InputCallback implementation. 47 virtual double ProvideInput(AudioBus* dest, 48 base::TimeDelta buffer_delay) OVERRIDE; 49 50 // Handles resampling, buffering, and channel mixing between input and output 51 // parameters. 52 AudioConverter audio_converter_; 53 54 scoped_ptr<AudioBus> input_bus_; 55 scoped_ptr<AudioBus> output_bus_; 56 const AudioParameters input_parameters_; 57 const AudioParameters output_parameters_; 58 bool waiting_for_input_; 59 scoped_ptr<uint8[]> converted_data_; 60 61 DISALLOW_COPY_AND_ASSIGN(OnDataConverter); 62 }; 63 64 namespace { 65 66 // The following constants are related to the volume level indicator shown in 67 // the UI for recorded audio. 68 // Multiplier used when new volume is greater than previous level. 69 const float kUpSmoothingFactor = 1.0f; 70 // Multiplier used when new volume is lesser than previous level. 71 const float kDownSmoothingFactor = 0.7f; 72 // RMS dB value of a maximum (unclipped) sine wave for int16 samples. 73 const float kAudioMeterMaxDb = 90.31f; 74 // This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0. 75 // Values lower than this will display as empty level-meter. 76 const float kAudioMeterMinDb = 30.0f; 77 const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb; 78 79 // Maximum level to draw to display unclipped meter. (1.0f displays clipping.) 80 const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f; 81 82 // Returns true if more than 5% of the samples are at min or max value. 83 bool DetectClipping(const AudioChunk& chunk) { 84 const int num_samples = chunk.NumSamples(); 85 const int16* samples = chunk.SamplesData16(); 86 const int kThreshold = num_samples / 20; 87 int clipping_samples = 0; 88 89 for (int i = 0; i < num_samples; ++i) { 90 if (samples[i] <= -32767 || samples[i] >= 32767) { 91 if (++clipping_samples > kThreshold) 92 return true; 93 } 94 } 95 return false; 96 } 97 98 void KeepAudioControllerRefcountedForDtor(scoped_refptr<AudioInputController>) { 99 } 100 101 } // namespace 102 103 const int SpeechRecognizerImpl::kAudioSampleRate = 16000; 104 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = 105 media::CHANNEL_LAYOUT_MONO; 106 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; 107 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; 108 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; 109 media::AudioManager* SpeechRecognizerImpl::audio_manager_for_tests_ = NULL; 110 111 COMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0, 112 kNumBitsPerAudioSample_must_be_a_multiple_of_8); 113 114 // SpeechRecognizerImpl::OnDataConverter implementation 115 116 SpeechRecognizerImpl::OnDataConverter::OnDataConverter( 117 const AudioParameters& input_params, const AudioParameters& output_params) 118 : audio_converter_(input_params, output_params, false), 119 input_bus_(AudioBus::Create(input_params)), 120 output_bus_(AudioBus::Create(output_params)), 121 input_parameters_(input_params), 122 output_parameters_(output_params), 123 waiting_for_input_(false), 124 converted_data_(new uint8[output_parameters_.GetBytesPerBuffer()]) { 125 audio_converter_.AddInput(this); 126 } 127 128 SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() { 129 // It should now be safe to unregister the converter since no more OnData() 130 // callbacks are outstanding at this point. 131 audio_converter_.RemoveInput(this); 132 } 133 134 scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert( 135 const uint8* data, size_t size) { 136 CHECK_EQ(size, static_cast<size_t>(input_parameters_.GetBytesPerBuffer())); 137 138 input_bus_->FromInterleaved( 139 data, input_bus_->frames(), input_parameters_.bits_per_sample() / 8); 140 141 waiting_for_input_ = true; 142 audio_converter_.Convert(output_bus_.get()); 143 144 output_bus_->ToInterleaved( 145 output_bus_->frames(), output_parameters_.bits_per_sample() / 8, 146 converted_data_.get()); 147 148 // TODO(primiano): Refactor AudioChunk to avoid the extra-copy here 149 // (see http://crbug.com/249316 for details). 150 return scoped_refptr<AudioChunk>(new AudioChunk( 151 converted_data_.get(), 152 output_parameters_.GetBytesPerBuffer(), 153 output_parameters_.bits_per_sample() / 8)); 154 } 155 156 double SpeechRecognizerImpl::OnDataConverter::ProvideInput( 157 AudioBus* dest, base::TimeDelta buffer_delay) { 158 // The audio converted should never ask for more than one bus in each call 159 // to Convert(). If so, we have a serious issue in our design since we might 160 // miss recorded chunks of 100 ms audio data. 161 CHECK(waiting_for_input_); 162 163 // Read from the input bus to feed the converter. 164 input_bus_->CopyTo(dest); 165 166 // |input_bus_| should only be provide once. 167 waiting_for_input_ = false; 168 return 1; 169 } 170 171 // SpeechRecognizerImpl implementation 172 173 SpeechRecognizerImpl::SpeechRecognizerImpl( 174 SpeechRecognitionEventListener* listener, 175 int session_id, 176 bool is_single_shot, 177 SpeechRecognitionEngine* engine) 178 : SpeechRecognizer(listener, session_id), 179 recognition_engine_(engine), 180 endpointer_(kAudioSampleRate), 181 is_dispatching_event_(false), 182 is_single_shot_(is_single_shot), 183 state_(STATE_IDLE) { 184 DCHECK(recognition_engine_ != NULL); 185 if (is_single_shot) { 186 // In single shot recognition, the session is automatically ended after: 187 // - 0.5 seconds of silence if time < 3 seconds 188 // - 1 seconds of silence if time >= 3 seconds 189 endpointer_.set_speech_input_complete_silence_length( 190 base::Time::kMicrosecondsPerSecond / 2); 191 endpointer_.set_long_speech_input_complete_silence_length( 192 base::Time::kMicrosecondsPerSecond); 193 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); 194 } else { 195 // In continuous recognition, the session is automatically ended after 15 196 // seconds of silence. 197 const int64 cont_timeout_us = base::Time::kMicrosecondsPerSecond * 15; 198 endpointer_.set_speech_input_complete_silence_length(cont_timeout_us); 199 endpointer_.set_long_speech_length(0); // Use only a single timeout. 200 } 201 endpointer_.StartSession(); 202 recognition_engine_->set_delegate(this); 203 } 204 205 // ------- Methods that trigger Finite State Machine (FSM) events ------------ 206 207 // NOTE:all the external events and requests should be enqueued (PostTask), even 208 // if they come from the same (IO) thread, in order to preserve the relationship 209 // of causality between events and avoid interleaved event processing due to 210 // synchronous callbacks. 211 212 void SpeechRecognizerImpl::StartRecognition(const std::string& device_id) { 213 DCHECK(!device_id.empty()); 214 device_id_ = device_id; 215 216 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 217 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 218 this, FSMEventArgs(EVENT_START))); 219 } 220 221 void SpeechRecognizerImpl::AbortRecognition() { 222 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 223 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 224 this, FSMEventArgs(EVENT_ABORT))); 225 } 226 227 void SpeechRecognizerImpl::StopAudioCapture() { 228 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 229 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 230 this, FSMEventArgs(EVENT_STOP_CAPTURE))); 231 } 232 233 bool SpeechRecognizerImpl::IsActive() const { 234 // Checking the FSM state from another thread (thus, while the FSM is 235 // potentially concurrently evolving) is meaningless. 236 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); 237 return state_ != STATE_IDLE && state_ != STATE_ENDED; 238 } 239 240 bool SpeechRecognizerImpl::IsCapturingAudio() const { 241 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive(). 242 const bool is_capturing_audio = state_ >= STATE_STARTING && 243 state_ <= STATE_RECOGNIZING; 244 DCHECK((is_capturing_audio && (audio_controller_.get() != NULL)) || 245 (!is_capturing_audio && audio_controller_.get() == NULL)); 246 return is_capturing_audio; 247 } 248 249 const SpeechRecognitionEngine& 250 SpeechRecognizerImpl::recognition_engine() const { 251 return *(recognition_engine_.get()); 252 } 253 254 SpeechRecognizerImpl::~SpeechRecognizerImpl() { 255 endpointer_.EndSession(); 256 if (audio_controller_.get()) { 257 audio_controller_->Close( 258 base::Bind(&KeepAudioControllerRefcountedForDtor, audio_controller_)); 259 } 260 } 261 262 // Invoked in the audio thread. 263 void SpeechRecognizerImpl::OnError(AudioInputController* controller) { 264 FSMEventArgs event_args(EVENT_AUDIO_ERROR); 265 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 266 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 267 this, event_args)); 268 } 269 270 void SpeechRecognizerImpl::OnData(AudioInputController* controller, 271 const uint8* data, uint32 size) { 272 if (size == 0) // This could happen when audio capture stops and is normal. 273 return; 274 275 // Convert audio from native format to fixed format used by WebSpeech. 276 FSMEventArgs event_args(EVENT_AUDIO_DATA); 277 event_args.audio_data = audio_converter_->Convert(data, size); 278 279 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 280 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 281 this, event_args)); 282 } 283 284 void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} 285 286 void SpeechRecognizerImpl::OnSpeechRecognitionEngineResults( 287 const SpeechRecognitionResults& results) { 288 FSMEventArgs event_args(EVENT_ENGINE_RESULT); 289 event_args.engine_results = results; 290 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 291 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 292 this, event_args)); 293 } 294 295 void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( 296 const SpeechRecognitionError& error) { 297 FSMEventArgs event_args(EVENT_ENGINE_ERROR); 298 event_args.engine_error = error; 299 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 300 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 301 this, event_args)); 302 } 303 304 // ----------------------- Core FSM implementation --------------------------- 305 // TODO(primiano): After the changes in the media package (r129173), this class 306 // slightly violates the SpeechRecognitionEventListener interface contract. In 307 // particular, it is not true anymore that this class can be freed after the 308 // OnRecognitionEnd event, since the audio_controller_.Close() asynchronous 309 // call can be still in progress after the end event. Currently, it does not 310 // represent a problem for the browser itself, since refcounting protects us 311 // against such race conditions. However, we should fix this in the next CLs. 312 // For instance, tests are currently working just because the 313 // TestAudioInputController is not closing asynchronously as the real controller 314 // does, but they will become flaky if TestAudioInputController will be fixed. 315 316 void SpeechRecognizerImpl::DispatchEvent(const FSMEventArgs& event_args) { 317 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); 318 DCHECK_LE(event_args.event, EVENT_MAX_VALUE); 319 DCHECK_LE(state_, STATE_MAX_VALUE); 320 321 // Event dispatching must be sequential, otherwise it will break all the rules 322 // and the assumptions of the finite state automata model. 323 DCHECK(!is_dispatching_event_); 324 is_dispatching_event_ = true; 325 326 // Guard against the delegate freeing us until we finish processing the event. 327 scoped_refptr<SpeechRecognizerImpl> me(this); 328 329 if (event_args.event == EVENT_AUDIO_DATA) { 330 DCHECK(event_args.audio_data.get() != NULL); 331 ProcessAudioPipeline(*event_args.audio_data.get()); 332 } 333 334 // The audio pipeline must be processed before the event dispatch, otherwise 335 // it would take actions according to the future state instead of the current. 336 state_ = ExecuteTransitionAndGetNextState(event_args); 337 is_dispatching_event_ = false; 338 } 339 340 SpeechRecognizerImpl::FSMState 341 SpeechRecognizerImpl::ExecuteTransitionAndGetNextState( 342 const FSMEventArgs& event_args) { 343 const FSMEvent event = event_args.event; 344 switch (state_) { 345 case STATE_IDLE: 346 switch (event) { 347 // TODO(primiano): restore UNREACHABLE_CONDITION on EVENT_ABORT and 348 // EVENT_STOP_CAPTURE below once speech input extensions are fixed. 349 case EVENT_ABORT: 350 return AbortSilently(event_args); 351 case EVENT_START: 352 return StartRecording(event_args); 353 case EVENT_STOP_CAPTURE: 354 return AbortSilently(event_args); 355 case EVENT_AUDIO_DATA: // Corner cases related to queued messages 356 case EVENT_ENGINE_RESULT: // being lately dispatched. 357 case EVENT_ENGINE_ERROR: 358 case EVENT_AUDIO_ERROR: 359 return DoNothing(event_args); 360 } 361 break; 362 case STATE_STARTING: 363 switch (event) { 364 case EVENT_ABORT: 365 return AbortWithError(event_args); 366 case EVENT_START: 367 return NotFeasible(event_args); 368 case EVENT_STOP_CAPTURE: 369 return AbortSilently(event_args); 370 case EVENT_AUDIO_DATA: 371 return StartRecognitionEngine(event_args); 372 case EVENT_ENGINE_RESULT: 373 return NotFeasible(event_args); 374 case EVENT_ENGINE_ERROR: 375 case EVENT_AUDIO_ERROR: 376 return AbortWithError(event_args); 377 } 378 break; 379 case STATE_ESTIMATING_ENVIRONMENT: 380 switch (event) { 381 case EVENT_ABORT: 382 return AbortWithError(event_args); 383 case EVENT_START: 384 return NotFeasible(event_args); 385 case EVENT_STOP_CAPTURE: 386 return StopCaptureAndWaitForResult(event_args); 387 case EVENT_AUDIO_DATA: 388 return WaitEnvironmentEstimationCompletion(event_args); 389 case EVENT_ENGINE_RESULT: 390 return ProcessIntermediateResult(event_args); 391 case EVENT_ENGINE_ERROR: 392 case EVENT_AUDIO_ERROR: 393 return AbortWithError(event_args); 394 } 395 break; 396 case STATE_WAITING_FOR_SPEECH: 397 switch (event) { 398 case EVENT_ABORT: 399 return AbortWithError(event_args); 400 case EVENT_START: 401 return NotFeasible(event_args); 402 case EVENT_STOP_CAPTURE: 403 return StopCaptureAndWaitForResult(event_args); 404 case EVENT_AUDIO_DATA: 405 return DetectUserSpeechOrTimeout(event_args); 406 case EVENT_ENGINE_RESULT: 407 return ProcessIntermediateResult(event_args); 408 case EVENT_ENGINE_ERROR: 409 case EVENT_AUDIO_ERROR: 410 return AbortWithError(event_args); 411 } 412 break; 413 case STATE_RECOGNIZING: 414 switch (event) { 415 case EVENT_ABORT: 416 return AbortWithError(event_args); 417 case EVENT_START: 418 return NotFeasible(event_args); 419 case EVENT_STOP_CAPTURE: 420 return StopCaptureAndWaitForResult(event_args); 421 case EVENT_AUDIO_DATA: 422 return DetectEndOfSpeech(event_args); 423 case EVENT_ENGINE_RESULT: 424 return ProcessIntermediateResult(event_args); 425 case EVENT_ENGINE_ERROR: 426 case EVENT_AUDIO_ERROR: 427 return AbortWithError(event_args); 428 } 429 break; 430 case STATE_WAITING_FINAL_RESULT: 431 switch (event) { 432 case EVENT_ABORT: 433 return AbortWithError(event_args); 434 case EVENT_START: 435 return NotFeasible(event_args); 436 case EVENT_STOP_CAPTURE: 437 case EVENT_AUDIO_DATA: 438 return DoNothing(event_args); 439 case EVENT_ENGINE_RESULT: 440 return ProcessFinalResult(event_args); 441 case EVENT_ENGINE_ERROR: 442 case EVENT_AUDIO_ERROR: 443 return AbortWithError(event_args); 444 } 445 break; 446 447 // TODO(primiano): remove this state when speech input extensions support 448 // will be removed and STATE_IDLE.EVENT_ABORT,EVENT_STOP_CAPTURE will be 449 // reset to NotFeasible (see TODO above). 450 case STATE_ENDED: 451 return DoNothing(event_args); 452 } 453 return NotFeasible(event_args); 454 } 455 456 // ----------- Contract for all the FSM evolution functions below ------------- 457 // - Are guaranteed to be executed in the IO thread; 458 // - Are guaranteed to be not reentrant (themselves and each other); 459 // - event_args members are guaranteed to be stable during the call; 460 // - The class won't be freed in the meanwhile due to callbacks; 461 // - IsCapturingAudio() returns true if and only if audio_controller_ != NULL. 462 463 // TODO(primiano): the audio pipeline is currently serial. However, the 464 // clipper->endpointer->vumeter chain and the sr_engine could be parallelized. 465 // We should profile the execution to see if it would be worth or not. 466 void SpeechRecognizerImpl::ProcessAudioPipeline(const AudioChunk& raw_audio) { 467 const bool route_to_endpointer = state_ >= STATE_ESTIMATING_ENVIRONMENT && 468 state_ <= STATE_RECOGNIZING; 469 const bool route_to_sr_engine = route_to_endpointer; 470 const bool route_to_vumeter = state_ >= STATE_WAITING_FOR_SPEECH && 471 state_ <= STATE_RECOGNIZING; 472 const bool clip_detected = DetectClipping(raw_audio); 473 float rms = 0.0f; 474 475 num_samples_recorded_ += raw_audio.NumSamples(); 476 477 if (route_to_endpointer) 478 endpointer_.ProcessAudio(raw_audio, &rms); 479 480 if (route_to_vumeter) { 481 DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|. 482 UpdateSignalAndNoiseLevels(rms, clip_detected); 483 } 484 if (route_to_sr_engine) { 485 DCHECK(recognition_engine_.get() != NULL); 486 recognition_engine_->TakeAudioChunk(raw_audio); 487 } 488 } 489 490 SpeechRecognizerImpl::FSMState 491 SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) { 492 DCHECK(recognition_engine_.get() != NULL); 493 DCHECK(!IsCapturingAudio()); 494 const bool unit_test_is_active = (audio_manager_for_tests_ != NULL); 495 AudioManager* audio_manager = unit_test_is_active ? 496 audio_manager_for_tests_ : 497 AudioManager::Get(); 498 DCHECK(audio_manager != NULL); 499 500 DVLOG(1) << "SpeechRecognizerImpl starting audio capture."; 501 num_samples_recorded_ = 0; 502 audio_level_ = 0; 503 listener()->OnRecognitionStart(session_id()); 504 505 // TODO(xians): Check if the OS has the device with |device_id_|, return 506 // |SPEECH_AUDIO_ERROR_DETAILS_NO_MIC| if the target device does not exist. 507 if (!audio_manager->HasAudioInputDevices()) { 508 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO, 509 SPEECH_AUDIO_ERROR_DETAILS_NO_MIC)); 510 } 511 512 int chunk_duration_ms = recognition_engine_->GetDesiredAudioChunkDurationMs(); 513 514 AudioParameters in_params = audio_manager->GetInputStreamParameters( 515 device_id_); 516 if (!in_params.IsValid() && !unit_test_is_active) { 517 DLOG(ERROR) << "Invalid native audio input parameters"; 518 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO)); 519 } 520 521 // Audio converter shall provide audio based on these parameters as output. 522 // Hard coded, WebSpeech specific parameters are utilized here. 523 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000; 524 AudioParameters output_parameters = AudioParameters( 525 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate, 526 kNumBitsPerAudioSample, frames_per_buffer); 527 528 // Audio converter will receive audio based on these parameters as input. 529 // On Windows we start by verifying that Core Audio is supported. If not, 530 // the WaveIn API is used and we might as well avoid all audio conversations 531 // since WaveIn does the conversion for us. 532 // TODO(henrika): this code should be moved to platform dependent audio 533 // managers. 534 bool use_native_audio_params = true; 535 #if defined(OS_WIN) 536 use_native_audio_params = media::CoreAudioUtil::IsSupported(); 537 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech"; 538 #endif 539 540 AudioParameters input_parameters = output_parameters; 541 if (use_native_audio_params && !unit_test_is_active) { 542 // Use native audio parameters but avoid opening up at the native buffer 543 // size. Instead use same frame size (in milliseconds) as WebSpeech uses. 544 // We rely on internal buffers in the audio back-end to fulfill this request 545 // and the idea is to simplify the audio conversion since each Convert() 546 // call will then render exactly one ProvideInput() call. 547 // Due to implementation details in the audio converter, 2 milliseconds 548 // are added to the default frame size (100 ms) to ensure there is enough 549 // data to generate 100 ms of output when resampling. 550 frames_per_buffer = 551 ((in_params.sample_rate() * (chunk_duration_ms + 2)) / 1000.0) + 0.5; 552 input_parameters.Reset(in_params.format(), 553 in_params.channel_layout(), 554 in_params.channels(), 555 in_params.input_channels(), 556 in_params.sample_rate(), 557 in_params.bits_per_sample(), 558 frames_per_buffer); 559 } 560 561 // Create an audio converter which converts data between native input format 562 // and WebSpeech specific output format. 563 audio_converter_.reset( 564 new OnDataConverter(input_parameters, output_parameters)); 565 566 audio_controller_ = AudioInputController::Create( 567 audio_manager, this, input_parameters, device_id_); 568 569 if (!audio_controller_.get()) { 570 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO)); 571 } 572 573 // The endpointer needs to estimate the environment/background noise before 574 // starting to treat the audio as user input. We wait in the state 575 // ESTIMATING_ENVIRONMENT until such interval has elapsed before switching 576 // to user input mode. 577 endpointer_.SetEnvironmentEstimationMode(); 578 audio_controller_->Record(); 579 return STATE_STARTING; 580 } 581 582 SpeechRecognizerImpl::FSMState 583 SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) { 584 // This is the first audio packet captured, so the recognition engine is 585 // started and the delegate notified about the event. 586 DCHECK(recognition_engine_.get() != NULL); 587 recognition_engine_->StartRecognition(); 588 listener()->OnAudioStart(session_id()); 589 590 // This is a little hack, since TakeAudioChunk() is already called by 591 // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping 592 // the first audio chunk captured after opening the audio device. 593 recognition_engine_->TakeAudioChunk(*(event_args.audio_data.get())); 594 return STATE_ESTIMATING_ENVIRONMENT; 595 } 596 597 SpeechRecognizerImpl::FSMState 598 SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) { 599 DCHECK(endpointer_.IsEstimatingEnvironment()); 600 if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) { 601 endpointer_.SetUserInputMode(); 602 listener()->OnEnvironmentEstimationComplete(session_id()); 603 return STATE_WAITING_FOR_SPEECH; 604 } else { 605 return STATE_ESTIMATING_ENVIRONMENT; 606 } 607 } 608 609 SpeechRecognizerImpl::FSMState 610 SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) { 611 if (endpointer_.DidStartReceivingSpeech()) { 612 listener()->OnSoundStart(session_id()); 613 return STATE_RECOGNIZING; 614 } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) { 615 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NO_SPEECH)); 616 } 617 return STATE_WAITING_FOR_SPEECH; 618 } 619 620 SpeechRecognizerImpl::FSMState 621 SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) { 622 if (endpointer_.speech_input_complete()) 623 return StopCaptureAndWaitForResult(event_args); 624 return STATE_RECOGNIZING; 625 } 626 627 SpeechRecognizerImpl::FSMState 628 SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) { 629 DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING); 630 631 DVLOG(1) << "Concluding recognition"; 632 CloseAudioControllerAsynchronously(); 633 recognition_engine_->AudioChunksEnded(); 634 635 if (state_ > STATE_WAITING_FOR_SPEECH) 636 listener()->OnSoundEnd(session_id()); 637 638 listener()->OnAudioEnd(session_id()); 639 return STATE_WAITING_FINAL_RESULT; 640 } 641 642 SpeechRecognizerImpl::FSMState 643 SpeechRecognizerImpl::AbortSilently(const FSMEventArgs& event_args) { 644 DCHECK_NE(event_args.event, EVENT_AUDIO_ERROR); 645 DCHECK_NE(event_args.event, EVENT_ENGINE_ERROR); 646 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NONE)); 647 } 648 649 SpeechRecognizerImpl::FSMState 650 SpeechRecognizerImpl::AbortWithError(const FSMEventArgs& event_args) { 651 if (event_args.event == EVENT_AUDIO_ERROR) { 652 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO)); 653 } else if (event_args.event == EVENT_ENGINE_ERROR) { 654 return Abort(event_args.engine_error); 655 } 656 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_ABORTED)); 657 } 658 659 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort( 660 const SpeechRecognitionError& error) { 661 if (IsCapturingAudio()) 662 CloseAudioControllerAsynchronously(); 663 664 DVLOG(1) << "SpeechRecognizerImpl canceling recognition. "; 665 666 // The recognition engine is initialized only after STATE_STARTING. 667 if (state_ > STATE_STARTING) { 668 DCHECK(recognition_engine_.get() != NULL); 669 recognition_engine_->EndRecognition(); 670 } 671 672 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT) 673 listener()->OnSoundEnd(session_id()); 674 675 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT) 676 listener()->OnAudioEnd(session_id()); 677 678 if (error.code != SPEECH_RECOGNITION_ERROR_NONE) 679 listener()->OnRecognitionError(session_id(), error); 680 681 listener()->OnRecognitionEnd(session_id()); 682 683 return STATE_ENDED; 684 } 685 686 SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ProcessIntermediateResult( 687 const FSMEventArgs& event_args) { 688 // Provisional results can occur only during continuous (non one-shot) mode. 689 // If this check is reached it means that a continuous speech recognition 690 // engine is being used for a one shot recognition. 691 DCHECK_EQ(false, is_single_shot_); 692 693 // In continuous recognition, intermediate results can occur even when we are 694 // in the ESTIMATING_ENVIRONMENT or WAITING_FOR_SPEECH states (if the 695 // recognition engine is "faster" than our endpointer). In these cases we 696 // skip the endpointer and fast-forward to the RECOGNIZING state, with respect 697 // of the events triggering order. 698 if (state_ == STATE_ESTIMATING_ENVIRONMENT) { 699 DCHECK(endpointer_.IsEstimatingEnvironment()); 700 endpointer_.SetUserInputMode(); 701 listener()->OnEnvironmentEstimationComplete(session_id()); 702 } else if (state_ == STATE_WAITING_FOR_SPEECH) { 703 listener()->OnSoundStart(session_id()); 704 } else { 705 DCHECK_EQ(STATE_RECOGNIZING, state_); 706 } 707 708 listener()->OnRecognitionResults(session_id(), event_args.engine_results); 709 return STATE_RECOGNIZING; 710 } 711 712 SpeechRecognizerImpl::FSMState 713 SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) { 714 const SpeechRecognitionResults& results = event_args.engine_results; 715 SpeechRecognitionResults::const_iterator i = results.begin(); 716 bool provisional_results_pending = false; 717 bool results_are_empty = true; 718 for (; i != results.end(); ++i) { 719 const SpeechRecognitionResult& result = *i; 720 if (result.is_provisional) { 721 provisional_results_pending = true; 722 DCHECK(!is_single_shot_); 723 } else if (results_are_empty) { 724 results_are_empty = result.hypotheses.empty(); 725 } 726 } 727 728 if (provisional_results_pending) { 729 listener()->OnRecognitionResults(session_id(), results); 730 // We don't end the recognition if a provisional result is received in 731 // STATE_WAITING_FINAL_RESULT. A definitive result will come next and will 732 // end the recognition. 733 return state_; 734 } 735 736 recognition_engine_->EndRecognition(); 737 738 if (!results_are_empty) { 739 // We could receive an empty result (which we won't propagate further) 740 // in the following (continuous) scenario: 741 // 1. The caller start pushing audio and receives some results; 742 // 2. A |StopAudioCapture| is issued later; 743 // 3. The final audio frames captured in the interval ]1,2] do not lead to 744 // any result (nor any error); 745 // 4. The speech recognition engine, therefore, emits an empty result to 746 // notify that the recognition is ended with no error, yet neither any 747 // further result. 748 listener()->OnRecognitionResults(session_id(), results); 749 } 750 751 listener()->OnRecognitionEnd(session_id()); 752 return STATE_ENDED; 753 } 754 755 SpeechRecognizerImpl::FSMState 756 SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const { 757 return state_; // Just keep the current state. 758 } 759 760 SpeechRecognizerImpl::FSMState 761 SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) { 762 NOTREACHED() << "Unfeasible event " << event_args.event 763 << " in state " << state_; 764 return state_; 765 } 766 767 void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() { 768 DCHECK(IsCapturingAudio()); 769 DVLOG(1) << "SpeechRecognizerImpl closing audio controller."; 770 // Issues a Close on the audio controller, passing an empty callback. The only 771 // purpose of such callback is to keep the audio controller refcounted until 772 // Close has completed (in the audio thread) and automatically destroy it 773 // afterwards (upon return from OnAudioClosed). 774 audio_controller_->Close(base::Bind(&SpeechRecognizerImpl::OnAudioClosed, 775 this, audio_controller_)); 776 audio_controller_ = NULL; // The controller is still refcounted by Bind. 777 } 778 779 int SpeechRecognizerImpl::GetElapsedTimeMs() const { 780 return (num_samples_recorded_ * 1000) / kAudioSampleRate; 781 } 782 783 void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms, 784 bool clip_detected) { 785 // Calculate the input volume to display in the UI, smoothing towards the 786 // new level. 787 // TODO(primiano): Do we really need all this floating point arith here? 788 // Perhaps it might be quite expensive on mobile. 789 float level = (rms - kAudioMeterMinDb) / 790 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); 791 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped); 792 const float smoothing_factor = (level > audio_level_) ? kUpSmoothingFactor : 793 kDownSmoothingFactor; 794 audio_level_ += (level - audio_level_) * smoothing_factor; 795 796 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / 797 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); 798 noise_level = std::min(std::max(0.0f, noise_level), 799 kAudioMeterRangeMaxUnclipped); 800 801 listener()->OnAudioLevelsChange( 802 session_id(), clip_detected ? 1.0f : audio_level_, noise_level); 803 } 804 805 void SpeechRecognizerImpl::SetAudioManagerForTests( 806 AudioManager* audio_manager) { 807 audio_manager_for_tests_ = audio_manager; 808 } 809 810 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) 811 : event(event_value), 812 audio_data(NULL), 813 engine_error(SPEECH_RECOGNITION_ERROR_NONE) { 814 } 815 816 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { 817 } 818 819 } // namespace content 820