Home | History | Annotate | Download | only in endpointer
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "content/browser/speech/endpointer/endpointer.h"
      6 
      7 #include "base/time/time.h"
      8 #include "content/browser/speech/audio_buffer.h"
      9 
     10 using base::Time;
     11 
     12 namespace {
     13 const int kFrameRate = 50;  // 1 frame = 20ms of audio.
     14 }
     15 
     16 namespace content {
     17 
     18 Endpointer::Endpointer(int sample_rate)
     19     : speech_input_possibly_complete_silence_length_us_(-1),
     20       speech_input_complete_silence_length_us_(-1),
     21       audio_frame_time_us_(0),
     22       sample_rate_(sample_rate),
     23       frame_size_(0) {
     24   Reset();
     25 
     26   frame_size_ = static_cast<int>(sample_rate / static_cast<float>(kFrameRate));
     27 
     28   speech_input_minimum_length_us_ =
     29       static_cast<int64>(1.7 * Time::kMicrosecondsPerSecond);
     30   speech_input_complete_silence_length_us_ =
     31       static_cast<int64>(0.5 * Time::kMicrosecondsPerSecond);
     32   long_speech_input_complete_silence_length_us_ = -1;
     33   long_speech_length_us_ = -1;
     34   speech_input_possibly_complete_silence_length_us_ =
     35       1 * Time::kMicrosecondsPerSecond;
     36 
     37   // Set the default configuration for Push To Talk mode.
     38   EnergyEndpointerParams ep_config;
     39   ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate));
     40   ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate));
     41   ep_config.set_endpoint_margin(0.2f);
     42   ep_config.set_onset_window(0.15f);
     43   ep_config.set_speech_on_window(0.4f);
     44   ep_config.set_offset_window(0.15f);
     45   ep_config.set_onset_detect_dur(0.09f);
     46   ep_config.set_onset_confirm_dur(0.075f);
     47   ep_config.set_on_maintain_dur(0.10f);
     48   ep_config.set_offset_confirm_dur(0.12f);
     49   ep_config.set_decision_threshold(1000.0f);
     50   ep_config.set_min_decision_threshold(50.0f);
     51   ep_config.set_fast_update_dur(0.2f);
     52   ep_config.set_sample_rate(static_cast<float>(sample_rate));
     53   ep_config.set_min_fundamental_frequency(57.143f);
     54   ep_config.set_max_fundamental_frequency(400.0f);
     55   ep_config.set_contamination_rejection_period(0.25f);
     56   energy_endpointer_.Init(ep_config);
     57 }
     58 
     59 void Endpointer::Reset() {
     60   old_ep_status_ = EP_PRE_SPEECH;
     61   waiting_for_speech_possibly_complete_timeout_ = false;
     62   waiting_for_speech_complete_timeout_ = false;
     63   speech_previously_detected_ = false;
     64   speech_input_complete_ = false;
     65   audio_frame_time_us_ = 0; // Reset time for packets sent to endpointer.
     66   speech_end_time_us_ = -1;
     67   speech_start_time_us_ = -1;
     68 }
     69 
     70 void Endpointer::StartSession() {
     71   Reset();
     72   energy_endpointer_.StartSession();
     73 }
     74 
     75 void Endpointer::EndSession() {
     76   energy_endpointer_.EndSession();
     77 }
     78 
     79 void Endpointer::SetEnvironmentEstimationMode() {
     80   Reset();
     81   energy_endpointer_.SetEnvironmentEstimationMode();
     82 }
     83 
     84 void Endpointer::SetUserInputMode() {
     85   energy_endpointer_.SetUserInputMode();
     86 }
     87 
     88 EpStatus Endpointer::Status(int64 *time) {
     89   return energy_endpointer_.Status(time);
     90 }
     91 
     92 EpStatus Endpointer::ProcessAudio(const AudioChunk& raw_audio, float* rms_out) {
     93   const int16* audio_data = raw_audio.SamplesData16();
     94   const int num_samples = raw_audio.NumSamples();
     95   EpStatus ep_status = EP_PRE_SPEECH;
     96 
     97   // Process the input data in blocks of frame_size_, dropping any incomplete
     98   // frames at the end (which is ok since typically the caller will be recording
     99   // audio in multiples of our frame size).
    100   int sample_index = 0;
    101   while (sample_index + frame_size_ <= num_samples) {
    102     // Have the endpointer process the frame.
    103     energy_endpointer_.ProcessAudioFrame(audio_frame_time_us_,
    104                                          audio_data + sample_index,
    105                                          frame_size_,
    106                                          rms_out);
    107     sample_index += frame_size_;
    108     audio_frame_time_us_ += (frame_size_ * Time::kMicrosecondsPerSecond) /
    109                          sample_rate_;
    110 
    111     // Get the status of the endpointer.
    112     int64 ep_time;
    113     ep_status = energy_endpointer_.Status(&ep_time);
    114 
    115     // Handle state changes.
    116     if ((EP_SPEECH_PRESENT == ep_status) &&
    117         (EP_POSSIBLE_ONSET == old_ep_status_)) {
    118       speech_end_time_us_ = -1;
    119       waiting_for_speech_possibly_complete_timeout_ = false;
    120       waiting_for_speech_complete_timeout_ = false;
    121       // Trigger SpeechInputDidStart event on first detection.
    122       if (false == speech_previously_detected_) {
    123         speech_previously_detected_ = true;
    124         speech_start_time_us_ = ep_time;
    125       }
    126     }
    127     if ((EP_PRE_SPEECH == ep_status) &&
    128         (EP_POSSIBLE_OFFSET == old_ep_status_)) {
    129       speech_end_time_us_ = ep_time;
    130       waiting_for_speech_possibly_complete_timeout_ = true;
    131       waiting_for_speech_complete_timeout_ = true;
    132     }
    133     if (ep_time > speech_input_minimum_length_us_) {
    134       // Speech possibly complete timeout.
    135       if ((waiting_for_speech_possibly_complete_timeout_) &&
    136           (ep_time - speech_end_time_us_ >
    137               speech_input_possibly_complete_silence_length_us_)) {
    138         waiting_for_speech_possibly_complete_timeout_ = false;
    139       }
    140       if (waiting_for_speech_complete_timeout_) {
    141         // The length of the silence timeout period can be held constant, or it
    142         // can be changed after a fixed amount of time from the beginning of
    143         // speech.
    144         bool has_stepped_silence =
    145             (long_speech_length_us_ > 0) &&
    146             (long_speech_input_complete_silence_length_us_ > 0);
    147         int64 requested_silence_length;
    148         if (has_stepped_silence &&
    149             (ep_time - speech_start_time_us_) > long_speech_length_us_) {
    150           requested_silence_length =
    151               long_speech_input_complete_silence_length_us_;
    152         } else {
    153           requested_silence_length =
    154               speech_input_complete_silence_length_us_;
    155         }
    156 
    157         // Speech complete timeout.
    158         if ((ep_time - speech_end_time_us_) > requested_silence_length) {
    159           waiting_for_speech_complete_timeout_ = false;
    160           speech_input_complete_ = true;
    161         }
    162       }
    163     }
    164     old_ep_status_ = ep_status;
    165   }
    166   return ep_status;
    167 }
    168 
    169 }  // namespace content
    170