Home | History | Annotate | Download | only in endpointer
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 //
      5 // To know more about the algorithm used and the original code which this is
      6 // based of, see
      7 // https://wiki.corp.google.com/twiki/bin/view/Main/ChromeGoogleCodeXRef
      8 
      9 #include "content/browser/speech/endpointer/energy_endpointer.h"
     10 
     11 #include <math.h>
     12 
     13 #include "base/logging.h"
     14 
     15 namespace {
     16 
     17 // Returns the RMS (quadratic mean) of the input signal.
     18 float RMS(const int16* samples, int num_samples) {
     19   int64 ssq_int64 = 0;
     20   int64 sum_int64 = 0;
     21   for (int i = 0; i < num_samples; ++i) {
     22     sum_int64 += samples[i];
     23     ssq_int64 += samples[i] * samples[i];
     24   }
     25   // now convert to floats.
     26   double sum = static_cast<double>(sum_int64);
     27   sum /= num_samples;
     28   double ssq = static_cast<double>(ssq_int64);
     29   return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum)));
     30 }
     31 
     32 int64 Secs2Usecs(float seconds) {
     33   return static_cast<int64>(0.5 + (1.0e6 * seconds));
     34 }
     35 
     36 float GetDecibel(float value) {
     37   if (value > 1.0e-100)
     38     return 20 * log10(value);
     39   return -2000.0;
     40 }
     41 
     42 }  // namespace
     43 
     44 namespace content {
     45 
     46 // Stores threshold-crossing histories for making decisions about the speech
     47 // state.
     48 class EnergyEndpointer::HistoryRing {
     49  public:
     50   HistoryRing() : insertion_index_(0) {}
     51 
     52   // Resets the ring to |size| elements each with state |initial_state|
     53   void SetRing(int size, bool initial_state);
     54 
     55   // Inserts a new entry into the ring and drops the oldest entry.
     56   void Insert(int64 time_us, bool decision);
     57 
     58   // Returns the time in microseconds of the most recently added entry.
     59   int64 EndTime() const;
     60 
     61   // Returns the sum of all intervals during which 'decision' is true within
     62   // the time in seconds specified by 'duration'. The returned interval is
     63   // in seconds.
     64   float RingSum(float duration_sec);
     65 
     66  private:
     67   struct DecisionPoint {
     68     int64 time_us;
     69     bool decision;
     70   };
     71 
     72   std::vector<DecisionPoint> decision_points_;
     73   int insertion_index_;  // Index at which the next item gets added/inserted.
     74 
     75   DISALLOW_COPY_AND_ASSIGN(HistoryRing);
     76 };
     77 
     78 void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) {
     79   insertion_index_ = 0;
     80   decision_points_.clear();
     81   DecisionPoint init = { -1, initial_state };
     82   decision_points_.resize(size, init);
     83 }
     84 
     85 void EnergyEndpointer::HistoryRing::Insert(int64 time_us, bool decision) {
     86   decision_points_[insertion_index_].time_us = time_us;
     87   decision_points_[insertion_index_].decision = decision;
     88   insertion_index_ = (insertion_index_ + 1) % decision_points_.size();
     89 }
     90 
     91 int64 EnergyEndpointer::HistoryRing::EndTime() const {
     92   int ind = insertion_index_ - 1;
     93   if (ind < 0)
     94     ind = decision_points_.size() - 1;
     95   return decision_points_[ind].time_us;
     96 }
     97 
     98 float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) {
     99   if (!decision_points_.size())
    100     return 0.0;
    101 
    102   int64 sum_us = 0;
    103   int ind = insertion_index_ - 1;
    104   if (ind < 0)
    105     ind = decision_points_.size() - 1;
    106   int64 end_us = decision_points_[ind].time_us;
    107   bool is_on = decision_points_[ind].decision;
    108   int64 start_us = end_us - static_cast<int64>(0.5 + (1.0e6 * duration_sec));
    109   if (start_us < 0)
    110     start_us = 0;
    111   size_t n_summed = 1;  // n points ==> (n-1) intervals
    112   while ((decision_points_[ind].time_us > start_us) &&
    113          (n_summed < decision_points_.size())) {
    114     --ind;
    115     if (ind < 0)
    116       ind = decision_points_.size() - 1;
    117     if (is_on)
    118       sum_us += end_us - decision_points_[ind].time_us;
    119     is_on = decision_points_[ind].decision;
    120     end_us = decision_points_[ind].time_us;
    121     n_summed++;
    122   }
    123 
    124   return 1.0e-6f * sum_us;  //  Returns total time that was super threshold.
    125 }
    126 
    127 EnergyEndpointer::EnergyEndpointer()
    128     : status_(EP_PRE_SPEECH),
    129       offset_confirm_dur_sec_(0),
    130       endpointer_time_us_(0),
    131       fast_update_frames_(0),
    132       frame_counter_(0),
    133       max_window_dur_(4.0),
    134       sample_rate_(0),
    135       history_(new HistoryRing()),
    136       decision_threshold_(0),
    137       estimating_environment_(false),
    138       noise_level_(0),
    139       rms_adapt_(0),
    140       start_lag_(0),
    141       end_lag_(0),
    142       user_input_start_time_us_(0) {
    143 }
    144 
    145 EnergyEndpointer::~EnergyEndpointer() {
    146 }
    147 
    148 int EnergyEndpointer::TimeToFrame(float time) const {
    149   return static_cast<int32>(0.5 + (time / params_.frame_period()));
    150 }
    151 
    152 void EnergyEndpointer::Restart(bool reset_threshold) {
    153   status_ = EP_PRE_SPEECH;
    154   user_input_start_time_us_ = 0;
    155 
    156   if (reset_threshold) {
    157     decision_threshold_ = params_.decision_threshold();
    158     rms_adapt_ = decision_threshold_;
    159     noise_level_ = params_.decision_threshold() / 2.0f;
    160     frame_counter_ = 0;  // Used for rapid initial update of levels.
    161   }
    162 
    163   // Set up the memories to hold the history windows.
    164   history_->SetRing(TimeToFrame(max_window_dur_), false);
    165 
    166   // Flag that indicates that current input should be used for
    167   // estimating the environment. The user has not yet started input
    168   // by e.g. pressed the push-to-talk button. By default, this is
    169   // false for backward compatibility.
    170   estimating_environment_ = false;
    171 }
    172 
    173 void EnergyEndpointer::Init(const EnergyEndpointerParams& params) {
    174   params_ = params;
    175 
    176   // Find the longest history interval to be used, and make the ring
    177   // large enough to accommodate that number of frames.  NOTE: This
    178   // depends upon ep_frame_period being set correctly in the factory
    179   // that did this instantiation.
    180   max_window_dur_ = params_.onset_window();
    181   if (params_.speech_on_window() > max_window_dur_)
    182     max_window_dur_ = params_.speech_on_window();
    183   if (params_.offset_window() > max_window_dur_)
    184     max_window_dur_ = params_.offset_window();
    185   Restart(true);
    186 
    187   offset_confirm_dur_sec_ = params_.offset_window() -
    188                             params_.offset_confirm_dur();
    189   if (offset_confirm_dur_sec_ < 0.0)
    190     offset_confirm_dur_sec_ = 0.0;
    191 
    192   user_input_start_time_us_ = 0;
    193 
    194   // Flag that indicates that  current input should be used for
    195   // estimating the environment. The user has not yet started input
    196   // by e.g. pressed the push-to-talk button. By default, this is
    197   // false for backward compatibility.
    198   estimating_environment_ = false;
    199   // The initial value of the noise and speech levels is inconsequential.
    200   // The level of the first frame will overwrite these values.
    201   noise_level_ = params_.decision_threshold() / 2.0f;
    202   fast_update_frames_ =
    203       static_cast<int64>(params_.fast_update_dur() / params_.frame_period());
    204 
    205   frame_counter_ = 0;  // Used for rapid initial update of levels.
    206 
    207   sample_rate_ = params_.sample_rate();
    208   start_lag_ = static_cast<int>(sample_rate_ /
    209                                 params_.max_fundamental_frequency());
    210   end_lag_ = static_cast<int>(sample_rate_ /
    211                               params_.min_fundamental_frequency());
    212 }
    213 
    214 void EnergyEndpointer::StartSession() {
    215   Restart(true);
    216 }
    217 
    218 void EnergyEndpointer::EndSession() {
    219   status_ = EP_POST_SPEECH;
    220 }
    221 
    222 void EnergyEndpointer::SetEnvironmentEstimationMode() {
    223   Restart(true);
    224   estimating_environment_ = true;
    225 }
    226 
    227 void EnergyEndpointer::SetUserInputMode() {
    228   estimating_environment_ = false;
    229   user_input_start_time_us_ = endpointer_time_us_;
    230 }
    231 
    232 void EnergyEndpointer::ProcessAudioFrame(int64 time_us,
    233                                          const int16* samples,
    234                                          int num_samples,
    235                                          float* rms_out) {
    236   endpointer_time_us_ = time_us;
    237   float rms = RMS(samples, num_samples);
    238 
    239   // Check that this is user input audio vs. pre-input adaptation audio.
    240   // Input audio starts when the user indicates start of input, by e.g.
    241   // pressing push-to-talk. Audio recieved prior to that is used to update
    242   // noise and speech level estimates.
    243   if (!estimating_environment_) {
    244     bool decision = false;
    245     if ((endpointer_time_us_ - user_input_start_time_us_) <
    246         Secs2Usecs(params_.contamination_rejection_period())) {
    247       decision = false;
    248       DVLOG(1) << "decision: forced to false, time: " << endpointer_time_us_;
    249     } else {
    250       decision = (rms > decision_threshold_);
    251     }
    252 
    253     history_->Insert(endpointer_time_us_, decision);
    254 
    255     switch (status_) {
    256       case EP_PRE_SPEECH:
    257         if (history_->RingSum(params_.onset_window()) >
    258             params_.onset_detect_dur()) {
    259           status_ = EP_POSSIBLE_ONSET;
    260         }
    261         break;
    262 
    263       case EP_POSSIBLE_ONSET: {
    264         float tsum = history_->RingSum(params_.onset_window());
    265         if (tsum > params_.onset_confirm_dur()) {
    266           status_ = EP_SPEECH_PRESENT;
    267         } else {  // If signal is not maintained, drop back to pre-speech.
    268           if (tsum <= params_.onset_detect_dur())
    269             status_ = EP_PRE_SPEECH;
    270         }
    271         break;
    272       }
    273 
    274       case EP_SPEECH_PRESENT: {
    275         // To induce hysteresis in the state residency, we allow a
    276         // smaller residency time in the on_ring, than was required to
    277         // enter the SPEECH_PERSENT state.
    278         float on_time = history_->RingSum(params_.speech_on_window());
    279         if (on_time < params_.on_maintain_dur())
    280           status_ = EP_POSSIBLE_OFFSET;
    281         break;
    282       }
    283 
    284       case EP_POSSIBLE_OFFSET:
    285         if (history_->RingSum(params_.offset_window()) <=
    286             offset_confirm_dur_sec_) {
    287           // Note that this offset time may be beyond the end
    288           // of the input buffer in a real-time system.  It will be up
    289           // to the RecognizerSession to decide what to do.
    290           status_ = EP_PRE_SPEECH;  // Automatically reset for next utterance.
    291         } else {  // If speech picks up again we allow return to SPEECH_PRESENT.
    292           if (history_->RingSum(params_.speech_on_window()) >=
    293               params_.on_maintain_dur())
    294             status_ = EP_SPEECH_PRESENT;
    295         }
    296         break;
    297 
    298       default:
    299         LOG(WARNING) << "Invalid case in switch: " << status_;
    300         break;
    301     }
    302 
    303     // If this is a quiet, non-speech region, slowly adapt the detection
    304     // threshold to be about 6dB above the average RMS.
    305     if ((!decision) && (status_ == EP_PRE_SPEECH)) {
    306       decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms);
    307       rms_adapt_ = decision_threshold_;
    308     } else {
    309       // If this is in a speech region, adapt the decision threshold to
    310       // be about 10dB below the average RMS. If the noise level is high,
    311       // the threshold is pushed up.
    312       // Adaptation up to a higher level is 5 times faster than decay to
    313       // a lower level.
    314       if ((status_ == EP_SPEECH_PRESENT) && decision) {
    315         if (rms_adapt_ > rms) {
    316           rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms);
    317         } else {
    318           rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms);
    319         }
    320         float target_threshold = 0.3f * rms_adapt_ +  noise_level_;
    321         decision_threshold_ = (.90f * decision_threshold_) +
    322                               (0.10f * target_threshold);
    323       }
    324     }
    325 
    326     // Set a floor
    327     if (decision_threshold_ < params_.min_decision_threshold())
    328       decision_threshold_ = params_.min_decision_threshold();
    329   }
    330 
    331   // Update speech and noise levels.
    332   UpdateLevels(rms);
    333   ++frame_counter_;
    334 
    335   if (rms_out)
    336     *rms_out = GetDecibel(rms);
    337 }
    338 
    339 float EnergyEndpointer::GetNoiseLevelDb() const {
    340   return GetDecibel(noise_level_);
    341 }
    342 
    343 void EnergyEndpointer::UpdateLevels(float rms) {
    344   // Update quickly initially. We assume this is noise and that
    345   // speech is 6dB above the noise.
    346   if (frame_counter_ < fast_update_frames_) {
    347     // Alpha increases from 0 to (k-1)/k where k is the number of time
    348     // steps in the initial adaptation period.
    349     float alpha = static_cast<float>(frame_counter_) /
    350         static_cast<float>(fast_update_frames_);
    351     noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms);
    352     DVLOG(1) << "FAST UPDATE, frame_counter_ " << frame_counter_
    353              << ", fast_update_frames_ " << fast_update_frames_;
    354   } else {
    355     // Update Noise level. The noise level adapts quickly downward, but
    356     // slowly upward. The noise_level_ parameter is not currently used
    357     // for threshold adaptation. It is used for UI feedback.
    358     if (noise_level_ < rms)
    359       noise_level_ = (0.999f * noise_level_) + (0.001f * rms);
    360     else
    361       noise_level_ = (0.95f * noise_level_) + (0.05f * rms);
    362   }
    363   if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) {
    364     decision_threshold_ = noise_level_ * 2; // 6dB above noise level.
    365     // Set a floor
    366     if (decision_threshold_ < params_.min_decision_threshold())
    367       decision_threshold_ = params_.min_decision_threshold();
    368   }
    369 }
    370 
    371 EpStatus EnergyEndpointer::Status(int64* status_time)  const {
    372   *status_time = history_->EndTime();
    373   return status_;
    374 }
    375 
    376 }  // namespace content
    377