1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 // 5 // To know more about the algorithm used and the original code which this is 6 // based of, see 7 // https://wiki.corp.google.com/twiki/bin/view/Main/ChromeGoogleCodeXRef 8 9 #include "content/browser/speech/endpointer/energy_endpointer.h" 10 11 #include <math.h> 12 13 #include "base/logging.h" 14 15 namespace { 16 17 // Returns the RMS (quadratic mean) of the input signal. 18 float RMS(const int16* samples, int num_samples) { 19 int64 ssq_int64 = 0; 20 int64 sum_int64 = 0; 21 for (int i = 0; i < num_samples; ++i) { 22 sum_int64 += samples[i]; 23 ssq_int64 += samples[i] * samples[i]; 24 } 25 // now convert to floats. 26 double sum = static_cast<double>(sum_int64); 27 sum /= num_samples; 28 double ssq = static_cast<double>(ssq_int64); 29 return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum))); 30 } 31 32 int64 Secs2Usecs(float seconds) { 33 return static_cast<int64>(0.5 + (1.0e6 * seconds)); 34 } 35 36 float GetDecibel(float value) { 37 if (value > 1.0e-100) 38 return 20 * log10(value); 39 return -2000.0; 40 } 41 42 } // namespace 43 44 namespace content { 45 46 // Stores threshold-crossing histories for making decisions about the speech 47 // state. 48 class EnergyEndpointer::HistoryRing { 49 public: 50 HistoryRing() : insertion_index_(0) {} 51 52 // Resets the ring to |size| elements each with state |initial_state| 53 void SetRing(int size, bool initial_state); 54 55 // Inserts a new entry into the ring and drops the oldest entry. 56 void Insert(int64 time_us, bool decision); 57 58 // Returns the time in microseconds of the most recently added entry. 59 int64 EndTime() const; 60 61 // Returns the sum of all intervals during which 'decision' is true within 62 // the time in seconds specified by 'duration'. The returned interval is 63 // in seconds. 64 float RingSum(float duration_sec); 65 66 private: 67 struct DecisionPoint { 68 int64 time_us; 69 bool decision; 70 }; 71 72 std::vector<DecisionPoint> decision_points_; 73 int insertion_index_; // Index at which the next item gets added/inserted. 74 75 DISALLOW_COPY_AND_ASSIGN(HistoryRing); 76 }; 77 78 void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) { 79 insertion_index_ = 0; 80 decision_points_.clear(); 81 DecisionPoint init = { -1, initial_state }; 82 decision_points_.resize(size, init); 83 } 84 85 void EnergyEndpointer::HistoryRing::Insert(int64 time_us, bool decision) { 86 decision_points_[insertion_index_].time_us = time_us; 87 decision_points_[insertion_index_].decision = decision; 88 insertion_index_ = (insertion_index_ + 1) % decision_points_.size(); 89 } 90 91 int64 EnergyEndpointer::HistoryRing::EndTime() const { 92 int ind = insertion_index_ - 1; 93 if (ind < 0) 94 ind = decision_points_.size() - 1; 95 return decision_points_[ind].time_us; 96 } 97 98 float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) { 99 if (!decision_points_.size()) 100 return 0.0; 101 102 int64 sum_us = 0; 103 int ind = insertion_index_ - 1; 104 if (ind < 0) 105 ind = decision_points_.size() - 1; 106 int64 end_us = decision_points_[ind].time_us; 107 bool is_on = decision_points_[ind].decision; 108 int64 start_us = end_us - static_cast<int64>(0.5 + (1.0e6 * duration_sec)); 109 if (start_us < 0) 110 start_us = 0; 111 size_t n_summed = 1; // n points ==> (n-1) intervals 112 while ((decision_points_[ind].time_us > start_us) && 113 (n_summed < decision_points_.size())) { 114 --ind; 115 if (ind < 0) 116 ind = decision_points_.size() - 1; 117 if (is_on) 118 sum_us += end_us - decision_points_[ind].time_us; 119 is_on = decision_points_[ind].decision; 120 end_us = decision_points_[ind].time_us; 121 n_summed++; 122 } 123 124 return 1.0e-6f * sum_us; // Returns total time that was super threshold. 125 } 126 127 EnergyEndpointer::EnergyEndpointer() 128 : status_(EP_PRE_SPEECH), 129 offset_confirm_dur_sec_(0), 130 endpointer_time_us_(0), 131 fast_update_frames_(0), 132 frame_counter_(0), 133 max_window_dur_(4.0), 134 sample_rate_(0), 135 history_(new HistoryRing()), 136 decision_threshold_(0), 137 estimating_environment_(false), 138 noise_level_(0), 139 rms_adapt_(0), 140 start_lag_(0), 141 end_lag_(0), 142 user_input_start_time_us_(0) { 143 } 144 145 EnergyEndpointer::~EnergyEndpointer() { 146 } 147 148 int EnergyEndpointer::TimeToFrame(float time) const { 149 return static_cast<int32>(0.5 + (time / params_.frame_period())); 150 } 151 152 void EnergyEndpointer::Restart(bool reset_threshold) { 153 status_ = EP_PRE_SPEECH; 154 user_input_start_time_us_ = 0; 155 156 if (reset_threshold) { 157 decision_threshold_ = params_.decision_threshold(); 158 rms_adapt_ = decision_threshold_; 159 noise_level_ = params_.decision_threshold() / 2.0f; 160 frame_counter_ = 0; // Used for rapid initial update of levels. 161 } 162 163 // Set up the memories to hold the history windows. 164 history_->SetRing(TimeToFrame(max_window_dur_), false); 165 166 // Flag that indicates that current input should be used for 167 // estimating the environment. The user has not yet started input 168 // by e.g. pressed the push-to-talk button. By default, this is 169 // false for backward compatibility. 170 estimating_environment_ = false; 171 } 172 173 void EnergyEndpointer::Init(const EnergyEndpointerParams& params) { 174 params_ = params; 175 176 // Find the longest history interval to be used, and make the ring 177 // large enough to accommodate that number of frames. NOTE: This 178 // depends upon ep_frame_period being set correctly in the factory 179 // that did this instantiation. 180 max_window_dur_ = params_.onset_window(); 181 if (params_.speech_on_window() > max_window_dur_) 182 max_window_dur_ = params_.speech_on_window(); 183 if (params_.offset_window() > max_window_dur_) 184 max_window_dur_ = params_.offset_window(); 185 Restart(true); 186 187 offset_confirm_dur_sec_ = params_.offset_window() - 188 params_.offset_confirm_dur(); 189 if (offset_confirm_dur_sec_ < 0.0) 190 offset_confirm_dur_sec_ = 0.0; 191 192 user_input_start_time_us_ = 0; 193 194 // Flag that indicates that current input should be used for 195 // estimating the environment. The user has not yet started input 196 // by e.g. pressed the push-to-talk button. By default, this is 197 // false for backward compatibility. 198 estimating_environment_ = false; 199 // The initial value of the noise and speech levels is inconsequential. 200 // The level of the first frame will overwrite these values. 201 noise_level_ = params_.decision_threshold() / 2.0f; 202 fast_update_frames_ = 203 static_cast<int64>(params_.fast_update_dur() / params_.frame_period()); 204 205 frame_counter_ = 0; // Used for rapid initial update of levels. 206 207 sample_rate_ = params_.sample_rate(); 208 start_lag_ = static_cast<int>(sample_rate_ / 209 params_.max_fundamental_frequency()); 210 end_lag_ = static_cast<int>(sample_rate_ / 211 params_.min_fundamental_frequency()); 212 } 213 214 void EnergyEndpointer::StartSession() { 215 Restart(true); 216 } 217 218 void EnergyEndpointer::EndSession() { 219 status_ = EP_POST_SPEECH; 220 } 221 222 void EnergyEndpointer::SetEnvironmentEstimationMode() { 223 Restart(true); 224 estimating_environment_ = true; 225 } 226 227 void EnergyEndpointer::SetUserInputMode() { 228 estimating_environment_ = false; 229 user_input_start_time_us_ = endpointer_time_us_; 230 } 231 232 void EnergyEndpointer::ProcessAudioFrame(int64 time_us, 233 const int16* samples, 234 int num_samples, 235 float* rms_out) { 236 endpointer_time_us_ = time_us; 237 float rms = RMS(samples, num_samples); 238 239 // Check that this is user input audio vs. pre-input adaptation audio. 240 // Input audio starts when the user indicates start of input, by e.g. 241 // pressing push-to-talk. Audio received prior to that is used to update 242 // noise and speech level estimates. 243 if (!estimating_environment_) { 244 bool decision = false; 245 if ((endpointer_time_us_ - user_input_start_time_us_) < 246 Secs2Usecs(params_.contamination_rejection_period())) { 247 decision = false; 248 DVLOG(1) << "decision: forced to false, time: " << endpointer_time_us_; 249 } else { 250 decision = (rms > decision_threshold_); 251 } 252 253 history_->Insert(endpointer_time_us_, decision); 254 255 switch (status_) { 256 case EP_PRE_SPEECH: 257 if (history_->RingSum(params_.onset_window()) > 258 params_.onset_detect_dur()) { 259 status_ = EP_POSSIBLE_ONSET; 260 } 261 break; 262 263 case EP_POSSIBLE_ONSET: { 264 float tsum = history_->RingSum(params_.onset_window()); 265 if (tsum > params_.onset_confirm_dur()) { 266 status_ = EP_SPEECH_PRESENT; 267 } else { // If signal is not maintained, drop back to pre-speech. 268 if (tsum <= params_.onset_detect_dur()) 269 status_ = EP_PRE_SPEECH; 270 } 271 break; 272 } 273 274 case EP_SPEECH_PRESENT: { 275 // To induce hysteresis in the state residency, we allow a 276 // smaller residency time in the on_ring, than was required to 277 // enter the SPEECH_PERSENT state. 278 float on_time = history_->RingSum(params_.speech_on_window()); 279 if (on_time < params_.on_maintain_dur()) 280 status_ = EP_POSSIBLE_OFFSET; 281 break; 282 } 283 284 case EP_POSSIBLE_OFFSET: 285 if (history_->RingSum(params_.offset_window()) <= 286 offset_confirm_dur_sec_) { 287 // Note that this offset time may be beyond the end 288 // of the input buffer in a real-time system. It will be up 289 // to the RecognizerSession to decide what to do. 290 status_ = EP_PRE_SPEECH; // Automatically reset for next utterance. 291 } else { // If speech picks up again we allow return to SPEECH_PRESENT. 292 if (history_->RingSum(params_.speech_on_window()) >= 293 params_.on_maintain_dur()) 294 status_ = EP_SPEECH_PRESENT; 295 } 296 break; 297 298 default: 299 LOG(WARNING) << "Invalid case in switch: " << status_; 300 break; 301 } 302 303 // If this is a quiet, non-speech region, slowly adapt the detection 304 // threshold to be about 6dB above the average RMS. 305 if ((!decision) && (status_ == EP_PRE_SPEECH)) { 306 decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms); 307 rms_adapt_ = decision_threshold_; 308 } else { 309 // If this is in a speech region, adapt the decision threshold to 310 // be about 10dB below the average RMS. If the noise level is high, 311 // the threshold is pushed up. 312 // Adaptation up to a higher level is 5 times faster than decay to 313 // a lower level. 314 if ((status_ == EP_SPEECH_PRESENT) && decision) { 315 if (rms_adapt_ > rms) { 316 rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms); 317 } else { 318 rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms); 319 } 320 float target_threshold = 0.3f * rms_adapt_ + noise_level_; 321 decision_threshold_ = (.90f * decision_threshold_) + 322 (0.10f * target_threshold); 323 } 324 } 325 326 // Set a floor 327 if (decision_threshold_ < params_.min_decision_threshold()) 328 decision_threshold_ = params_.min_decision_threshold(); 329 } 330 331 // Update speech and noise levels. 332 UpdateLevels(rms); 333 ++frame_counter_; 334 335 if (rms_out) 336 *rms_out = GetDecibel(rms); 337 } 338 339 float EnergyEndpointer::GetNoiseLevelDb() const { 340 return GetDecibel(noise_level_); 341 } 342 343 void EnergyEndpointer::UpdateLevels(float rms) { 344 // Update quickly initially. We assume this is noise and that 345 // speech is 6dB above the noise. 346 if (frame_counter_ < fast_update_frames_) { 347 // Alpha increases from 0 to (k-1)/k where k is the number of time 348 // steps in the initial adaptation period. 349 float alpha = static_cast<float>(frame_counter_) / 350 static_cast<float>(fast_update_frames_); 351 noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms); 352 DVLOG(1) << "FAST UPDATE, frame_counter_ " << frame_counter_ 353 << ", fast_update_frames_ " << fast_update_frames_; 354 } else { 355 // Update Noise level. The noise level adapts quickly downward, but 356 // slowly upward. The noise_level_ parameter is not currently used 357 // for threshold adaptation. It is used for UI feedback. 358 if (noise_level_ < rms) 359 noise_level_ = (0.999f * noise_level_) + (0.001f * rms); 360 else 361 noise_level_ = (0.95f * noise_level_) + (0.05f * rms); 362 } 363 if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) { 364 decision_threshold_ = noise_level_ * 2; // 6dB above noise level. 365 // Set a floor 366 if (decision_threshold_ < params_.min_decision_threshold()) 367 decision_threshold_ = params_.min_decision_threshold(); 368 } 369 } 370 371 EpStatus EnergyEndpointer::Status(int64* status_time) const { 372 *status_time = history_->EndTime(); 373 return status_; 374 } 375 376 } // namespace content 377