Home | History | Annotate | Download | only in endpointer
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
      6 #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
      7 
      8 #include "base/basictypes.h"
      9 #include "content/browser/speech/endpointer/energy_endpointer.h"
     10 #include "content/common/content_export.h"
     11 
     12 class EpStatus;
     13 
     14 namespace content {
     15 
     16 class AudioChunk;
     17 
     18 // A simple interface to the underlying energy-endpointer implementation, this
     19 // class lets callers provide audio as being recorded and let them poll to find
     20 // when the user has stopped speaking.
     21 //
     22 // There are two events that may trigger the end of speech:
     23 //
     24 // speechInputPossiblyComplete event:
     25 //
     26 // Signals that silence/noise has  been detected for a *short* amount of
     27 // time after some speech has been detected. It can be used for low latency
     28 // UI feedback. To disable it, set it to a large amount.
     29 //
     30 // speechInputComplete event:
     31 //
     32 // This event is intended to signal end of input and to stop recording.
     33 // The amount of time to wait after speech is set by
     34 // speech_input_complete_silence_length_ and optionally two other
     35 // parameters (see below).
     36 // This time can be held constant, or can change as more speech is detected.
     37 // In the latter case, the time changes after a set amount of time from the
     38 // *beginning* of speech.  This is motivated by the expectation that there
     39 // will be two distinct types of inputs: short search queries and longer
     40 // dictation style input.
     41 //
     42 // Three parameters are used to define the piecewise constant timeout function.
     43 // The timeout length is speech_input_complete_silence_length until
     44 // long_speech_length, when it changes to
     45 // long_speech_input_complete_silence_length.
     46 class CONTENT_EXPORT Endpointer {
     47  public:
     48   explicit Endpointer(int sample_rate);
     49 
     50   // Start the endpointer. This should be called at the beginning of a session.
     51   void StartSession();
     52 
     53   // Stop the endpointer.
     54   void EndSession();
     55 
     56   // Start environment estimation. Audio will be used for environment estimation
     57   // i.e. noise level estimation.
     58   void SetEnvironmentEstimationMode();
     59 
     60   // Start user input. This should be called when the user indicates start of
     61   // input, e.g. by pressing a button.
     62   void SetUserInputMode();
     63 
     64   // Process a segment of audio, which may be more than one frame.
     65   // The status of the last frame will be returned.
     66   EpStatus ProcessAudio(const AudioChunk& raw_audio, float* rms_out);
     67 
     68   // Get the status of the endpointer.
     69   EpStatus Status(int64 *time_us);
     70 
     71   // Returns true if the endpointer detected reasonable audio levels above
     72   // background noise which could be user speech, false if not.
     73   bool DidStartReceivingSpeech() const {
     74     return speech_previously_detected_;
     75   }
     76 
     77   bool IsEstimatingEnvironment() const {
     78     return energy_endpointer_.estimating_environment();
     79   }
     80 
     81   void set_speech_input_complete_silence_length(int64 time_us) {
     82     speech_input_complete_silence_length_us_ = time_us;
     83   }
     84 
     85   void set_long_speech_input_complete_silence_length(int64 time_us) {
     86     long_speech_input_complete_silence_length_us_ = time_us;
     87   }
     88 
     89   void set_speech_input_possibly_complete_silence_length(int64 time_us) {
     90     speech_input_possibly_complete_silence_length_us_ = time_us;
     91   }
     92 
     93   void set_long_speech_length(int64 time_us) {
     94     long_speech_length_us_ = time_us;
     95   }
     96 
     97   bool speech_input_complete() const {
     98     return speech_input_complete_;
     99   }
    100 
    101   // RMS background noise level in dB.
    102   float NoiseLevelDb() const { return energy_endpointer_.GetNoiseLevelDb(); }
    103 
    104  private:
    105   // Reset internal states. Helper method common to initial input utterance
    106   // and following input utternaces.
    107   void Reset();
    108 
    109   // Minimum allowable length of speech input.
    110   int64 speech_input_minimum_length_us_;
    111 
    112   // The speechInputPossiblyComplete event signals that silence/noise has been
    113   // detected for a *short* amount of time after some speech has been detected.
    114   // This proporty specifies the time period.
    115   int64 speech_input_possibly_complete_silence_length_us_;
    116 
    117   // The speechInputComplete event signals that silence/noise has been
    118   // detected for a *long* amount of time after some speech has been detected.
    119   // This property specifies the time period.
    120   int64 speech_input_complete_silence_length_us_;
    121 
    122   // Same as above, this specifies the required silence period after speech
    123   // detection. This period is used instead of
    124   // speech_input_complete_silence_length_ when the utterance is longer than
    125   // long_speech_length_. This parameter is optional.
    126   int64 long_speech_input_complete_silence_length_us_;
    127 
    128   // The period of time after which the endpointer should consider
    129   // long_speech_input_complete_silence_length_ as a valid silence period
    130   // instead of speech_input_complete_silence_length_. This parameter is
    131   // optional.
    132   int64 long_speech_length_us_;
    133 
    134   // First speech onset time, used in determination of speech complete timeout.
    135   int64 speech_start_time_us_;
    136 
    137   // Most recent end time, used in determination of speech complete timeout.
    138   int64 speech_end_time_us_;
    139 
    140   int64 audio_frame_time_us_;
    141   EpStatus old_ep_status_;
    142   bool waiting_for_speech_possibly_complete_timeout_;
    143   bool waiting_for_speech_complete_timeout_;
    144   bool speech_previously_detected_;
    145   bool speech_input_complete_;
    146   EnergyEndpointer energy_endpointer_;
    147   int sample_rate_;
    148   int32 frame_size_;
    149 };
    150 
    151 }  // namespace content
    152 
    153 #endif  // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
    154