Home | History | Annotate | Download | only in speech
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef CHROME_BROWSER_SPEECH_TTS_CONTROLLER_H_
      6 #define CHROME_BROWSER_SPEECH_TTS_CONTROLLER_H_
      7 
      8 #include <queue>
      9 #include <set>
     10 #include <string>
     11 #include <vector>
     12 
     13 #include "base/memory/scoped_ptr.h"
     14 #include "base/memory/singleton.h"
     15 #include "base/memory/weak_ptr.h"
     16 #include "url/gurl.h"
     17 
     18 class Utterance;
     19 class TtsPlatformImpl;
     20 class Profile;
     21 
     22 namespace base {
     23 class Value;
     24 }
     25 
     26 // Events sent back from the TTS engine indicating the progress.
     27 enum TtsEventType {
     28   TTS_EVENT_START,
     29   TTS_EVENT_END,
     30   TTS_EVENT_WORD,
     31   TTS_EVENT_SENTENCE,
     32   TTS_EVENT_MARKER,
     33   TTS_EVENT_INTERRUPTED,
     34   TTS_EVENT_CANCELLED,
     35   TTS_EVENT_ERROR,
     36   TTS_EVENT_PAUSE,
     37   TTS_EVENT_RESUME
     38 };
     39 
     40 enum TtsGenderType {
     41   TTS_GENDER_NONE,
     42   TTS_GENDER_MALE,
     43   TTS_GENDER_FEMALE
     44 };
     45 
     46 // Returns true if this event type is one that indicates an utterance
     47 // is finished and can be destroyed.
     48 bool IsFinalTtsEventType(TtsEventType event_type);
     49 
     50 // The continuous parameters that apply to a given utterance.
     51 struct UtteranceContinuousParameters {
     52   UtteranceContinuousParameters();
     53 
     54   double rate;
     55   double pitch;
     56   double volume;
     57 };
     58 
     59 // Information about one voice.
     60 struct VoiceData {
     61   VoiceData();
     62   ~VoiceData();
     63 
     64   std::string name;
     65   std::string lang;
     66   TtsGenderType gender;
     67   std::string extension_id;
     68   std::set<TtsEventType> events;
     69 
     70   // If true, the synthesis engine is a remote network resource.
     71   // It may be higher latency and may incur bandwidth costs.
     72   bool remote;
     73 
     74   // If true, this is implemented by this platform's subclass of
     75   // TtsPlatformImpl. If false, this is implemented by an extension.
     76   bool native;
     77   std::string native_voice_identifier;
     78 };
     79 
     80 // Class that wants to receive events on utterances.
     81 class UtteranceEventDelegate {
     82  public:
     83   virtual ~UtteranceEventDelegate() {}
     84   virtual void OnTtsEvent(Utterance* utterance,
     85                           TtsEventType event_type,
     86                           int char_index,
     87                           const std::string& error_message) = 0;
     88 };
     89 
     90 // Class that wants to be notified when the set of
     91 // voices has changed.
     92 class VoicesChangedDelegate {
     93  public:
     94   virtual ~VoicesChangedDelegate() {}
     95   virtual void OnVoicesChanged() = 0;
     96 };
     97 
     98 // One speech utterance.
     99 class Utterance {
    100  public:
    101   // Construct an utterance given a profile and a completion task to call
    102   // when the utterance is done speaking. Before speaking this utterance,
    103   // its other parameters like text, rate, pitch, etc. should all be set.
    104   explicit Utterance(Profile* profile);
    105   ~Utterance();
    106 
    107   // Sends an event to the delegate. If the event type is TTS_EVENT_END
    108   // or TTS_EVENT_ERROR, deletes the utterance. If |char_index| is -1,
    109   // uses the last good value.
    110   void OnTtsEvent(TtsEventType event_type,
    111                   int char_index,
    112                   const std::string& error_message);
    113 
    114   // Finish an utterance without sending an event to the delegate.
    115   void Finish();
    116 
    117   // Getters and setters for the text to speak and other speech options.
    118   void set_text(const std::string& text) { text_ = text; }
    119   const std::string& text() const { return text_; }
    120 
    121   void set_options(const base::Value* options);
    122   const base::Value* options() const { return options_.get(); }
    123 
    124   void set_src_extension_id(const std::string& src_extension_id) {
    125     src_extension_id_ = src_extension_id;
    126   }
    127   const std::string& src_extension_id() { return src_extension_id_; }
    128 
    129   void set_src_id(int src_id) { src_id_ = src_id; }
    130   int src_id() { return src_id_; }
    131 
    132   void set_src_url(const GURL& src_url) { src_url_ = src_url; }
    133   const GURL& src_url() { return src_url_; }
    134 
    135   void set_voice_name(const std::string& voice_name) {
    136     voice_name_ = voice_name;
    137   }
    138   const std::string& voice_name() const { return voice_name_; }
    139 
    140   void set_lang(const std::string& lang) {
    141     lang_ = lang;
    142   }
    143   const std::string& lang() const { return lang_; }
    144 
    145   void set_gender(TtsGenderType gender) {
    146     gender_ = gender;
    147   }
    148   TtsGenderType gender() const { return gender_; }
    149 
    150   void set_continuous_parameters(const UtteranceContinuousParameters& params) {
    151     continuous_parameters_ = params;
    152   }
    153   const UtteranceContinuousParameters& continuous_parameters() {
    154     return continuous_parameters_;
    155   }
    156 
    157   void set_can_enqueue(bool can_enqueue) { can_enqueue_ = can_enqueue; }
    158   bool can_enqueue() const { return can_enqueue_; }
    159 
    160   void set_required_event_types(const std::set<TtsEventType>& types) {
    161     required_event_types_ = types;
    162   }
    163   const std::set<TtsEventType>& required_event_types() const {
    164     return required_event_types_;
    165   }
    166 
    167   void set_desired_event_types(const std::set<TtsEventType>& types) {
    168     desired_event_types_ = types;
    169   }
    170   const std::set<TtsEventType>& desired_event_types() const {
    171     return desired_event_types_;
    172   }
    173 
    174   const std::string& extension_id() const { return extension_id_; }
    175   void set_extension_id(const std::string& extension_id) {
    176     extension_id_ = extension_id;
    177   }
    178 
    179   UtteranceEventDelegate* event_delegate() const {
    180     return event_delegate_.get();
    181   }
    182   void set_event_delegate(
    183       base::WeakPtr<UtteranceEventDelegate> event_delegate) {
    184     event_delegate_ = event_delegate;
    185   }
    186 
    187   // Getters and setters for internal state.
    188   Profile* profile() const { return profile_; }
    189   int id() const { return id_; }
    190   bool finished() const { return finished_; }
    191 
    192  private:
    193   // The profile that initiated this utterance.
    194   Profile* profile_;
    195 
    196   // The extension ID of the extension providing TTS for this utterance, or
    197   // empty if native TTS is being used.
    198   std::string extension_id_;
    199 
    200   // The unique ID of this utterance, used to associate callback functions
    201   // with utterances.
    202   int id_;
    203 
    204   // The id of the next utterance, so we can associate requests with
    205   // responses.
    206   static int next_utterance_id_;
    207 
    208   // The text to speak.
    209   std::string text_;
    210 
    211   // The full options arg passed to tts.speak, which may include fields
    212   // other than the ones we explicitly parse, below.
    213   scoped_ptr<base::Value> options_;
    214 
    215   // The extension ID of the extension that called speak() and should
    216   // receive events.
    217   std::string src_extension_id_;
    218 
    219   // The source extension's ID of this utterance, so that it can associate
    220   // events with the appropriate callback.
    221   int src_id_;
    222 
    223   // The URL of the page where the source extension called speak.
    224   GURL src_url_;
    225 
    226   // The delegate to be called when an utterance event is fired.
    227   base::WeakPtr<UtteranceEventDelegate> event_delegate_;
    228 
    229   // The parsed options.
    230   std::string voice_name_;
    231   std::string lang_;
    232   TtsGenderType gender_;
    233   UtteranceContinuousParameters continuous_parameters_;
    234   bool can_enqueue_;
    235   std::set<TtsEventType> required_event_types_;
    236   std::set<TtsEventType> desired_event_types_;
    237 
    238   // The index of the current char being spoken.
    239   int char_index_;
    240 
    241   // True if this utterance received an event indicating it's done.
    242   bool finished_;
    243 };
    244 
    245 // Singleton class that manages text-to-speech for the TTS and TTS engine
    246 // extension APIs, maintaining a queue of pending utterances and keeping
    247 // track of all state.
    248 class TtsController {
    249  public:
    250   // Get the single instance of this class.
    251   static TtsController* GetInstance();
    252 
    253   // Returns true if we're currently speaking an utterance.
    254   bool IsSpeaking();
    255 
    256   // Speak the given utterance. If the utterance's can_enqueue flag is true
    257   // and another utterance is in progress, adds it to the end of the queue.
    258   // Otherwise, interrupts any current utterance and speaks this one
    259   // immediately.
    260   void SpeakOrEnqueue(Utterance* utterance);
    261 
    262   // Stop all utterances and flush the queue. Implies leaving pause mode
    263   // as well.
    264   void Stop();
    265 
    266   // Pause the speech queue. Some engines may support pausing in the middle
    267   // of an utterance.
    268   void Pause();
    269 
    270   // Resume speaking.
    271   void Resume();
    272 
    273   // Handle events received from the speech engine. Events are forwarded to
    274   // the callback function, and in addition, completion and error events
    275   // trigger finishing the current utterance and starting the next one, if
    276   // any.
    277   void OnTtsEvent(int utterance_id,
    278                   TtsEventType event_type,
    279                   int char_index,
    280                   const std::string& error_message);
    281 
    282   // Return a list of all available voices, including the native voice,
    283   // if supported, and all voices registered by extensions.
    284   void GetVoices(Profile* profile, std::vector<VoiceData>* out_voices);
    285 
    286   // Called by TtsExtensionLoaderChromeOs::LoadTtsExtension when it
    287   // finishes loading the built-in TTS component extension.
    288   void RetrySpeakingQueuedUtterances();
    289 
    290   // Called by the extension system or platform implementation when the
    291   // list of voices may have changed and should be re-queried.
    292   void VoicesChanged();
    293 
    294   // Add a delegate that wants to be notified when the set of voices changes.
    295   void AddVoicesChangedDelegate(VoicesChangedDelegate* delegate);
    296 
    297   // Remove delegate that wants to be notified when the set of voices changes.
    298   void RemoveVoicesChangedDelegate(VoicesChangedDelegate* delegate);
    299 
    300   // For unit testing.
    301   void SetPlatformImpl(TtsPlatformImpl* platform_impl);
    302   int QueueSize();
    303 
    304  protected:
    305   TtsController();
    306   virtual ~TtsController();
    307 
    308  private:
    309   // Get the platform TTS implementation (or injected mock).
    310   TtsPlatformImpl* GetPlatformImpl();
    311 
    312   // Start speaking the given utterance. Will either take ownership of
    313   // |utterance| or delete it if there's an error. Returns true on success.
    314   void SpeakNow(Utterance* utterance);
    315 
    316   // Clear the utterance queue. If send_events is true, will send
    317   // TTS_EVENT_CANCELLED events on each one.
    318   void ClearUtteranceQueue(bool send_events);
    319 
    320   // Finalize and delete the current utterance.
    321   void FinishCurrentUtterance();
    322 
    323   // Start speaking the next utterance in the queue.
    324   void SpeakNextUtterance();
    325 
    326   // Given an utterance and a vector of voices, return the
    327   // index of the voice that best matches the utterance.
    328   int GetMatchingVoice(const Utterance* utterance,
    329                        std::vector<VoiceData>& voices);
    330 
    331   friend struct DefaultSingletonTraits<TtsController>;
    332 
    333   // The current utterance being spoken.
    334   Utterance* current_utterance_;
    335 
    336   // Whether the queue is paused or not.
    337   bool paused_;
    338 
    339   // A queue of utterances to speak after the current one finishes.
    340   std::queue<Utterance*> utterance_queue_;
    341 
    342   // A set of delegates that want to be notified when the voices change.
    343   std::set<VoicesChangedDelegate*> voices_changed_delegates_;
    344 
    345   // A pointer to the platform implementation of text-to-speech, for
    346   // dependency injection.
    347   TtsPlatformImpl* platform_impl_;
    348 
    349   DISALLOW_COPY_AND_ASSIGN(TtsController);
    350 };
    351 
    352 #endif  // CHROME_BROWSER_SPEECH_TTS_CONTROLLER_H_
    353