Home | History | Annotate | Download | only in network_speech_synthesis
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 /**
      6  * @fileoverview
      7  * This is a component extension that implements a text-to-speech (TTS)
      8  * engine powered by Google's speech synthesis API.
      9  *
     10  * This is an "event page", so it's not loaded when the API isn't being used,
     11  * and doesn't waste resources. When a web page or web app makes a speech
     12  * request and the parameters match one of the voices in this extension's
     13  * manifest, it makes a request to Google's API using Chrome's private key
     14  * and plays the resulting speech using HTML5 audio.
     15  */
     16 
     17 /**
     18  * The main class for this extension. Adds listeners to
     19  * chrome.ttsEngine.onSpeak and chrome.ttsEngine.onStop and implements
     20  * them using Google's speech synthesis API.
     21  * @constructor
     22  */
     23 function TtsExtension() {}
     24 
     25 TtsExtension.prototype = {
     26   /**
     27    * The url prefix of the speech server, including static query
     28    * parameters that don't change.
     29    * @type {string}
     30    * @const
     31    * @private
     32    */
     33   SPEECH_SERVER_URL_:
     34       'https://www.google.com/speech-api/v2/synthesize?' +
     35       'enc=mpeg&client=chromium',
     36 
     37   /**
     38    * A mapping from language and gender to voice name, hardcoded for now
     39    * until the speech synthesis server capabilities response provides this.
     40    * The key of this map is of the form '<lang>-<gender>'.
     41    * @type {Object.<string, string>}
     42    * @private
     43    */
     44   LANG_AND_GENDER_TO_VOICE_NAME_: {
     45     'en-gb-male': 'rjs',
     46     'en-gb-female': 'fis',
     47   },
     48 
     49   /**
     50    * The arguments passed to the onSpeak event handler for the utterance
     51    * that's currently being spoken. Should be null when no object is
     52    * pending.
     53    *
     54    * @type {?{utterance: string, options: Object, callback: Function}}
     55    * @private
     56    */
     57   currentUtterance_: null,
     58 
     59   /**
     60    * The HTML5 audio element we use for playing the sound served by the
     61    * speech server.
     62    * @type {HTMLAudioElement}
     63    * @private
     64    */
     65   audioElement_: null,
     66 
     67   /**
     68    * A mapping from voice name to language and gender, derived from the
     69    * manifest file.  This is used in case the speech synthesis request
     70    * specifies a voice name but doesn't specify a language code or gender.
     71    * @type {Object.<string, {lang: string, gender: string}>}
     72    * @private
     73    */
     74   voiceNameToLangAndGender_: {},
     75 
     76   /**
     77    * This is the main function called to initialize this extension.
     78    * Initializes data structures and adds event listeners.
     79    */
     80   init: function() {
     81     // Get voices from manifest.
     82     var voices = chrome.app.getDetails().tts_engine.voices;
     83     for (var i = 0; i < voices.length; i++) {
     84       this.voiceNameToLangAndGender_[voices[i].voice_name] = {
     85         lang: voices[i].lang,
     86         gender: voices[i].gender
     87       };
     88     }
     89 
     90     // Initialize the audio element and event listeners on it.
     91     this.audioElement_ = document.createElement('audio');
     92     document.body.appendChild(this.audioElement_);
     93     this.audioElement_.addEventListener(
     94         'ended', this.onStop_.bind(this), false);
     95     this.audioElement_.addEventListener(
     96         'canplaythrough', this.onStart_.bind(this), false);
     97 
     98     // Install event listeners for the ttsEngine API.
     99     chrome.ttsEngine.onSpeak.addListener(this.onSpeak_.bind(this));
    100     chrome.ttsEngine.onStop.addListener(this.onStop_.bind(this));
    101     chrome.ttsEngine.onPause.addListener(this.onPause_.bind(this));
    102     chrome.ttsEngine.onResume.addListener(this.onResume_.bind(this));
    103   },
    104 
    105   /**
    106    * Handler for the chrome.ttsEngine.onSpeak interface.
    107    * Gets Chrome's Google API key and then uses it to generate a request
    108    * url for the requested speech utterance. Sets that url as the source
    109    * of the HTML5 audio element.
    110    * @param {string} utterance The text to be spoken.
    111    * @param {Object} options Options to control the speech, as defined
    112    *     in the Chrome ttsEngine extension API.
    113    * @private
    114    */
    115   onSpeak_: function(utterance, options, callback) {
    116     // Truncate the utterance if it's too long. Both Chrome's tts
    117     // extension api and the web speech api specify 32k as the
    118     // maximum limit for an utterance.
    119     if (utterance.length > 32768)
    120       utterance = utterance.substr(0, 32768);
    121 
    122     try {
    123       // First, stop any pending audio.
    124       this.onStop_();
    125 
    126       this.currentUtterance_ = {
    127         utterance: utterance,
    128         options: options,
    129         callback: callback
    130       };
    131 
    132       var lang = options.lang;
    133       var gender = options.gender;
    134       if (options.voiceName) {
    135         lang = this.voiceNameToLangAndGender_[options.voiceName].lang;
    136         gender = this.voiceNameToLangAndGender_[options.voiceName].gender;
    137       }
    138 
    139       if (!lang)
    140         lang = navigator.language;
    141 
    142       // Look up the specific voice name for this language and gender.
    143       // If it's not in the map, it doesn't matter - the language will
    144       // be used directly. This is only used for languages where more
    145       // than one gender is actually available.
    146       var key = lang.toLowerCase() + '-' + gender;
    147       var voiceName = this.LANG_AND_GENDER_TO_VOICE_NAME_[key];
    148 
    149       var url = this.SPEECH_SERVER_URL_;
    150       chrome.systemPrivate.getApiKey((function(key) {
    151         url += '&key=' + key;
    152         url += '&text=' + encodeURIComponent(utterance);
    153         url += '&lang=' + lang.toLowerCase();
    154 
    155         if (voiceName)
    156           url += '&name=' + voiceName;
    157 
    158         if (options.rate) {
    159           // Input rate is between 0.1 and 10.0 with a default of 1.0.
    160           // Output speed is between 0.0 and 1.0 with a default of 0.5.
    161           url += '&speed=' + (options.rate / 2.0);
    162         }
    163 
    164         if (options.pitch) {
    165           // Input pitch is between 0.0 and 2.0 with a default of 1.0.
    166           // Output pitch is between 0.0 and 1.0 with a default of 0.5.
    167           url += '&pitch=' + (options.pitch / 2.0);
    168         }
    169 
    170         // This begins loading the audio but does not play it.
    171         // When enough of the audio has loaded to begin playback,
    172         // the 'canplaythrough' handler will call this.onStart_,
    173         // which sends a start event to the ttsEngine callback and
    174         // then begins playing audio.
    175         this.audioElement_.src = url;
    176       }).bind(this));
    177     } catch (err) {
    178       console.error(String(err));
    179       callback({
    180         'type': 'error',
    181         'errorMessage': String(err)
    182       });
    183       this.currentUtterance_ = null;
    184     }
    185   },
    186 
    187   /**
    188    * Handler for the chrome.ttsEngine.onStop interface.
    189    * Called either when the ttsEngine API requests us to stop, or when
    190    * we reach the end of the audio stream. Pause the audio element to
    191    * silence it, and send a callback to the ttsEngine API to let it know
    192    * that we've completed. Note that the ttsEngine API manages callback
    193    * messages and will automatically replace the 'end' event with a
    194    * more specific callback like 'interrupted' when sending it to the
    195    * TTS client.
    196    * @private
    197    */
    198   onStop_: function() {
    199     if (this.currentUtterance_) {
    200       this.audioElement_.pause();
    201       this.currentUtterance_.callback({
    202         'type': 'end',
    203         'charIndex': this.currentUtterance_.utterance.length
    204       });
    205     }
    206     this.currentUtterance_ = null;
    207   },
    208 
    209   /**
    210    * Handler for the canplaythrough event on the audio element.
    211    * Called when the audio element has buffered enough audio to begin
    212    * playback. Send the 'start' event to the ttsEngine callback and
    213    * then begin playing the audio element.
    214    * @private
    215    */
    216   onStart_: function() {
    217     if (this.currentUtterance_) {
    218       if (this.currentUtterance_.options.volume !== undefined) {
    219         // Both APIs use the same range for volume, between 0.0 and 1.0.
    220         this.audioElement_.volume = this.currentUtterance_.options.volume;
    221       }
    222       this.audioElement_.play();
    223       this.currentUtterance_.callback({
    224           'type': 'start',
    225           'charIndex': 0
    226       });
    227     }
    228   },
    229 
    230   /**
    231    * Handler for the chrome.ttsEngine.onPause interface.
    232    * Pauses audio if we're in the middle of an utterance.
    233    * @private
    234    */
    235   onPause_: function() {
    236     if (this.currentUtterance_) {
    237       this.audioElement_.pause();
    238     }
    239   },
    240 
    241   /**
    242    * Handler for the chrome.ttsEngine.onPause interface.
    243    * Resumes audio if we're in the middle of an utterance.
    244    * @private
    245    */
    246   onResume_: function() {
    247     if (this.currentUtterance_) {
    248       this.audioElement_.play();
    249     }
    250   }
    251 
    252 };
    253 
    254 (new TtsExtension()).init();
    255