Home | History | Annotate | Download | only in voice
      1 /*
      2  * Copyright (C) 2009 Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
      5  * use this file except in compliance with the License. You may obtain a copy of
      6  * the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
     12  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
     13  * License for the specific language governing permissions and limitations under
     14  * the License.
     15  */
     16 
     17 package com.android.inputmethod.voice;
     18 
     19 import com.android.inputmethod.latin.R;
     20 
     21 import android.content.ContentResolver;
     22 import android.content.Context;
     23 import android.content.Intent;
     24 import android.os.Build;
     25 import android.os.Bundle;
     26 import android.os.Handler;
     27 import android.os.Message;
     28 import android.speech.RecognitionListener;
     29 import android.speech.SpeechRecognizer;
     30 import android.speech.RecognizerIntent;
     31 import android.util.Log;
     32 import android.view.View;
     33 import android.view.View.OnClickListener;
     34 
     35 import java.io.ByteArrayOutputStream;
     36 import java.io.IOException;
     37 import java.util.ArrayList;
     38 import java.util.HashMap;
     39 import java.util.List;
     40 import java.util.Locale;
     41 import java.util.Map;
     42 
     43 /**
     44  * Speech recognition input, including both user interface and a background
     45  * process to stream audio to the network recognizer. This class supplies a
     46  * View (getView()), which it updates as recognition occurs. The user of this
     47  * class is responsible for making the view visible to the user, as well as
     48  * handling various events returned through UiListener.
     49  */
     50 public class VoiceInput implements OnClickListener {
     51     private static final String TAG = "VoiceInput";
     52     private static final String EXTRA_RECOGNITION_CONTEXT =
     53             "android.speech.extras.RECOGNITION_CONTEXT";
     54     private static final String EXTRA_CALLING_PACKAGE = "calling_package";
     55 
     56     private static final String DEFAULT_RECOMMENDED_PACKAGES =
     57             "com.android.mms " +
     58             "com.google.android.gm " +
     59             "com.google.android.talk " +
     60             "com.google.android.apps.googlevoice " +
     61             "com.android.email " +
     62             "com.android.browser ";
     63 
     64     // WARNING! Before enabling this, fix the problem with calling getExtractedText() in
     65     // landscape view. It causes Extracted text updates to be rejected due to a token mismatch
     66     public static boolean ENABLE_WORD_CORRECTIONS = false;
     67 
     68     // Dummy word suggestion which means "delete current word"
     69     public static final String DELETE_SYMBOL = " \u00D7 ";  // times symbol
     70 
     71     private Whitelist mRecommendedList;
     72     private Whitelist mBlacklist;
     73 
     74     private VoiceInputLogger mLogger;
     75 
     76     // Names of a few intent extras defined in VoiceSearch's RecognitionService.
     77     // These let us tweak the endpointer parameters.
     78     private static final String EXTRA_SPEECH_MINIMUM_LENGTH_MILLIS =
     79             "android.speech.extras.SPEECH_INPUT_MINIMUM_LENGTH_MILLIS";
     80     private static final String EXTRA_SPEECH_INPUT_COMPLETE_SILENCE_LENGTH_MILLIS =
     81             "android.speech.extras.SPEECH_INPUT_COMPLETE_SILENCE_LENGTH_MILLIS";
     82     private static final String EXTRA_SPEECH_INPUT_POSSIBLY_COMPLETE_SILENCE_LENGTH_MILLIS =
     83             "android.speech.extras.SPEECH_INPUT_POSSIBLY_COMPLETE_SILENCE_LENGTH_MILLIS";
     84 
     85     // The usual endpointer default value for input complete silence length is 0.5 seconds,
     86     // but that's used for things like voice search. For dictation-like voice input like this,
     87     // we go with a more liberal value of 1 second. This value will only be used if a value
     88     // is not provided from Gservices.
     89     private static final String INPUT_COMPLETE_SILENCE_LENGTH_DEFAULT_VALUE_MILLIS = "1000";
     90 
     91     // Used to record part of that state for logging purposes.
     92     public static final int DEFAULT = 0;
     93     public static final int LISTENING = 1;
     94     public static final int WORKING = 2;
     95     public static final int ERROR = 3;
     96 
     97     private int mAfterVoiceInputDeleteCount = 0;
     98     private int mAfterVoiceInputInsertCount = 0;
     99     private int mAfterVoiceInputInsertPunctuationCount = 0;
    100     private int mAfterVoiceInputCursorPos = 0;
    101     private int mAfterVoiceInputSelectionSpan = 0;
    102 
    103     private int mState = DEFAULT;
    104 
    105     private final static int MSG_CLOSE_ERROR_DIALOG = 1;
    106 
    107     private final Handler mHandler = new Handler() {
    108         @Override
    109         public void handleMessage(Message msg) {
    110             if (msg.what == MSG_CLOSE_ERROR_DIALOG) {
    111                 mState = DEFAULT;
    112                 mRecognitionView.finish();
    113                 mUiListener.onCancelVoice();
    114             }
    115         }
    116     };
    117 
    118     /**
    119      * Events relating to the recognition UI. You must implement these.
    120      */
    121     public interface UiListener {
    122 
    123         /**
    124          * @param recognitionResults a set of transcripts for what the user
    125          *   spoke, sorted by likelihood.
    126          */
    127         public void onVoiceResults(
    128             List<String> recognitionResults,
    129             Map<String, List<CharSequence>> alternatives);
    130 
    131         /**
    132          * Called when the user cancels speech recognition.
    133          */
    134         public void onCancelVoice();
    135     }
    136 
    137     private SpeechRecognizer mSpeechRecognizer;
    138     private RecognitionListener mRecognitionListener;
    139     private RecognitionView mRecognitionView;
    140     private UiListener mUiListener;
    141     private Context mContext;
    142 
    143     /**
    144      * @param context the service or activity in which we're running.
    145      * @param uiHandler object to receive events from VoiceInput.
    146      */
    147     public VoiceInput(Context context, UiListener uiHandler) {
    148         mLogger = VoiceInputLogger.getLogger(context);
    149         mRecognitionListener = new ImeRecognitionListener();
    150         mSpeechRecognizer = SpeechRecognizer.createSpeechRecognizer(context);
    151         mSpeechRecognizer.setRecognitionListener(mRecognitionListener);
    152         mUiListener = uiHandler;
    153         mContext = context;
    154         newView();
    155 
    156         String recommendedPackages = SettingsUtil.getSettingsString(
    157                 context.getContentResolver(),
    158                 SettingsUtil.LATIN_IME_VOICE_INPUT_RECOMMENDED_PACKAGES,
    159                 DEFAULT_RECOMMENDED_PACKAGES);
    160 
    161         mRecommendedList = new Whitelist();
    162         for (String recommendedPackage : recommendedPackages.split("\\s+")) {
    163             mRecommendedList.addApp(recommendedPackage);
    164         }
    165 
    166         mBlacklist = new Whitelist();
    167         mBlacklist.addApp("com.android.setupwizard");
    168     }
    169 
    170     public void setCursorPos(int pos) {
    171         mAfterVoiceInputCursorPos = pos;
    172     }
    173 
    174     public int getCursorPos() {
    175         return mAfterVoiceInputCursorPos;
    176     }
    177 
    178     public void setSelectionSpan(int span) {
    179         mAfterVoiceInputSelectionSpan = span;
    180     }
    181 
    182     public int getSelectionSpan() {
    183         return mAfterVoiceInputSelectionSpan;
    184     }
    185 
    186     public void incrementTextModificationDeleteCount(int count){
    187         mAfterVoiceInputDeleteCount += count;
    188         // Send up intents for other text modification types
    189         if (mAfterVoiceInputInsertCount > 0) {
    190             logTextModifiedByTypingInsertion(mAfterVoiceInputInsertCount);
    191             mAfterVoiceInputInsertCount = 0;
    192         }
    193         if (mAfterVoiceInputInsertPunctuationCount > 0) {
    194             logTextModifiedByTypingInsertionPunctuation(mAfterVoiceInputInsertPunctuationCount);
    195             mAfterVoiceInputInsertPunctuationCount = 0;
    196         }
    197 
    198     }
    199 
    200     public void incrementTextModificationInsertCount(int count){
    201         mAfterVoiceInputInsertCount += count;
    202         if (mAfterVoiceInputSelectionSpan > 0) {
    203             // If text was highlighted before inserting the char, count this as
    204             // a delete.
    205             mAfterVoiceInputDeleteCount += mAfterVoiceInputSelectionSpan;
    206         }
    207         // Send up intents for other text modification types
    208         if (mAfterVoiceInputDeleteCount > 0) {
    209             logTextModifiedByTypingDeletion(mAfterVoiceInputDeleteCount);
    210             mAfterVoiceInputDeleteCount = 0;
    211         }
    212         if (mAfterVoiceInputInsertPunctuationCount > 0) {
    213             logTextModifiedByTypingInsertionPunctuation(mAfterVoiceInputInsertPunctuationCount);
    214             mAfterVoiceInputInsertPunctuationCount = 0;
    215         }
    216     }
    217 
    218     public void incrementTextModificationInsertPunctuationCount(int count){
    219         mAfterVoiceInputInsertPunctuationCount += 1;
    220         if (mAfterVoiceInputSelectionSpan > 0) {
    221             // If text was highlighted before inserting the char, count this as
    222             // a delete.
    223             mAfterVoiceInputDeleteCount += mAfterVoiceInputSelectionSpan;
    224         }
    225         // Send up intents for aggregated non-punctuation insertions
    226         if (mAfterVoiceInputDeleteCount > 0) {
    227             logTextModifiedByTypingDeletion(mAfterVoiceInputDeleteCount);
    228             mAfterVoiceInputDeleteCount = 0;
    229         }
    230         if (mAfterVoiceInputInsertCount > 0) {
    231             logTextModifiedByTypingInsertion(mAfterVoiceInputInsertCount);
    232             mAfterVoiceInputInsertCount = 0;
    233         }
    234     }
    235 
    236     public void flushAllTextModificationCounters() {
    237         if (mAfterVoiceInputInsertCount > 0) {
    238             logTextModifiedByTypingInsertion(mAfterVoiceInputInsertCount);
    239             mAfterVoiceInputInsertCount = 0;
    240         }
    241         if (mAfterVoiceInputDeleteCount > 0) {
    242             logTextModifiedByTypingDeletion(mAfterVoiceInputDeleteCount);
    243             mAfterVoiceInputDeleteCount = 0;
    244         }
    245         if (mAfterVoiceInputInsertPunctuationCount > 0) {
    246             logTextModifiedByTypingInsertionPunctuation(mAfterVoiceInputInsertPunctuationCount);
    247             mAfterVoiceInputInsertPunctuationCount = 0;
    248         }
    249     }
    250 
    251     /**
    252      * The configuration of the IME changed and may have caused the views to be layed out
    253      * again. Restore the state of the recognition view.
    254      */
    255     public void onConfigurationChanged() {
    256         mRecognitionView.restoreState();
    257     }
    258 
    259     /**
    260      * @return true if field is blacklisted for voice
    261      */
    262     public boolean isBlacklistedField(FieldContext context) {
    263         return mBlacklist.matches(context);
    264     }
    265 
    266     /**
    267      * Used to decide whether to show voice input hints for this field, etc.
    268      *
    269      * @return true if field is recommended for voice
    270      */
    271     public boolean isRecommendedField(FieldContext context) {
    272         return mRecommendedList.matches(context);
    273     }
    274 
    275     /**
    276      * Start listening for speech from the user. This will grab the microphone
    277      * and start updating the view provided by getView(). It is the caller's
    278      * responsibility to ensure that the view is visible to the user at this stage.
    279      *
    280      * @param context the same FieldContext supplied to voiceIsEnabled()
    281      * @param swipe whether this voice input was started by swipe, for logging purposes
    282      */
    283     public void startListening(FieldContext context, boolean swipe) {
    284         mState = DEFAULT;
    285 
    286         Locale locale = Locale.getDefault();
    287         String localeString = locale.getLanguage() + "-" + locale.getCountry();
    288 
    289         mLogger.start(localeString, swipe);
    290 
    291         mState = LISTENING;
    292 
    293         mRecognitionView.showInitializing();
    294         startListeningAfterInitialization(context);
    295     }
    296 
    297     /**
    298      * Called only when the recognition manager's initialization completed
    299      *
    300      * @param context context with which {@link #startListening(FieldContext, boolean)} was executed
    301      */
    302     private void startListeningAfterInitialization(FieldContext context) {
    303         Intent intent = makeIntent();
    304         intent.putExtra(RecognizerIntent.EXTRA_LANGUAGE_MODEL, "");
    305         intent.putExtra(EXTRA_RECOGNITION_CONTEXT, context.getBundle());
    306         intent.putExtra(EXTRA_CALLING_PACKAGE, "VoiceIME");
    307         intent.putExtra(RecognizerIntent.EXTRA_MAX_RESULTS,
    308                 SettingsUtil.getSettingsInt(
    309                         mContext.getContentResolver(),
    310                         SettingsUtil.LATIN_IME_MAX_VOICE_RESULTS,
    311                         1));
    312 
    313         // Get endpointer params from Gservices.
    314         // TODO: Consider caching these values for improved performance on slower devices.
    315         final ContentResolver cr = mContext.getContentResolver();
    316         putEndpointerExtra(
    317                 cr,
    318                 intent,
    319                 SettingsUtil.LATIN_IME_SPEECH_MINIMUM_LENGTH_MILLIS,
    320                 EXTRA_SPEECH_MINIMUM_LENGTH_MILLIS,
    321                 null  /* rely on endpointer default */);
    322         putEndpointerExtra(
    323                 cr,
    324                 intent,
    325                 SettingsUtil.LATIN_IME_SPEECH_INPUT_COMPLETE_SILENCE_LENGTH_MILLIS,
    326                 EXTRA_SPEECH_INPUT_COMPLETE_SILENCE_LENGTH_MILLIS,
    327                 INPUT_COMPLETE_SILENCE_LENGTH_DEFAULT_VALUE_MILLIS
    328                 /* our default value is different from the endpointer's */);
    329         putEndpointerExtra(
    330                 cr,
    331                 intent,
    332                 SettingsUtil.
    333                         LATIN_IME_SPEECH_INPUT_POSSIBLY_COMPLETE_SILENCE_LENGTH_MILLIS,
    334                 EXTRA_SPEECH_INPUT_POSSIBLY_COMPLETE_SILENCE_LENGTH_MILLIS,
    335                 null  /* rely on endpointer default */);
    336 
    337         mSpeechRecognizer.startListening(intent);
    338     }
    339 
    340     /**
    341      * Gets the value of the provided Gservices key, attempts to parse it into a long,
    342      * and if successful, puts the long value as an extra in the provided intent.
    343      */
    344     private void putEndpointerExtra(ContentResolver cr, Intent i,
    345             String gservicesKey, String intentExtraKey, String defaultValue) {
    346         long l = -1;
    347         String s = SettingsUtil.getSettingsString(cr, gservicesKey, defaultValue);
    348         if (s != null) {
    349             try {
    350                 l = Long.valueOf(s);
    351             } catch (NumberFormatException e) {
    352                 Log.e(TAG, "could not parse value for " + gservicesKey + ": " + s);
    353             }
    354         }
    355 
    356         if (l != -1) i.putExtra(intentExtraKey, l);
    357     }
    358 
    359     public void destroy() {
    360         mSpeechRecognizer.destroy();
    361     }
    362 
    363     /**
    364      * Creates a new instance of the view that is returned by {@link #getView()}
    365      * Clients should use this when a previously returned view is stuck in a
    366      * layout that is being thrown away and a new one is need to show to the
    367      * user.
    368      */
    369     public void newView() {
    370         mRecognitionView = new RecognitionView(mContext, this);
    371     }
    372 
    373     /**
    374      * @return a view that shows the recognition flow--e.g., "Speak now" and
    375      * "working" dialogs.
    376      */
    377     public View getView() {
    378         return mRecognitionView.getView();
    379     }
    380 
    381     /**
    382      * Handle the cancel button.
    383      */
    384     public void onClick(View view) {
    385         switch(view.getId()) {
    386             case R.id.button:
    387                 cancel();
    388                 break;
    389         }
    390     }
    391 
    392     public void logTextModifiedByTypingInsertion(int length) {
    393         mLogger.textModifiedByTypingInsertion(length);
    394     }
    395 
    396     public void logTextModifiedByTypingInsertionPunctuation(int length) {
    397         mLogger.textModifiedByTypingInsertionPunctuation(length);
    398     }
    399 
    400     public void logTextModifiedByTypingDeletion(int length) {
    401         mLogger.textModifiedByTypingDeletion(length);
    402     }
    403 
    404     public void logTextModifiedByChooseSuggestion(int length) {
    405         mLogger.textModifiedByChooseSuggestion(length);
    406     }
    407 
    408     public void logKeyboardWarningDialogShown() {
    409         mLogger.keyboardWarningDialogShown();
    410     }
    411 
    412     public void logKeyboardWarningDialogDismissed() {
    413         mLogger.keyboardWarningDialogDismissed();
    414     }
    415 
    416     public void logKeyboardWarningDialogOk() {
    417         mLogger.keyboardWarningDialogOk();
    418     }
    419 
    420     public void logKeyboardWarningDialogCancel() {
    421         mLogger.keyboardWarningDialogCancel();
    422     }
    423 
    424     public void logSwipeHintDisplayed() {
    425         mLogger.swipeHintDisplayed();
    426     }
    427 
    428     public void logPunctuationHintDisplayed() {
    429         mLogger.punctuationHintDisplayed();
    430     }
    431 
    432     public void logVoiceInputDelivered(int length) {
    433         mLogger.voiceInputDelivered(length);
    434     }
    435 
    436     public void logNBestChoose(int index) {
    437         mLogger.nBestChoose(index);
    438     }
    439 
    440     public void logInputEnded() {
    441         mLogger.inputEnded();
    442     }
    443 
    444     public void flushLogs() {
    445         mLogger.flush();
    446     }
    447 
    448     private static Intent makeIntent() {
    449         Intent intent = new Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH);
    450 
    451         // On Cupcake, use VoiceIMEHelper since VoiceSearch doesn't support.
    452         // On Donut, always use VoiceSearch, since VoiceIMEHelper and
    453         // VoiceSearch may conflict.
    454         if (Build.VERSION.RELEASE.equals("1.5")) {
    455             intent = intent.setClassName(
    456               "com.google.android.voiceservice",
    457               "com.google.android.voiceservice.IMERecognitionService");
    458         } else {
    459             intent = intent.setClassName(
    460               "com.google.android.voicesearch",
    461               "com.google.android.voicesearch.RecognitionService");
    462         }
    463 
    464         return intent;
    465     }
    466 
    467     /**
    468      * Cancel in-progress speech recognition.
    469      */
    470     public void cancel() {
    471         switch (mState) {
    472         case LISTENING:
    473             mLogger.cancelDuringListening();
    474             break;
    475         case WORKING:
    476             mLogger.cancelDuringWorking();
    477             break;
    478         case ERROR:
    479             mLogger.cancelDuringError();
    480             break;
    481         }
    482         mState = DEFAULT;
    483 
    484         // Remove all pending tasks (e.g., timers to cancel voice input)
    485         mHandler.removeMessages(MSG_CLOSE_ERROR_DIALOG);
    486 
    487         mSpeechRecognizer.cancel();
    488         mUiListener.onCancelVoice();
    489         mRecognitionView.finish();
    490     }
    491 
    492     private int getErrorStringId(int errorType, boolean endpointed) {
    493         switch (errorType) {
    494             // We use CLIENT_ERROR to signify that voice search is not available on the device.
    495             case SpeechRecognizer.ERROR_CLIENT:
    496                 return R.string.voice_not_installed;
    497             case SpeechRecognizer.ERROR_NETWORK:
    498                 return R.string.voice_network_error;
    499             case SpeechRecognizer.ERROR_NETWORK_TIMEOUT:
    500                 return endpointed ?
    501                         R.string.voice_network_error : R.string.voice_too_much_speech;
    502             case SpeechRecognizer.ERROR_AUDIO:
    503                 return R.string.voice_audio_error;
    504             case SpeechRecognizer.ERROR_SERVER:
    505                 return R.string.voice_server_error;
    506             case SpeechRecognizer.ERROR_SPEECH_TIMEOUT:
    507                 return R.string.voice_speech_timeout;
    508             case SpeechRecognizer.ERROR_NO_MATCH:
    509                 return R.string.voice_no_match;
    510             default: return R.string.voice_error;
    511         }
    512     }
    513 
    514     private void onError(int errorType, boolean endpointed) {
    515         Log.i(TAG, "error " + errorType);
    516         mLogger.error(errorType);
    517         onError(mContext.getString(getErrorStringId(errorType, endpointed)));
    518     }
    519 
    520     private void onError(String error) {
    521         mState = ERROR;
    522         mRecognitionView.showError(error);
    523         // Wait a couple seconds and then automatically dismiss message.
    524         mHandler.sendMessageDelayed(Message.obtain(mHandler, MSG_CLOSE_ERROR_DIALOG), 2000);
    525     }
    526 
    527     private class ImeRecognitionListener implements RecognitionListener {
    528         // Waveform data
    529         final ByteArrayOutputStream mWaveBuffer = new ByteArrayOutputStream();
    530         int mSpeechStart;
    531         private boolean mEndpointed = false;
    532 
    533         public void onReadyForSpeech(Bundle noiseParams) {
    534             mRecognitionView.showListening();
    535         }
    536 
    537         public void onBeginningOfSpeech() {
    538             mEndpointed = false;
    539             mSpeechStart = mWaveBuffer.size();
    540         }
    541 
    542         public void onRmsChanged(float rmsdB) {
    543             mRecognitionView.updateVoiceMeter(rmsdB);
    544         }
    545 
    546         public void onBufferReceived(byte[] buf) {
    547             try {
    548                 mWaveBuffer.write(buf);
    549             } catch (IOException e) {}
    550         }
    551 
    552         public void onEndOfSpeech() {
    553             mEndpointed = true;
    554             mState = WORKING;
    555             mRecognitionView.showWorking(mWaveBuffer, mSpeechStart, mWaveBuffer.size());
    556         }
    557 
    558         public void onError(int errorType) {
    559             mState = ERROR;
    560             VoiceInput.this.onError(errorType, mEndpointed);
    561         }
    562 
    563         public void onResults(Bundle resultsBundle) {
    564             List<String> results = resultsBundle
    565                     .getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION);
    566             mState = DEFAULT;
    567 
    568             final Map<String, List<CharSequence>> alternatives =
    569                     new HashMap<String, List<CharSequence>>();
    570             if (results.size() >= 2 && ENABLE_WORD_CORRECTIONS) {
    571                 final String[][] words = new String[results.size()][];
    572                 for (int i = 0; i < words.length; i++) {
    573                     words[i] = results.get(i).split(" ");
    574                 }
    575 
    576                 for (int key = 0; key < words[0].length; key++) {
    577                     alternatives.put(words[0][key], new ArrayList<CharSequence>());
    578                     for (int alt = 1; alt < words.length; alt++) {
    579                         int keyBegin = key * words[alt].length / words[0].length;
    580                         int keyEnd = (key + 1) * words[alt].length / words[0].length;
    581 
    582                         for (int i = keyBegin; i < Math.min(words[alt].length, keyEnd); i++) {
    583                             List<CharSequence> altList = alternatives.get(words[0][key]);
    584                             if (!altList.contains(words[alt][i]) && altList.size() < 6) {
    585                                 altList.add(words[alt][i]);
    586                             }
    587                         }
    588                     }
    589                 }
    590             }
    591 
    592             if (results.size() > 5) {
    593                 results = results.subList(0, 5);
    594             }
    595             mUiListener.onVoiceResults(results, alternatives);
    596             mRecognitionView.finish();
    597         }
    598 
    599         public void onPartialResults(final Bundle partialResults) {
    600             // currently - do nothing
    601         }
    602 
    603         public void onEvent(int eventType, Bundle params) {
    604             // do nothing - reserved for events that might be added in the future
    605         }
    606     }
    607 }
    608