Home | History | Annotate | Download | only in research
      1 /*
      2  * Copyright (C) 2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.inputmethod.research;
     18 
     19 import android.util.Log;
     20 
     21 import com.android.inputmethod.annotations.UsedForTesting;
     22 import com.android.inputmethod.latin.Dictionary;
     23 import com.android.inputmethod.latin.Suggest;
     24 import com.android.inputmethod.latin.define.ProductionFlag;
     25 
     26 import java.io.IOException;
     27 import java.util.ArrayList;
     28 import java.util.LinkedList;
     29 
     30 /**
     31  * MainLogBuffer is a FixedLogBuffer that tracks the state of LogUnits to make privacy guarantees.
     32  *
     33  * There are three forms of privacy protection: 1) only words in the main dictionary are allowed to
     34  * be logged in enough detail to determine their contents, 2) only a subset of words are logged
     35  * in detail, such as 10%, and 3) no numbers are logged.
     36  *
     37  * This class maintains a list of LogUnits, each corresponding to a word.  As the user completes
     38  * words, they are added here.  But if the user backs up over their current word to edit a word
     39  * entered earlier, then it is pulled out of this LogBuffer, changes are then added to the end of
     40  * the LogUnit, and it is pushed back in here when the user is done.  Because words may be pulled
     41  * back out even after they are pushed in, we must not publish the contents of this LogBuffer too
     42  * quickly.  However, we cannot let the contents pile up either, or it will limit the editing that
     43  * a user can perform.
     44  *
     45  * To balance these requirements (keep history so user can edit, flush history so it does not pile
     46  * up), the LogBuffer is considered "complete" when the user has entered enough words to form an
     47  * n-gram, followed by enough additional non-detailed words (that are in the 90%, as per above).
     48  * Once complete, the n-gram may be published to flash storage (via the ResearchLog class).
     49  * However, the additional non-detailed words are retained, in case the user backspaces to edit
     50  * them.  The MainLogBuffer then continues to add words, publishing individual non-detailed words
     51  * as new words arrive.  After enough non-detailed words have been pushed out to account for the
     52  * 90% between words, the words at the front of the LogBuffer can be published as an n-gram again.
     53  *
     54  * If the words that would form the valid n-gram are not in the dictionary, then words are pushed
     55  * through the LogBuffer one at a time until an n-gram is found that is entirely composed of
     56  * dictionary words.
     57  *
     58  * If the user closes a session, then the entire LogBuffer is flushed, publishing any embedded
     59  * n-gram containing dictionary words.
     60  */
     61 public abstract class MainLogBuffer extends FixedLogBuffer {
     62     private static final String TAG = MainLogBuffer.class.getSimpleName();
     63     private static final boolean DEBUG = false
     64             && ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS_DEBUG;
     65 
     66     // The size of the n-grams logged.  E.g. N_GRAM_SIZE = 2 means to sample bigrams.
     67     public static final int N_GRAM_SIZE = 2;
     68 
     69     // TODO: Remove dependence on Suggest, and pass in Dictionary as a parameter to an appropriate
     70     // method.
     71     private final Suggest mSuggest;
     72     @UsedForTesting
     73     private Dictionary mDictionaryForTesting;
     74     private boolean mIsStopping = false;
     75 
     76     /* package for test */ int mNumWordsBetweenNGrams;
     77 
     78     // Counter for words left to suppress before an n-gram can be sampled.  Reset to mMinWordPeriod
     79     // after a sample is taken.
     80     /* package for test */ int mNumWordsUntilSafeToSample;
     81 
     82     public MainLogBuffer(final int wordsBetweenSamples, final int numInitialWordsToIgnore,
     83             final Suggest suggest) {
     84         super(N_GRAM_SIZE + wordsBetweenSamples);
     85         mNumWordsBetweenNGrams = wordsBetweenSamples;
     86         mNumWordsUntilSafeToSample = DEBUG ? 0 : numInitialWordsToIgnore;
     87         mSuggest = suggest;
     88     }
     89 
     90     @UsedForTesting
     91     /* package for test */ void setDictionaryForTesting(final Dictionary dictionary) {
     92         mDictionaryForTesting = dictionary;
     93     }
     94 
     95     private Dictionary getDictionary() {
     96         if (mDictionaryForTesting != null) {
     97             return mDictionaryForTesting;
     98         }
     99         if (mSuggest == null || !mSuggest.hasMainDictionary()) return null;
    100         return mSuggest.getMainDictionary();
    101     }
    102 
    103     public void setIsStopping() {
    104         mIsStopping = true;
    105     }
    106 
    107     /**
    108      * Determines whether uploading the n words at the front the MainLogBuffer will not violate
    109      * user privacy.
    110      *
    111      * The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any
    112      * non-character data that is typed between words.  The decision about privacy is made based on
    113      * the buffer's entire content.  If it is decided that the privacy risks are too great to upload
    114      * the contents of this buffer, a censored version of the LogItems may still be uploaded.  E.g.,
    115      * the screen orientation and other characteristics about the device can be uploaded without
    116      * revealing much about the user.
    117      */
    118     private boolean isSafeNGram(final ArrayList<LogUnit> logUnits, final int minNGramSize) {
    119         // Bypass privacy checks when debugging.
    120         if (ResearchLogger.IS_LOGGING_EVERYTHING) {
    121             if (mIsStopping) {
    122                 return true;
    123             }
    124             // Only check that it is the right length.  If not, wait for later words to make
    125             // complete n-grams.
    126             int numWordsInLogUnitList = 0;
    127             final int length = logUnits.size();
    128             for (int i = 0; i < length; i++) {
    129                 final LogUnit logUnit = logUnits.get(i);
    130                 numWordsInLogUnitList += logUnit.getNumWords();
    131             }
    132             return numWordsInLogUnitList >= minNGramSize;
    133         }
    134 
    135         // Check that we are not sampling too frequently.  Having sampled recently might disclose
    136         // too much of the user's intended meaning.
    137         if (mNumWordsUntilSafeToSample > 0) {
    138             return false;
    139         }
    140         // Reload the dictionary in case it has changed (e.g., because the user has changed
    141         // languages).
    142         final Dictionary dictionary = getDictionary();
    143         if (dictionary == null) {
    144             // Main dictionary is unavailable.  Since we cannot check it, we cannot tell if a
    145             // word is out-of-vocabulary or not.  Therefore, we must judge the entire buffer
    146             // contents to potentially pose a privacy risk.
    147             return false;
    148         }
    149 
    150         // Check each word in the buffer.  If any word poses a privacy threat, we cannot upload
    151         // the complete buffer contents in detail.
    152         int numWordsInLogUnitList = 0;
    153         final int length = logUnits.size();
    154         for (final LogUnit logUnit : logUnits) {
    155             if (!logUnit.hasOneOrMoreWords()) {
    156                 // Digits outside words are a privacy threat.
    157                 if (logUnit.mayContainDigit()) {
    158                     return false;
    159                 }
    160             } else {
    161                 numWordsInLogUnitList += logUnit.getNumWords();
    162                 final String[] words = logUnit.getWordsAsStringArray();
    163                 for (final String word : words) {
    164                     // Words not in the dictionary are a privacy threat.
    165                     if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) {
    166                         if (DEBUG) {
    167                             Log.d(TAG, "\"" + word + "\" NOT SAFE!: hasLetters: "
    168                                     + ResearchLogger.hasLetters(word)
    169                                     + ", isValid: " + (dictionary.isValidWord(word)));
    170                         }
    171                         return false;
    172                     }
    173                 }
    174             }
    175         }
    176 
    177         // Finally, only return true if the ngram is the right size.
    178         return numWordsInLogUnitList == minNGramSize;
    179     }
    180 
    181     public void shiftAndPublishAll() throws IOException {
    182         final LinkedList<LogUnit> logUnits = getLogUnits();
    183         while (!logUnits.isEmpty()) {
    184             publishLogUnitsAtFrontOfBuffer();
    185         }
    186     }
    187 
    188     @Override
    189     protected final void onBufferFull() {
    190         try {
    191             publishLogUnitsAtFrontOfBuffer();
    192         } catch (final IOException e) {
    193             if (DEBUG) {
    194                 Log.w(TAG, "IOException when publishing front of LogBuffer", e);
    195             }
    196         }
    197     }
    198 
    199     protected final void publishLogUnitsAtFrontOfBuffer() throws IOException {
    200         // TODO: Refactor this method to require fewer passes through the LogUnits.  Should really
    201         // require only one pass.
    202         ArrayList<LogUnit> logUnits = peekAtFirstNWords(N_GRAM_SIZE);
    203         if (isSafeNGram(logUnits, N_GRAM_SIZE)) {
    204             // Good n-gram at the front of the buffer.  Publish it, disclosing details.
    205             publish(logUnits, true /* canIncludePrivateData */);
    206             shiftOutWords(N_GRAM_SIZE);
    207             mNumWordsUntilSafeToSample = mNumWordsBetweenNGrams;
    208             return;
    209         }
    210         // No good n-gram at front, and buffer is full.  Shift out up through the first logUnit
    211         // with associated words (or if there is none, all the existing logUnits).
    212         logUnits.clear();
    213         LogUnit logUnit = shiftOut();
    214         while (logUnit != null) {
    215             logUnits.add(logUnit);
    216             final int numWords = logUnit.getNumWords();
    217             if (numWords > 0) {
    218                 mNumWordsUntilSafeToSample = Math.max(0, mNumWordsUntilSafeToSample - numWords);
    219                 break;
    220             }
    221             logUnit = shiftOut();
    222         }
    223         publish(logUnits, false /* canIncludePrivateData */);
    224     }
    225 
    226     /**
    227      * Called when a list of logUnits should be published.
    228      *
    229      * It is the subclass's responsibility to implement the publication.
    230      *
    231      * @param logUnits The list of logUnits to be published.
    232      * @param canIncludePrivateData Whether the private data in the logUnits can be included in
    233      * publication.
    234      *
    235      * @throws IOException if publication to the log file is not possible
    236      */
    237     protected abstract void publish(final ArrayList<LogUnit> logUnits,
    238             final boolean canIncludePrivateData) throws IOException;
    239 
    240     @Override
    241     protected int shiftOutWords(final int numWords) {
    242         final int numWordsShiftedOut = super.shiftOutWords(numWords);
    243         mNumWordsUntilSafeToSample = Math.max(0, mNumWordsUntilSafeToSample - numWordsShiftedOut);
    244         if (DEBUG) {
    245             Log.d(TAG, "wordsUntilSafeToSample now at " + mNumWordsUntilSafeToSample);
    246         }
    247         return numWordsShiftedOut;
    248     }
    249 }
    250