Home | History | Annotate | Download | only in research
      1 /*
      2  * Copyright (C) 2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.inputmethod.research;
     18 
     19 import android.util.Log;
     20 
     21 import com.android.inputmethod.annotations.UsedForTesting;
     22 import com.android.inputmethod.latin.Dictionary;
     23 import com.android.inputmethod.latin.Suggest;
     24 import com.android.inputmethod.latin.define.ProductionFlag;
     25 
     26 import java.io.IOException;
     27 import java.util.ArrayList;
     28 import java.util.LinkedList;
     29 
     30 /**
     31  * MainLogBuffer is a FixedLogBuffer that tracks the state of LogUnits to make privacy guarantees.
     32  *
     33  * There are three forms of privacy protection: 1) only words in the main dictionary are allowed to
     34  * be logged in enough detail to determine their contents, 2) only a subset of words are logged
     35  * in detail, such as 10%, and 3) no numbers are logged.
     36  *
     37  * This class maintains a list of LogUnits, each corresponding to a word.  As the user completes
     38  * words, they are added here.  But if the user backs up over their current word to edit a word
     39  * entered earlier, then it is pulled out of this LogBuffer, changes are then added to the end of
     40  * the LogUnit, and it is pushed back in here when the user is done.  Because words may be pulled
     41  * back out even after they are pushed in, we must not publish the contents of this LogBuffer too
     42  * quickly.  However, we cannot let the contents pile up either, or it will limit the editing that
     43  * a user can perform.
     44  *
     45  * To balance these requirements (keep history so user can edit, flush history so it does not pile
     46  * up), the LogBuffer is considered "complete" when the user has entered enough words to form an
     47  * n-gram, followed by enough additional non-detailed words (that are in the 90%, as per above).
     48  * Once complete, the n-gram may be published to flash storage (via the ResearchLog class).
     49  * However, the additional non-detailed words are retained, in case the user backspaces to edit
     50  * them.  The MainLogBuffer then continues to add words, publishing individual non-detailed words
     51  * as new words arrive.  After enough non-detailed words have been pushed out to account for the
     52  * 90% between words, the words at the front of the LogBuffer can be published as an n-gram again.
     53  *
     54  * If the words that would form the valid n-gram are not in the dictionary, then words are pushed
     55  * through the LogBuffer one at a time until an n-gram is found that is entirely composed of
     56  * dictionary words.
     57  *
     58  * If the user closes a session, then the entire LogBuffer is flushed, publishing any embedded
     59  * n-gram containing dictionary words.
     60  */
     61 public abstract class MainLogBuffer extends FixedLogBuffer {
     62     private static final String TAG = MainLogBuffer.class.getSimpleName();
     63     private static final boolean DEBUG = false
     64             && ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS_DEBUG;
     65 
     66     // Keep consistent with switch statement in Statistics.recordPublishabilityResultCode()
     67     public static final int PUBLISHABILITY_PUBLISHABLE = 0;
     68     public static final int PUBLISHABILITY_UNPUBLISHABLE_STOPPING = 1;
     69     public static final int PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT = 2;
     70     public static final int PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY = 3;
     71     public static final int PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE = 4;
     72     public static final int PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT = 5;
     73     public static final int PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY = 6;
     74 
     75     // The size of the n-grams logged.  E.g. N_GRAM_SIZE = 2 means to sample bigrams.
     76     public static final int N_GRAM_SIZE = 2;
     77 
     78     // TODO: Remove dependence on Suggest, and pass in Dictionary as a parameter to an appropriate
     79     // method.
     80     private final Suggest mSuggest;
     81     @UsedForTesting
     82     private Dictionary mDictionaryForTesting;
     83     private boolean mIsStopping = false;
     84 
     85     /* package for test */ int mNumWordsBetweenNGrams;
     86 
     87     // Counter for words left to suppress before an n-gram can be sampled.  Reset to mMinWordPeriod
     88     // after a sample is taken.
     89     /* package for test */ int mNumWordsUntilSafeToSample;
     90 
     91     public MainLogBuffer(final int wordsBetweenSamples, final int numInitialWordsToIgnore,
     92             final Suggest suggest) {
     93         super(N_GRAM_SIZE + wordsBetweenSamples);
     94         mNumWordsBetweenNGrams = wordsBetweenSamples;
     95         mNumWordsUntilSafeToSample = DEBUG ? 0 : numInitialWordsToIgnore;
     96         mSuggest = suggest;
     97     }
     98 
     99     @UsedForTesting
    100     /* package for test */ void setDictionaryForTesting(final Dictionary dictionary) {
    101         mDictionaryForTesting = dictionary;
    102     }
    103 
    104     private Dictionary getDictionary() {
    105         if (mDictionaryForTesting != null) {
    106             return mDictionaryForTesting;
    107         }
    108         if (mSuggest == null || !mSuggest.hasMainDictionary()) return null;
    109         return mSuggest.getMainDictionary();
    110     }
    111 
    112     public void setIsStopping() {
    113         mIsStopping = true;
    114     }
    115 
    116     /**
    117      * Determines whether the string determined by a series of LogUnits will not violate user
    118      * privacy if published.
    119      *
    120      * @param logUnits a LogUnit list to check for publishability
    121      * @param nGramSize the smallest n-gram acceptable to be published.  if
    122      * {@link ResearchLogger#IS_LOGGING_EVERYTHING} is true, then publish if there are more than
    123      * {@code minNGramSize} words in the logUnits, otherwise wait.  if {@link
    124      * ResearchLogger#IS_LOGGING_EVERYTHING} is false, then ensure that there are exactly nGramSize
    125      * words in the LogUnits.
    126      *
    127      * @return one of the {@code PUBLISHABILITY_*} result codes defined in this class.
    128      */
    129     private int getPublishabilityResultCode(final ArrayList<LogUnit> logUnits,
    130             final int nGramSize) {
    131         // Bypass privacy checks when debugging.
    132         if (ResearchLogger.IS_LOGGING_EVERYTHING) {
    133             if (mIsStopping) {
    134                 return PUBLISHABILITY_UNPUBLISHABLE_STOPPING;
    135             }
    136             // Only check that it is the right length.  If not, wait for later words to make
    137             // complete n-grams.
    138             int numWordsInLogUnitList = 0;
    139             final int length = logUnits.size();
    140             for (int i = 0; i < length; i++) {
    141                 final LogUnit logUnit = logUnits.get(i);
    142                 numWordsInLogUnitList += logUnit.getNumWords();
    143             }
    144             if (numWordsInLogUnitList >= nGramSize) {
    145                 return PUBLISHABILITY_PUBLISHABLE;
    146             } else {
    147                 return PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT;
    148             }
    149         }
    150 
    151         // Check that we are not sampling too frequently.  Having sampled recently might disclose
    152         // too much of the user's intended meaning.
    153         if (mNumWordsUntilSafeToSample > 0) {
    154             return PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY;
    155         }
    156         // Reload the dictionary in case it has changed (e.g., because the user has changed
    157         // languages).
    158         final Dictionary dictionary = getDictionary();
    159         if (dictionary == null) {
    160             // Main dictionary is unavailable.  Since we cannot check it, we cannot tell if a
    161             // word is out-of-vocabulary or not.  Therefore, we must judge the entire buffer
    162             // contents to potentially pose a privacy risk.
    163             return PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE;
    164         }
    165 
    166         // Check each word in the buffer.  If any word poses a privacy threat, we cannot upload
    167         // the complete buffer contents in detail.
    168         int numWordsInLogUnitList = 0;
    169         final int length = logUnits.size();
    170         for (final LogUnit logUnit : logUnits) {
    171             if (!logUnit.hasOneOrMoreWords()) {
    172                 // Digits outside words are a privacy threat.
    173                 if (logUnit.mayContainDigit()) {
    174                     return PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT;
    175                 }
    176             } else {
    177                 numWordsInLogUnitList += logUnit.getNumWords();
    178                 final String[] words = logUnit.getWordsAsStringArray();
    179                 for (final String word : words) {
    180                     // Words not in the dictionary are a privacy threat.
    181                     if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) {
    182                         if (DEBUG) {
    183                             Log.d(TAG, "\"" + word + "\" NOT SAFE!: hasLetters: "
    184                                     + ResearchLogger.hasLetters(word)
    185                                     + ", isValid: " + (dictionary.isValidWord(word)));
    186                         }
    187                         return PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY;
    188                     }
    189                 }
    190             }
    191         }
    192 
    193         // Finally, only return true if the ngram is the right size.
    194         if (numWordsInLogUnitList == nGramSize) {
    195             return PUBLISHABILITY_PUBLISHABLE;
    196         } else {
    197             return PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT;
    198         }
    199     }
    200 
    201     public void shiftAndPublishAll() throws IOException {
    202         final LinkedList<LogUnit> logUnits = getLogUnits();
    203         while (!logUnits.isEmpty()) {
    204             publishLogUnitsAtFrontOfBuffer();
    205         }
    206     }
    207 
    208     @Override
    209     protected final void onBufferFull() {
    210         try {
    211             publishLogUnitsAtFrontOfBuffer();
    212         } catch (final IOException e) {
    213             if (DEBUG) {
    214                 Log.w(TAG, "IOException when publishing front of LogBuffer", e);
    215             }
    216         }
    217     }
    218 
    219     /**
    220      * If there is a safe n-gram at the front of this log buffer, publish it with all details, and
    221      * remove the LogUnits that constitute it.
    222      *
    223      * An n-gram might not be "safe" if it violates privacy controls.  E.g., it might contain
    224      * numbers, an out-of-vocabulary word, or another n-gram may have been published recently.  If
    225      * there is no safe n-gram, then the LogUnits up through the first word-containing LogUnit are
    226      * published, but without disclosing any privacy-related details, such as the word the LogUnit
    227      * generated, motion data, etc.
    228      *
    229      * Note that a LogUnit can hold more than one word if the user types without explicit spaces.
    230      * In this case, the words may be grouped together in such a way that pulling an n-gram off the
    231      * front would require splitting a LogUnit.  Splitting a LogUnit is not possible, so this case
    232      * is treated just as the unsafe n-gram case.  This may cause n-grams to be sampled at slightly
    233      * less than the target frequency.
    234      */
    235     protected final void publishLogUnitsAtFrontOfBuffer() throws IOException {
    236         // TODO: Refactor this method to require fewer passes through the LogUnits.  Should really
    237         // require only one pass.
    238         ArrayList<LogUnit> logUnits = peekAtFirstNWords(N_GRAM_SIZE);
    239         final int publishabilityResultCode = getPublishabilityResultCode(logUnits, N_GRAM_SIZE);
    240         ResearchLogger.recordPublishabilityResultCode(publishabilityResultCode);
    241         if (publishabilityResultCode == MainLogBuffer.PUBLISHABILITY_PUBLISHABLE) {
    242             // Good n-gram at the front of the buffer.  Publish it, disclosing details.
    243             publish(logUnits, true /* canIncludePrivateData */);
    244             shiftOutWords(N_GRAM_SIZE);
    245             mNumWordsUntilSafeToSample = mNumWordsBetweenNGrams;
    246             return;
    247         }
    248         // No good n-gram at front, and buffer is full.  Shift out up through the first logUnit
    249         // with associated words (or if there is none, all the existing logUnits).
    250         logUnits.clear();
    251         LogUnit logUnit = shiftOut();
    252         while (logUnit != null) {
    253             logUnits.add(logUnit);
    254             final int numWords = logUnit.getNumWords();
    255             if (numWords > 0) {
    256                 mNumWordsUntilSafeToSample = Math.max(0, mNumWordsUntilSafeToSample - numWords);
    257                 break;
    258             }
    259             logUnit = shiftOut();
    260         }
    261         publish(logUnits, false /* canIncludePrivateData */);
    262     }
    263 
    264     /**
    265      * Called when a list of logUnits should be published.
    266      *
    267      * It is the subclass's responsibility to implement the publication.
    268      *
    269      * @param logUnits The list of logUnits to be published.
    270      * @param canIncludePrivateData Whether the private data in the logUnits can be included in
    271      * publication.
    272      *
    273      * @throws IOException if publication to the log file is not possible
    274      */
    275     protected abstract void publish(final ArrayList<LogUnit> logUnits,
    276             final boolean canIncludePrivateData) throws IOException;
    277 
    278     @Override
    279     protected int shiftOutWords(final int numWords) {
    280         final int numWordsShiftedOut = super.shiftOutWords(numWords);
    281         mNumWordsUntilSafeToSample = Math.max(0, mNumWordsUntilSafeToSample - numWordsShiftedOut);
    282         if (DEBUG) {
    283             Log.d(TAG, "wordsUntilSafeToSample now at " + mNumWordsUntilSafeToSample);
    284         }
    285         return numWordsShiftedOut;
    286     }
    287 }
    288