1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.research; 18 19 import android.util.Log; 20 21 import com.android.inputmethod.annotations.UsedForTesting; 22 import com.android.inputmethod.latin.Dictionary; 23 import com.android.inputmethod.latin.Suggest; 24 import com.android.inputmethod.latin.define.ProductionFlag; 25 26 import java.io.IOException; 27 import java.util.ArrayList; 28 import java.util.LinkedList; 29 30 /** 31 * MainLogBuffer is a FixedLogBuffer that tracks the state of LogUnits to make privacy guarantees. 32 * 33 * There are three forms of privacy protection: 1) only words in the main dictionary are allowed to 34 * be logged in enough detail to determine their contents, 2) only a subset of words are logged 35 * in detail, such as 10%, and 3) no numbers are logged. 36 * 37 * This class maintains a list of LogUnits, each corresponding to a word. As the user completes 38 * words, they are added here. But if the user backs up over their current word to edit a word 39 * entered earlier, then it is pulled out of this LogBuffer, changes are then added to the end of 40 * the LogUnit, and it is pushed back in here when the user is done. Because words may be pulled 41 * back out even after they are pushed in, we must not publish the contents of this LogBuffer too 42 * quickly. However, we cannot let the contents pile up either, or it will limit the editing that 43 * a user can perform. 44 * 45 * To balance these requirements (keep history so user can edit, flush history so it does not pile 46 * up), the LogBuffer is considered "complete" when the user has entered enough words to form an 47 * n-gram, followed by enough additional non-detailed words (that are in the 90%, as per above). 48 * Once complete, the n-gram may be published to flash storage (via the ResearchLog class). 49 * However, the additional non-detailed words are retained, in case the user backspaces to edit 50 * them. The MainLogBuffer then continues to add words, publishing individual non-detailed words 51 * as new words arrive. After enough non-detailed words have been pushed out to account for the 52 * 90% between words, the words at the front of the LogBuffer can be published as an n-gram again. 53 * 54 * If the words that would form the valid n-gram are not in the dictionary, then words are pushed 55 * through the LogBuffer one at a time until an n-gram is found that is entirely composed of 56 * dictionary words. 57 * 58 * If the user closes a session, then the entire LogBuffer is flushed, publishing any embedded 59 * n-gram containing dictionary words. 60 */ 61 public abstract class MainLogBuffer extends FixedLogBuffer { 62 private static final String TAG = MainLogBuffer.class.getSimpleName(); 63 private static final boolean DEBUG = false 64 && ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS_DEBUG; 65 66 // Keep consistent with switch statement in Statistics.recordPublishabilityResultCode() 67 public static final int PUBLISHABILITY_PUBLISHABLE = 0; 68 public static final int PUBLISHABILITY_UNPUBLISHABLE_STOPPING = 1; 69 public static final int PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT = 2; 70 public static final int PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY = 3; 71 public static final int PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE = 4; 72 public static final int PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT = 5; 73 public static final int PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY = 6; 74 75 // The size of the n-grams logged. E.g. N_GRAM_SIZE = 2 means to sample bigrams. 76 public static final int N_GRAM_SIZE = 2; 77 78 // TODO: Remove dependence on Suggest, and pass in Dictionary as a parameter to an appropriate 79 // method. 80 private final Suggest mSuggest; 81 @UsedForTesting 82 private Dictionary mDictionaryForTesting; 83 private boolean mIsStopping = false; 84 85 /* package for test */ int mNumWordsBetweenNGrams; 86 87 // Counter for words left to suppress before an n-gram can be sampled. Reset to mMinWordPeriod 88 // after a sample is taken. 89 /* package for test */ int mNumWordsUntilSafeToSample; 90 91 public MainLogBuffer(final int wordsBetweenSamples, final int numInitialWordsToIgnore, 92 final Suggest suggest) { 93 super(N_GRAM_SIZE + wordsBetweenSamples); 94 mNumWordsBetweenNGrams = wordsBetweenSamples; 95 mNumWordsUntilSafeToSample = DEBUG ? 0 : numInitialWordsToIgnore; 96 mSuggest = suggest; 97 } 98 99 @UsedForTesting 100 /* package for test */ void setDictionaryForTesting(final Dictionary dictionary) { 101 mDictionaryForTesting = dictionary; 102 } 103 104 private Dictionary getDictionary() { 105 if (mDictionaryForTesting != null) { 106 return mDictionaryForTesting; 107 } 108 if (mSuggest == null || !mSuggest.hasMainDictionary()) return null; 109 return mSuggest.getMainDictionary(); 110 } 111 112 public void setIsStopping() { 113 mIsStopping = true; 114 } 115 116 /** 117 * Determines whether the string determined by a series of LogUnits will not violate user 118 * privacy if published. 119 * 120 * @param logUnits a LogUnit list to check for publishability 121 * @param nGramSize the smallest n-gram acceptable to be published. if 122 * {@link ResearchLogger#IS_LOGGING_EVERYTHING} is true, then publish if there are more than 123 * {@code minNGramSize} words in the logUnits, otherwise wait. if {@link 124 * ResearchLogger#IS_LOGGING_EVERYTHING} is false, then ensure that there are exactly nGramSize 125 * words in the LogUnits. 126 * 127 * @return one of the {@code PUBLISHABILITY_*} result codes defined in this class. 128 */ 129 private int getPublishabilityResultCode(final ArrayList<LogUnit> logUnits, 130 final int nGramSize) { 131 // Bypass privacy checks when debugging. 132 if (ResearchLogger.IS_LOGGING_EVERYTHING) { 133 if (mIsStopping) { 134 return PUBLISHABILITY_UNPUBLISHABLE_STOPPING; 135 } 136 // Only check that it is the right length. If not, wait for later words to make 137 // complete n-grams. 138 int numWordsInLogUnitList = 0; 139 final int length = logUnits.size(); 140 for (int i = 0; i < length; i++) { 141 final LogUnit logUnit = logUnits.get(i); 142 numWordsInLogUnitList += logUnit.getNumWords(); 143 } 144 if (numWordsInLogUnitList >= nGramSize) { 145 return PUBLISHABILITY_PUBLISHABLE; 146 } else { 147 return PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT; 148 } 149 } 150 151 // Check that we are not sampling too frequently. Having sampled recently might disclose 152 // too much of the user's intended meaning. 153 if (mNumWordsUntilSafeToSample > 0) { 154 return PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY; 155 } 156 // Reload the dictionary in case it has changed (e.g., because the user has changed 157 // languages). 158 final Dictionary dictionary = getDictionary(); 159 if (dictionary == null) { 160 // Main dictionary is unavailable. Since we cannot check it, we cannot tell if a 161 // word is out-of-vocabulary or not. Therefore, we must judge the entire buffer 162 // contents to potentially pose a privacy risk. 163 return PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE; 164 } 165 166 // Check each word in the buffer. If any word poses a privacy threat, we cannot upload 167 // the complete buffer contents in detail. 168 int numWordsInLogUnitList = 0; 169 final int length = logUnits.size(); 170 for (final LogUnit logUnit : logUnits) { 171 if (!logUnit.hasOneOrMoreWords()) { 172 // Digits outside words are a privacy threat. 173 if (logUnit.mayContainDigit()) { 174 return PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT; 175 } 176 } else { 177 numWordsInLogUnitList += logUnit.getNumWords(); 178 final String[] words = logUnit.getWordsAsStringArray(); 179 for (final String word : words) { 180 // Words not in the dictionary are a privacy threat. 181 if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) { 182 if (DEBUG) { 183 Log.d(TAG, "\"" + word + "\" NOT SAFE!: hasLetters: " 184 + ResearchLogger.hasLetters(word) 185 + ", isValid: " + (dictionary.isValidWord(word))); 186 } 187 return PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY; 188 } 189 } 190 } 191 } 192 193 // Finally, only return true if the ngram is the right size. 194 if (numWordsInLogUnitList == nGramSize) { 195 return PUBLISHABILITY_PUBLISHABLE; 196 } else { 197 return PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT; 198 } 199 } 200 201 public void shiftAndPublishAll() throws IOException { 202 final LinkedList<LogUnit> logUnits = getLogUnits(); 203 while (!logUnits.isEmpty()) { 204 publishLogUnitsAtFrontOfBuffer(); 205 } 206 } 207 208 @Override 209 protected final void onBufferFull() { 210 try { 211 publishLogUnitsAtFrontOfBuffer(); 212 } catch (final IOException e) { 213 if (DEBUG) { 214 Log.w(TAG, "IOException when publishing front of LogBuffer", e); 215 } 216 } 217 } 218 219 /** 220 * If there is a safe n-gram at the front of this log buffer, publish it with all details, and 221 * remove the LogUnits that constitute it. 222 * 223 * An n-gram might not be "safe" if it violates privacy controls. E.g., it might contain 224 * numbers, an out-of-vocabulary word, or another n-gram may have been published recently. If 225 * there is no safe n-gram, then the LogUnits up through the first word-containing LogUnit are 226 * published, but without disclosing any privacy-related details, such as the word the LogUnit 227 * generated, motion data, etc. 228 * 229 * Note that a LogUnit can hold more than one word if the user types without explicit spaces. 230 * In this case, the words may be grouped together in such a way that pulling an n-gram off the 231 * front would require splitting a LogUnit. Splitting a LogUnit is not possible, so this case 232 * is treated just as the unsafe n-gram case. This may cause n-grams to be sampled at slightly 233 * less than the target frequency. 234 */ 235 protected final void publishLogUnitsAtFrontOfBuffer() throws IOException { 236 // TODO: Refactor this method to require fewer passes through the LogUnits. Should really 237 // require only one pass. 238 ArrayList<LogUnit> logUnits = peekAtFirstNWords(N_GRAM_SIZE); 239 final int publishabilityResultCode = getPublishabilityResultCode(logUnits, N_GRAM_SIZE); 240 ResearchLogger.recordPublishabilityResultCode(publishabilityResultCode); 241 if (publishabilityResultCode == MainLogBuffer.PUBLISHABILITY_PUBLISHABLE) { 242 // Good n-gram at the front of the buffer. Publish it, disclosing details. 243 publish(logUnits, true /* canIncludePrivateData */); 244 shiftOutWords(N_GRAM_SIZE); 245 mNumWordsUntilSafeToSample = mNumWordsBetweenNGrams; 246 return; 247 } 248 // No good n-gram at front, and buffer is full. Shift out up through the first logUnit 249 // with associated words (or if there is none, all the existing logUnits). 250 logUnits.clear(); 251 LogUnit logUnit = shiftOut(); 252 while (logUnit != null) { 253 logUnits.add(logUnit); 254 final int numWords = logUnit.getNumWords(); 255 if (numWords > 0) { 256 mNumWordsUntilSafeToSample = Math.max(0, mNumWordsUntilSafeToSample - numWords); 257 break; 258 } 259 logUnit = shiftOut(); 260 } 261 publish(logUnits, false /* canIncludePrivateData */); 262 } 263 264 /** 265 * Called when a list of logUnits should be published. 266 * 267 * It is the subclass's responsibility to implement the publication. 268 * 269 * @param logUnits The list of logUnits to be published. 270 * @param canIncludePrivateData Whether the private data in the logUnits can be included in 271 * publication. 272 * 273 * @throws IOException if publication to the log file is not possible 274 */ 275 protected abstract void publish(final ArrayList<LogUnit> logUnits, 276 final boolean canIncludePrivateData) throws IOException; 277 278 @Override 279 protected int shiftOutWords(final int numWords) { 280 final int numWordsShiftedOut = super.shiftOutWords(numWords); 281 mNumWordsUntilSafeToSample = Math.max(0, mNumWordsUntilSafeToSample - numWordsShiftedOut); 282 if (DEBUG) { 283 Log.d(TAG, "wordsUntilSafeToSample now at " + mNumWordsUntilSafeToSample); 284 } 285 return numWordsShiftedOut; 286 } 287 } 288