1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.research; 18 19 import android.util.Log; 20 21 import com.android.inputmethod.annotations.UsedForTesting; 22 import com.android.inputmethod.latin.Dictionary; 23 import com.android.inputmethod.latin.Suggest; 24 import com.android.inputmethod.latin.define.ProductionFlag; 25 26 import java.io.IOException; 27 import java.util.ArrayList; 28 import java.util.LinkedList; 29 30 /** 31 * MainLogBuffer is a FixedLogBuffer that tracks the state of LogUnits to make privacy guarantees. 32 * 33 * There are three forms of privacy protection: 1) only words in the main dictionary are allowed to 34 * be logged in enough detail to determine their contents, 2) only a subset of words are logged 35 * in detail, such as 10%, and 3) no numbers are logged. 36 * 37 * This class maintains a list of LogUnits, each corresponding to a word. As the user completes 38 * words, they are added here. But if the user backs up over their current word to edit a word 39 * entered earlier, then it is pulled out of this LogBuffer, changes are then added to the end of 40 * the LogUnit, and it is pushed back in here when the user is done. Because words may be pulled 41 * back out even after they are pushed in, we must not publish the contents of this LogBuffer too 42 * quickly. However, we cannot let the contents pile up either, or it will limit the editing that 43 * a user can perform. 44 * 45 * To balance these requirements (keep history so user can edit, flush history so it does not pile 46 * up), the LogBuffer is considered "complete" when the user has entered enough words to form an 47 * n-gram, followed by enough additional non-detailed words (that are in the 90%, as per above). 48 * Once complete, the n-gram may be published to flash storage (via the ResearchLog class). 49 * However, the additional non-detailed words are retained, in case the user backspaces to edit 50 * them. The MainLogBuffer then continues to add words, publishing individual non-detailed words 51 * as new words arrive. After enough non-detailed words have been pushed out to account for the 52 * 90% between words, the words at the front of the LogBuffer can be published as an n-gram again. 53 * 54 * If the words that would form the valid n-gram are not in the dictionary, then words are pushed 55 * through the LogBuffer one at a time until an n-gram is found that is entirely composed of 56 * dictionary words. 57 * 58 * If the user closes a session, then the entire LogBuffer is flushed, publishing any embedded 59 * n-gram containing dictionary words. 60 */ 61 public abstract class MainLogBuffer extends FixedLogBuffer { 62 private static final String TAG = MainLogBuffer.class.getSimpleName(); 63 private static final boolean DEBUG = false 64 && ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS_DEBUG; 65 66 // The size of the n-grams logged. E.g. N_GRAM_SIZE = 2 means to sample bigrams. 67 public static final int N_GRAM_SIZE = 2; 68 69 // TODO: Remove dependence on Suggest, and pass in Dictionary as a parameter to an appropriate 70 // method. 71 private final Suggest mSuggest; 72 @UsedForTesting 73 private Dictionary mDictionaryForTesting; 74 private boolean mIsStopping = false; 75 76 /* package for test */ int mNumWordsBetweenNGrams; 77 78 // Counter for words left to suppress before an n-gram can be sampled. Reset to mMinWordPeriod 79 // after a sample is taken. 80 /* package for test */ int mNumWordsUntilSafeToSample; 81 82 public MainLogBuffer(final int wordsBetweenSamples, final int numInitialWordsToIgnore, 83 final Suggest suggest) { 84 super(N_GRAM_SIZE + wordsBetweenSamples); 85 mNumWordsBetweenNGrams = wordsBetweenSamples; 86 mNumWordsUntilSafeToSample = DEBUG ? 0 : numInitialWordsToIgnore; 87 mSuggest = suggest; 88 } 89 90 @UsedForTesting 91 /* package for test */ void setDictionaryForTesting(final Dictionary dictionary) { 92 mDictionaryForTesting = dictionary; 93 } 94 95 private Dictionary getDictionary() { 96 if (mDictionaryForTesting != null) { 97 return mDictionaryForTesting; 98 } 99 if (mSuggest == null || !mSuggest.hasMainDictionary()) return null; 100 return mSuggest.getMainDictionary(); 101 } 102 103 public void setIsStopping() { 104 mIsStopping = true; 105 } 106 107 /** 108 * Determines whether uploading the n words at the front the MainLogBuffer will not violate 109 * user privacy. 110 * 111 * The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any 112 * non-character data that is typed between words. The decision about privacy is made based on 113 * the buffer's entire content. If it is decided that the privacy risks are too great to upload 114 * the contents of this buffer, a censored version of the LogItems may still be uploaded. E.g., 115 * the screen orientation and other characteristics about the device can be uploaded without 116 * revealing much about the user. 117 */ 118 private boolean isSafeNGram(final ArrayList<LogUnit> logUnits, final int minNGramSize) { 119 // Bypass privacy checks when debugging. 120 if (ResearchLogger.IS_LOGGING_EVERYTHING) { 121 if (mIsStopping) { 122 return true; 123 } 124 // Only check that it is the right length. If not, wait for later words to make 125 // complete n-grams. 126 int numWordsInLogUnitList = 0; 127 final int length = logUnits.size(); 128 for (int i = 0; i < length; i++) { 129 final LogUnit logUnit = logUnits.get(i); 130 numWordsInLogUnitList += logUnit.getNumWords(); 131 } 132 return numWordsInLogUnitList >= minNGramSize; 133 } 134 135 // Check that we are not sampling too frequently. Having sampled recently might disclose 136 // too much of the user's intended meaning. 137 if (mNumWordsUntilSafeToSample > 0) { 138 return false; 139 } 140 // Reload the dictionary in case it has changed (e.g., because the user has changed 141 // languages). 142 final Dictionary dictionary = getDictionary(); 143 if (dictionary == null) { 144 // Main dictionary is unavailable. Since we cannot check it, we cannot tell if a 145 // word is out-of-vocabulary or not. Therefore, we must judge the entire buffer 146 // contents to potentially pose a privacy risk. 147 return false; 148 } 149 150 // Check each word in the buffer. If any word poses a privacy threat, we cannot upload 151 // the complete buffer contents in detail. 152 int numWordsInLogUnitList = 0; 153 final int length = logUnits.size(); 154 for (final LogUnit logUnit : logUnits) { 155 if (!logUnit.hasOneOrMoreWords()) { 156 // Digits outside words are a privacy threat. 157 if (logUnit.mayContainDigit()) { 158 return false; 159 } 160 } else { 161 numWordsInLogUnitList += logUnit.getNumWords(); 162 final String[] words = logUnit.getWordsAsStringArray(); 163 for (final String word : words) { 164 // Words not in the dictionary are a privacy threat. 165 if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) { 166 if (DEBUG) { 167 Log.d(TAG, "\"" + word + "\" NOT SAFE!: hasLetters: " 168 + ResearchLogger.hasLetters(word) 169 + ", isValid: " + (dictionary.isValidWord(word))); 170 } 171 return false; 172 } 173 } 174 } 175 } 176 177 // Finally, only return true if the ngram is the right size. 178 return numWordsInLogUnitList == minNGramSize; 179 } 180 181 public void shiftAndPublishAll() throws IOException { 182 final LinkedList<LogUnit> logUnits = getLogUnits(); 183 while (!logUnits.isEmpty()) { 184 publishLogUnitsAtFrontOfBuffer(); 185 } 186 } 187 188 @Override 189 protected final void onBufferFull() { 190 try { 191 publishLogUnitsAtFrontOfBuffer(); 192 } catch (final IOException e) { 193 if (DEBUG) { 194 Log.w(TAG, "IOException when publishing front of LogBuffer", e); 195 } 196 } 197 } 198 199 protected final void publishLogUnitsAtFrontOfBuffer() throws IOException { 200 // TODO: Refactor this method to require fewer passes through the LogUnits. Should really 201 // require only one pass. 202 ArrayList<LogUnit> logUnits = peekAtFirstNWords(N_GRAM_SIZE); 203 if (isSafeNGram(logUnits, N_GRAM_SIZE)) { 204 // Good n-gram at the front of the buffer. Publish it, disclosing details. 205 publish(logUnits, true /* canIncludePrivateData */); 206 shiftOutWords(N_GRAM_SIZE); 207 mNumWordsUntilSafeToSample = mNumWordsBetweenNGrams; 208 return; 209 } 210 // No good n-gram at front, and buffer is full. Shift out up through the first logUnit 211 // with associated words (or if there is none, all the existing logUnits). 212 logUnits.clear(); 213 LogUnit logUnit = shiftOut(); 214 while (logUnit != null) { 215 logUnits.add(logUnit); 216 final int numWords = logUnit.getNumWords(); 217 if (numWords > 0) { 218 mNumWordsUntilSafeToSample = Math.max(0, mNumWordsUntilSafeToSample - numWords); 219 break; 220 } 221 logUnit = shiftOut(); 222 } 223 publish(logUnits, false /* canIncludePrivateData */); 224 } 225 226 /** 227 * Called when a list of logUnits should be published. 228 * 229 * It is the subclass's responsibility to implement the publication. 230 * 231 * @param logUnits The list of logUnits to be published. 232 * @param canIncludePrivateData Whether the private data in the logUnits can be included in 233 * publication. 234 * 235 * @throws IOException if publication to the log file is not possible 236 */ 237 protected abstract void publish(final ArrayList<LogUnit> logUnits, 238 final boolean canIncludePrivateData) throws IOException; 239 240 @Override 241 protected int shiftOutWords(final int numWords) { 242 final int numWordsShiftedOut = super.shiftOutWords(numWords); 243 mNumWordsUntilSafeToSample = Math.max(0, mNumWordsUntilSafeToSample - numWordsShiftedOut); 244 if (DEBUG) { 245 Log.d(TAG, "wordsUntilSafeToSample now at " + mNumWordsUntilSafeToSample); 246 } 247 return numWordsShiftedOut; 248 } 249 } 250