1 /* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.latin; 18 19 import android.test.AndroidTestCase; 20 import android.test.suitebuilder.annotation.LargeTest; 21 import android.text.TextUtils; 22 import android.util.Pair; 23 24 import com.android.inputmethod.latin.NgramContext.WordInfo; 25 import com.android.inputmethod.latin.common.CodePointUtils; 26 import com.android.inputmethod.latin.common.FileUtils; 27 import com.android.inputmethod.latin.makedict.DictionaryHeader; 28 import com.android.inputmethod.latin.makedict.FormatSpec; 29 import com.android.inputmethod.latin.makedict.WeightedString; 30 import com.android.inputmethod.latin.makedict.WordProperty; 31 import com.android.inputmethod.latin.utils.BinaryDictionaryUtils; 32 33 import java.io.File; 34 import java.io.IOException; 35 import java.util.ArrayList; 36 import java.util.HashMap; 37 import java.util.HashSet; 38 import java.util.Locale; 39 import java.util.Random; 40 41 @LargeTest 42 public class BinaryDictionaryTests extends AndroidTestCase { 43 private static final String TEST_DICT_FILE_EXTENSION = ".testDict"; 44 private static final String TEST_LOCALE = "test"; 45 private static final String DICTIONARY_ID = "TestBinaryDictionary"; 46 47 private HashSet<File> mDictFilesToBeDeleted = new HashSet<>(); 48 49 @Override 50 protected void setUp() throws Exception { 51 super.setUp(); 52 mDictFilesToBeDeleted.clear(); 53 } 54 55 @Override 56 protected void tearDown() throws Exception { 57 for (final File dictFile : mDictFilesToBeDeleted) { 58 dictFile.delete(); 59 } 60 mDictFilesToBeDeleted.clear(); 61 super.tearDown(); 62 } 63 64 private File createEmptyDictionaryAndGetFile(final int formatVersion) { 65 return createEmptyDictionaryWithAttributesAndGetFile(formatVersion, 66 new HashMap<String, String>()); 67 } 68 69 private File createEmptyDictionaryWithAttributesAndGetFile(final int formatVersion, 70 final HashMap<String, String> attributeMap) { 71 try { 72 final File dictFile = createEmptyVer4DictionaryAndGetFile(formatVersion, 73 attributeMap); 74 mDictFilesToBeDeleted.add(dictFile); 75 return dictFile; 76 } catch (final IOException e) { 77 fail(e.toString()); 78 } 79 return null; 80 } 81 82 private File createEmptyVer4DictionaryAndGetFile(final int formatVersion, 83 final HashMap<String, String> attributeMap) throws IOException { 84 final File file = File.createTempFile(DICTIONARY_ID, TEST_DICT_FILE_EXTENSION, 85 getContext().getCacheDir()); 86 file.delete(); 87 file.mkdir(); 88 if (BinaryDictionaryUtils.createEmptyDictFile(file.getAbsolutePath(), formatVersion, 89 Locale.ENGLISH, attributeMap)) { 90 return file; 91 } 92 throw new IOException("Empty dictionary " + file.getAbsolutePath() 93 + " cannot be created. Format version: " + formatVersion); 94 } 95 96 private static BinaryDictionary getBinaryDictionary(final File dictFile) { 97 return new BinaryDictionary(dictFile.getAbsolutePath(), 98 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 99 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 100 } 101 102 private BinaryDictionary getEmptyBinaryDictionary(final int formatVersion) { 103 final File dictFile = createEmptyDictionaryAndGetFile(formatVersion); 104 return new BinaryDictionary(dictFile.getAbsolutePath(), 105 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 106 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 107 } 108 109 public void testIsValidDictionary() { 110 final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); 111 BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); 112 assertTrue("binaryDictionary must be valid for existing valid dictionary file.", 113 binaryDictionary.isValidDictionary()); 114 binaryDictionary.close(); 115 assertFalse("binaryDictionary must be invalid after closing.", 116 binaryDictionary.isValidDictionary()); 117 FileUtils.deleteRecursively(dictFile); 118 binaryDictionary = getBinaryDictionary(dictFile); 119 assertFalse("binaryDictionary must be invalid for not existing dictionary file.", 120 binaryDictionary.isValidDictionary()); 121 binaryDictionary.close(); 122 } 123 124 public void testConstructingDictionaryOnMemory() { 125 final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); 126 FileUtils.deleteRecursively(dictFile); 127 assertFalse(dictFile.exists()); 128 final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 129 true /* useFullEditDistance */, Locale.getDefault(), TEST_LOCALE, 130 FormatSpec.VERSION403, new HashMap<String, String>()); 131 assertTrue(binaryDictionary.isValidDictionary()); 132 assertEquals(FormatSpec.VERSION403, binaryDictionary.getFormatVersion()); 133 final int probability = 100; 134 addUnigramWord(binaryDictionary, "word", probability); 135 assertEquals(probability, binaryDictionary.getFrequency("word")); 136 assertFalse(dictFile.exists()); 137 binaryDictionary.flush(); 138 assertTrue(dictFile.exists()); 139 assertTrue(binaryDictionary.isValidDictionary()); 140 assertEquals(FormatSpec.VERSION403, binaryDictionary.getFormatVersion()); 141 assertEquals(probability, binaryDictionary.getFrequency("word")); 142 binaryDictionary.close(); 143 } 144 145 public void testAddTooLongWord() { 146 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 147 final StringBuffer stringBuilder = new StringBuffer(); 148 for (int i = 0; i < BinaryDictionary.DICTIONARY_MAX_WORD_LENGTH; i++) { 149 stringBuilder.append('a'); 150 } 151 final String validLongWord = stringBuilder.toString(); 152 stringBuilder.append('a'); 153 final String invalidLongWord = stringBuilder.toString(); 154 final int probability = 100; 155 addUnigramWord(binaryDictionary, "aaa", probability); 156 addUnigramWord(binaryDictionary, validLongWord, probability); 157 addUnigramWord(binaryDictionary, invalidLongWord, probability); 158 // Too long short cut. 159 binaryDictionary.addUnigramEntry("a", probability, false /* isBeginningOfSentence */, 160 false /* isNotAWord */, false /* isPossiblyOffensive */, 161 BinaryDictionary.NOT_A_VALID_TIMESTAMP); 162 addUnigramWord(binaryDictionary, "abc", probability); 163 final int updatedProbability = 200; 164 // Update. 165 addUnigramWord(binaryDictionary, validLongWord, updatedProbability); 166 addUnigramWord(binaryDictionary, invalidLongWord, updatedProbability); 167 addUnigramWord(binaryDictionary, "abc", updatedProbability); 168 169 assertEquals(probability, binaryDictionary.getFrequency("aaa")); 170 assertEquals(updatedProbability, binaryDictionary.getFrequency(validLongWord)); 171 assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency(invalidLongWord)); 172 assertEquals(updatedProbability, binaryDictionary.getFrequency("abc")); 173 } 174 175 private static void addUnigramWord(final BinaryDictionary binaryDictionary, final String word, 176 final int probability) { 177 binaryDictionary.addUnigramEntry(word, probability, 178 false /* isBeginningOfSentence */, false /* isNotAWord */, 179 false /* isPossiblyOffensive */, 180 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); 181 } 182 183 private static void addBigramWords(final BinaryDictionary binaryDictionary, final String word0, 184 final String word1, final int probability) { 185 binaryDictionary.addNgramEntry(new NgramContext(new WordInfo(word0)), word1, probability, 186 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); 187 } 188 189 private static void addTrigramEntry(final BinaryDictionary binaryDictionary, final String word0, 190 final String word1, final String word2, final int probability) { 191 binaryDictionary.addNgramEntry( 192 new NgramContext(new WordInfo(word1), new WordInfo(word0)), word2, 193 probability, BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); 194 } 195 196 private static boolean isValidBigram(final BinaryDictionary binaryDictionary, 197 final String word0, final String word1) { 198 return binaryDictionary.isValidNgram(new NgramContext(new WordInfo(word0)), word1); 199 } 200 201 private static int getBigramProbability(final BinaryDictionary binaryDictionary, 202 final String word0, final String word1) { 203 return binaryDictionary.getNgramProbability(new NgramContext(new WordInfo(word0)), word1); 204 } 205 206 private static int getTrigramProbability(final BinaryDictionary binaryDictionary, 207 final String word0, final String word1, final String word2) { 208 return binaryDictionary.getNgramProbability( 209 new NgramContext(new WordInfo(word1), new WordInfo(word0)), word2); 210 } 211 212 public void testAddUnigramWord() { 213 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 214 final int probability = 100; 215 addUnigramWord(binaryDictionary, "aaa", probability); 216 // Reallocate and create. 217 addUnigramWord(binaryDictionary, "aab", probability); 218 // Insert into children. 219 addUnigramWord(binaryDictionary, "aac", probability); 220 // Make terminal. 221 addUnigramWord(binaryDictionary, "aa", probability); 222 // Create children. 223 addUnigramWord(binaryDictionary, "aaaa", probability); 224 // Reallocate and make termianl. 225 addUnigramWord(binaryDictionary, "a", probability); 226 227 final int updatedProbability = 200; 228 // Update. 229 addUnigramWord(binaryDictionary, "aaa", updatedProbability); 230 231 assertEquals(probability, binaryDictionary.getFrequency("aab")); 232 assertEquals(probability, binaryDictionary.getFrequency("aac")); 233 assertEquals(probability, binaryDictionary.getFrequency("aa")); 234 assertEquals(probability, binaryDictionary.getFrequency("aaaa")); 235 assertEquals(probability, binaryDictionary.getFrequency("a")); 236 assertEquals(updatedProbability, binaryDictionary.getFrequency("aaa")); 237 } 238 239 public void testRandomlyAddUnigramWord() { 240 final int wordCount = 1000; 241 final int codePointSetSize = 50; 242 final long seed = System.currentTimeMillis(); 243 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 244 245 final HashMap<String, Integer> probabilityMap = new HashMap<>(); 246 // Test a word that isn't contained within the dictionary. 247 final Random random = new Random(seed); 248 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 249 for (int i = 0; i < wordCount; ++i) { 250 final String word = CodePointUtils.generateWord(random, codePointSet); 251 probabilityMap.put(word, random.nextInt(0xFF)); 252 } 253 for (String word : probabilityMap.keySet()) { 254 addUnigramWord(binaryDictionary, word, probabilityMap.get(word)); 255 } 256 for (String word : probabilityMap.keySet()) { 257 assertEquals(word, (int)probabilityMap.get(word), binaryDictionary.getFrequency(word)); 258 } 259 } 260 261 public void testAddBigramWords() { 262 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 263 264 final int unigramProbability = 100; 265 final int bigramProbability = 150; 266 final int updatedBigramProbability = 200; 267 addUnigramWord(binaryDictionary, "aaa", unigramProbability); 268 addUnigramWord(binaryDictionary, "abb", unigramProbability); 269 addUnigramWord(binaryDictionary, "bcc", unigramProbability); 270 addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability); 271 addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability); 272 addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability); 273 addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability); 274 275 assertTrue(isValidBigram(binaryDictionary, "aaa", "abb")); 276 assertTrue(isValidBigram(binaryDictionary, "aaa", "bcc")); 277 assertTrue(isValidBigram(binaryDictionary, "abb", "aaa")); 278 assertTrue(isValidBigram(binaryDictionary, "abb", "bcc")); 279 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "abb")); 280 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bcc")); 281 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "aaa")); 282 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "bcc")); 283 284 addBigramWords(binaryDictionary, "aaa", "abb", updatedBigramProbability); 285 assertEquals(updatedBigramProbability, 286 getBigramProbability(binaryDictionary, "aaa", "abb")); 287 288 assertFalse(isValidBigram(binaryDictionary, "bcc", "aaa")); 289 assertFalse(isValidBigram(binaryDictionary, "bcc", "bbc")); 290 assertFalse(isValidBigram(binaryDictionary, "aaa", "aaa")); 291 assertEquals(Dictionary.NOT_A_PROBABILITY, 292 getBigramProbability(binaryDictionary, "bcc", "aaa")); 293 assertEquals(Dictionary.NOT_A_PROBABILITY, 294 getBigramProbability(binaryDictionary, "bcc", "bbc")); 295 assertEquals(Dictionary.NOT_A_PROBABILITY, 296 getBigramProbability(binaryDictionary, "aaa", "aaa")); 297 298 // Testing bigram link. 299 addUnigramWord(binaryDictionary, "abcde", unigramProbability); 300 addUnigramWord(binaryDictionary, "fghij", unigramProbability); 301 addBigramWords(binaryDictionary, "abcde", "fghij", bigramProbability); 302 addUnigramWord(binaryDictionary, "fgh", unigramProbability); 303 addUnigramWord(binaryDictionary, "abc", unigramProbability); 304 addUnigramWord(binaryDictionary, "f", unigramProbability); 305 306 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abcde", "fghij")); 307 assertEquals(Dictionary.NOT_A_PROBABILITY, 308 getBigramProbability(binaryDictionary, "abcde", "fgh")); 309 addBigramWords(binaryDictionary, "abcde", "fghij", updatedBigramProbability); 310 assertEquals(updatedBigramProbability, 311 getBigramProbability(binaryDictionary, "abcde", "fghij")); 312 } 313 314 public void testRandomlyAddBigramWords() { 315 final int wordCount = 100; 316 final int bigramCount = 1000; 317 final int codePointSetSize = 50; 318 final long seed = System.currentTimeMillis(); 319 final Random random = new Random(seed); 320 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 321 322 final ArrayList<String> words = new ArrayList<>(); 323 final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>(); 324 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 325 final HashMap<String, Integer> unigramProbabilities = new HashMap<>(); 326 final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>(); 327 328 for (int i = 0; i < wordCount; ++i) { 329 final String word = CodePointUtils.generateWord(random, codePointSet); 330 words.add(word); 331 final int unigramProbability = random.nextInt(0xFF); 332 unigramProbabilities.put(word, unigramProbability); 333 addUnigramWord(binaryDictionary, word, unigramProbability); 334 } 335 336 for (int i = 0; i < bigramCount; i++) { 337 final String word0 = words.get(random.nextInt(wordCount)); 338 final String word1 = words.get(random.nextInt(wordCount)); 339 if (TextUtils.equals(word0, word1)) { 340 continue; 341 } 342 final Pair<String, String> bigram = new Pair<>(word0, word1); 343 bigramWords.add(bigram); 344 final int unigramProbability = unigramProbabilities.get(word1); 345 final int bigramProbability = 346 unigramProbability + random.nextInt(0xFF - unigramProbability); 347 bigramProbabilities.put(bigram, bigramProbability); 348 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 349 } 350 351 for (final Pair<String, String> bigram : bigramWords) { 352 final int bigramProbability = bigramProbabilities.get(bigram); 353 assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY, 354 isValidBigram(binaryDictionary, bigram.first, bigram.second)); 355 assertEquals(bigramProbability, 356 getBigramProbability(binaryDictionary, bigram.first, bigram.second)); 357 } 358 } 359 360 public void testAddTrigramWords() { 361 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 362 final int unigramProbability = 100; 363 final int trigramProbability = 150; 364 final int updatedTrigramProbability = 200; 365 addUnigramWord(binaryDictionary, "aaa", unigramProbability); 366 addUnigramWord(binaryDictionary, "abb", unigramProbability); 367 addUnigramWord(binaryDictionary, "bcc", unigramProbability); 368 369 addBigramWords(binaryDictionary, "abb", "bcc", 10); 370 addBigramWords(binaryDictionary, "abb", "aaa", 10); 371 372 addTrigramEntry(binaryDictionary, "aaa", "abb", "bcc", trigramProbability); 373 addTrigramEntry(binaryDictionary, "bcc", "abb", "aaa", trigramProbability); 374 375 assertEquals(trigramProbability, 376 getTrigramProbability(binaryDictionary, "aaa", "abb", "bcc")); 377 assertEquals(trigramProbability, 378 getTrigramProbability(binaryDictionary, "bcc", "abb", "aaa")); 379 assertFalse(isValidBigram(binaryDictionary, "aaa", "abb")); 380 381 addTrigramEntry(binaryDictionary, "bcc", "abb", "aaa", updatedTrigramProbability); 382 assertEquals(updatedTrigramProbability, 383 getTrigramProbability(binaryDictionary, "bcc", "abb", "aaa")); 384 } 385 386 public void testFlushDictionary() { 387 final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); 388 BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); 389 390 final int probability = 100; 391 addUnigramWord(binaryDictionary, "aaa", probability); 392 addUnigramWord(binaryDictionary, "abcd", probability); 393 // Close without flushing. 394 binaryDictionary.close(); 395 396 binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 397 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 398 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); 399 400 assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("aaa")); 401 assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("abcd")); 402 403 addUnigramWord(binaryDictionary, "aaa", probability); 404 addUnigramWord(binaryDictionary, "abcd", probability); 405 binaryDictionary.flush(); 406 binaryDictionary.close(); 407 408 binaryDictionary = getBinaryDictionary(dictFile); 409 assertEquals(probability, binaryDictionary.getFrequency("aaa")); 410 assertEquals(probability, binaryDictionary.getFrequency("abcd")); 411 addUnigramWord(binaryDictionary, "bcde", probability); 412 binaryDictionary.flush(); 413 binaryDictionary.close(); 414 415 binaryDictionary = getBinaryDictionary(dictFile); 416 assertEquals(probability, binaryDictionary.getFrequency("bcde")); 417 binaryDictionary.close(); 418 } 419 420 public void testFlushWithGCDictionary() { 421 final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); 422 BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); 423 final int unigramProbability = 100; 424 final int bigramProbability = 150; 425 addUnigramWord(binaryDictionary, "aaa", unigramProbability); 426 addUnigramWord(binaryDictionary, "abb", unigramProbability); 427 addUnigramWord(binaryDictionary, "bcc", unigramProbability); 428 addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability); 429 addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability); 430 addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability); 431 addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability); 432 binaryDictionary.flushWithGC(); 433 binaryDictionary.close(); 434 435 binaryDictionary = getBinaryDictionary(dictFile); 436 assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa")); 437 assertEquals(unigramProbability, binaryDictionary.getFrequency("abb")); 438 assertEquals(unigramProbability, binaryDictionary.getFrequency("bcc")); 439 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "abb")); 440 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bcc")); 441 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "aaa")); 442 assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "bcc")); 443 assertFalse(isValidBigram(binaryDictionary, "bcc", "aaa")); 444 assertFalse(isValidBigram(binaryDictionary, "bcc", "bbc")); 445 assertFalse(isValidBigram(binaryDictionary, "aaa", "aaa")); 446 binaryDictionary.flushWithGC(); 447 binaryDictionary.close(); 448 } 449 450 public void testAddBigramWordsAndFlashWithGC() { 451 final int wordCount = 100; 452 final int bigramCount = 1000; 453 final int codePointSetSize = 30; 454 final long seed = System.currentTimeMillis(); 455 final Random random = new Random(seed); 456 457 final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); 458 BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); 459 460 final ArrayList<String> words = new ArrayList<>(); 461 final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>(); 462 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 463 final HashMap<String, Integer> unigramProbabilities = new HashMap<>(); 464 final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>(); 465 466 for (int i = 0; i < wordCount; ++i) { 467 final String word = CodePointUtils.generateWord(random, codePointSet); 468 words.add(word); 469 final int unigramProbability = random.nextInt(0xFF); 470 unigramProbabilities.put(word, unigramProbability); 471 addUnigramWord(binaryDictionary, word, unigramProbability); 472 } 473 474 for (int i = 0; i < bigramCount; i++) { 475 final String word0 = words.get(random.nextInt(wordCount)); 476 final String word1 = words.get(random.nextInt(wordCount)); 477 if (TextUtils.equals(word0, word1)) { 478 continue; 479 } 480 final Pair<String, String> bigram = new Pair<>(word0, word1); 481 bigramWords.add(bigram); 482 final int unigramProbability = unigramProbabilities.get(word1); 483 final int bigramProbability = 484 unigramProbability + random.nextInt(0xFF - unigramProbability); 485 bigramProbabilities.put(bigram, bigramProbability); 486 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 487 } 488 489 binaryDictionary.flushWithGC(); 490 binaryDictionary.close(); 491 binaryDictionary = getBinaryDictionary(dictFile); 492 493 for (final Pair<String, String> bigram : bigramWords) { 494 final int bigramProbability = bigramProbabilities.get(bigram); 495 assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY, 496 isValidBigram(binaryDictionary, bigram.first, bigram.second)); 497 assertEquals(bigramProbability, 498 getBigramProbability(binaryDictionary, bigram.first, bigram.second)); 499 } 500 } 501 502 public void testRandomOperationsAndFlashWithGC() { 503 final int maxUnigramCount = 5000; 504 final int maxBigramCount = 10000; 505 final HashMap<String, String> attributeMap = new HashMap<>(); 506 attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY, String.valueOf(maxUnigramCount)); 507 attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY, String.valueOf(maxBigramCount)); 508 509 final int flashWithGCIterationCount = 50; 510 final int operationCountInEachIteration = 200; 511 final int initialUnigramCount = 100; 512 final float addUnigramProb = 0.5f; 513 final float addBigramProb = 0.8f; 514 final int codePointSetSize = 30; 515 516 final long seed = System.currentTimeMillis(); 517 final Random random = new Random(seed); 518 final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(FormatSpec.VERSION403, 519 attributeMap); 520 BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); 521 522 final ArrayList<String> words = new ArrayList<>(); 523 final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>(); 524 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 525 final HashMap<String, Integer> unigramProbabilities = new HashMap<>(); 526 final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>(); 527 for (int i = 0; i < initialUnigramCount; ++i) { 528 final String word = CodePointUtils.generateWord(random, codePointSet); 529 words.add(word); 530 final int unigramProbability = random.nextInt(0xFF); 531 unigramProbabilities.put(word, unigramProbability); 532 addUnigramWord(binaryDictionary, word, unigramProbability); 533 } 534 binaryDictionary.flushWithGC(); 535 binaryDictionary.close(); 536 537 for (int gcCount = 0; gcCount < flashWithGCIterationCount; gcCount++) { 538 binaryDictionary = getBinaryDictionary(dictFile); 539 for (int opCount = 0; opCount < operationCountInEachIteration; opCount++) { 540 // Add unigram. 541 if (random.nextFloat() < addUnigramProb) { 542 final String word = CodePointUtils.generateWord(random, codePointSet); 543 words.add(word); 544 final int unigramProbability = random.nextInt(0xFF); 545 unigramProbabilities.put(word, unigramProbability); 546 addUnigramWord(binaryDictionary, word, unigramProbability); 547 } 548 // Add bigram. 549 if (random.nextFloat() < addBigramProb && words.size() > 2) { 550 final int word0Index = random.nextInt(words.size()); 551 int word1Index = random.nextInt(words.size() - 1); 552 if (word0Index <= word1Index) { 553 word1Index++; 554 } 555 final String word0 = words.get(word0Index); 556 final String word1 = words.get(word1Index); 557 if (TextUtils.equals(word0, word1)) { 558 continue; 559 } 560 final int unigramProbability = unigramProbabilities.get(word1); 561 final int bigramProbability = 562 unigramProbability + random.nextInt(0xFF - unigramProbability); 563 final Pair<String, String> bigram = new Pair<>(word0, word1); 564 bigramWords.add(bigram); 565 bigramProbabilities.put(bigram, bigramProbability); 566 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 567 } 568 } 569 570 // Test whether the all unigram operations are collectlly handled. 571 for (int i = 0; i < words.size(); i++) { 572 final String word = words.get(i); 573 final int unigramProbability = unigramProbabilities.get(word); 574 assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word)); 575 } 576 // Test whether the all bigram operations are collectlly handled. 577 for (int i = 0; i < bigramWords.size(); i++) { 578 final Pair<String, String> bigram = bigramWords.get(i); 579 final int probability; 580 if (bigramProbabilities.containsKey(bigram)) { 581 probability = bigramProbabilities.get(bigram); 582 } else { 583 probability = Dictionary.NOT_A_PROBABILITY; 584 } 585 586 assertEquals(probability, 587 getBigramProbability(binaryDictionary, bigram.first, bigram.second)); 588 assertEquals(probability != Dictionary.NOT_A_PROBABILITY, 589 isValidBigram(binaryDictionary, bigram.first, bigram.second)); 590 } 591 binaryDictionary.flushWithGC(); 592 binaryDictionary.close(); 593 } 594 } 595 596 public void testAddManyUnigramsAndFlushWithGC() { 597 final int flashWithGCIterationCount = 3; 598 final int codePointSetSize = 50; 599 600 final long seed = System.currentTimeMillis(); 601 final Random random = new Random(seed); 602 603 final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); 604 605 final ArrayList<String> words = new ArrayList<>(); 606 final HashMap<String, Integer> unigramProbabilities = new HashMap<>(); 607 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 608 609 BinaryDictionary binaryDictionary; 610 for (int i = 0; i < flashWithGCIterationCount; i++) { 611 binaryDictionary = getBinaryDictionary(dictFile); 612 while(!binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) { 613 final String word = CodePointUtils.generateWord(random, codePointSet); 614 words.add(word); 615 final int unigramProbability = random.nextInt(0xFF); 616 unigramProbabilities.put(word, unigramProbability); 617 addUnigramWord(binaryDictionary, word, unigramProbability); 618 } 619 620 for (int j = 0; j < words.size(); j++) { 621 final String word = words.get(j); 622 final int unigramProbability = unigramProbabilities.get(word); 623 assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word)); 624 } 625 626 binaryDictionary.flushWithGC(); 627 binaryDictionary.close(); 628 } 629 } 630 631 public void testUnigramAndBigramCount() { 632 final int maxUnigramCount = 5000; 633 final int maxBigramCount = 10000; 634 final HashMap<String, String> attributeMap = new HashMap<>(); 635 attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY, String.valueOf(maxUnigramCount)); 636 attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY, String.valueOf(maxBigramCount)); 637 638 final int flashWithGCIterationCount = 10; 639 final int codePointSetSize = 50; 640 final int unigramCountPerIteration = 1000; 641 final int bigramCountPerIteration = 2000; 642 final long seed = System.currentTimeMillis(); 643 final Random random = new Random(seed); 644 final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(FormatSpec.VERSION403, 645 attributeMap); 646 647 final ArrayList<String> words = new ArrayList<>(); 648 final HashSet<Pair<String, String>> bigrams = new HashSet<>(); 649 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 650 651 BinaryDictionary binaryDictionary; 652 for (int i = 0; i < flashWithGCIterationCount; i++) { 653 binaryDictionary = getBinaryDictionary(dictFile); 654 for (int j = 0; j < unigramCountPerIteration; j++) { 655 final String word = CodePointUtils.generateWord(random, codePointSet); 656 words.add(word); 657 final int unigramProbability = random.nextInt(0xFF); 658 addUnigramWord(binaryDictionary, word, unigramProbability); 659 } 660 for (int j = 0; j < bigramCountPerIteration; j++) { 661 final String word0 = words.get(random.nextInt(words.size())); 662 final String word1 = words.get(random.nextInt(words.size())); 663 if (TextUtils.equals(word0, word1)) { 664 continue; 665 } 666 bigrams.add(new Pair<>(word0, word1)); 667 final int bigramProbability = random.nextInt(0xF); 668 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 669 } 670 assertEquals(new HashSet<>(words).size(), Integer.parseInt( 671 binaryDictionary.getPropertyForGettingStats( 672 BinaryDictionary.UNIGRAM_COUNT_QUERY))); 673 assertEquals(new HashSet<>(bigrams).size(), Integer.parseInt( 674 binaryDictionary.getPropertyForGettingStats( 675 BinaryDictionary.BIGRAM_COUNT_QUERY))); 676 binaryDictionary.flushWithGC(); 677 assertEquals(new HashSet<>(words).size(), Integer.parseInt( 678 binaryDictionary.getPropertyForGettingStats( 679 BinaryDictionary.UNIGRAM_COUNT_QUERY))); 680 assertEquals(new HashSet<>(bigrams).size(), Integer.parseInt( 681 binaryDictionary.getPropertyForGettingStats( 682 BinaryDictionary.BIGRAM_COUNT_QUERY))); 683 binaryDictionary.close(); 684 } 685 } 686 687 public void testGetWordProperties() { 688 final long seed = System.currentTimeMillis(); 689 final Random random = new Random(seed); 690 final int UNIGRAM_COUNT = 1000; 691 final int BIGRAM_COUNT = 1000; 692 final int codePointSetSize = 20; 693 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 694 final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403); 695 final BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); 696 697 final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord", 698 false /* isBeginningOfSentence */); 699 assertFalse(invalidWordProperty.isValid()); 700 701 final ArrayList<String> words = new ArrayList<>(); 702 final HashMap<String, Integer> wordProbabilities = new HashMap<>(); 703 final HashMap<String, HashSet<String>> bigrams = new HashMap<>(); 704 final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>(); 705 706 for (int i = 0; i < UNIGRAM_COUNT; i++) { 707 final String word = CodePointUtils.generateWord(random, codePointSet); 708 final int unigramProbability = random.nextInt(0xFF); 709 final boolean isNotAWord = random.nextBoolean(); 710 final boolean isPossiblyOffensive = random.nextBoolean(); 711 // TODO: Add tests for historical info. 712 binaryDictionary.addUnigramEntry(word, unigramProbability, 713 false /* isBeginningOfSentence */, isNotAWord, isPossiblyOffensive, 714 BinaryDictionary.NOT_A_VALID_TIMESTAMP); 715 if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { 716 binaryDictionary.flushWithGC(); 717 } 718 words.add(word); 719 wordProbabilities.put(word, unigramProbability); 720 final WordProperty wordProperty = binaryDictionary.getWordProperty(word, 721 false /* isBeginningOfSentence */); 722 assertEquals(word, wordProperty.mWord); 723 assertTrue(wordProperty.isValid()); 724 assertEquals(isNotAWord, wordProperty.mIsNotAWord); 725 assertEquals(isPossiblyOffensive, wordProperty.mIsPossiblyOffensive); 726 assertEquals(false, wordProperty.mHasNgrams); 727 assertEquals(unigramProbability, wordProperty.mProbabilityInfo.mProbability); 728 } 729 730 for (int i = 0; i < BIGRAM_COUNT; i++) { 731 final int word0Index = random.nextInt(wordProbabilities.size()); 732 final int word1Index = random.nextInt(wordProbabilities.size()); 733 if (word0Index == word1Index) { 734 continue; 735 } 736 final String word0 = words.get(word0Index); 737 final String word1 = words.get(word1Index); 738 final int unigramProbability = wordProbabilities.get(word1); 739 final int bigramProbability = 740 unigramProbability + random.nextInt(0xFF - unigramProbability); 741 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 742 if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { 743 binaryDictionary.flushWithGC(); 744 } 745 if (!bigrams.containsKey(word0)) { 746 final HashSet<String> bigramWord1s = new HashSet<>(); 747 bigrams.put(word0, bigramWord1s); 748 } 749 bigrams.get(word0).add(word1); 750 bigramProbabilities.put(new Pair<>(word0, word1), bigramProbability); 751 } 752 753 for (int i = 0; i < words.size(); i++) { 754 final String word0 = words.get(i); 755 if (!bigrams.containsKey(word0)) { 756 continue; 757 } 758 final HashSet<String> bigramWord1s = bigrams.get(word0); 759 final WordProperty wordProperty = binaryDictionary.getWordProperty(word0, 760 false /* isBeginningOfSentence */); 761 assertEquals(bigramWord1s.size(), wordProperty.mNgrams.size()); 762 // TODO: Support ngram. 763 for (final WeightedString bigramTarget : wordProperty.getBigrams()) { 764 final String word1 = bigramTarget.mWord; 765 assertTrue(bigramWord1s.contains(word1)); 766 final int bigramProbability = bigramProbabilities.get(new Pair<>(word0, word1)); 767 assertEquals(bigramProbability, bigramTarget.getProbability()); 768 } 769 } 770 } 771 772 public void testIterateAllWords() { 773 final long seed = System.currentTimeMillis(); 774 final Random random = new Random(seed); 775 final int UNIGRAM_COUNT = 1000; 776 final int BIGRAM_COUNT = 1000; 777 final int codePointSetSize = 20; 778 final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); 779 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 780 781 final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord", 782 false /* isBeginningOfSentence */); 783 assertFalse(invalidWordProperty.isValid()); 784 785 final ArrayList<String> words = new ArrayList<>(); 786 final HashMap<String, Integer> wordProbabilitiesToCheckLater = new HashMap<>(); 787 final HashMap<String, HashSet<String>> bigrams = new HashMap<>(); 788 final HashMap<Pair<String, String>, Integer> bigramProbabilitiesToCheckLater = 789 new HashMap<>(); 790 791 for (int i = 0; i < UNIGRAM_COUNT; i++) { 792 final String word = CodePointUtils.generateWord(random, codePointSet); 793 final int unigramProbability = random.nextInt(0xFF); 794 addUnigramWord(binaryDictionary, word, unigramProbability); 795 if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { 796 binaryDictionary.flushWithGC(); 797 } 798 words.add(word); 799 wordProbabilitiesToCheckLater.put(word, unigramProbability); 800 } 801 802 for (int i = 0; i < BIGRAM_COUNT; i++) { 803 final int word0Index = random.nextInt(wordProbabilitiesToCheckLater.size()); 804 final int word1Index = random.nextInt(wordProbabilitiesToCheckLater.size()); 805 if (word0Index == word1Index) { 806 continue; 807 } 808 final String word0 = words.get(word0Index); 809 final String word1 = words.get(word1Index); 810 final int unigramProbability = wordProbabilitiesToCheckLater.get(word1); 811 final int bigramProbability = 812 unigramProbability + random.nextInt(0xFF - unigramProbability); 813 addBigramWords(binaryDictionary, word0, word1, bigramProbability); 814 if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { 815 binaryDictionary.flushWithGC(); 816 } 817 if (!bigrams.containsKey(word0)) { 818 final HashSet<String> bigramWord1s = new HashSet<>(); 819 bigrams.put(word0, bigramWord1s); 820 } 821 bigrams.get(word0).add(word1); 822 bigramProbabilitiesToCheckLater.put(new Pair<>(word0, word1), bigramProbability); 823 } 824 825 final HashSet<String> wordSet = new HashSet<>(words); 826 final HashSet<Pair<String, String>> bigramSet = 827 new HashSet<>(bigramProbabilitiesToCheckLater.keySet()); 828 int token = 0; 829 do { 830 final BinaryDictionary.GetNextWordPropertyResult result = 831 binaryDictionary.getNextWordProperty(token); 832 final WordProperty wordProperty = result.mWordProperty; 833 final String word0 = wordProperty.mWord; 834 assertEquals((int)wordProbabilitiesToCheckLater.get(word0), 835 wordProperty.mProbabilityInfo.mProbability); 836 wordSet.remove(word0); 837 final HashSet<String> bigramWord1s = bigrams.get(word0); 838 // TODO: Support ngram. 839 if (wordProperty.mHasNgrams) { 840 for (final WeightedString bigramTarget : wordProperty.getBigrams()) { 841 final String word1 = bigramTarget.mWord; 842 assertTrue(bigramWord1s.contains(word1)); 843 final Pair<String, String> bigram = new Pair<>(word0, word1); 844 final int bigramProbability = bigramProbabilitiesToCheckLater.get(bigram); 845 assertEquals(bigramProbability, bigramTarget.getProbability()); 846 bigramSet.remove(bigram); 847 } 848 } 849 token = result.mNextToken; 850 } while (token != 0); 851 assertTrue(wordSet.isEmpty()); 852 assertTrue(bigramSet.isEmpty()); 853 } 854 855 public void testPossiblyOffensiveAttributeMaintained() { 856 final BinaryDictionary binaryDictionary = 857 getEmptyBinaryDictionary(FormatSpec.VERSION403); 858 binaryDictionary.addUnigramEntry("ddd", 100, false, true, true, 0); 859 WordProperty wordProperty = binaryDictionary.getWordProperty("ddd", false); 860 assertEquals(true, wordProperty.mIsPossiblyOffensive); 861 } 862 863 public void testBeginningOfSentence() { 864 final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403); 865 final int dummyProbability = 0; 866 final NgramContext beginningOfSentenceContext = NgramContext.BEGINNING_OF_SENTENCE; 867 final int bigramProbability = 200; 868 addUnigramWord(binaryDictionary, "aaa", dummyProbability); 869 binaryDictionary.addNgramEntry(beginningOfSentenceContext, "aaa", bigramProbability, 870 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); 871 assertEquals(bigramProbability, 872 binaryDictionary.getNgramProbability(beginningOfSentenceContext, "aaa")); 873 binaryDictionary.addNgramEntry(beginningOfSentenceContext, "aaa", bigramProbability, 874 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); 875 addUnigramWord(binaryDictionary, "bbb", dummyProbability); 876 binaryDictionary.addNgramEntry(beginningOfSentenceContext, "bbb", bigramProbability, 877 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); 878 binaryDictionary.flushWithGC(); 879 assertEquals(bigramProbability, 880 binaryDictionary.getNgramProbability(beginningOfSentenceContext, "aaa")); 881 assertEquals(bigramProbability, 882 binaryDictionary.getNgramProbability(beginningOfSentenceContext, "bbb")); 883 } 884 } 885