Home | History | Annotate | Download | only in latin
      1 /*
      2  * Copyright (C) 2013 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.inputmethod.latin;
     18 
     19 import android.test.AndroidTestCase;
     20 import android.test.suitebuilder.annotation.LargeTest;
     21 import android.text.TextUtils;
     22 import android.util.Pair;
     23 
     24 import com.android.inputmethod.latin.NgramContext.WordInfo;
     25 import com.android.inputmethod.latin.common.CodePointUtils;
     26 import com.android.inputmethod.latin.common.FileUtils;
     27 import com.android.inputmethod.latin.makedict.DictionaryHeader;
     28 import com.android.inputmethod.latin.makedict.FormatSpec;
     29 import com.android.inputmethod.latin.makedict.WeightedString;
     30 import com.android.inputmethod.latin.makedict.WordProperty;
     31 import com.android.inputmethod.latin.utils.BinaryDictionaryUtils;
     32 
     33 import java.io.File;
     34 import java.io.IOException;
     35 import java.util.ArrayList;
     36 import java.util.HashMap;
     37 import java.util.HashSet;
     38 import java.util.Locale;
     39 import java.util.Random;
     40 
     41 @LargeTest
     42 public class BinaryDictionaryTests extends AndroidTestCase {
     43     private static final String TEST_DICT_FILE_EXTENSION = ".testDict";
     44     private static final String TEST_LOCALE = "test";
     45     private static final String DICTIONARY_ID = "TestBinaryDictionary";
     46 
     47     private HashSet<File> mDictFilesToBeDeleted = new HashSet<>();
     48 
     49     @Override
     50     protected void setUp() throws Exception {
     51         super.setUp();
     52         mDictFilesToBeDeleted.clear();
     53     }
     54 
     55     @Override
     56     protected void tearDown() throws Exception {
     57         for (final File dictFile : mDictFilesToBeDeleted) {
     58             dictFile.delete();
     59         }
     60         mDictFilesToBeDeleted.clear();
     61         super.tearDown();
     62     }
     63 
     64     private File createEmptyDictionaryAndGetFile(final int formatVersion) {
     65         return createEmptyDictionaryWithAttributesAndGetFile(formatVersion,
     66                 new HashMap<String, String>());
     67     }
     68 
     69     private File createEmptyDictionaryWithAttributesAndGetFile(final int formatVersion,
     70             final HashMap<String, String> attributeMap) {
     71         try {
     72             final File dictFile = createEmptyVer4DictionaryAndGetFile(formatVersion,
     73                     attributeMap);
     74             mDictFilesToBeDeleted.add(dictFile);
     75             return dictFile;
     76         } catch (final IOException e) {
     77             fail(e.toString());
     78         }
     79         return null;
     80     }
     81 
     82     private File createEmptyVer4DictionaryAndGetFile(final int formatVersion,
     83             final HashMap<String, String> attributeMap) throws IOException {
     84         final File file = File.createTempFile(DICTIONARY_ID, TEST_DICT_FILE_EXTENSION,
     85                 getContext().getCacheDir());
     86         file.delete();
     87         file.mkdir();
     88         if (BinaryDictionaryUtils.createEmptyDictFile(file.getAbsolutePath(), formatVersion,
     89                 Locale.ENGLISH, attributeMap)) {
     90             return file;
     91         }
     92         throw new IOException("Empty dictionary " + file.getAbsolutePath()
     93                 + " cannot be created. Format version: " + formatVersion);
     94     }
     95 
     96     private static BinaryDictionary getBinaryDictionary(final File dictFile) {
     97         return new BinaryDictionary(dictFile.getAbsolutePath(),
     98                 0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
     99                 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
    100     }
    101 
    102     private BinaryDictionary getEmptyBinaryDictionary(final int formatVersion) {
    103         final File dictFile = createEmptyDictionaryAndGetFile(formatVersion);
    104         return new BinaryDictionary(dictFile.getAbsolutePath(),
    105                 0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
    106                 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
    107     }
    108 
    109     public void testIsValidDictionary() {
    110         final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
    111         BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
    112         assertTrue("binaryDictionary must be valid for existing valid dictionary file.",
    113                 binaryDictionary.isValidDictionary());
    114         binaryDictionary.close();
    115         assertFalse("binaryDictionary must be invalid after closing.",
    116                 binaryDictionary.isValidDictionary());
    117         FileUtils.deleteRecursively(dictFile);
    118         binaryDictionary = getBinaryDictionary(dictFile);
    119         assertFalse("binaryDictionary must be invalid for not existing dictionary file.",
    120                 binaryDictionary.isValidDictionary());
    121         binaryDictionary.close();
    122     }
    123 
    124     public void testConstructingDictionaryOnMemory() {
    125         final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
    126         FileUtils.deleteRecursively(dictFile);
    127         assertFalse(dictFile.exists());
    128         final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
    129                 true /* useFullEditDistance */, Locale.getDefault(), TEST_LOCALE,
    130                 FormatSpec.VERSION403, new HashMap<String, String>());
    131         assertTrue(binaryDictionary.isValidDictionary());
    132         assertEquals(FormatSpec.VERSION403, binaryDictionary.getFormatVersion());
    133         final int probability = 100;
    134         addUnigramWord(binaryDictionary, "word", probability);
    135         assertEquals(probability, binaryDictionary.getFrequency("word"));
    136         assertFalse(dictFile.exists());
    137         binaryDictionary.flush();
    138         assertTrue(dictFile.exists());
    139         assertTrue(binaryDictionary.isValidDictionary());
    140         assertEquals(FormatSpec.VERSION403, binaryDictionary.getFormatVersion());
    141         assertEquals(probability, binaryDictionary.getFrequency("word"));
    142         binaryDictionary.close();
    143     }
    144 
    145     public void testAddTooLongWord() {
    146         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
    147         final StringBuffer stringBuilder = new StringBuffer();
    148         for (int i = 0; i < BinaryDictionary.DICTIONARY_MAX_WORD_LENGTH; i++) {
    149             stringBuilder.append('a');
    150         }
    151         final String validLongWord = stringBuilder.toString();
    152         stringBuilder.append('a');
    153         final String invalidLongWord = stringBuilder.toString();
    154         final int probability = 100;
    155         addUnigramWord(binaryDictionary, "aaa", probability);
    156         addUnigramWord(binaryDictionary, validLongWord, probability);
    157         addUnigramWord(binaryDictionary, invalidLongWord, probability);
    158         // Too long short cut.
    159         binaryDictionary.addUnigramEntry("a", probability, false /* isBeginningOfSentence */,
    160                 false /* isNotAWord */, false /* isPossiblyOffensive */,
    161                 BinaryDictionary.NOT_A_VALID_TIMESTAMP);
    162         addUnigramWord(binaryDictionary, "abc", probability);
    163         final int updatedProbability = 200;
    164         // Update.
    165         addUnigramWord(binaryDictionary, validLongWord, updatedProbability);
    166         addUnigramWord(binaryDictionary, invalidLongWord, updatedProbability);
    167         addUnigramWord(binaryDictionary, "abc", updatedProbability);
    168 
    169         assertEquals(probability, binaryDictionary.getFrequency("aaa"));
    170         assertEquals(updatedProbability, binaryDictionary.getFrequency(validLongWord));
    171         assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency(invalidLongWord));
    172         assertEquals(updatedProbability, binaryDictionary.getFrequency("abc"));
    173     }
    174 
    175     private static void addUnigramWord(final BinaryDictionary binaryDictionary, final String word,
    176             final int probability) {
    177         binaryDictionary.addUnigramEntry(word, probability,
    178                 false /* isBeginningOfSentence */, false /* isNotAWord */,
    179                 false /* isPossiblyOffensive */,
    180                 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
    181     }
    182 
    183     private static void addBigramWords(final BinaryDictionary binaryDictionary, final String word0,
    184             final String word1, final int probability) {
    185         binaryDictionary.addNgramEntry(new NgramContext(new WordInfo(word0)), word1, probability,
    186                 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
    187     }
    188 
    189     private static void addTrigramEntry(final BinaryDictionary binaryDictionary, final String word0,
    190             final String word1, final String word2, final int probability) {
    191         binaryDictionary.addNgramEntry(
    192                 new NgramContext(new WordInfo(word1), new WordInfo(word0)), word2,
    193                 probability, BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
    194     }
    195 
    196     private static boolean isValidBigram(final BinaryDictionary binaryDictionary,
    197             final String word0, final String word1) {
    198         return binaryDictionary.isValidNgram(new NgramContext(new WordInfo(word0)), word1);
    199     }
    200 
    201     private static int getBigramProbability(final BinaryDictionary binaryDictionary,
    202             final String word0,  final String word1) {
    203         return binaryDictionary.getNgramProbability(new NgramContext(new WordInfo(word0)), word1);
    204     }
    205 
    206     private static int getTrigramProbability(final BinaryDictionary binaryDictionary,
    207             final String word0, final String word1, final String word2) {
    208         return binaryDictionary.getNgramProbability(
    209                 new NgramContext(new WordInfo(word1), new WordInfo(word0)), word2);
    210     }
    211 
    212     public void testAddUnigramWord() {
    213         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
    214         final int probability = 100;
    215         addUnigramWord(binaryDictionary, "aaa", probability);
    216         // Reallocate and create.
    217         addUnigramWord(binaryDictionary, "aab", probability);
    218         // Insert into children.
    219         addUnigramWord(binaryDictionary, "aac", probability);
    220         // Make terminal.
    221         addUnigramWord(binaryDictionary, "aa", probability);
    222         // Create children.
    223         addUnigramWord(binaryDictionary, "aaaa", probability);
    224         // Reallocate and make termianl.
    225         addUnigramWord(binaryDictionary, "a", probability);
    226 
    227         final int updatedProbability = 200;
    228         // Update.
    229         addUnigramWord(binaryDictionary, "aaa", updatedProbability);
    230 
    231         assertEquals(probability, binaryDictionary.getFrequency("aab"));
    232         assertEquals(probability, binaryDictionary.getFrequency("aac"));
    233         assertEquals(probability, binaryDictionary.getFrequency("aa"));
    234         assertEquals(probability, binaryDictionary.getFrequency("aaaa"));
    235         assertEquals(probability, binaryDictionary.getFrequency("a"));
    236         assertEquals(updatedProbability, binaryDictionary.getFrequency("aaa"));
    237     }
    238 
    239     public void testRandomlyAddUnigramWord() {
    240         final int wordCount = 1000;
    241         final int codePointSetSize = 50;
    242         final long seed = System.currentTimeMillis();
    243         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
    244 
    245         final HashMap<String, Integer> probabilityMap = new HashMap<>();
    246         // Test a word that isn't contained within the dictionary.
    247         final Random random = new Random(seed);
    248         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
    249         for (int i = 0; i < wordCount; ++i) {
    250             final String word = CodePointUtils.generateWord(random, codePointSet);
    251             probabilityMap.put(word, random.nextInt(0xFF));
    252         }
    253         for (String word : probabilityMap.keySet()) {
    254             addUnigramWord(binaryDictionary, word, probabilityMap.get(word));
    255         }
    256         for (String word : probabilityMap.keySet()) {
    257             assertEquals(word, (int)probabilityMap.get(word), binaryDictionary.getFrequency(word));
    258         }
    259     }
    260 
    261     public void testAddBigramWords() {
    262         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
    263 
    264         final int unigramProbability = 100;
    265         final int bigramProbability = 150;
    266         final int updatedBigramProbability = 200;
    267         addUnigramWord(binaryDictionary, "aaa", unigramProbability);
    268         addUnigramWord(binaryDictionary, "abb", unigramProbability);
    269         addUnigramWord(binaryDictionary, "bcc", unigramProbability);
    270         addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability);
    271         addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability);
    272         addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability);
    273         addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability);
    274 
    275         assertTrue(isValidBigram(binaryDictionary, "aaa", "abb"));
    276         assertTrue(isValidBigram(binaryDictionary, "aaa", "bcc"));
    277         assertTrue(isValidBigram(binaryDictionary, "abb", "aaa"));
    278         assertTrue(isValidBigram(binaryDictionary, "abb", "bcc"));
    279         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "abb"));
    280         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bcc"));
    281         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "aaa"));
    282         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "bcc"));
    283 
    284         addBigramWords(binaryDictionary, "aaa", "abb", updatedBigramProbability);
    285         assertEquals(updatedBigramProbability,
    286                 getBigramProbability(binaryDictionary, "aaa", "abb"));
    287 
    288         assertFalse(isValidBigram(binaryDictionary, "bcc", "aaa"));
    289         assertFalse(isValidBigram(binaryDictionary, "bcc", "bbc"));
    290         assertFalse(isValidBigram(binaryDictionary, "aaa", "aaa"));
    291         assertEquals(Dictionary.NOT_A_PROBABILITY,
    292                 getBigramProbability(binaryDictionary, "bcc", "aaa"));
    293         assertEquals(Dictionary.NOT_A_PROBABILITY,
    294                 getBigramProbability(binaryDictionary, "bcc", "bbc"));
    295         assertEquals(Dictionary.NOT_A_PROBABILITY,
    296                 getBigramProbability(binaryDictionary, "aaa", "aaa"));
    297 
    298         // Testing bigram link.
    299         addUnigramWord(binaryDictionary, "abcde", unigramProbability);
    300         addUnigramWord(binaryDictionary, "fghij", unigramProbability);
    301         addBigramWords(binaryDictionary, "abcde", "fghij", bigramProbability);
    302         addUnigramWord(binaryDictionary, "fgh", unigramProbability);
    303         addUnigramWord(binaryDictionary, "abc", unigramProbability);
    304         addUnigramWord(binaryDictionary, "f", unigramProbability);
    305 
    306         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abcde", "fghij"));
    307         assertEquals(Dictionary.NOT_A_PROBABILITY,
    308                 getBigramProbability(binaryDictionary, "abcde", "fgh"));
    309         addBigramWords(binaryDictionary, "abcde", "fghij", updatedBigramProbability);
    310         assertEquals(updatedBigramProbability,
    311                 getBigramProbability(binaryDictionary, "abcde", "fghij"));
    312     }
    313 
    314     public void testRandomlyAddBigramWords() {
    315         final int wordCount = 100;
    316         final int bigramCount = 1000;
    317         final int codePointSetSize = 50;
    318         final long seed = System.currentTimeMillis();
    319         final Random random = new Random(seed);
    320         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
    321 
    322         final ArrayList<String> words = new ArrayList<>();
    323         final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>();
    324         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
    325         final HashMap<String, Integer> unigramProbabilities = new HashMap<>();
    326         final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>();
    327 
    328         for (int i = 0; i < wordCount; ++i) {
    329             final String word = CodePointUtils.generateWord(random, codePointSet);
    330             words.add(word);
    331             final int unigramProbability = random.nextInt(0xFF);
    332             unigramProbabilities.put(word, unigramProbability);
    333             addUnigramWord(binaryDictionary, word, unigramProbability);
    334         }
    335 
    336         for (int i = 0; i < bigramCount; i++) {
    337             final String word0 = words.get(random.nextInt(wordCount));
    338             final String word1 = words.get(random.nextInt(wordCount));
    339             if (TextUtils.equals(word0, word1)) {
    340                 continue;
    341             }
    342             final Pair<String, String> bigram = new Pair<>(word0, word1);
    343             bigramWords.add(bigram);
    344             final int unigramProbability = unigramProbabilities.get(word1);
    345             final int bigramProbability =
    346                     unigramProbability + random.nextInt(0xFF - unigramProbability);
    347             bigramProbabilities.put(bigram, bigramProbability);
    348             addBigramWords(binaryDictionary, word0, word1, bigramProbability);
    349         }
    350 
    351         for (final Pair<String, String> bigram : bigramWords) {
    352             final int bigramProbability = bigramProbabilities.get(bigram);
    353             assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY,
    354                     isValidBigram(binaryDictionary, bigram.first, bigram.second));
    355             assertEquals(bigramProbability,
    356                     getBigramProbability(binaryDictionary, bigram.first, bigram.second));
    357         }
    358     }
    359 
    360     public void testAddTrigramWords() {
    361         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
    362         final int unigramProbability = 100;
    363         final int trigramProbability = 150;
    364         final int updatedTrigramProbability = 200;
    365         addUnigramWord(binaryDictionary, "aaa", unigramProbability);
    366         addUnigramWord(binaryDictionary, "abb", unigramProbability);
    367         addUnigramWord(binaryDictionary, "bcc", unigramProbability);
    368 
    369         addBigramWords(binaryDictionary, "abb", "bcc", 10);
    370         addBigramWords(binaryDictionary, "abb", "aaa", 10);
    371 
    372         addTrigramEntry(binaryDictionary, "aaa", "abb", "bcc", trigramProbability);
    373         addTrigramEntry(binaryDictionary, "bcc", "abb", "aaa", trigramProbability);
    374 
    375         assertEquals(trigramProbability,
    376                 getTrigramProbability(binaryDictionary, "aaa", "abb", "bcc"));
    377         assertEquals(trigramProbability,
    378                 getTrigramProbability(binaryDictionary, "bcc", "abb", "aaa"));
    379         assertFalse(isValidBigram(binaryDictionary, "aaa", "abb"));
    380 
    381         addTrigramEntry(binaryDictionary, "bcc", "abb", "aaa", updatedTrigramProbability);
    382         assertEquals(updatedTrigramProbability,
    383                 getTrigramProbability(binaryDictionary, "bcc", "abb", "aaa"));
    384     }
    385 
    386     public void testFlushDictionary() {
    387         final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
    388         BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
    389 
    390         final int probability = 100;
    391         addUnigramWord(binaryDictionary, "aaa", probability);
    392         addUnigramWord(binaryDictionary, "abcd", probability);
    393         // Close without flushing.
    394         binaryDictionary.close();
    395 
    396         binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
    397                 0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
    398                 Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
    399 
    400         assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("aaa"));
    401         assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("abcd"));
    402 
    403         addUnigramWord(binaryDictionary, "aaa", probability);
    404         addUnigramWord(binaryDictionary, "abcd", probability);
    405         binaryDictionary.flush();
    406         binaryDictionary.close();
    407 
    408         binaryDictionary = getBinaryDictionary(dictFile);
    409         assertEquals(probability, binaryDictionary.getFrequency("aaa"));
    410         assertEquals(probability, binaryDictionary.getFrequency("abcd"));
    411         addUnigramWord(binaryDictionary, "bcde", probability);
    412         binaryDictionary.flush();
    413         binaryDictionary.close();
    414 
    415         binaryDictionary = getBinaryDictionary(dictFile);
    416         assertEquals(probability, binaryDictionary.getFrequency("bcde"));
    417         binaryDictionary.close();
    418     }
    419 
    420     public void testFlushWithGCDictionary() {
    421         final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
    422         BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
    423         final int unigramProbability = 100;
    424         final int bigramProbability = 150;
    425         addUnigramWord(binaryDictionary, "aaa", unigramProbability);
    426         addUnigramWord(binaryDictionary, "abb", unigramProbability);
    427         addUnigramWord(binaryDictionary, "bcc", unigramProbability);
    428         addBigramWords(binaryDictionary, "aaa", "abb", bigramProbability);
    429         addBigramWords(binaryDictionary, "aaa", "bcc", bigramProbability);
    430         addBigramWords(binaryDictionary, "abb", "aaa", bigramProbability);
    431         addBigramWords(binaryDictionary, "abb", "bcc", bigramProbability);
    432         binaryDictionary.flushWithGC();
    433         binaryDictionary.close();
    434 
    435         binaryDictionary = getBinaryDictionary(dictFile);
    436         assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa"));
    437         assertEquals(unigramProbability, binaryDictionary.getFrequency("abb"));
    438         assertEquals(unigramProbability, binaryDictionary.getFrequency("bcc"));
    439         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "abb"));
    440         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bcc"));
    441         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "aaa"));
    442         assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "abb", "bcc"));
    443         assertFalse(isValidBigram(binaryDictionary, "bcc", "aaa"));
    444         assertFalse(isValidBigram(binaryDictionary, "bcc", "bbc"));
    445         assertFalse(isValidBigram(binaryDictionary, "aaa", "aaa"));
    446         binaryDictionary.flushWithGC();
    447         binaryDictionary.close();
    448     }
    449 
    450     public void testAddBigramWordsAndFlashWithGC() {
    451         final int wordCount = 100;
    452         final int bigramCount = 1000;
    453         final int codePointSetSize = 30;
    454         final long seed = System.currentTimeMillis();
    455         final Random random = new Random(seed);
    456 
    457         final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
    458         BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
    459 
    460         final ArrayList<String> words = new ArrayList<>();
    461         final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>();
    462         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
    463         final HashMap<String, Integer> unigramProbabilities = new HashMap<>();
    464         final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>();
    465 
    466         for (int i = 0; i < wordCount; ++i) {
    467             final String word = CodePointUtils.generateWord(random, codePointSet);
    468             words.add(word);
    469             final int unigramProbability = random.nextInt(0xFF);
    470             unigramProbabilities.put(word, unigramProbability);
    471             addUnigramWord(binaryDictionary, word, unigramProbability);
    472         }
    473 
    474         for (int i = 0; i < bigramCount; i++) {
    475             final String word0 = words.get(random.nextInt(wordCount));
    476             final String word1 = words.get(random.nextInt(wordCount));
    477             if (TextUtils.equals(word0, word1)) {
    478                 continue;
    479             }
    480             final Pair<String, String> bigram = new Pair<>(word0, word1);
    481             bigramWords.add(bigram);
    482             final int unigramProbability = unigramProbabilities.get(word1);
    483             final int bigramProbability =
    484                     unigramProbability + random.nextInt(0xFF - unigramProbability);
    485             bigramProbabilities.put(bigram, bigramProbability);
    486             addBigramWords(binaryDictionary, word0, word1, bigramProbability);
    487         }
    488 
    489         binaryDictionary.flushWithGC();
    490         binaryDictionary.close();
    491         binaryDictionary = getBinaryDictionary(dictFile);
    492 
    493         for (final Pair<String, String> bigram : bigramWords) {
    494             final int bigramProbability = bigramProbabilities.get(bigram);
    495             assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY,
    496                     isValidBigram(binaryDictionary, bigram.first, bigram.second));
    497             assertEquals(bigramProbability,
    498                     getBigramProbability(binaryDictionary, bigram.first, bigram.second));
    499         }
    500     }
    501 
    502     public void testRandomOperationsAndFlashWithGC() {
    503         final int maxUnigramCount = 5000;
    504         final int maxBigramCount = 10000;
    505         final HashMap<String, String> attributeMap = new HashMap<>();
    506         attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY, String.valueOf(maxUnigramCount));
    507         attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY, String.valueOf(maxBigramCount));
    508 
    509         final int flashWithGCIterationCount = 50;
    510         final int operationCountInEachIteration = 200;
    511         final int initialUnigramCount = 100;
    512         final float addUnigramProb = 0.5f;
    513         final float addBigramProb = 0.8f;
    514         final int codePointSetSize = 30;
    515 
    516         final long seed = System.currentTimeMillis();
    517         final Random random = new Random(seed);
    518         final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(FormatSpec.VERSION403,
    519                 attributeMap);
    520         BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
    521 
    522         final ArrayList<String> words = new ArrayList<>();
    523         final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>();
    524         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
    525         final HashMap<String, Integer> unigramProbabilities = new HashMap<>();
    526         final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>();
    527         for (int i = 0; i < initialUnigramCount; ++i) {
    528             final String word = CodePointUtils.generateWord(random, codePointSet);
    529             words.add(word);
    530             final int unigramProbability = random.nextInt(0xFF);
    531             unigramProbabilities.put(word, unigramProbability);
    532             addUnigramWord(binaryDictionary, word, unigramProbability);
    533         }
    534         binaryDictionary.flushWithGC();
    535         binaryDictionary.close();
    536 
    537         for (int gcCount = 0; gcCount < flashWithGCIterationCount; gcCount++) {
    538             binaryDictionary = getBinaryDictionary(dictFile);
    539             for (int opCount = 0; opCount < operationCountInEachIteration; opCount++) {
    540                 // Add unigram.
    541                 if (random.nextFloat() < addUnigramProb) {
    542                     final String word = CodePointUtils.generateWord(random, codePointSet);
    543                     words.add(word);
    544                     final int unigramProbability = random.nextInt(0xFF);
    545                     unigramProbabilities.put(word, unigramProbability);
    546                     addUnigramWord(binaryDictionary, word, unigramProbability);
    547                 }
    548                 // Add bigram.
    549                 if (random.nextFloat() < addBigramProb && words.size() > 2) {
    550                     final int word0Index = random.nextInt(words.size());
    551                     int word1Index = random.nextInt(words.size() - 1);
    552                     if (word0Index <= word1Index) {
    553                         word1Index++;
    554                     }
    555                     final String word0 = words.get(word0Index);
    556                     final String word1 = words.get(word1Index);
    557                     if (TextUtils.equals(word0, word1)) {
    558                         continue;
    559                     }
    560                     final int unigramProbability = unigramProbabilities.get(word1);
    561                     final int bigramProbability =
    562                             unigramProbability + random.nextInt(0xFF - unigramProbability);
    563                     final Pair<String, String> bigram = new Pair<>(word0, word1);
    564                     bigramWords.add(bigram);
    565                     bigramProbabilities.put(bigram, bigramProbability);
    566                     addBigramWords(binaryDictionary, word0, word1, bigramProbability);
    567                 }
    568             }
    569 
    570             // Test whether the all unigram operations are collectlly handled.
    571             for (int i = 0; i < words.size(); i++) {
    572                 final String word = words.get(i);
    573                 final int unigramProbability = unigramProbabilities.get(word);
    574                 assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word));
    575             }
    576             // Test whether the all bigram operations are collectlly handled.
    577             for (int i = 0; i < bigramWords.size(); i++) {
    578                 final Pair<String, String> bigram = bigramWords.get(i);
    579                 final int probability;
    580                 if (bigramProbabilities.containsKey(bigram)) {
    581                     probability = bigramProbabilities.get(bigram);
    582                 } else {
    583                     probability = Dictionary.NOT_A_PROBABILITY;
    584                 }
    585 
    586                 assertEquals(probability,
    587                         getBigramProbability(binaryDictionary, bigram.first, bigram.second));
    588                 assertEquals(probability != Dictionary.NOT_A_PROBABILITY,
    589                         isValidBigram(binaryDictionary, bigram.first, bigram.second));
    590             }
    591             binaryDictionary.flushWithGC();
    592             binaryDictionary.close();
    593         }
    594     }
    595 
    596     public void testAddManyUnigramsAndFlushWithGC() {
    597         final int flashWithGCIterationCount = 3;
    598         final int codePointSetSize = 50;
    599 
    600         final long seed = System.currentTimeMillis();
    601         final Random random = new Random(seed);
    602 
    603         final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
    604 
    605         final ArrayList<String> words = new ArrayList<>();
    606         final HashMap<String, Integer> unigramProbabilities = new HashMap<>();
    607         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
    608 
    609         BinaryDictionary binaryDictionary;
    610         for (int i = 0; i < flashWithGCIterationCount; i++) {
    611             binaryDictionary = getBinaryDictionary(dictFile);
    612             while(!binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
    613                 final String word = CodePointUtils.generateWord(random, codePointSet);
    614                 words.add(word);
    615                 final int unigramProbability = random.nextInt(0xFF);
    616                 unigramProbabilities.put(word, unigramProbability);
    617                 addUnigramWord(binaryDictionary, word, unigramProbability);
    618             }
    619 
    620             for (int j = 0; j < words.size(); j++) {
    621                 final String word = words.get(j);
    622                 final int unigramProbability = unigramProbabilities.get(word);
    623                 assertEquals(word, unigramProbability, binaryDictionary.getFrequency(word));
    624             }
    625 
    626             binaryDictionary.flushWithGC();
    627             binaryDictionary.close();
    628         }
    629     }
    630 
    631     public void testUnigramAndBigramCount() {
    632         final int maxUnigramCount = 5000;
    633         final int maxBigramCount = 10000;
    634         final HashMap<String, String> attributeMap = new HashMap<>();
    635         attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY, String.valueOf(maxUnigramCount));
    636         attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY, String.valueOf(maxBigramCount));
    637 
    638         final int flashWithGCIterationCount = 10;
    639         final int codePointSetSize = 50;
    640         final int unigramCountPerIteration = 1000;
    641         final int bigramCountPerIteration = 2000;
    642         final long seed = System.currentTimeMillis();
    643         final Random random = new Random(seed);
    644         final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(FormatSpec.VERSION403,
    645                 attributeMap);
    646 
    647         final ArrayList<String> words = new ArrayList<>();
    648         final HashSet<Pair<String, String>> bigrams = new HashSet<>();
    649         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
    650 
    651         BinaryDictionary binaryDictionary;
    652         for (int i = 0; i < flashWithGCIterationCount; i++) {
    653             binaryDictionary = getBinaryDictionary(dictFile);
    654             for (int j = 0; j < unigramCountPerIteration; j++) {
    655                 final String word = CodePointUtils.generateWord(random, codePointSet);
    656                 words.add(word);
    657                 final int unigramProbability = random.nextInt(0xFF);
    658                 addUnigramWord(binaryDictionary, word, unigramProbability);
    659             }
    660             for (int j = 0; j < bigramCountPerIteration; j++) {
    661                 final String word0 = words.get(random.nextInt(words.size()));
    662                 final String word1 = words.get(random.nextInt(words.size()));
    663                 if (TextUtils.equals(word0, word1)) {
    664                     continue;
    665                 }
    666                 bigrams.add(new Pair<>(word0, word1));
    667                 final int bigramProbability = random.nextInt(0xF);
    668                 addBigramWords(binaryDictionary, word0, word1, bigramProbability);
    669             }
    670             assertEquals(new HashSet<>(words).size(), Integer.parseInt(
    671                     binaryDictionary.getPropertyForGettingStats(
    672                             BinaryDictionary.UNIGRAM_COUNT_QUERY)));
    673             assertEquals(new HashSet<>(bigrams).size(), Integer.parseInt(
    674                     binaryDictionary.getPropertyForGettingStats(
    675                             BinaryDictionary.BIGRAM_COUNT_QUERY)));
    676             binaryDictionary.flushWithGC();
    677             assertEquals(new HashSet<>(words).size(), Integer.parseInt(
    678                     binaryDictionary.getPropertyForGettingStats(
    679                             BinaryDictionary.UNIGRAM_COUNT_QUERY)));
    680             assertEquals(new HashSet<>(bigrams).size(), Integer.parseInt(
    681                     binaryDictionary.getPropertyForGettingStats(
    682                             BinaryDictionary.BIGRAM_COUNT_QUERY)));
    683             binaryDictionary.close();
    684         }
    685     }
    686 
    687     public void testGetWordProperties() {
    688         final long seed = System.currentTimeMillis();
    689         final Random random = new Random(seed);
    690         final int UNIGRAM_COUNT = 1000;
    691         final int BIGRAM_COUNT = 1000;
    692         final int codePointSetSize = 20;
    693         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
    694         final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
    695         final BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
    696 
    697         final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord",
    698                 false /* isBeginningOfSentence */);
    699         assertFalse(invalidWordProperty.isValid());
    700 
    701         final ArrayList<String> words = new ArrayList<>();
    702         final HashMap<String, Integer> wordProbabilities = new HashMap<>();
    703         final HashMap<String, HashSet<String>> bigrams = new HashMap<>();
    704         final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>();
    705 
    706         for (int i = 0; i < UNIGRAM_COUNT; i++) {
    707             final String word = CodePointUtils.generateWord(random, codePointSet);
    708             final int unigramProbability = random.nextInt(0xFF);
    709             final boolean isNotAWord = random.nextBoolean();
    710             final boolean isPossiblyOffensive = random.nextBoolean();
    711             // TODO: Add tests for historical info.
    712             binaryDictionary.addUnigramEntry(word, unigramProbability,
    713                     false /* isBeginningOfSentence */, isNotAWord, isPossiblyOffensive,
    714                     BinaryDictionary.NOT_A_VALID_TIMESTAMP);
    715             if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
    716                 binaryDictionary.flushWithGC();
    717             }
    718             words.add(word);
    719             wordProbabilities.put(word, unigramProbability);
    720             final WordProperty wordProperty = binaryDictionary.getWordProperty(word,
    721                     false /* isBeginningOfSentence */);
    722             assertEquals(word, wordProperty.mWord);
    723             assertTrue(wordProperty.isValid());
    724             assertEquals(isNotAWord, wordProperty.mIsNotAWord);
    725             assertEquals(isPossiblyOffensive, wordProperty.mIsPossiblyOffensive);
    726             assertEquals(false, wordProperty.mHasNgrams);
    727             assertEquals(unigramProbability, wordProperty.mProbabilityInfo.mProbability);
    728         }
    729 
    730         for (int i = 0; i < BIGRAM_COUNT; i++) {
    731             final int word0Index = random.nextInt(wordProbabilities.size());
    732             final int word1Index = random.nextInt(wordProbabilities.size());
    733             if (word0Index == word1Index) {
    734                 continue;
    735             }
    736             final String word0 = words.get(word0Index);
    737             final String word1 = words.get(word1Index);
    738             final int unigramProbability = wordProbabilities.get(word1);
    739             final int bigramProbability =
    740                     unigramProbability + random.nextInt(0xFF - unigramProbability);
    741             addBigramWords(binaryDictionary, word0, word1, bigramProbability);
    742             if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
    743                 binaryDictionary.flushWithGC();
    744             }
    745             if (!bigrams.containsKey(word0)) {
    746                 final HashSet<String> bigramWord1s = new HashSet<>();
    747                 bigrams.put(word0, bigramWord1s);
    748             }
    749             bigrams.get(word0).add(word1);
    750             bigramProbabilities.put(new Pair<>(word0, word1), bigramProbability);
    751         }
    752 
    753         for (int i = 0; i < words.size(); i++) {
    754             final String word0 = words.get(i);
    755             if (!bigrams.containsKey(word0)) {
    756                 continue;
    757             }
    758             final HashSet<String> bigramWord1s = bigrams.get(word0);
    759             final WordProperty wordProperty = binaryDictionary.getWordProperty(word0,
    760                     false /* isBeginningOfSentence */);
    761             assertEquals(bigramWord1s.size(), wordProperty.mNgrams.size());
    762             // TODO: Support ngram.
    763             for (final WeightedString bigramTarget : wordProperty.getBigrams()) {
    764                 final String word1 = bigramTarget.mWord;
    765                 assertTrue(bigramWord1s.contains(word1));
    766                 final int bigramProbability = bigramProbabilities.get(new Pair<>(word0, word1));
    767                 assertEquals(bigramProbability, bigramTarget.getProbability());
    768             }
    769         }
    770     }
    771 
    772     public void testIterateAllWords() {
    773         final long seed = System.currentTimeMillis();
    774         final Random random = new Random(seed);
    775         final int UNIGRAM_COUNT = 1000;
    776         final int BIGRAM_COUNT = 1000;
    777         final int codePointSetSize = 20;
    778         final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
    779         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
    780 
    781         final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord",
    782                 false /* isBeginningOfSentence */);
    783         assertFalse(invalidWordProperty.isValid());
    784 
    785         final ArrayList<String> words = new ArrayList<>();
    786         final HashMap<String, Integer> wordProbabilitiesToCheckLater = new HashMap<>();
    787         final HashMap<String, HashSet<String>> bigrams = new HashMap<>();
    788         final HashMap<Pair<String, String>, Integer> bigramProbabilitiesToCheckLater =
    789                 new HashMap<>();
    790 
    791         for (int i = 0; i < UNIGRAM_COUNT; i++) {
    792             final String word = CodePointUtils.generateWord(random, codePointSet);
    793             final int unigramProbability = random.nextInt(0xFF);
    794             addUnigramWord(binaryDictionary, word, unigramProbability);
    795             if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
    796                 binaryDictionary.flushWithGC();
    797             }
    798             words.add(word);
    799             wordProbabilitiesToCheckLater.put(word, unigramProbability);
    800         }
    801 
    802         for (int i = 0; i < BIGRAM_COUNT; i++) {
    803             final int word0Index = random.nextInt(wordProbabilitiesToCheckLater.size());
    804             final int word1Index = random.nextInt(wordProbabilitiesToCheckLater.size());
    805             if (word0Index == word1Index) {
    806                 continue;
    807             }
    808             final String word0 = words.get(word0Index);
    809             final String word1 = words.get(word1Index);
    810             final int unigramProbability = wordProbabilitiesToCheckLater.get(word1);
    811             final int bigramProbability =
    812                     unigramProbability + random.nextInt(0xFF - unigramProbability);
    813             addBigramWords(binaryDictionary, word0, word1, bigramProbability);
    814             if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
    815                 binaryDictionary.flushWithGC();
    816             }
    817             if (!bigrams.containsKey(word0)) {
    818                 final HashSet<String> bigramWord1s = new HashSet<>();
    819                 bigrams.put(word0, bigramWord1s);
    820             }
    821             bigrams.get(word0).add(word1);
    822             bigramProbabilitiesToCheckLater.put(new Pair<>(word0, word1), bigramProbability);
    823         }
    824 
    825         final HashSet<String> wordSet = new HashSet<>(words);
    826         final HashSet<Pair<String, String>> bigramSet =
    827                 new HashSet<>(bigramProbabilitiesToCheckLater.keySet());
    828         int token = 0;
    829         do {
    830             final BinaryDictionary.GetNextWordPropertyResult result =
    831                     binaryDictionary.getNextWordProperty(token);
    832             final WordProperty wordProperty = result.mWordProperty;
    833             final String word0 = wordProperty.mWord;
    834             assertEquals((int)wordProbabilitiesToCheckLater.get(word0),
    835                     wordProperty.mProbabilityInfo.mProbability);
    836             wordSet.remove(word0);
    837             final HashSet<String> bigramWord1s = bigrams.get(word0);
    838             // TODO: Support ngram.
    839             if (wordProperty.mHasNgrams) {
    840                 for (final WeightedString bigramTarget : wordProperty.getBigrams()) {
    841                     final String word1 = bigramTarget.mWord;
    842                     assertTrue(bigramWord1s.contains(word1));
    843                     final Pair<String, String> bigram = new Pair<>(word0, word1);
    844                     final int bigramProbability = bigramProbabilitiesToCheckLater.get(bigram);
    845                     assertEquals(bigramProbability, bigramTarget.getProbability());
    846                     bigramSet.remove(bigram);
    847                 }
    848             }
    849             token = result.mNextToken;
    850         } while (token != 0);
    851         assertTrue(wordSet.isEmpty());
    852         assertTrue(bigramSet.isEmpty());
    853     }
    854 
    855     public void testPossiblyOffensiveAttributeMaintained() {
    856         final BinaryDictionary binaryDictionary =
    857                 getEmptyBinaryDictionary(FormatSpec.VERSION403);
    858         binaryDictionary.addUnigramEntry("ddd", 100, false, true, true, 0);
    859         WordProperty wordProperty = binaryDictionary.getWordProperty("ddd", false);
    860         assertEquals(true, wordProperty.mIsPossiblyOffensive);
    861     }
    862 
    863     public void testBeginningOfSentence() {
    864         final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
    865         final int dummyProbability = 0;
    866         final NgramContext beginningOfSentenceContext = NgramContext.BEGINNING_OF_SENTENCE;
    867         final int bigramProbability = 200;
    868         addUnigramWord(binaryDictionary, "aaa", dummyProbability);
    869         binaryDictionary.addNgramEntry(beginningOfSentenceContext, "aaa", bigramProbability,
    870                 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
    871         assertEquals(bigramProbability,
    872                 binaryDictionary.getNgramProbability(beginningOfSentenceContext, "aaa"));
    873         binaryDictionary.addNgramEntry(beginningOfSentenceContext, "aaa", bigramProbability,
    874                 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
    875         addUnigramWord(binaryDictionary, "bbb", dummyProbability);
    876         binaryDictionary.addNgramEntry(beginningOfSentenceContext, "bbb", bigramProbability,
    877                 BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
    878         binaryDictionary.flushWithGC();
    879         assertEquals(bigramProbability,
    880                 binaryDictionary.getNgramProbability(beginningOfSentenceContext, "aaa"));
    881         assertEquals(bigramProbability,
    882                 binaryDictionary.getNgramProbability(beginningOfSentenceContext, "bbb"));
    883     }
    884 }
    885