Home | History | Annotate | Download | only in src
      1 /*
      2  * Copyright (C) 2010, The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include <cstring>
     18 
     19 #define LOG_TAG "LatinIME: bigram_dictionary.cpp"
     20 
     21 #include "bigram_dictionary.h"
     22 #include "binary_format.h"
     23 #include "bloom_filter.h"
     24 #include "char_utils.h"
     25 #include "defines.h"
     26 #include "dictionary.h"
     27 
     28 namespace latinime {
     29 
     30 BigramDictionary::BigramDictionary(const uint8_t *const streamStart) : DICT_ROOT(streamStart) {
     31     if (DEBUG_DICT) {
     32         AKLOGI("BigramDictionary - constructor");
     33     }
     34 }
     35 
     36 BigramDictionary::~BigramDictionary() {
     37 }
     38 
     39 void BigramDictionary::addWordBigram(int *word, int length, int probability, int *bigramProbability,
     40         int *bigramCodePoints, int *outputTypes) const {
     41     word[length] = 0;
     42     if (DEBUG_DICT_FULL) {
     43 #ifdef FLAG_DBG
     44         char s[length + 1];
     45         for (int i = 0; i <= length; i++) s[i] = static_cast<char>(word[i]);
     46         AKLOGI("Bigram: Found word = %s, freq = %d :", s, probability);
     47 #endif
     48     }
     49 
     50     // Find the right insertion point
     51     int insertAt = 0;
     52     while (insertAt < MAX_RESULTS) {
     53         if (probability > bigramProbability[insertAt] || (bigramProbability[insertAt] == probability
     54                 && length < getCodePointCount(MAX_WORD_LENGTH,
     55                         bigramCodePoints + insertAt * MAX_WORD_LENGTH))) {
     56             break;
     57         }
     58         insertAt++;
     59     }
     60     if (DEBUG_DICT_FULL) {
     61         AKLOGI("Bigram: InsertAt -> %d MAX_RESULTS: %d", insertAt, MAX_RESULTS);
     62     }
     63     if (insertAt >= MAX_RESULTS) {
     64         return;
     65     }
     66     memmove(bigramProbability + (insertAt + 1),
     67             bigramProbability + insertAt,
     68             (MAX_RESULTS - insertAt - 1) * sizeof(bigramProbability[0]));
     69     bigramProbability[insertAt] = probability;
     70     outputTypes[insertAt] = Dictionary::KIND_PREDICTION;
     71     memmove(bigramCodePoints + (insertAt + 1) * MAX_WORD_LENGTH,
     72             bigramCodePoints + insertAt * MAX_WORD_LENGTH,
     73             (MAX_RESULTS - insertAt - 1) * sizeof(bigramCodePoints[0]) * MAX_WORD_LENGTH);
     74     int *dest = bigramCodePoints + insertAt * MAX_WORD_LENGTH;
     75     while (length--) {
     76         *dest++ = *word++;
     77     }
     78     *dest = 0; // NULL terminate
     79     if (DEBUG_DICT_FULL) {
     80         AKLOGI("Bigram: Added word at %d", insertAt);
     81     }
     82 }
     83 
     84 /* Parameters :
     85  * prevWord: the word before, the one for which we need to look up bigrams.
     86  * prevWordLength: its length.
     87  * inputCodePoints: what user typed, in the same format as for UnigramDictionary::getSuggestions.
     88  * inputSize: the size of the codes array.
     89  * bigramCodePoints: an array for output, at the same format as outwords for getSuggestions.
     90  * bigramProbability: an array to output frequencies.
     91  * outputTypes: an array to output types.
     92  * This method returns the number of bigrams this word has, for backward compatibility.
     93  * Note: this is not the number of bigrams output in the array, which is the number of
     94  * bigrams this word has WHOSE first letter also matches the letter the user typed.
     95  * TODO: this may not be a sensible thing to do. It makes sense when the bigrams are
     96  * used to match the first letter of the second word, but once the user has typed more
     97  * and the bigrams are used to boost unigram result scores, it makes little sense to
     98  * reduce their scope to the ones that match the first letter.
     99  */
    100 int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *inputCodePoints,
    101         int inputSize, int *bigramCodePoints, int *bigramProbability, int *outputTypes) const {
    102     // TODO: remove unused arguments, and refrain from storing stuff in members of this class
    103     // TODO: have "in" arguments before "out" ones, and make out args explicit in the name
    104 
    105     const uint8_t *const root = DICT_ROOT;
    106     int pos = getBigramListPositionForWord(prevWord, prevWordLength,
    107             false /* forceLowerCaseSearch */);
    108     // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
    109     if (0 == pos) {
    110         // If no bigrams for this exact word, search again in lower case.
    111         pos = getBigramListPositionForWord(prevWord, prevWordLength,
    112                 true /* forceLowerCaseSearch */);
    113     }
    114     // If still no bigrams, we really don't have them!
    115     if (0 == pos) return 0;
    116     uint8_t bigramFlags;
    117     int bigramCount = 0;
    118     do {
    119         bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
    120         int bigramBuffer[MAX_WORD_LENGTH];
    121         int unigramProbability = 0;
    122         const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
    123                 &pos);
    124         const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH,
    125                 bigramBuffer, &unigramProbability);
    126 
    127         // inputSize == 0 means we are trying to find bigram predictions.
    128         if (inputSize < 1 || checkFirstCharacter(bigramBuffer, inputCodePoints)) {
    129             const int bigramProbabilityTemp =
    130                     BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags;
    131             // Due to space constraints, the probability for bigrams is approximate - the lower the
    132             // unigram probability, the worse the precision. The theoritical maximum error in
    133             // resulting probability is 8 - although in the practice it's never bigger than 3 or 4
    134             // in very bad cases. This means that sometimes, we'll see some bigrams interverted
    135             // here, but it can't get too bad.
    136             const int probability = BinaryFormat::computeProbabilityForBigram(
    137                     unigramProbability, bigramProbabilityTemp);
    138             addWordBigram(bigramBuffer, length, probability, bigramProbability, bigramCodePoints,
    139                     outputTypes);
    140             ++bigramCount;
    141         }
    142     } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
    143     return min(bigramCount, MAX_RESULTS);
    144 }
    145 
    146 // Returns a pointer to the start of the bigram list.
    147 // If the word is not found or has no bigrams, this function returns 0.
    148 int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength,
    149         const bool forceLowerCaseSearch) const {
    150     if (0 >= prevWordLength) return 0;
    151     const uint8_t *const root = DICT_ROOT;
    152     int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength,
    153             forceLowerCaseSearch);
    154 
    155     if (NOT_VALID_WORD == pos) return 0;
    156     const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
    157     if (0 == (flags & BinaryFormat::FLAG_HAS_BIGRAMS)) return 0;
    158     if (0 == (flags & BinaryFormat::FLAG_HAS_MULTIPLE_CHARS)) {
    159         BinaryFormat::getCodePointAndForwardPointer(root, &pos);
    160     } else {
    161         pos = BinaryFormat::skipOtherCharacters(root, pos);
    162     }
    163     pos = BinaryFormat::skipProbability(flags, pos);
    164     pos = BinaryFormat::skipChildrenPosition(flags, pos);
    165     pos = BinaryFormat::skipShortcuts(root, flags, pos);
    166     return pos;
    167 }
    168 
    169 void BigramDictionary::fillBigramAddressToProbabilityMapAndFilter(const int *prevWord,
    170         const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const {
    171     memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE);
    172     const uint8_t *const root = DICT_ROOT;
    173     int pos = getBigramListPositionForWord(prevWord, prevWordLength,
    174             false /* forceLowerCaseSearch */);
    175     if (0 == pos) {
    176         // If no bigrams for this exact string, search again in lower case.
    177         pos = getBigramListPositionForWord(prevWord, prevWordLength,
    178                 true /* forceLowerCaseSearch */);
    179     }
    180     if (0 == pos) return;
    181 
    182     uint8_t bigramFlags;
    183     do {
    184         bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
    185         const int probability = BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags;
    186         const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
    187                 &pos);
    188         (*map)[bigramPos] = probability;
    189         setInFilter(filter, bigramPos);
    190     } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
    191 }
    192 
    193 bool BigramDictionary::checkFirstCharacter(int *word, int *inputCodePoints) const {
    194     // Checks whether this word starts with same character or neighboring characters of
    195     // what user typed.
    196 
    197     int maxAlt = MAX_ALTERNATIVES;
    198     const int firstBaseLowerCodePoint = toBaseLowerCase(*word);
    199     while (maxAlt > 0) {
    200         if (toBaseLowerCase(*inputCodePoints) == firstBaseLowerCodePoint) {
    201             return true;
    202         }
    203         inputCodePoints++;
    204         maxAlt--;
    205     }
    206     return false;
    207 }
    208 
    209 bool BigramDictionary::isValidBigram(const int *word1, int length1, const int *word2,
    210         int length2) const {
    211     const uint8_t *const root = DICT_ROOT;
    212     int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */);
    213     // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
    214     if (0 == pos) return false;
    215     int nextWordPos = BinaryFormat::getTerminalPosition(root, word2, length2,
    216             false /* forceLowerCaseSearch */);
    217     if (NOT_VALID_WORD == nextWordPos) return false;
    218     uint8_t bigramFlags;
    219     do {
    220         bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
    221         const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
    222                 &pos);
    223         if (bigramPos == nextWordPos) {
    224             return true;
    225         }
    226     } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
    227     return false;
    228 }
    229 
    230 // TODO: Move functions related to bigram to here
    231 } // namespace latinime
    232