Home | History | Annotate | Download | only in src
      1 /*
      2  * Copyright (C) 2011 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef LATINIME_BINARY_FORMAT_H
     18 #define LATINIME_BINARY_FORMAT_H
     19 
     20 #include <limits>
     21 #include <map>
     22 #include "bloom_filter.h"
     23 #include "char_utils.h"
     24 
     25 namespace latinime {
     26 
     27 class BinaryFormat {
     28  public:
     29     // Mask and flags for children address type selection.
     30     static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
     31     static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
     32     static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
     33     static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
     34     static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
     35 
     36     // Flag for single/multiple char group
     37     static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
     38 
     39     // Flag for terminal groups
     40     static const int FLAG_IS_TERMINAL = 0x10;
     41 
     42     // Flag for shortcut targets presence
     43     static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
     44     // Flag for bigram presence
     45     static const int FLAG_HAS_BIGRAMS = 0x04;
     46     // Flag for non-words (typically, shortcut only entries)
     47     static const int FLAG_IS_NOT_A_WORD = 0x02;
     48     // Flag for blacklist
     49     static const int FLAG_IS_BLACKLISTED = 0x01;
     50 
     51     // Attribute (bigram/shortcut) related flags:
     52     // Flag for presence of more attributes
     53     static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
     54     // Flag for sign of offset. If this flag is set, the offset value must be negated.
     55     static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
     56 
     57     // Mask for attribute frequency, stored on 4 bits inside the flags byte.
     58     static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
     59     // The numeric value of the shortcut frequency that means 'whitelist'.
     60     static const int WHITELIST_SHORTCUT_FREQUENCY = 15;
     61 
     62     // Mask and flags for attribute address type selection.
     63     static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
     64     static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
     65     static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
     66     static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
     67 
     68     const static int UNKNOWN_FORMAT = -1;
     69     // Originally, format version 1 had a 16-bit magic number, then the version number `01'
     70     // then options that must be 0. Hence the first 32-bits of the format are always as follow
     71     // and it's okay to consider them a magic number as a whole.
     72     const static uint32_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100;
     73     const static unsigned int FORMAT_VERSION_1_HEADER_SIZE = 5;
     74     // The versions of Latin IME that only handle format version 1 only test for the magic
     75     // number, so we had to change it so that version 2 files would be rejected by older
     76     // implementations. On this occasion, we made the magic number 32 bits long.
     77     const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
     78 
     79     const static int CHARACTER_ARRAY_TERMINATOR_SIZE = 1;
     80     const static int SHORTCUT_LIST_SIZE_SIZE = 2;
     81 
     82     static int detectFormat(const uint8_t *const dict);
     83     static unsigned int getHeaderSize(const uint8_t *const dict);
     84     static unsigned int getFlags(const uint8_t *const dict);
     85     static int getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos);
     86     static uint8_t getFlagsAndForwardPointer(const uint8_t *const dict, int *pos);
     87     static int32_t getCodePointAndForwardPointer(const uint8_t *const dict, int *pos);
     88     static int readFrequencyWithoutMovingPointer(const uint8_t *const dict, const int pos);
     89     static int skipOtherCharacters(const uint8_t *const dict, const int pos);
     90     static int skipChildrenPosition(const uint8_t flags, const int pos);
     91     static int skipFrequency(const uint8_t flags, const int pos);
     92     static int skipShortcuts(const uint8_t *const dict, const uint8_t flags, const int pos);
     93     static int skipBigrams(const uint8_t *const dict, const uint8_t flags, const int pos);
     94     static int skipChildrenPosAndAttributes(const uint8_t *const dict, const uint8_t flags,
     95             const int pos);
     96     static int readChildrenPosition(const uint8_t *const dict, const uint8_t flags, const int pos);
     97     static bool hasChildrenInFlags(const uint8_t flags);
     98     static int getAttributeAddressAndForwardPointer(const uint8_t *const dict, const uint8_t flags,
     99             int *pos);
    100     static int getAttributeFrequencyFromFlags(const int flags);
    101     static int getTerminalPosition(const uint8_t *const root, const int32_t *const inWord,
    102             const int length, const bool forceLowerCaseSearch);
    103     static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth,
    104             uint16_t *outWord, int *outUnigramFrequency);
    105     static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq);
    106     static int getProbability(const int position, const std::map<int, int> *bigramMap,
    107             const uint8_t *bigramFilter, const int unigramFreq);
    108 
    109     // Flags for special processing
    110     // Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or
    111     // something very bad (like, the apocalypse) will happen. Please update both at the same time.
    112     enum {
    113         REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1,
    114         REQUIRES_FRENCH_LIGATURES_PROCESSING = 0x4
    115     };
    116     const static unsigned int NO_FLAGS = 0;
    117 
    118  private:
    119     DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat);
    120     const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
    121     const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;
    122     const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;
    123     static int skipAllAttributes(const uint8_t *const dict, const uint8_t flags, const int pos);
    124 };
    125 
    126 inline int BinaryFormat::detectFormat(const uint8_t *const dict) {
    127     // The magic number is stored big-endian.
    128     const uint32_t magicNumber = (dict[0] << 24) + (dict[1] << 16) + (dict[2] << 8) + dict[3];
    129     switch (magicNumber) {
    130     case FORMAT_VERSION_1_MAGIC_NUMBER:
    131         // Format 1 header is exactly 5 bytes long and looks like:
    132         // Magic number (2 bytes) 0x78 0xB1
    133         // Version number (1 byte) 0x01
    134         // Options (2 bytes) must be 0x00 0x00
    135         return 1;
    136     case FORMAT_VERSION_2_MAGIC_NUMBER:
    137         // Format 2 header is as follows:
    138         // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE
    139         // Version number (2 bytes) 0x00 0x02
    140         // Options (2 bytes)
    141         // Header size (4 bytes) : integer, big endian
    142         return (dict[4] << 8) + dict[5];
    143     default:
    144         return UNKNOWN_FORMAT;
    145     }
    146 }
    147 
    148 inline unsigned int BinaryFormat::getFlags(const uint8_t *const dict) {
    149     switch (detectFormat(dict)) {
    150     case 1:
    151         return NO_FLAGS;
    152     default:
    153         return (dict[6] << 8) + dict[7];
    154     }
    155 }
    156 
    157 inline unsigned int BinaryFormat::getHeaderSize(const uint8_t *const dict) {
    158     switch (detectFormat(dict)) {
    159     case 1:
    160         return FORMAT_VERSION_1_HEADER_SIZE;
    161     case 2:
    162         // See the format of the header in the comment in detectFormat() above
    163         return (dict[8] << 24) + (dict[9] << 16) + (dict[10] << 8) + dict[11];
    164     default:
    165         return std::numeric_limits<unsigned int>::max();
    166     }
    167 }
    168 
    169 inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos) {
    170     const int msb = dict[(*pos)++];
    171     if (msb < 0x80) return msb;
    172     return ((msb & 0x7F) << 8) | dict[(*pos)++];
    173 }
    174 
    175 inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t *const dict, int *pos) {
    176     return dict[(*pos)++];
    177 }
    178 
    179 inline int32_t BinaryFormat::getCodePointAndForwardPointer(const uint8_t *const dict, int *pos) {
    180     const int origin = *pos;
    181     const int32_t codePoint = dict[origin];
    182     if (codePoint < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
    183         if (codePoint == CHARACTER_ARRAY_TERMINATOR) {
    184             *pos = origin + 1;
    185             return NOT_A_CODE_POINT;
    186         } else {
    187             *pos = origin + 3;
    188             const int32_t char_1 = codePoint << 16;
    189             const int32_t char_2 = char_1 + (dict[origin + 1] << 8);
    190             return char_2 + dict[origin + 2];
    191         }
    192     } else {
    193         *pos = origin + 1;
    194         return codePoint;
    195     }
    196 }
    197 
    198 inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t *const dict,
    199         const int pos) {
    200     return dict[pos];
    201 }
    202 
    203 inline int BinaryFormat::skipOtherCharacters(const uint8_t *const dict, const int pos) {
    204     int currentPos = pos;
    205     int32_t character = dict[currentPos++];
    206     while (CHARACTER_ARRAY_TERMINATOR != character) {
    207         if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
    208             currentPos += MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE;
    209         }
    210         character = dict[currentPos++];
    211     }
    212     return currentPos;
    213 }
    214 
    215 static inline int attributeAddressSize(const uint8_t flags) {
    216     static const int ATTRIBUTE_ADDRESS_SHIFT = 4;
    217     return (flags & BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
    218     /* Note: this is a value-dependant optimization of what may probably be
    219        more readably written this way:
    220        switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) {
    221        case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
    222        case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
    223        case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
    224        default: return 0;
    225        }
    226     */
    227 }
    228 
    229 static inline int skipExistingBigrams(const uint8_t *const dict, const int pos) {
    230     int currentPos = pos;
    231     uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos);
    232     while (flags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT) {
    233         currentPos += attributeAddressSize(flags);
    234         flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos);
    235     }
    236     currentPos += attributeAddressSize(flags);
    237     return currentPos;
    238 }
    239 
    240 static inline int childrenAddressSize(const uint8_t flags) {
    241     static const int CHILDREN_ADDRESS_SHIFT = 6;
    242     return (BinaryFormat::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
    243     /* See the note in attributeAddressSize. The same applies here */
    244 }
    245 
    246 static inline int shortcutByteSize(const uint8_t *const dict, const int pos) {
    247     return ((int)(dict[pos] << 8)) + (dict[pos + 1]);
    248 }
    249 
    250 inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) {
    251     return pos + childrenAddressSize(flags);
    252 }
    253 
    254 inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
    255     return FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
    256 }
    257 
    258 inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t flags,
    259         const int pos) {
    260     if (FLAG_HAS_SHORTCUT_TARGETS & flags) {
    261         return pos + shortcutByteSize(dict, pos);
    262     } else {
    263         return pos;
    264     }
    265 }
    266 
    267 inline int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t flags,
    268         const int pos) {
    269     if (FLAG_HAS_BIGRAMS & flags) {
    270         return skipExistingBigrams(dict, pos);
    271     } else {
    272         return pos;
    273     }
    274 }
    275 
    276 inline int BinaryFormat::skipAllAttributes(const uint8_t *const dict, const uint8_t flags,
    277         const int pos) {
    278     // This function skips all attributes: shortcuts and bigrams.
    279     int newPos = pos;
    280     newPos = skipShortcuts(dict, flags, newPos);
    281     newPos = skipBigrams(dict, flags, newPos);
    282     return newPos;
    283 }
    284 
    285 inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t *const dict,
    286         const uint8_t flags, const int pos) {
    287     int currentPos = pos;
    288     currentPos = skipChildrenPosition(flags, currentPos);
    289     currentPos = skipAllAttributes(dict, flags, currentPos);
    290     return currentPos;
    291 }
    292 
    293 inline int BinaryFormat::readChildrenPosition(const uint8_t *const dict, const uint8_t flags,
    294         const int pos) {
    295     int offset = 0;
    296     switch (MASK_GROUP_ADDRESS_TYPE & flags) {
    297         case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
    298             offset = dict[pos];
    299             break;
    300         case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
    301             offset = dict[pos] << 8;
    302             offset += dict[pos + 1];
    303             break;
    304         case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
    305             offset = dict[pos] << 16;
    306             offset += dict[pos + 1] << 8;
    307             offset += dict[pos + 2];
    308             break;
    309         default:
    310             // If we come here, it means we asked for the children of a word with
    311             // no children.
    312             return -1;
    313     }
    314     return pos + offset;
    315 }
    316 
    317 inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) {
    318     return (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != (MASK_GROUP_ADDRESS_TYPE & flags));
    319 }
    320 
    321 inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t *const dict,
    322         const uint8_t flags, int *pos) {
    323     int offset = 0;
    324     const int origin = *pos;
    325     switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
    326         case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
    327             offset = dict[origin];
    328             *pos = origin + 1;
    329             break;
    330         case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
    331             offset = dict[origin] << 8;
    332             offset += dict[origin + 1];
    333             *pos = origin + 2;
    334             break;
    335         case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
    336             offset = dict[origin] << 16;
    337             offset += dict[origin + 1] << 8;
    338             offset += dict[origin + 2];
    339             *pos = origin + 3;
    340             break;
    341     }
    342     if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
    343         return origin - offset;
    344     } else {
    345         return origin + offset;
    346     }
    347 }
    348 
    349 inline int BinaryFormat::getAttributeFrequencyFromFlags(const int flags) {
    350     return flags & MASK_ATTRIBUTE_FREQUENCY;
    351 }
    352 
    353 // This function gets the byte position of the last chargroup of the exact matching word in the
    354 // dictionary. If no match is found, it returns NOT_VALID_WORD.
    355 inline int BinaryFormat::getTerminalPosition(const uint8_t *const root,
    356         const int32_t *const inWord, const int length, const bool forceLowerCaseSearch) {
    357     int pos = 0;
    358     int wordPos = 0;
    359 
    360     while (true) {
    361         // If we already traversed the tree further than the word is long, there means
    362         // there was no match (or we would have found it).
    363         if (wordPos >= length) return NOT_VALID_WORD;
    364         int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
    365         const int32_t wChar = forceLowerCaseSearch ? toLowerCase(inWord[wordPos]) : inWord[wordPos];
    366         while (true) {
    367             // If there are no more character groups in this node, it means we could not
    368             // find a matching character for this depth, therefore there is no match.
    369             if (0 >= charGroupCount) return NOT_VALID_WORD;
    370             const int charGroupPos = pos;
    371             const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
    372             int32_t character = BinaryFormat::getCodePointAndForwardPointer(root, &pos);
    373             if (character == wChar) {
    374                 // This is the correct node. Only one character group may start with the same
    375                 // char within a node, so either we found our match in this node, or there is
    376                 // no match and we can return NOT_VALID_WORD. So we will check all the characters
    377                 // in this character group indeed does match.
    378                 if (FLAG_HAS_MULTIPLE_CHARS & flags) {
    379                     character = BinaryFormat::getCodePointAndForwardPointer(root, &pos);
    380                     while (NOT_A_CODE_POINT != character) {
    381                         ++wordPos;
    382                         // If we shoot the length of the word we search for, or if we find a single
    383                         // character that does not match, as explained above, it means the word is
    384                         // not in the dictionary (by virtue of this chargroup being the only one to
    385                         // match the word on the first character, but not matching the whole word).
    386                         if (wordPos >= length) return NOT_VALID_WORD;
    387                         if (inWord[wordPos] != character) return NOT_VALID_WORD;
    388                         character = BinaryFormat::getCodePointAndForwardPointer(root, &pos);
    389                     }
    390                 }
    391                 // If we come here we know that so far, we do match. Either we are on a terminal
    392                 // and we match the length, in which case we found it, or we traverse children.
    393                 // If we don't match the length AND don't have children, then a word in the
    394                 // dictionary fully matches a prefix of the searched word but not the full word.
    395                 ++wordPos;
    396                 if (FLAG_IS_TERMINAL & flags) {
    397                     if (wordPos == length) {
    398                         return charGroupPos;
    399                     }
    400                     pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos);
    401                 }
    402                 if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags)) {
    403                     return NOT_VALID_WORD;
    404                 }
    405                 // We have children and we are still shorter than the word we are searching for, so
    406                 // we need to traverse children. Put the pointer on the children position, and
    407                 // break
    408                 pos = BinaryFormat::readChildrenPosition(root, flags, pos);
    409                 break;
    410             } else {
    411                 // This chargroup does not match, so skip the remaining part and go to the next.
    412                 if (FLAG_HAS_MULTIPLE_CHARS & flags) {
    413                     pos = BinaryFormat::skipOtherCharacters(root, pos);
    414                 }
    415                 pos = BinaryFormat::skipFrequency(flags, pos);
    416                 pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
    417             }
    418             --charGroupCount;
    419         }
    420     }
    421 }
    422 
    423 // This function searches for a terminal in the dictionary by its address.
    424 // Due to the fact that words are ordered in the dictionary in a strict breadth-first order,
    425 // it is possible to check for this with advantageous complexity. For each node, we search
    426 // for groups with children and compare the children address with the address we look for.
    427 // When we shoot the address we look for, it means the word we look for is in the children
    428 // of the previous group. The only tricky part is the fact that if we arrive at the end of a
    429 // node with the last group's children address still less than what we are searching for, we
    430 // must descend the last group's children (for example, if the word we are searching for starts
    431 // with a z, it's the last group of the root node, so all children addresses will be smaller
    432 // than the address we look for, and we have to descend the z node).
    433 /* Parameters :
    434  * root: the dictionary buffer
    435  * address: the byte position of the last chargroup of the word we are searching for (this is
    436  *   what is stored as the "bigram address" in each bigram)
    437  * outword: an array to write the found word, with MAX_WORD_LENGTH size.
    438  * outUnigramFrequency: a pointer to an int to write the frequency into.
    439  * Return value : the length of the word, of 0 if the word was not found.
    440  */
    441 inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int address,
    442         const int maxDepth, uint16_t *outWord, int *outUnigramFrequency) {
    443     int pos = 0;
    444     int wordPos = 0;
    445 
    446     // One iteration of the outer loop iterates through nodes. As stated above, we will only
    447     // traverse nodes that are actually a part of the terminal we are searching, so each time
    448     // we enter this loop we are one depth level further than last time.
    449     // The only reason we count nodes is because we want to reduce the probability of infinite
    450     // looping in case there is a bug. Since we know there is an upper bound to the depth we are
    451     // supposed to traverse, it does not hurt to count iterations.
    452     for (int loopCount = maxDepth; loopCount > 0; --loopCount) {
    453         int lastCandidateGroupPos = 0;
    454         // Let's loop through char groups in this node searching for either the terminal
    455         // or one of its ascendants.
    456         for (int charGroupCount = getGroupCountAndForwardPointer(root, &pos); charGroupCount > 0;
    457                  --charGroupCount) {
    458             const int startPos = pos;
    459             const uint8_t flags = getFlagsAndForwardPointer(root, &pos);
    460             const int32_t character = getCodePointAndForwardPointer(root, &pos);
    461             if (address == startPos) {
    462                 // We found the address. Copy the rest of the word in the buffer and return
    463                 // the length.
    464                 outWord[wordPos] = character;
    465                 if (FLAG_HAS_MULTIPLE_CHARS & flags) {
    466                     int32_t nextChar = getCodePointAndForwardPointer(root, &pos);
    467                     // We count chars in order to avoid infinite loops if the file is broken or
    468                     // if there is some other bug
    469                     int charCount = maxDepth;
    470                     while (NOT_A_CODE_POINT != nextChar && --charCount > 0) {
    471                         outWord[++wordPos] = nextChar;
    472                         nextChar = getCodePointAndForwardPointer(root, &pos);
    473                     }
    474                 }
    475                 *outUnigramFrequency = readFrequencyWithoutMovingPointer(root, pos);
    476                 return ++wordPos;
    477             }
    478             // We need to skip past this char group, so skip any remaining chars after the
    479             // first and possibly the frequency.
    480             if (FLAG_HAS_MULTIPLE_CHARS & flags) {
    481                 pos = skipOtherCharacters(root, pos);
    482             }
    483             pos = skipFrequency(flags, pos);
    484 
    485             // The fact that this group has children is very important. Since we already know
    486             // that this group does not match, if it has no children we know it is irrelevant
    487             // to what we are searching for.
    488             const bool hasChildren = (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS !=
    489                     (MASK_GROUP_ADDRESS_TYPE & flags));
    490             // We will write in `found' whether we have passed the children address we are
    491             // searching for. For example if we search for "beer", the children of b are less
    492             // than the address we are searching for and the children of c are greater. When we
    493             // come here for c, we realize this is too big, and that we should descend b.
    494             bool found;
    495             if (hasChildren) {
    496                 // Here comes the tricky part. First, read the children position.
    497                 const int childrenPos = readChildrenPosition(root, flags, pos);
    498                 if (childrenPos > address) {
    499                     // If the children pos is greater than address, it means the previous chargroup,
    500                     // which address is stored in lastCandidateGroupPos, was the right one.
    501                     found = true;
    502                 } else if (1 >= charGroupCount) {
    503                     // However if we are on the LAST group of this node, and we have NOT shot the
    504                     // address we should descend THIS node. So we trick the lastCandidateGroupPos
    505                     // so that we will descend this node, not the previous one.
    506                     lastCandidateGroupPos = startPos;
    507                     found = true;
    508                 } else {
    509                     // Else, we should continue looking.
    510                     found = false;
    511                 }
    512             } else {
    513                 // Even if we don't have children here, we could still be on the last group of this
    514                 // node. If this is the case, we should descend the last group that had children,
    515                 // and their address is already in lastCandidateGroup.
    516                 found = (1 >= charGroupCount);
    517             }
    518 
    519             if (found) {
    520                 // Okay, we found the group we should descend. Its address is in
    521                 // the lastCandidateGroupPos variable, so we just re-read it.
    522                 if (0 != lastCandidateGroupPos) {
    523                     const uint8_t lastFlags =
    524                             getFlagsAndForwardPointer(root, &lastCandidateGroupPos);
    525                     const int32_t lastChar =
    526                             getCodePointAndForwardPointer(root, &lastCandidateGroupPos);
    527                     // We copy all the characters in this group to the buffer
    528                     outWord[wordPos] = lastChar;
    529                     if (FLAG_HAS_MULTIPLE_CHARS & lastFlags) {
    530                         int32_t nextChar =
    531                                 getCodePointAndForwardPointer(root, &lastCandidateGroupPos);
    532                         int charCount = maxDepth;
    533                         while (-1 != nextChar && --charCount > 0) {
    534                             outWord[++wordPos] = nextChar;
    535                             nextChar = getCodePointAndForwardPointer(root, &lastCandidateGroupPos);
    536                         }
    537                     }
    538                     ++wordPos;
    539                     // Now we only need to branch to the children address. Skip the frequency if
    540                     // it's there, read pos, and break to resume the search at pos.
    541                     lastCandidateGroupPos = skipFrequency(lastFlags, lastCandidateGroupPos);
    542                     pos = readChildrenPosition(root, lastFlags, lastCandidateGroupPos);
    543                     break;
    544                 } else {
    545                     // Here is a little tricky part: we come here if we found out that all children
    546                     // addresses in this group are bigger than the address we are searching for.
    547                     // Should we conclude the word is not in the dictionary? No! It could still be
    548                     // one of the remaining chargroups in this node, so we have to keep looking in
    549                     // this node until we find it (or we realize it's not there either, in which
    550                     // case it's actually not in the dictionary). Pass the end of this group, ready
    551                     // to start the next one.
    552                     pos = skipChildrenPosAndAttributes(root, flags, pos);
    553                 }
    554             } else {
    555                 // If we did not find it, we should record the last children address for the next
    556                 // iteration.
    557                 if (hasChildren) lastCandidateGroupPos = startPos;
    558                 // Now skip the end of this group (children pos and the attributes if any) so that
    559                 // our pos is after the end of this char group, at the start of the next one.
    560                 pos = skipChildrenPosAndAttributes(root, flags, pos);
    561             }
    562 
    563         }
    564     }
    565     // If we have looked through all the chargroups and found no match, the address is
    566     // not the address of a terminal in this dictionary.
    567     return 0;
    568 }
    569 
    570 static inline int backoff(const int unigramFreq) {
    571     return unigramFreq;
    572     // For some reason, applying the backoff weight gives bad results in tests. To apply the
    573     // backoff weight, we divide the probability by 2, which in our storing format means
    574     // decreasing the score by 8.
    575     // TODO: figure out what's wrong with this.
    576     // return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8);
    577 }
    578 
    579 inline int BinaryFormat::computeFrequencyForBigram(const int unigramFreq, const int bigramFreq) {
    580     // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the
    581     // unigram frequency to be the median value of the 17th step from the top. A value of
    582     // 0 for the bigram frequency represents the middle of the 16th step from the top,
    583     // while a value of 15 represents the middle of the top step.
    584     // See makedict.BinaryDictInputOutput for details.
    585     const float stepSize = static_cast<float>(MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ);
    586     return unigramFreq + static_cast<int>(static_cast<float>(bigramFreq + 1) * stepSize);
    587 }
    588 
    589 // This returns a probability in log space.
    590 inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap,
    591         const uint8_t *bigramFilter, const int unigramFreq) {
    592     if (!bigramMap || !bigramFilter) return backoff(unigramFreq);
    593     if (!isInFilter(bigramFilter, position)) return backoff(unigramFreq);
    594     const std::map<int, int>::const_iterator bigramFreqIt = bigramMap->find(position);
    595     if (bigramFreqIt != bigramMap->end()) {
    596         const int bigramFreq = bigramFreqIt->second;
    597         return computeFrequencyForBigram(unigramFreq, bigramFreq);
    598     } else {
    599         return backoff(unigramFreq);
    600     }
    601 }
    602 } // namespace latinime
    603 #endif // LATINIME_BINARY_FORMAT_H
    604