Home | History | Annotate | Download | only in src
      1 /*
      2  * Copyright (C) 2011 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef LATINIME_BINARY_FORMAT_H
     18 #define LATINIME_BINARY_FORMAT_H
     19 
     20 #include <limits>
     21 #include "bloom_filter.h"
     22 #include "unigram_dictionary.h"
     23 
     24 namespace latinime {
     25 
     26 class BinaryFormat {
     27  private:
     28     const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
     29     const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;
     30     const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;
     31 
     32  public:
     33     const static int UNKNOWN_FORMAT = -1;
     34     // Originally, format version 1 had a 16-bit magic number, then the version number `01'
     35     // then options that must be 0. Hence the first 32-bits of the format are always as follow
     36     // and it's okay to consider them a magic number as a whole.
     37     const static uint32_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100;
     38     const static unsigned int FORMAT_VERSION_1_HEADER_SIZE = 5;
     39     // The versions of Latin IME that only handle format version 1 only test for the magic
     40     // number, so we had to change it so that version 2 files would be rejected by older
     41     // implementations. On this occasion, we made the magic number 32 bits long.
     42     const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
     43 
     44     const static int CHARACTER_ARRAY_TERMINATOR_SIZE = 1;
     45     const static int SHORTCUT_LIST_SIZE_SIZE = 2;
     46 
     47     static int detectFormat(const uint8_t* const dict);
     48     static unsigned int getHeaderSize(const uint8_t* const dict);
     49     static unsigned int getFlags(const uint8_t* const dict);
     50     static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);
     51     static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos);
     52     static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);
     53     static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos);
     54     static int skipOtherCharacters(const uint8_t* const dict, const int pos);
     55     static int skipChildrenPosition(const uint8_t flags, const int pos);
     56     static int skipFrequency(const uint8_t flags, const int pos);
     57     static int skipShortcuts(const uint8_t* const dict, const uint8_t flags, const int pos);
     58     static int skipBigrams(const uint8_t* const dict, const uint8_t flags, const int pos);
     59     static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos);
     60     static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags,
     61             const int pos);
     62     static int readChildrenPosition(const uint8_t* const dict, const uint8_t flags, const int pos);
     63     static bool hasChildrenInFlags(const uint8_t flags);
     64     static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags,
     65             int *pos);
     66     static int getTerminalPosition(const uint8_t* const root, const int32_t* const inWord,
     67             const int length);
     68     static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth,
     69             uint16_t* outWord, int* outUnigramFrequency);
     70     static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq);
     71     static int getProbability(const int position, const std::map<int, int> *bigramMap,
     72             const uint8_t *bigramFilter, const int unigramFreq);
     73 
     74     // Flags for special processing
     75     // Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or
     76     // something very bad (like, the apocalypse) will happen. Please update both at the same time.
     77     enum {
     78         REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1,
     79         REQUIRES_FRENCH_LIGATURES_PROCESSING = 0x4
     80     };
     81     const static unsigned int NO_FLAGS = 0;
     82 };
     83 
     84 inline int BinaryFormat::detectFormat(const uint8_t* const dict) {
     85     // The magic number is stored big-endian.
     86     const uint32_t magicNumber = (dict[0] << 24) + (dict[1] << 16) + (dict[2] << 8) + dict[3];
     87     switch (magicNumber) {
     88     case FORMAT_VERSION_1_MAGIC_NUMBER:
     89         // Format 1 header is exactly 5 bytes long and looks like:
     90         // Magic number (2 bytes) 0x78 0xB1
     91         // Version number (1 byte) 0x01
     92         // Options (2 bytes) must be 0x00 0x00
     93         return 1;
     94     case FORMAT_VERSION_2_MAGIC_NUMBER:
     95         // Format 2 header is as follows:
     96         // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE
     97         // Version number (2 bytes) 0x00 0x02
     98         // Options (2 bytes)
     99         // Header size (4 bytes) : integer, big endian
    100         return (dict[4] << 8) + dict[5];
    101     default:
    102         return UNKNOWN_FORMAT;
    103     }
    104 }
    105 
    106 inline unsigned int BinaryFormat::getFlags(const uint8_t* const dict) {
    107     switch (detectFormat(dict)) {
    108     case 1:
    109         return NO_FLAGS;
    110     default:
    111         return (dict[6] << 8) + dict[7];
    112     }
    113 }
    114 
    115 inline unsigned int BinaryFormat::getHeaderSize(const uint8_t* const dict) {
    116     switch (detectFormat(dict)) {
    117     case 1:
    118         return FORMAT_VERSION_1_HEADER_SIZE;
    119     case 2:
    120         // See the format of the header in the comment in detectFormat() above
    121         return (dict[8] << 24) + (dict[9] << 16) + (dict[10] << 8) + dict[11];
    122     default:
    123         return std::numeric_limits<unsigned int>::max();
    124     }
    125 }
    126 
    127 inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) {
    128     const int msb = dict[(*pos)++];
    129     if (msb < 0x80) return msb;
    130     return ((msb & 0x7F) << 8) | dict[(*pos)++];
    131 }
    132 
    133 inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) {
    134     return dict[(*pos)++];
    135 }
    136 
    137 inline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos) {
    138     const int origin = *pos;
    139     const int32_t character = dict[origin];
    140     if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
    141         if (character == CHARACTER_ARRAY_TERMINATOR) {
    142             *pos = origin + 1;
    143             return NOT_A_CHARACTER;
    144         } else {
    145             *pos = origin + 3;
    146             const int32_t char_1 = character << 16;
    147             const int32_t char_2 = char_1 + (dict[origin + 1] << 8);
    148             return char_2 + dict[origin + 2];
    149         }
    150     } else {
    151         *pos = origin + 1;
    152         return character;
    153     }
    154 }
    155 
    156 inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t* const dict,
    157         const int pos) {
    158     return dict[pos];
    159 }
    160 
    161 inline int BinaryFormat::skipOtherCharacters(const uint8_t* const dict, const int pos) {
    162     int currentPos = pos;
    163     int32_t character = dict[currentPos++];
    164     while (CHARACTER_ARRAY_TERMINATOR != character) {
    165         if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
    166             currentPos += MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE;
    167         }
    168         character = dict[currentPos++];
    169     }
    170     return currentPos;
    171 }
    172 
    173 static inline int attributeAddressSize(const uint8_t flags) {
    174     static const int ATTRIBUTE_ADDRESS_SHIFT = 4;
    175     return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
    176     /* Note: this is a value-dependant optimization of what may probably be
    177        more readably written this way:
    178        switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) {
    179        case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
    180        case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
    181        case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
    182        default: return 0;
    183        }
    184     */
    185 }
    186 
    187 static inline int skipExistingBigrams(const uint8_t* const dict, const int pos) {
    188     int currentPos = pos;
    189     uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos);
    190     while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
    191         currentPos += attributeAddressSize(flags);
    192         flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos);
    193     }
    194     currentPos += attributeAddressSize(flags);
    195     return currentPos;
    196 }
    197 
    198 static inline int childrenAddressSize(const uint8_t flags) {
    199     static const int CHILDREN_ADDRESS_SHIFT = 6;
    200     return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
    201     /* See the note in attributeAddressSize. The same applies here */
    202 }
    203 
    204 static inline int shortcutByteSize(const uint8_t* const dict, const int pos) {
    205     return ((int)(dict[pos] << 8)) + (dict[pos + 1]);
    206 }
    207 
    208 inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) {
    209     return pos + childrenAddressSize(flags);
    210 }
    211 
    212 inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
    213     return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
    214 }
    215 
    216 inline int BinaryFormat::skipShortcuts(const uint8_t* const dict, const uint8_t flags,
    217         const int pos) {
    218     if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) {
    219         return pos + shortcutByteSize(dict, pos);
    220     } else {
    221         return pos;
    222     }
    223 }
    224 
    225 inline int BinaryFormat::skipBigrams(const uint8_t* const dict, const uint8_t flags,
    226         const int pos) {
    227     if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
    228         return skipExistingBigrams(dict, pos);
    229     } else {
    230         return pos;
    231     }
    232 }
    233 
    234 inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags,
    235         const int pos) {
    236     // This function skips all attributes: shortcuts and bigrams.
    237     int newPos = pos;
    238     newPos = skipShortcuts(dict, flags, newPos);
    239     newPos = skipBigrams(dict, flags, newPos);
    240     return newPos;
    241 }
    242 
    243 inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict,
    244         const uint8_t flags, const int pos) {
    245     int currentPos = pos;
    246     currentPos = skipChildrenPosition(flags, currentPos);
    247     currentPos = skipAllAttributes(dict, flags, currentPos);
    248     return currentPos;
    249 }
    250 
    251 inline int BinaryFormat::readChildrenPosition(const uint8_t* const dict, const uint8_t flags,
    252         const int pos) {
    253     int offset = 0;
    254     switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) {
    255         case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
    256             offset = dict[pos];
    257             break;
    258         case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
    259             offset = dict[pos] << 8;
    260             offset += dict[pos + 1];
    261             break;
    262         case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
    263             offset = dict[pos] << 16;
    264             offset += dict[pos + 1] << 8;
    265             offset += dict[pos + 2];
    266             break;
    267         default:
    268             // If we come here, it means we asked for the children of a word with
    269             // no children.
    270             return -1;
    271     }
    272     return pos + offset;
    273 }
    274 
    275 inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) {
    276     return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
    277             != (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
    278 }
    279 
    280 inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* const dict,
    281         const uint8_t flags, int *pos) {
    282     int offset = 0;
    283     const int origin = *pos;
    284     switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
    285         case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
    286             offset = dict[origin];
    287             *pos = origin + 1;
    288             break;
    289         case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
    290             offset = dict[origin] << 8;
    291             offset += dict[origin + 1];
    292             *pos = origin + 2;
    293             break;
    294         case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
    295             offset = dict[origin] << 16;
    296             offset += dict[origin + 1] << 8;
    297             offset += dict[origin + 2];
    298             *pos = origin + 3;
    299             break;
    300     }
    301     if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
    302         return origin - offset;
    303     } else {
    304         return origin + offset;
    305     }
    306 }
    307 
    308 // This function gets the byte position of the last chargroup of the exact matching word in the
    309 // dictionary. If no match is found, it returns NOT_VALID_WORD.
    310 inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
    311         const int32_t* const inWord, const int length) {
    312     int pos = 0;
    313     int wordPos = 0;
    314 
    315     while (true) {
    316         // If we already traversed the tree further than the word is long, there means
    317         // there was no match (or we would have found it).
    318         if (wordPos > length) return NOT_VALID_WORD;
    319         int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
    320         const int32_t wChar = inWord[wordPos];
    321         while (true) {
    322             // If there are no more character groups in this node, it means we could not
    323             // find a matching character for this depth, therefore there is no match.
    324             if (0 >= charGroupCount) return NOT_VALID_WORD;
    325             const int charGroupPos = pos;
    326             const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
    327             int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
    328             if (character == wChar) {
    329                 // This is the correct node. Only one character group may start with the same
    330                 // char within a node, so either we found our match in this node, or there is
    331                 // no match and we can return NOT_VALID_WORD. So we will check all the characters
    332                 // in this character group indeed does match.
    333                 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
    334                     character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
    335                     while (NOT_A_CHARACTER != character) {
    336                         ++wordPos;
    337                         // If we shoot the length of the word we search for, or if we find a single
    338                         // character that does not match, as explained above, it means the word is
    339                         // not in the dictionary (by virtue of this chargroup being the only one to
    340                         // match the word on the first character, but not matching the whole word).
    341                         if (wordPos > length) return NOT_VALID_WORD;
    342                         if (inWord[wordPos] != character) return NOT_VALID_WORD;
    343                         character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
    344                     }
    345                 }
    346                 // If we come here we know that so far, we do match. Either we are on a terminal
    347                 // and we match the length, in which case we found it, or we traverse children.
    348                 // If we don't match the length AND don't have children, then a word in the
    349                 // dictionary fully matches a prefix of the searched word but not the full word.
    350                 ++wordPos;
    351                 if (UnigramDictionary::FLAG_IS_TERMINAL & flags) {
    352                     if (wordPos == length) {
    353                         return charGroupPos;
    354                     }
    355                     pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos);
    356                 }
    357                 if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
    358                         == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) {
    359                     return NOT_VALID_WORD;
    360                 }
    361                 // We have children and we are still shorter than the word we are searching for, so
    362                 // we need to traverse children. Put the pointer on the children position, and
    363                 // break
    364                 pos = BinaryFormat::readChildrenPosition(root, flags, pos);
    365                 break;
    366             } else {
    367                 // This chargroup does not match, so skip the remaining part and go to the next.
    368                 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
    369                     pos = BinaryFormat::skipOtherCharacters(root, pos);
    370                 }
    371                 pos = BinaryFormat::skipFrequency(flags, pos);
    372                 pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
    373             }
    374             --charGroupCount;
    375         }
    376     }
    377 }
    378 
    379 // This function searches for a terminal in the dictionary by its address.
    380 // Due to the fact that words are ordered in the dictionary in a strict breadth-first order,
    381 // it is possible to check for this with advantageous complexity. For each node, we search
    382 // for groups with children and compare the children address with the address we look for.
    383 // When we shoot the address we look for, it means the word we look for is in the children
    384 // of the previous group. The only tricky part is the fact that if we arrive at the end of a
    385 // node with the last group's children address still less than what we are searching for, we
    386 // must descend the last group's children (for example, if the word we are searching for starts
    387 // with a z, it's the last group of the root node, so all children addresses will be smaller
    388 // than the address we look for, and we have to descend the z node).
    389 /* Parameters :
    390  * root: the dictionary buffer
    391  * address: the byte position of the last chargroup of the word we are searching for (this is
    392  *   what is stored as the "bigram address" in each bigram)
    393  * outword: an array to write the found word, with MAX_WORD_LENGTH size.
    394  * outUnigramFrequency: a pointer to an int to write the frequency into.
    395  * Return value : the length of the word, of 0 if the word was not found.
    396  */
    397 inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int address,
    398         const int maxDepth, uint16_t* outWord, int* outUnigramFrequency) {
    399     int pos = 0;
    400     int wordPos = 0;
    401 
    402     // One iteration of the outer loop iterates through nodes. As stated above, we will only
    403     // traverse nodes that are actually a part of the terminal we are searching, so each time
    404     // we enter this loop we are one depth level further than last time.
    405     // The only reason we count nodes is because we want to reduce the probability of infinite
    406     // looping in case there is a bug. Since we know there is an upper bound to the depth we are
    407     // supposed to traverse, it does not hurt to count iterations.
    408     for (int loopCount = maxDepth; loopCount > 0; --loopCount) {
    409         int lastCandidateGroupPos = 0;
    410         // Let's loop through char groups in this node searching for either the terminal
    411         // or one of its ascendants.
    412         for (int charGroupCount = getGroupCountAndForwardPointer(root, &pos); charGroupCount > 0;
    413                  --charGroupCount) {
    414             const int startPos = pos;
    415             const uint8_t flags = getFlagsAndForwardPointer(root, &pos);
    416             const int32_t character = getCharCodeAndForwardPointer(root, &pos);
    417             if (address == startPos) {
    418                 // We found the address. Copy the rest of the word in the buffer and return
    419                 // the length.
    420                 outWord[wordPos] = character;
    421                 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
    422                     int32_t nextChar = getCharCodeAndForwardPointer(root, &pos);
    423                     // We count chars in order to avoid infinite loops if the file is broken or
    424                     // if there is some other bug
    425                     int charCount = maxDepth;
    426                     while (NOT_A_CHARACTER != nextChar && --charCount > 0) {
    427                         outWord[++wordPos] = nextChar;
    428                         nextChar = getCharCodeAndForwardPointer(root, &pos);
    429                     }
    430                 }
    431                 *outUnigramFrequency = readFrequencyWithoutMovingPointer(root, pos);
    432                 return ++wordPos;
    433             }
    434             // We need to skip past this char group, so skip any remaining chars after the
    435             // first and possibly the frequency.
    436             if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
    437                 pos = skipOtherCharacters(root, pos);
    438             }
    439             pos = skipFrequency(flags, pos);
    440 
    441             // The fact that this group has children is very important. Since we already know
    442             // that this group does not match, if it has no children we know it is irrelevant
    443             // to what we are searching for.
    444             const bool hasChildren = (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS !=
    445                     (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
    446             // We will write in `found' whether we have passed the children address we are
    447             // searching for. For example if we search for "beer", the children of b are less
    448             // than the address we are searching for and the children of c are greater. When we
    449             // come here for c, we realize this is too big, and that we should descend b.
    450             bool found;
    451             if (hasChildren) {
    452                 // Here comes the tricky part. First, read the children position.
    453                 const int childrenPos = readChildrenPosition(root, flags, pos);
    454                 if (childrenPos > address) {
    455                     // If the children pos is greater than address, it means the previous chargroup,
    456                     // which address is stored in lastCandidateGroupPos, was the right one.
    457                     found = true;
    458                 } else if (1 >= charGroupCount) {
    459                     // However if we are on the LAST group of this node, and we have NOT shot the
    460                     // address we should descend THIS node. So we trick the lastCandidateGroupPos
    461                     // so that we will descend this node, not the previous one.
    462                     lastCandidateGroupPos = startPos;
    463                     found = true;
    464                 } else {
    465                     // Else, we should continue looking.
    466                     found = false;
    467                 }
    468             } else {
    469                 // Even if we don't have children here, we could still be on the last group of this
    470                 // node. If this is the case, we should descend the last group that had children,
    471                 // and their address is already in lastCandidateGroup.
    472                 found = (1 >= charGroupCount);
    473             }
    474 
    475             if (found) {
    476                 // Okay, we found the group we should descend. Its address is in
    477                 // the lastCandidateGroupPos variable, so we just re-read it.
    478                 if (0 != lastCandidateGroupPos) {
    479                     const uint8_t lastFlags =
    480                             getFlagsAndForwardPointer(root, &lastCandidateGroupPos);
    481                     const int32_t lastChar =
    482                             getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
    483                     // We copy all the characters in this group to the buffer
    484                     outWord[wordPos] = lastChar;
    485                     if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & lastFlags) {
    486                         int32_t nextChar =
    487                                 getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
    488                         int charCount = maxDepth;
    489                         while (-1 != nextChar && --charCount > 0) {
    490                             outWord[++wordPos] = nextChar;
    491                             nextChar = getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
    492                         }
    493                     }
    494                     ++wordPos;
    495                     // Now we only need to branch to the children address. Skip the frequency if
    496                     // it's there, read pos, and break to resume the search at pos.
    497                     lastCandidateGroupPos = skipFrequency(lastFlags, lastCandidateGroupPos);
    498                     pos = readChildrenPosition(root, lastFlags, lastCandidateGroupPos);
    499                     break;
    500                 } else {
    501                     // Here is a little tricky part: we come here if we found out that all children
    502                     // addresses in this group are bigger than the address we are searching for.
    503                     // Should we conclude the word is not in the dictionary? No! It could still be
    504                     // one of the remaining chargroups in this node, so we have to keep looking in
    505                     // this node until we find it (or we realize it's not there either, in which
    506                     // case it's actually not in the dictionary). Pass the end of this group, ready
    507                     // to start the next one.
    508                     pos = skipChildrenPosAndAttributes(root, flags, pos);
    509                 }
    510             } else {
    511                 // If we did not find it, we should record the last children address for the next
    512                 // iteration.
    513                 if (hasChildren) lastCandidateGroupPos = startPos;
    514                 // Now skip the end of this group (children pos and the attributes if any) so that
    515                 // our pos is after the end of this char group, at the start of the next one.
    516                 pos = skipChildrenPosAndAttributes(root, flags, pos);
    517             }
    518 
    519         }
    520     }
    521     // If we have looked through all the chargroups and found no match, the address is
    522     // not the address of a terminal in this dictionary.
    523     return 0;
    524 }
    525 
    526 static inline int backoff(const int unigramFreq) {
    527     return unigramFreq;
    528     // For some reason, applying the backoff weight gives bad results in tests. To apply the
    529     // backoff weight, we divide the probability by 2, which in our storing format means
    530     // decreasing the score by 8.
    531     // TODO: figure out what's wrong with this.
    532     // return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8);
    533 }
    534 
    535 inline int BinaryFormat::computeFrequencyForBigram(const int unigramFreq, const int bigramFreq) {
    536     // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the
    537     // unigram frequency to be the median value of the 17th step from the top. A value of
    538     // 0 for the bigram frequency represents the middle of the 16th step from the top,
    539     // while a value of 15 represents the middle of the top step.
    540     // See makedict.BinaryDictInputOutput for details.
    541     const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ);
    542     return (int)(unigramFreq + (bigramFreq + 1) * stepSize);
    543 }
    544 
    545 // This returns a probability in log space.
    546 inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap,
    547         const uint8_t *bigramFilter, const int unigramFreq) {
    548     if (!bigramMap || !bigramFilter) return backoff(unigramFreq);
    549     if (!isInFilter(bigramFilter, position)) return backoff(unigramFreq);
    550     const std::map<int, int>::const_iterator bigramFreqIt = bigramMap->find(position);
    551     if (bigramFreqIt != bigramMap->end()) {
    552         const int bigramFreq = bigramFreqIt->second;
    553         return computeFrequencyForBigram(unigramFreq, bigramFreq);
    554     } else {
    555         return backoff(unigramFreq);
    556     }
    557 }
    558 
    559 } // namespace latinime
    560 
    561 #endif // LATINIME_BINARY_FORMAT_H
    562