Home | History | Annotate | Download | only in src
      1 /*
      2  * Copyright (C) 2011 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef LATINIME_BINARY_FORMAT_H
     18 #define LATINIME_BINARY_FORMAT_H
     19 
     20 #include "unigram_dictionary.h"
     21 
     22 namespace latinime {
     23 
     24 class BinaryFormat {
     25 private:
     26     const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
     27     const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;
     28     const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;
     29 
     30 public:
     31     const static int UNKNOWN_FORMAT = -1;
     32     const static int FORMAT_VERSION_1 = 1;
     33     const static uint16_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B1;
     34 
     35     static int detectFormat(const uint8_t* const dict);
     36     static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);
     37     static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos);
     38     static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);
     39     static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos);
     40     static int skipOtherCharacters(const uint8_t* const dict, const int pos);
     41     static int skipAttributes(const uint8_t* const dict, const int pos);
     42     static int skipChildrenPosition(const uint8_t flags, const int pos);
     43     static int skipFrequency(const uint8_t flags, const int pos);
     44     static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos);
     45     static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags,
     46             const int pos);
     47     static int readChildrenPosition(const uint8_t* const dict, const uint8_t flags, const int pos);
     48     static bool hasChildrenInFlags(const uint8_t flags);
     49     static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags,
     50             int *pos);
     51     static int getTerminalPosition(const uint8_t* const root, const uint16_t* const inWord,
     52             const int length);
     53     static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth,
     54             uint16_t* outWord);
     55 };
     56 
     57 inline int BinaryFormat::detectFormat(const uint8_t* const dict) {
     58     const uint16_t magicNumber = (dict[0] << 8) + dict[1]; // big endian
     59     if (FORMAT_VERSION_1_MAGIC_NUMBER == magicNumber) return FORMAT_VERSION_1;
     60     return UNKNOWN_FORMAT;
     61 }
     62 
     63 inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) {
     64     return dict[(*pos)++];
     65 }
     66 
     67 inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) {
     68     return dict[(*pos)++];
     69 }
     70 
     71 inline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos) {
     72     const int origin = *pos;
     73     const int32_t character = dict[origin];
     74     if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
     75         if (character == CHARACTER_ARRAY_TERMINATOR) {
     76             *pos = origin + 1;
     77             return NOT_A_CHARACTER;
     78         } else {
     79             *pos = origin + 3;
     80             const int32_t char_1 = character << 16;
     81             const int32_t char_2 = char_1 + (dict[origin + 1] << 8);
     82             return char_2 + dict[origin + 2];
     83         }
     84     } else {
     85         *pos = origin + 1;
     86         return character;
     87     }
     88 }
     89 
     90 inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t* const dict,
     91         const int pos) {
     92     return dict[pos];
     93 }
     94 
     95 inline int BinaryFormat::skipOtherCharacters(const uint8_t* const dict, const int pos) {
     96     int currentPos = pos;
     97     int32_t character = dict[currentPos++];
     98     while (CHARACTER_ARRAY_TERMINATOR != character) {
     99         if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
    100             currentPos += MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE;
    101         }
    102         character = dict[currentPos++];
    103     }
    104     return currentPos;
    105 }
    106 
    107 static inline int attributeAddressSize(const uint8_t flags) {
    108     static const int ATTRIBUTE_ADDRESS_SHIFT = 4;
    109     return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
    110     /* Note: this is a value-dependant optimization of what may probably be
    111        more readably written this way:
    112        switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) {
    113        case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
    114        case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
    115        case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
    116        default: return 0;
    117        }
    118     */
    119 }
    120 
    121 inline int BinaryFormat::skipAttributes(const uint8_t* const dict, const int pos) {
    122     int currentPos = pos;
    123     uint8_t flags = getFlagsAndForwardPointer(dict, &currentPos);
    124     while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
    125         currentPos += attributeAddressSize(flags);
    126         flags = getFlagsAndForwardPointer(dict, &currentPos);
    127     }
    128     currentPos += attributeAddressSize(flags);
    129     return currentPos;
    130 }
    131 
    132 static inline int childrenAddressSize(const uint8_t flags) {
    133     static const int CHILDREN_ADDRESS_SHIFT = 6;
    134     return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
    135     /* See the note in attributeAddressSize. The same applies here */
    136 }
    137 
    138 inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) {
    139     return pos + childrenAddressSize(flags);
    140 }
    141 
    142 inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
    143     return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
    144 }
    145 
    146 inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags,
    147         const int pos) {
    148     // This function skips all attributes. The format makes provision for future extension
    149     // with other attributes (notably shortcuts) but for the time being, bigrams are the
    150     // only attributes that may be found in a character group, so we only look at bigrams
    151     // in this version.
    152     if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
    153         return skipAttributes(dict, pos);
    154     } else {
    155         return pos;
    156     }
    157 }
    158 
    159 inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict,
    160         const uint8_t flags, const int pos) {
    161     int currentPos = pos;
    162     currentPos = skipChildrenPosition(flags, currentPos);
    163     currentPos = skipAllAttributes(dict, flags, currentPos);
    164     return currentPos;
    165 }
    166 
    167 inline int BinaryFormat::readChildrenPosition(const uint8_t* const dict, const uint8_t flags,
    168         const int pos) {
    169     int offset = 0;
    170     switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) {
    171         case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
    172             offset = dict[pos];
    173             break;
    174         case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
    175             offset = dict[pos] << 8;
    176             offset += dict[pos + 1];
    177             break;
    178         case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
    179             offset = dict[pos] << 16;
    180             offset += dict[pos + 1] << 8;
    181             offset += dict[pos + 2];
    182             break;
    183         default:
    184             // If we come here, it means we asked for the children of a word with
    185             // no children.
    186             return -1;
    187     }
    188     return pos + offset;
    189 }
    190 
    191 inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) {
    192     return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
    193             != (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
    194 }
    195 
    196 inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* const dict,
    197         const uint8_t flags, int *pos) {
    198     int offset = 0;
    199     const int origin = *pos;
    200     switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
    201         case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
    202             offset = dict[origin];
    203             *pos = origin + 1;
    204             break;
    205         case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
    206             offset = dict[origin] << 8;
    207             offset += dict[origin + 1];
    208             *pos = origin + 2;
    209             break;
    210         case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
    211             offset = dict[origin] << 16;
    212             offset += dict[origin + 1] << 8;
    213             offset += dict[origin + 2];
    214             *pos = origin + 3;
    215             break;
    216     }
    217     if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
    218         return origin - offset;
    219     } else {
    220         return origin + offset;
    221     }
    222 }
    223 
    224 // This function gets the byte position of the last chargroup of the exact matching word in the
    225 // dictionary. If no match is found, it returns NOT_VALID_WORD.
    226 inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
    227         const uint16_t* const inWord, const int length) {
    228     int pos = 0;
    229     int wordPos = 0;
    230 
    231     while (true) {
    232         // If we already traversed the tree further than the word is long, there means
    233         // there was no match (or we would have found it).
    234         if (wordPos > length) return NOT_VALID_WORD;
    235         int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
    236         const uint16_t wChar = inWord[wordPos];
    237         while (true) {
    238             // If there are no more character groups in this node, it means we could not
    239             // find a matching character for this depth, therefore there is no match.
    240             if (0 >= charGroupCount) return NOT_VALID_WORD;
    241             const int charGroupPos = pos;
    242             const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
    243             int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
    244             if (character == wChar) {
    245                 // This is the correct node. Only one character group may start with the same
    246                 // char within a node, so either we found our match in this node, or there is
    247                 // no match and we can return NOT_VALID_WORD. So we will check all the characters
    248                 // in this character group indeed does match.
    249                 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
    250                     character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
    251                     while (NOT_A_CHARACTER != character) {
    252                         ++wordPos;
    253                         // If we shoot the length of the word we search for, or if we find a single
    254                         // character that does not match, as explained above, it means the word is
    255                         // not in the dictionary (by virtue of this chargroup being the only one to
    256                         // match the word on the first character, but not matching the whole word).
    257                         if (wordPos > length) return NOT_VALID_WORD;
    258                         if (inWord[wordPos] != character) return NOT_VALID_WORD;
    259                         character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
    260                     }
    261                 }
    262                 // If we come here we know that so far, we do match. Either we are on a terminal
    263                 // and we match the length, in which case we found it, or we traverse children.
    264                 // If we don't match the length AND don't have children, then a word in the
    265                 // dictionary fully matches a prefix of the searched word but not the full word.
    266                 ++wordPos;
    267                 if (UnigramDictionary::FLAG_IS_TERMINAL & flags) {
    268                     if (wordPos == length) {
    269                         return charGroupPos;
    270                     }
    271                     pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos);
    272                 }
    273                 if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
    274                         == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) {
    275                     return NOT_VALID_WORD;
    276                 }
    277                 // We have children and we are still shorter than the word we are searching for, so
    278                 // we need to traverse children. Put the pointer on the children position, and
    279                 // break
    280                 pos = BinaryFormat::readChildrenPosition(root, flags, pos);
    281                 break;
    282             } else {
    283                 // This chargroup does not match, so skip the remaining part and go to the next.
    284                 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
    285                     pos = BinaryFormat::skipOtherCharacters(root, pos);
    286                 }
    287                 pos = BinaryFormat::skipFrequency(flags, pos);
    288                 pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
    289             }
    290             --charGroupCount;
    291         }
    292     }
    293 }
    294 
    295 // This function searches for a terminal in the dictionary by its address.
    296 // Due to the fact that words are ordered in the dictionary in a strict breadth-first order,
    297 // it is possible to check for this with advantageous complexity. For each node, we search
    298 // for groups with children and compare the children address with the address we look for.
    299 // When we shoot the address we look for, it means the word we look for is in the children
    300 // of the previous group. The only tricky part is the fact that if we arrive at the end of a
    301 // node with the last group's children address still less than what we are searching for, we
    302 // must descend the last group's children (for example, if the word we are searching for starts
    303 // with a z, it's the last group of the root node, so all children addresses will be smaller
    304 // than the address we look for, and we have to descend the z node).
    305 /* Parameters :
    306  * root: the dictionary buffer
    307  * address: the byte position of the last chargroup of the word we are searching for (this is
    308  *   what is stored as the "bigram address" in each bigram)
    309  * outword: an array to write the found word, with MAX_WORD_LENGTH size.
    310  * Return value : the length of the word, of 0 if the word was not found.
    311  */
    312 inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int address,
    313         const int maxDepth, uint16_t* outWord) {
    314     int pos = 0;
    315     int wordPos = 0;
    316 
    317     // One iteration of the outer loop iterates through nodes. As stated above, we will only
    318     // traverse nodes that are actually a part of the terminal we are searching, so each time
    319     // we enter this loop we are one depth level further than last time.
    320     // The only reason we count nodes is because we want to reduce the probability of infinite
    321     // looping in case there is a bug. Since we know there is an upper bound to the depth we are
    322     // supposed to traverse, it does not hurt to count iterations.
    323     for (int loopCount = maxDepth; loopCount > 0; --loopCount) {
    324         int lastCandidateGroupPos = 0;
    325         // Let's loop through char groups in this node searching for either the terminal
    326         // or one of its ascendants.
    327         for (int charGroupCount = getGroupCountAndForwardPointer(root, &pos); charGroupCount > 0;
    328                  --charGroupCount) {
    329             const int startPos = pos;
    330             const uint8_t flags = getFlagsAndForwardPointer(root, &pos);
    331             const int32_t character = getCharCodeAndForwardPointer(root, &pos);
    332             if (address == startPos) {
    333                 // We found the address. Copy the rest of the word in the buffer and return
    334                 // the length.
    335                 outWord[wordPos] = character;
    336                 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
    337                     int32_t nextChar = getCharCodeAndForwardPointer(root, &pos);
    338                     // We count chars in order to avoid infinite loops if the file is broken or
    339                     // if there is some other bug
    340                     int charCount = maxDepth;
    341                     while (-1 != nextChar && --charCount > 0) {
    342                         outWord[++wordPos] = nextChar;
    343                         nextChar = getCharCodeAndForwardPointer(root, &pos);
    344                     }
    345                 }
    346                 return ++wordPos;
    347             }
    348             // We need to skip past this char group, so skip any remaining chars after the
    349             // first and possibly the frequency.
    350             if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
    351                 pos = skipOtherCharacters(root, pos);
    352             }
    353             pos = skipFrequency(flags, pos);
    354 
    355             // The fact that this group has children is very important. Since we already know
    356             // that this group does not match, if it has no children we know it is irrelevant
    357             // to what we are searching for.
    358             const bool hasChildren = (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS !=
    359                     (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
    360             // We will write in `found' whether we have passed the children address we are
    361             // searching for. For example if we search for "beer", the children of b are less
    362             // than the address we are searching for and the children of c are greater. When we
    363             // come here for c, we realize this is too big, and that we should descend b.
    364             bool found;
    365             if (hasChildren) {
    366                 // Here comes the tricky part. First, read the children position.
    367                 const int childrenPos = readChildrenPosition(root, flags, pos);
    368                 if (childrenPos > address) {
    369                     // If the children pos is greater than address, it means the previous chargroup,
    370                     // which address is stored in lastCandidateGroupPos, was the right one.
    371                     found = true;
    372                 } else if (1 >= charGroupCount) {
    373                     // However if we are on the LAST group of this node, and we have NOT shot the
    374                     // address we should descend THIS node. So we trick the lastCandidateGroupPos
    375                     // so that we will descend this node, not the previous one.
    376                     lastCandidateGroupPos = startPos;
    377                     found = true;
    378                 } else {
    379                     // Else, we should continue looking.
    380                     found = false;
    381                 }
    382             } else {
    383                 // Even if we don't have children here, we could still be on the last group of this
    384                 // node. If this is the case, we should descend the last group that had children,
    385                 // and their address is already in lastCandidateGroup.
    386                 found = (1 >= charGroupCount);
    387             }
    388 
    389             if (found) {
    390                 // Okay, we found the group we should descend. Its address is in
    391                 // the lastCandidateGroupPos variable, so we just re-read it.
    392                 if (0 != lastCandidateGroupPos) {
    393                     const uint8_t lastFlags =
    394                             getFlagsAndForwardPointer(root, &lastCandidateGroupPos);
    395                     const int32_t lastChar =
    396                             getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
    397                     // We copy all the characters in this group to the buffer
    398                     outWord[wordPos] = lastChar;
    399                     if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & lastFlags) {
    400                         int32_t nextChar =
    401                                 getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
    402                         int charCount = maxDepth;
    403                         while (-1 != nextChar && --charCount > 0) {
    404                             outWord[++wordPos] = nextChar;
    405                             nextChar = getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
    406                         }
    407                     }
    408                     ++wordPos;
    409                     // Now we only need to branch to the children address. Skip the frequency if
    410                     // it's there, read pos, and break to resume the search at pos.
    411                     lastCandidateGroupPos = skipFrequency(lastFlags, lastCandidateGroupPos);
    412                     pos = readChildrenPosition(root, lastFlags, lastCandidateGroupPos);
    413                     break;
    414                 } else {
    415                     // Here is a little tricky part: we come here if we found out that all children
    416                     // addresses in this group are bigger than the address we are searching for.
    417                     // Should we conclude the word is not in the dictionary? No! It could still be
    418                     // one of the remaining chargroups in this node, so we have to keep looking in
    419                     // this node until we find it (or we realize it's not there either, in which
    420                     // case it's actually not in the dictionary). Pass the end of this group, ready
    421                     // to start the next one.
    422                     pos = skipChildrenPosAndAttributes(root, flags, pos);
    423                 }
    424             } else {
    425                 // If we did not find it, we should record the last children address for the next
    426                 // iteration.
    427                 if (hasChildren) lastCandidateGroupPos = startPos;
    428                 // Now skip the end of this group (children pos and the attributes if any) so that
    429                 // our pos is after the end of this char group, at the start of the next one.
    430                 pos = skipChildrenPosAndAttributes(root, flags, pos);
    431             }
    432 
    433         }
    434     }
    435     // If we have looked through all the chargroups and found no match, the address is
    436     // not the address of a terminal in this dictionary.
    437     return 0;
    438 }
    439 
    440 } // namespace latinime
    441 
    442 #endif // LATINIME_BINARY_FORMAT_H
    443