1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LATINIME_BINARY_FORMAT_H 18 #define LATINIME_BINARY_FORMAT_H 19 20 #include "unigram_dictionary.h" 21 22 namespace latinime { 23 24 class BinaryFormat { 25 private: 26 const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; 27 const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F; 28 const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2; 29 30 public: 31 const static int UNKNOWN_FORMAT = -1; 32 const static int FORMAT_VERSION_1 = 1; 33 const static uint16_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B1; 34 35 static int detectFormat(const uint8_t* const dict); 36 static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos); 37 static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos); 38 static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos); 39 static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos); 40 static int skipOtherCharacters(const uint8_t* const dict, const int pos); 41 static int skipAttributes(const uint8_t* const dict, const int pos); 42 static int skipChildrenPosition(const uint8_t flags, const int pos); 43 static int skipFrequency(const uint8_t flags, const int pos); 44 static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos); 45 static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags, 46 const int pos); 47 static int readChildrenPosition(const uint8_t* const dict, const uint8_t flags, const int pos); 48 static bool hasChildrenInFlags(const uint8_t flags); 49 static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags, 50 int *pos); 51 static int getTerminalPosition(const uint8_t* const root, const uint16_t* const inWord, 52 const int length); 53 static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth, 54 uint16_t* outWord); 55 }; 56 57 inline int BinaryFormat::detectFormat(const uint8_t* const dict) { 58 const uint16_t magicNumber = (dict[0] << 8) + dict[1]; // big endian 59 if (FORMAT_VERSION_1_MAGIC_NUMBER == magicNumber) return FORMAT_VERSION_1; 60 return UNKNOWN_FORMAT; 61 } 62 63 inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) { 64 return dict[(*pos)++]; 65 } 66 67 inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) { 68 return dict[(*pos)++]; 69 } 70 71 inline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos) { 72 const int origin = *pos; 73 const int32_t character = dict[origin]; 74 if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { 75 if (character == CHARACTER_ARRAY_TERMINATOR) { 76 *pos = origin + 1; 77 return NOT_A_CHARACTER; 78 } else { 79 *pos = origin + 3; 80 const int32_t char_1 = character << 16; 81 const int32_t char_2 = char_1 + (dict[origin + 1] << 8); 82 return char_2 + dict[origin + 2]; 83 } 84 } else { 85 *pos = origin + 1; 86 return character; 87 } 88 } 89 90 inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t* const dict, 91 const int pos) { 92 return dict[pos]; 93 } 94 95 inline int BinaryFormat::skipOtherCharacters(const uint8_t* const dict, const int pos) { 96 int currentPos = pos; 97 int32_t character = dict[currentPos++]; 98 while (CHARACTER_ARRAY_TERMINATOR != character) { 99 if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { 100 currentPos += MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE; 101 } 102 character = dict[currentPos++]; 103 } 104 return currentPos; 105 } 106 107 static inline int attributeAddressSize(const uint8_t flags) { 108 static const int ATTRIBUTE_ADDRESS_SHIFT = 4; 109 return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT; 110 /* Note: this is a value-dependant optimization of what may probably be 111 more readably written this way: 112 switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) { 113 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1; 114 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2; 115 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3; 116 default: return 0; 117 } 118 */ 119 } 120 121 inline int BinaryFormat::skipAttributes(const uint8_t* const dict, const int pos) { 122 int currentPos = pos; 123 uint8_t flags = getFlagsAndForwardPointer(dict, ¤tPos); 124 while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) { 125 currentPos += attributeAddressSize(flags); 126 flags = getFlagsAndForwardPointer(dict, ¤tPos); 127 } 128 currentPos += attributeAddressSize(flags); 129 return currentPos; 130 } 131 132 static inline int childrenAddressSize(const uint8_t flags) { 133 static const int CHILDREN_ADDRESS_SHIFT = 6; 134 return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT; 135 /* See the note in attributeAddressSize. The same applies here */ 136 } 137 138 inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) { 139 return pos + childrenAddressSize(flags); 140 } 141 142 inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) { 143 return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos; 144 } 145 146 inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags, 147 const int pos) { 148 // This function skips all attributes. The format makes provision for future extension 149 // with other attributes (notably shortcuts) but for the time being, bigrams are the 150 // only attributes that may be found in a character group, so we only look at bigrams 151 // in this version. 152 if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) { 153 return skipAttributes(dict, pos); 154 } else { 155 return pos; 156 } 157 } 158 159 inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict, 160 const uint8_t flags, const int pos) { 161 int currentPos = pos; 162 currentPos = skipChildrenPosition(flags, currentPos); 163 currentPos = skipAllAttributes(dict, flags, currentPos); 164 return currentPos; 165 } 166 167 inline int BinaryFormat::readChildrenPosition(const uint8_t* const dict, const uint8_t flags, 168 const int pos) { 169 int offset = 0; 170 switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) { 171 case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: 172 offset = dict[pos]; 173 break; 174 case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: 175 offset = dict[pos] << 8; 176 offset += dict[pos + 1]; 177 break; 178 case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: 179 offset = dict[pos] << 16; 180 offset += dict[pos + 1] << 8; 181 offset += dict[pos + 2]; 182 break; 183 default: 184 // If we come here, it means we asked for the children of a word with 185 // no children. 186 return -1; 187 } 188 return pos + offset; 189 } 190 191 inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) { 192 return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS 193 != (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)); 194 } 195 196 inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* const dict, 197 const uint8_t flags, int *pos) { 198 int offset = 0; 199 const int origin = *pos; 200 switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) { 201 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: 202 offset = dict[origin]; 203 *pos = origin + 1; 204 break; 205 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: 206 offset = dict[origin] << 8; 207 offset += dict[origin + 1]; 208 *pos = origin + 2; 209 break; 210 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: 211 offset = dict[origin] << 16; 212 offset += dict[origin + 1] << 8; 213 offset += dict[origin + 2]; 214 *pos = origin + 3; 215 break; 216 } 217 if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) { 218 return origin - offset; 219 } else { 220 return origin + offset; 221 } 222 } 223 224 // This function gets the byte position of the last chargroup of the exact matching word in the 225 // dictionary. If no match is found, it returns NOT_VALID_WORD. 226 inline int BinaryFormat::getTerminalPosition(const uint8_t* const root, 227 const uint16_t* const inWord, const int length) { 228 int pos = 0; 229 int wordPos = 0; 230 231 while (true) { 232 // If we already traversed the tree further than the word is long, there means 233 // there was no match (or we would have found it). 234 if (wordPos > length) return NOT_VALID_WORD; 235 int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos); 236 const uint16_t wChar = inWord[wordPos]; 237 while (true) { 238 // If there are no more character groups in this node, it means we could not 239 // find a matching character for this depth, therefore there is no match. 240 if (0 >= charGroupCount) return NOT_VALID_WORD; 241 const int charGroupPos = pos; 242 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); 243 int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); 244 if (character == wChar) { 245 // This is the correct node. Only one character group may start with the same 246 // char within a node, so either we found our match in this node, or there is 247 // no match and we can return NOT_VALID_WORD. So we will check all the characters 248 // in this character group indeed does match. 249 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { 250 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); 251 while (NOT_A_CHARACTER != character) { 252 ++wordPos; 253 // If we shoot the length of the word we search for, or if we find a single 254 // character that does not match, as explained above, it means the word is 255 // not in the dictionary (by virtue of this chargroup being the only one to 256 // match the word on the first character, but not matching the whole word). 257 if (wordPos > length) return NOT_VALID_WORD; 258 if (inWord[wordPos] != character) return NOT_VALID_WORD; 259 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); 260 } 261 } 262 // If we come here we know that so far, we do match. Either we are on a terminal 263 // and we match the length, in which case we found it, or we traverse children. 264 // If we don't match the length AND don't have children, then a word in the 265 // dictionary fully matches a prefix of the searched word but not the full word. 266 ++wordPos; 267 if (UnigramDictionary::FLAG_IS_TERMINAL & flags) { 268 if (wordPos == length) { 269 return charGroupPos; 270 } 271 pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos); 272 } 273 if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS 274 == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) { 275 return NOT_VALID_WORD; 276 } 277 // We have children and we are still shorter than the word we are searching for, so 278 // we need to traverse children. Put the pointer on the children position, and 279 // break 280 pos = BinaryFormat::readChildrenPosition(root, flags, pos); 281 break; 282 } else { 283 // This chargroup does not match, so skip the remaining part and go to the next. 284 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { 285 pos = BinaryFormat::skipOtherCharacters(root, pos); 286 } 287 pos = BinaryFormat::skipFrequency(flags, pos); 288 pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos); 289 } 290 --charGroupCount; 291 } 292 } 293 } 294 295 // This function searches for a terminal in the dictionary by its address. 296 // Due to the fact that words are ordered in the dictionary in a strict breadth-first order, 297 // it is possible to check for this with advantageous complexity. For each node, we search 298 // for groups with children and compare the children address with the address we look for. 299 // When we shoot the address we look for, it means the word we look for is in the children 300 // of the previous group. The only tricky part is the fact that if we arrive at the end of a 301 // node with the last group's children address still less than what we are searching for, we 302 // must descend the last group's children (for example, if the word we are searching for starts 303 // with a z, it's the last group of the root node, so all children addresses will be smaller 304 // than the address we look for, and we have to descend the z node). 305 /* Parameters : 306 * root: the dictionary buffer 307 * address: the byte position of the last chargroup of the word we are searching for (this is 308 * what is stored as the "bigram address" in each bigram) 309 * outword: an array to write the found word, with MAX_WORD_LENGTH size. 310 * Return value : the length of the word, of 0 if the word was not found. 311 */ 312 inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int address, 313 const int maxDepth, uint16_t* outWord) { 314 int pos = 0; 315 int wordPos = 0; 316 317 // One iteration of the outer loop iterates through nodes. As stated above, we will only 318 // traverse nodes that are actually a part of the terminal we are searching, so each time 319 // we enter this loop we are one depth level further than last time. 320 // The only reason we count nodes is because we want to reduce the probability of infinite 321 // looping in case there is a bug. Since we know there is an upper bound to the depth we are 322 // supposed to traverse, it does not hurt to count iterations. 323 for (int loopCount = maxDepth; loopCount > 0; --loopCount) { 324 int lastCandidateGroupPos = 0; 325 // Let's loop through char groups in this node searching for either the terminal 326 // or one of its ascendants. 327 for (int charGroupCount = getGroupCountAndForwardPointer(root, &pos); charGroupCount > 0; 328 --charGroupCount) { 329 const int startPos = pos; 330 const uint8_t flags = getFlagsAndForwardPointer(root, &pos); 331 const int32_t character = getCharCodeAndForwardPointer(root, &pos); 332 if (address == startPos) { 333 // We found the address. Copy the rest of the word in the buffer and return 334 // the length. 335 outWord[wordPos] = character; 336 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { 337 int32_t nextChar = getCharCodeAndForwardPointer(root, &pos); 338 // We count chars in order to avoid infinite loops if the file is broken or 339 // if there is some other bug 340 int charCount = maxDepth; 341 while (-1 != nextChar && --charCount > 0) { 342 outWord[++wordPos] = nextChar; 343 nextChar = getCharCodeAndForwardPointer(root, &pos); 344 } 345 } 346 return ++wordPos; 347 } 348 // We need to skip past this char group, so skip any remaining chars after the 349 // first and possibly the frequency. 350 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { 351 pos = skipOtherCharacters(root, pos); 352 } 353 pos = skipFrequency(flags, pos); 354 355 // The fact that this group has children is very important. Since we already know 356 // that this group does not match, if it has no children we know it is irrelevant 357 // to what we are searching for. 358 const bool hasChildren = (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != 359 (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)); 360 // We will write in `found' whether we have passed the children address we are 361 // searching for. For example if we search for "beer", the children of b are less 362 // than the address we are searching for and the children of c are greater. When we 363 // come here for c, we realize this is too big, and that we should descend b. 364 bool found; 365 if (hasChildren) { 366 // Here comes the tricky part. First, read the children position. 367 const int childrenPos = readChildrenPosition(root, flags, pos); 368 if (childrenPos > address) { 369 // If the children pos is greater than address, it means the previous chargroup, 370 // which address is stored in lastCandidateGroupPos, was the right one. 371 found = true; 372 } else if (1 >= charGroupCount) { 373 // However if we are on the LAST group of this node, and we have NOT shot the 374 // address we should descend THIS node. So we trick the lastCandidateGroupPos 375 // so that we will descend this node, not the previous one. 376 lastCandidateGroupPos = startPos; 377 found = true; 378 } else { 379 // Else, we should continue looking. 380 found = false; 381 } 382 } else { 383 // Even if we don't have children here, we could still be on the last group of this 384 // node. If this is the case, we should descend the last group that had children, 385 // and their address is already in lastCandidateGroup. 386 found = (1 >= charGroupCount); 387 } 388 389 if (found) { 390 // Okay, we found the group we should descend. Its address is in 391 // the lastCandidateGroupPos variable, so we just re-read it. 392 if (0 != lastCandidateGroupPos) { 393 const uint8_t lastFlags = 394 getFlagsAndForwardPointer(root, &lastCandidateGroupPos); 395 const int32_t lastChar = 396 getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); 397 // We copy all the characters in this group to the buffer 398 outWord[wordPos] = lastChar; 399 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & lastFlags) { 400 int32_t nextChar = 401 getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); 402 int charCount = maxDepth; 403 while (-1 != nextChar && --charCount > 0) { 404 outWord[++wordPos] = nextChar; 405 nextChar = getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); 406 } 407 } 408 ++wordPos; 409 // Now we only need to branch to the children address. Skip the frequency if 410 // it's there, read pos, and break to resume the search at pos. 411 lastCandidateGroupPos = skipFrequency(lastFlags, lastCandidateGroupPos); 412 pos = readChildrenPosition(root, lastFlags, lastCandidateGroupPos); 413 break; 414 } else { 415 // Here is a little tricky part: we come here if we found out that all children 416 // addresses in this group are bigger than the address we are searching for. 417 // Should we conclude the word is not in the dictionary? No! It could still be 418 // one of the remaining chargroups in this node, so we have to keep looking in 419 // this node until we find it (or we realize it's not there either, in which 420 // case it's actually not in the dictionary). Pass the end of this group, ready 421 // to start the next one. 422 pos = skipChildrenPosAndAttributes(root, flags, pos); 423 } 424 } else { 425 // If we did not find it, we should record the last children address for the next 426 // iteration. 427 if (hasChildren) lastCandidateGroupPos = startPos; 428 // Now skip the end of this group (children pos and the attributes if any) so that 429 // our pos is after the end of this char group, at the start of the next one. 430 pos = skipChildrenPosAndAttributes(root, flags, pos); 431 } 432 433 } 434 } 435 // If we have looked through all the chargroups and found no match, the address is 436 // not the address of a terminal in this dictionary. 437 return 0; 438 } 439 440 } // namespace latinime 441 442 #endif // LATINIME_BINARY_FORMAT_H 443