1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LATINIME_BINARY_FORMAT_H 18 #define LATINIME_BINARY_FORMAT_H 19 20 #include <limits> 21 #include <map> 22 #include "bloom_filter.h" 23 #include "char_utils.h" 24 25 namespace latinime { 26 27 class BinaryFormat { 28 public: 29 // Mask and flags for children address type selection. 30 static const int MASK_GROUP_ADDRESS_TYPE = 0xC0; 31 static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; 32 static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40; 33 static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80; 34 static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0; 35 36 // Flag for single/multiple char group 37 static const int FLAG_HAS_MULTIPLE_CHARS = 0x20; 38 39 // Flag for terminal groups 40 static const int FLAG_IS_TERMINAL = 0x10; 41 42 // Flag for shortcut targets presence 43 static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08; 44 // Flag for bigram presence 45 static const int FLAG_HAS_BIGRAMS = 0x04; 46 // Flag for non-words (typically, shortcut only entries) 47 static const int FLAG_IS_NOT_A_WORD = 0x02; 48 // Flag for blacklist 49 static const int FLAG_IS_BLACKLISTED = 0x01; 50 51 // Attribute (bigram/shortcut) related flags: 52 // Flag for presence of more attributes 53 static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80; 54 // Flag for sign of offset. If this flag is set, the offset value must be negated. 55 static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; 56 57 // Mask for attribute frequency, stored on 4 bits inside the flags byte. 58 static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F; 59 // The numeric value of the shortcut frequency that means 'whitelist'. 60 static const int WHITELIST_SHORTCUT_FREQUENCY = 15; 61 62 // Mask and flags for attribute address type selection. 63 static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; 64 static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; 65 static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; 66 static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; 67 68 const static int UNKNOWN_FORMAT = -1; 69 // Originally, format version 1 had a 16-bit magic number, then the version number `01' 70 // then options that must be 0. Hence the first 32-bits of the format are always as follow 71 // and it's okay to consider them a magic number as a whole. 72 const static uint32_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100; 73 const static unsigned int FORMAT_VERSION_1_HEADER_SIZE = 5; 74 // The versions of Latin IME that only handle format version 1 only test for the magic 75 // number, so we had to change it so that version 2 files would be rejected by older 76 // implementations. On this occasion, we made the magic number 32 bits long. 77 const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; 78 79 const static int CHARACTER_ARRAY_TERMINATOR_SIZE = 1; 80 const static int SHORTCUT_LIST_SIZE_SIZE = 2; 81 82 static int detectFormat(const uint8_t *const dict); 83 static unsigned int getHeaderSize(const uint8_t *const dict); 84 static unsigned int getFlags(const uint8_t *const dict); 85 static int getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos); 86 static uint8_t getFlagsAndForwardPointer(const uint8_t *const dict, int *pos); 87 static int32_t getCodePointAndForwardPointer(const uint8_t *const dict, int *pos); 88 static int readFrequencyWithoutMovingPointer(const uint8_t *const dict, const int pos); 89 static int skipOtherCharacters(const uint8_t *const dict, const int pos); 90 static int skipChildrenPosition(const uint8_t flags, const int pos); 91 static int skipFrequency(const uint8_t flags, const int pos); 92 static int skipShortcuts(const uint8_t *const dict, const uint8_t flags, const int pos); 93 static int skipBigrams(const uint8_t *const dict, const uint8_t flags, const int pos); 94 static int skipChildrenPosAndAttributes(const uint8_t *const dict, const uint8_t flags, 95 const int pos); 96 static int readChildrenPosition(const uint8_t *const dict, const uint8_t flags, const int pos); 97 static bool hasChildrenInFlags(const uint8_t flags); 98 static int getAttributeAddressAndForwardPointer(const uint8_t *const dict, const uint8_t flags, 99 int *pos); 100 static int getAttributeFrequencyFromFlags(const int flags); 101 static int getTerminalPosition(const uint8_t *const root, const int32_t *const inWord, 102 const int length, const bool forceLowerCaseSearch); 103 static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth, 104 uint16_t *outWord, int *outUnigramFrequency); 105 static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq); 106 static int getProbability(const int position, const std::map<int, int> *bigramMap, 107 const uint8_t *bigramFilter, const int unigramFreq); 108 109 // Flags for special processing 110 // Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or 111 // something very bad (like, the apocalypse) will happen. Please update both at the same time. 112 enum { 113 REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1, 114 REQUIRES_FRENCH_LIGATURES_PROCESSING = 0x4 115 }; 116 const static unsigned int NO_FLAGS = 0; 117 118 private: 119 DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat); 120 const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; 121 const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F; 122 const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2; 123 static int skipAllAttributes(const uint8_t *const dict, const uint8_t flags, const int pos); 124 }; 125 126 inline int BinaryFormat::detectFormat(const uint8_t *const dict) { 127 // The magic number is stored big-endian. 128 const uint32_t magicNumber = (dict[0] << 24) + (dict[1] << 16) + (dict[2] << 8) + dict[3]; 129 switch (magicNumber) { 130 case FORMAT_VERSION_1_MAGIC_NUMBER: 131 // Format 1 header is exactly 5 bytes long and looks like: 132 // Magic number (2 bytes) 0x78 0xB1 133 // Version number (1 byte) 0x01 134 // Options (2 bytes) must be 0x00 0x00 135 return 1; 136 case FORMAT_VERSION_2_MAGIC_NUMBER: 137 // Format 2 header is as follows: 138 // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE 139 // Version number (2 bytes) 0x00 0x02 140 // Options (2 bytes) 141 // Header size (4 bytes) : integer, big endian 142 return (dict[4] << 8) + dict[5]; 143 default: 144 return UNKNOWN_FORMAT; 145 } 146 } 147 148 inline unsigned int BinaryFormat::getFlags(const uint8_t *const dict) { 149 switch (detectFormat(dict)) { 150 case 1: 151 return NO_FLAGS; 152 default: 153 return (dict[6] << 8) + dict[7]; 154 } 155 } 156 157 inline unsigned int BinaryFormat::getHeaderSize(const uint8_t *const dict) { 158 switch (detectFormat(dict)) { 159 case 1: 160 return FORMAT_VERSION_1_HEADER_SIZE; 161 case 2: 162 // See the format of the header in the comment in detectFormat() above 163 return (dict[8] << 24) + (dict[9] << 16) + (dict[10] << 8) + dict[11]; 164 default: 165 return std::numeric_limits<unsigned int>::max(); 166 } 167 } 168 169 inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos) { 170 const int msb = dict[(*pos)++]; 171 if (msb < 0x80) return msb; 172 return ((msb & 0x7F) << 8) | dict[(*pos)++]; 173 } 174 175 inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t *const dict, int *pos) { 176 return dict[(*pos)++]; 177 } 178 179 inline int32_t BinaryFormat::getCodePointAndForwardPointer(const uint8_t *const dict, int *pos) { 180 const int origin = *pos; 181 const int32_t codePoint = dict[origin]; 182 if (codePoint < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { 183 if (codePoint == CHARACTER_ARRAY_TERMINATOR) { 184 *pos = origin + 1; 185 return NOT_A_CODE_POINT; 186 } else { 187 *pos = origin + 3; 188 const int32_t char_1 = codePoint << 16; 189 const int32_t char_2 = char_1 + (dict[origin + 1] << 8); 190 return char_2 + dict[origin + 2]; 191 } 192 } else { 193 *pos = origin + 1; 194 return codePoint; 195 } 196 } 197 198 inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t *const dict, 199 const int pos) { 200 return dict[pos]; 201 } 202 203 inline int BinaryFormat::skipOtherCharacters(const uint8_t *const dict, const int pos) { 204 int currentPos = pos; 205 int32_t character = dict[currentPos++]; 206 while (CHARACTER_ARRAY_TERMINATOR != character) { 207 if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { 208 currentPos += MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE; 209 } 210 character = dict[currentPos++]; 211 } 212 return currentPos; 213 } 214 215 static inline int attributeAddressSize(const uint8_t flags) { 216 static const int ATTRIBUTE_ADDRESS_SHIFT = 4; 217 return (flags & BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT; 218 /* Note: this is a value-dependant optimization of what may probably be 219 more readably written this way: 220 switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) { 221 case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1; 222 case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2; 223 case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3; 224 default: return 0; 225 } 226 */ 227 } 228 229 static inline int skipExistingBigrams(const uint8_t *const dict, const int pos) { 230 int currentPos = pos; 231 uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos); 232 while (flags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT) { 233 currentPos += attributeAddressSize(flags); 234 flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos); 235 } 236 currentPos += attributeAddressSize(flags); 237 return currentPos; 238 } 239 240 static inline int childrenAddressSize(const uint8_t flags) { 241 static const int CHILDREN_ADDRESS_SHIFT = 6; 242 return (BinaryFormat::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT; 243 /* See the note in attributeAddressSize. The same applies here */ 244 } 245 246 static inline int shortcutByteSize(const uint8_t *const dict, const int pos) { 247 return ((int)(dict[pos] << 8)) + (dict[pos + 1]); 248 } 249 250 inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) { 251 return pos + childrenAddressSize(flags); 252 } 253 254 inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) { 255 return FLAG_IS_TERMINAL & flags ? pos + 1 : pos; 256 } 257 258 inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t flags, 259 const int pos) { 260 if (FLAG_HAS_SHORTCUT_TARGETS & flags) { 261 return pos + shortcutByteSize(dict, pos); 262 } else { 263 return pos; 264 } 265 } 266 267 inline int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t flags, 268 const int pos) { 269 if (FLAG_HAS_BIGRAMS & flags) { 270 return skipExistingBigrams(dict, pos); 271 } else { 272 return pos; 273 } 274 } 275 276 inline int BinaryFormat::skipAllAttributes(const uint8_t *const dict, const uint8_t flags, 277 const int pos) { 278 // This function skips all attributes: shortcuts and bigrams. 279 int newPos = pos; 280 newPos = skipShortcuts(dict, flags, newPos); 281 newPos = skipBigrams(dict, flags, newPos); 282 return newPos; 283 } 284 285 inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t *const dict, 286 const uint8_t flags, const int pos) { 287 int currentPos = pos; 288 currentPos = skipChildrenPosition(flags, currentPos); 289 currentPos = skipAllAttributes(dict, flags, currentPos); 290 return currentPos; 291 } 292 293 inline int BinaryFormat::readChildrenPosition(const uint8_t *const dict, const uint8_t flags, 294 const int pos) { 295 int offset = 0; 296 switch (MASK_GROUP_ADDRESS_TYPE & flags) { 297 case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: 298 offset = dict[pos]; 299 break; 300 case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: 301 offset = dict[pos] << 8; 302 offset += dict[pos + 1]; 303 break; 304 case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: 305 offset = dict[pos] << 16; 306 offset += dict[pos + 1] << 8; 307 offset += dict[pos + 2]; 308 break; 309 default: 310 // If we come here, it means we asked for the children of a word with 311 // no children. 312 return -1; 313 } 314 return pos + offset; 315 } 316 317 inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) { 318 return (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != (MASK_GROUP_ADDRESS_TYPE & flags)); 319 } 320 321 inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t *const dict, 322 const uint8_t flags, int *pos) { 323 int offset = 0; 324 const int origin = *pos; 325 switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) { 326 case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: 327 offset = dict[origin]; 328 *pos = origin + 1; 329 break; 330 case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: 331 offset = dict[origin] << 8; 332 offset += dict[origin + 1]; 333 *pos = origin + 2; 334 break; 335 case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: 336 offset = dict[origin] << 16; 337 offset += dict[origin + 1] << 8; 338 offset += dict[origin + 2]; 339 *pos = origin + 3; 340 break; 341 } 342 if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) { 343 return origin - offset; 344 } else { 345 return origin + offset; 346 } 347 } 348 349 inline int BinaryFormat::getAttributeFrequencyFromFlags(const int flags) { 350 return flags & MASK_ATTRIBUTE_FREQUENCY; 351 } 352 353 // This function gets the byte position of the last chargroup of the exact matching word in the 354 // dictionary. If no match is found, it returns NOT_VALID_WORD. 355 inline int BinaryFormat::getTerminalPosition(const uint8_t *const root, 356 const int32_t *const inWord, const int length, const bool forceLowerCaseSearch) { 357 int pos = 0; 358 int wordPos = 0; 359 360 while (true) { 361 // If we already traversed the tree further than the word is long, there means 362 // there was no match (or we would have found it). 363 if (wordPos >= length) return NOT_VALID_WORD; 364 int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos); 365 const int32_t wChar = forceLowerCaseSearch ? toLowerCase(inWord[wordPos]) : inWord[wordPos]; 366 while (true) { 367 // If there are no more character groups in this node, it means we could not 368 // find a matching character for this depth, therefore there is no match. 369 if (0 >= charGroupCount) return NOT_VALID_WORD; 370 const int charGroupPos = pos; 371 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); 372 int32_t character = BinaryFormat::getCodePointAndForwardPointer(root, &pos); 373 if (character == wChar) { 374 // This is the correct node. Only one character group may start with the same 375 // char within a node, so either we found our match in this node, or there is 376 // no match and we can return NOT_VALID_WORD. So we will check all the characters 377 // in this character group indeed does match. 378 if (FLAG_HAS_MULTIPLE_CHARS & flags) { 379 character = BinaryFormat::getCodePointAndForwardPointer(root, &pos); 380 while (NOT_A_CODE_POINT != character) { 381 ++wordPos; 382 // If we shoot the length of the word we search for, or if we find a single 383 // character that does not match, as explained above, it means the word is 384 // not in the dictionary (by virtue of this chargroup being the only one to 385 // match the word on the first character, but not matching the whole word). 386 if (wordPos >= length) return NOT_VALID_WORD; 387 if (inWord[wordPos] != character) return NOT_VALID_WORD; 388 character = BinaryFormat::getCodePointAndForwardPointer(root, &pos); 389 } 390 } 391 // If we come here we know that so far, we do match. Either we are on a terminal 392 // and we match the length, in which case we found it, or we traverse children. 393 // If we don't match the length AND don't have children, then a word in the 394 // dictionary fully matches a prefix of the searched word but not the full word. 395 ++wordPos; 396 if (FLAG_IS_TERMINAL & flags) { 397 if (wordPos == length) { 398 return charGroupPos; 399 } 400 pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos); 401 } 402 if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags)) { 403 return NOT_VALID_WORD; 404 } 405 // We have children and we are still shorter than the word we are searching for, so 406 // we need to traverse children. Put the pointer on the children position, and 407 // break 408 pos = BinaryFormat::readChildrenPosition(root, flags, pos); 409 break; 410 } else { 411 // This chargroup does not match, so skip the remaining part and go to the next. 412 if (FLAG_HAS_MULTIPLE_CHARS & flags) { 413 pos = BinaryFormat::skipOtherCharacters(root, pos); 414 } 415 pos = BinaryFormat::skipFrequency(flags, pos); 416 pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos); 417 } 418 --charGroupCount; 419 } 420 } 421 } 422 423 // This function searches for a terminal in the dictionary by its address. 424 // Due to the fact that words are ordered in the dictionary in a strict breadth-first order, 425 // it is possible to check for this with advantageous complexity. For each node, we search 426 // for groups with children and compare the children address with the address we look for. 427 // When we shoot the address we look for, it means the word we look for is in the children 428 // of the previous group. The only tricky part is the fact that if we arrive at the end of a 429 // node with the last group's children address still less than what we are searching for, we 430 // must descend the last group's children (for example, if the word we are searching for starts 431 // with a z, it's the last group of the root node, so all children addresses will be smaller 432 // than the address we look for, and we have to descend the z node). 433 /* Parameters : 434 * root: the dictionary buffer 435 * address: the byte position of the last chargroup of the word we are searching for (this is 436 * what is stored as the "bigram address" in each bigram) 437 * outword: an array to write the found word, with MAX_WORD_LENGTH size. 438 * outUnigramFrequency: a pointer to an int to write the frequency into. 439 * Return value : the length of the word, of 0 if the word was not found. 440 */ 441 inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int address, 442 const int maxDepth, uint16_t *outWord, int *outUnigramFrequency) { 443 int pos = 0; 444 int wordPos = 0; 445 446 // One iteration of the outer loop iterates through nodes. As stated above, we will only 447 // traverse nodes that are actually a part of the terminal we are searching, so each time 448 // we enter this loop we are one depth level further than last time. 449 // The only reason we count nodes is because we want to reduce the probability of infinite 450 // looping in case there is a bug. Since we know there is an upper bound to the depth we are 451 // supposed to traverse, it does not hurt to count iterations. 452 for (int loopCount = maxDepth; loopCount > 0; --loopCount) { 453 int lastCandidateGroupPos = 0; 454 // Let's loop through char groups in this node searching for either the terminal 455 // or one of its ascendants. 456 for (int charGroupCount = getGroupCountAndForwardPointer(root, &pos); charGroupCount > 0; 457 --charGroupCount) { 458 const int startPos = pos; 459 const uint8_t flags = getFlagsAndForwardPointer(root, &pos); 460 const int32_t character = getCodePointAndForwardPointer(root, &pos); 461 if (address == startPos) { 462 // We found the address. Copy the rest of the word in the buffer and return 463 // the length. 464 outWord[wordPos] = character; 465 if (FLAG_HAS_MULTIPLE_CHARS & flags) { 466 int32_t nextChar = getCodePointAndForwardPointer(root, &pos); 467 // We count chars in order to avoid infinite loops if the file is broken or 468 // if there is some other bug 469 int charCount = maxDepth; 470 while (NOT_A_CODE_POINT != nextChar && --charCount > 0) { 471 outWord[++wordPos] = nextChar; 472 nextChar = getCodePointAndForwardPointer(root, &pos); 473 } 474 } 475 *outUnigramFrequency = readFrequencyWithoutMovingPointer(root, pos); 476 return ++wordPos; 477 } 478 // We need to skip past this char group, so skip any remaining chars after the 479 // first and possibly the frequency. 480 if (FLAG_HAS_MULTIPLE_CHARS & flags) { 481 pos = skipOtherCharacters(root, pos); 482 } 483 pos = skipFrequency(flags, pos); 484 485 // The fact that this group has children is very important. Since we already know 486 // that this group does not match, if it has no children we know it is irrelevant 487 // to what we are searching for. 488 const bool hasChildren = (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != 489 (MASK_GROUP_ADDRESS_TYPE & flags)); 490 // We will write in `found' whether we have passed the children address we are 491 // searching for. For example if we search for "beer", the children of b are less 492 // than the address we are searching for and the children of c are greater. When we 493 // come here for c, we realize this is too big, and that we should descend b. 494 bool found; 495 if (hasChildren) { 496 // Here comes the tricky part. First, read the children position. 497 const int childrenPos = readChildrenPosition(root, flags, pos); 498 if (childrenPos > address) { 499 // If the children pos is greater than address, it means the previous chargroup, 500 // which address is stored in lastCandidateGroupPos, was the right one. 501 found = true; 502 } else if (1 >= charGroupCount) { 503 // However if we are on the LAST group of this node, and we have NOT shot the 504 // address we should descend THIS node. So we trick the lastCandidateGroupPos 505 // so that we will descend this node, not the previous one. 506 lastCandidateGroupPos = startPos; 507 found = true; 508 } else { 509 // Else, we should continue looking. 510 found = false; 511 } 512 } else { 513 // Even if we don't have children here, we could still be on the last group of this 514 // node. If this is the case, we should descend the last group that had children, 515 // and their address is already in lastCandidateGroup. 516 found = (1 >= charGroupCount); 517 } 518 519 if (found) { 520 // Okay, we found the group we should descend. Its address is in 521 // the lastCandidateGroupPos variable, so we just re-read it. 522 if (0 != lastCandidateGroupPos) { 523 const uint8_t lastFlags = 524 getFlagsAndForwardPointer(root, &lastCandidateGroupPos); 525 const int32_t lastChar = 526 getCodePointAndForwardPointer(root, &lastCandidateGroupPos); 527 // We copy all the characters in this group to the buffer 528 outWord[wordPos] = lastChar; 529 if (FLAG_HAS_MULTIPLE_CHARS & lastFlags) { 530 int32_t nextChar = 531 getCodePointAndForwardPointer(root, &lastCandidateGroupPos); 532 int charCount = maxDepth; 533 while (-1 != nextChar && --charCount > 0) { 534 outWord[++wordPos] = nextChar; 535 nextChar = getCodePointAndForwardPointer(root, &lastCandidateGroupPos); 536 } 537 } 538 ++wordPos; 539 // Now we only need to branch to the children address. Skip the frequency if 540 // it's there, read pos, and break to resume the search at pos. 541 lastCandidateGroupPos = skipFrequency(lastFlags, lastCandidateGroupPos); 542 pos = readChildrenPosition(root, lastFlags, lastCandidateGroupPos); 543 break; 544 } else { 545 // Here is a little tricky part: we come here if we found out that all children 546 // addresses in this group are bigger than the address we are searching for. 547 // Should we conclude the word is not in the dictionary? No! It could still be 548 // one of the remaining chargroups in this node, so we have to keep looking in 549 // this node until we find it (or we realize it's not there either, in which 550 // case it's actually not in the dictionary). Pass the end of this group, ready 551 // to start the next one. 552 pos = skipChildrenPosAndAttributes(root, flags, pos); 553 } 554 } else { 555 // If we did not find it, we should record the last children address for the next 556 // iteration. 557 if (hasChildren) lastCandidateGroupPos = startPos; 558 // Now skip the end of this group (children pos and the attributes if any) so that 559 // our pos is after the end of this char group, at the start of the next one. 560 pos = skipChildrenPosAndAttributes(root, flags, pos); 561 } 562 563 } 564 } 565 // If we have looked through all the chargroups and found no match, the address is 566 // not the address of a terminal in this dictionary. 567 return 0; 568 } 569 570 static inline int backoff(const int unigramFreq) { 571 return unigramFreq; 572 // For some reason, applying the backoff weight gives bad results in tests. To apply the 573 // backoff weight, we divide the probability by 2, which in our storing format means 574 // decreasing the score by 8. 575 // TODO: figure out what's wrong with this. 576 // return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8); 577 } 578 579 inline int BinaryFormat::computeFrequencyForBigram(const int unigramFreq, const int bigramFreq) { 580 // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the 581 // unigram frequency to be the median value of the 17th step from the top. A value of 582 // 0 for the bigram frequency represents the middle of the 16th step from the top, 583 // while a value of 15 represents the middle of the top step. 584 // See makedict.BinaryDictInputOutput for details. 585 const float stepSize = static_cast<float>(MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ); 586 return unigramFreq + static_cast<int>(static_cast<float>(bigramFreq + 1) * stepSize); 587 } 588 589 // This returns a probability in log space. 590 inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap, 591 const uint8_t *bigramFilter, const int unigramFreq) { 592 if (!bigramMap || !bigramFilter) return backoff(unigramFreq); 593 if (!isInFilter(bigramFilter, position)) return backoff(unigramFreq); 594 const std::map<int, int>::const_iterator bigramFreqIt = bigramMap->find(position); 595 if (bigramFreqIt != bigramMap->end()) { 596 const int bigramFreq = bigramFreqIt->second; 597 return computeFrequencyForBigram(unigramFreq, bigramFreq); 598 } else { 599 return backoff(unigramFreq); 600 } 601 } 602 } // namespace latinime 603 #endif // LATINIME_BINARY_FORMAT_H 604