1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LATINIME_BINARY_FORMAT_H 18 #define LATINIME_BINARY_FORMAT_H 19 20 #include <limits> 21 #include "bloom_filter.h" 22 #include "unigram_dictionary.h" 23 24 namespace latinime { 25 26 class BinaryFormat { 27 private: 28 const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; 29 const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F; 30 const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2; 31 32 public: 33 const static int UNKNOWN_FORMAT = -1; 34 // Originally, format version 1 had a 16-bit magic number, then the version number `01' 35 // then options that must be 0. Hence the first 32-bits of the format are always as follow 36 // and it's okay to consider them a magic number as a whole. 37 const static uint32_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100; 38 const static unsigned int FORMAT_VERSION_1_HEADER_SIZE = 5; 39 // The versions of Latin IME that only handle format version 1 only test for the magic 40 // number, so we had to change it so that version 2 files would be rejected by older 41 // implementations. On this occasion, we made the magic number 32 bits long. 42 const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; 43 44 const static int CHARACTER_ARRAY_TERMINATOR_SIZE = 1; 45 const static int SHORTCUT_LIST_SIZE_SIZE = 2; 46 47 static int detectFormat(const uint8_t* const dict); 48 static unsigned int getHeaderSize(const uint8_t* const dict); 49 static unsigned int getFlags(const uint8_t* const dict); 50 static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos); 51 static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos); 52 static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos); 53 static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos); 54 static int skipOtherCharacters(const uint8_t* const dict, const int pos); 55 static int skipChildrenPosition(const uint8_t flags, const int pos); 56 static int skipFrequency(const uint8_t flags, const int pos); 57 static int skipShortcuts(const uint8_t* const dict, const uint8_t flags, const int pos); 58 static int skipBigrams(const uint8_t* const dict, const uint8_t flags, const int pos); 59 static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos); 60 static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags, 61 const int pos); 62 static int readChildrenPosition(const uint8_t* const dict, const uint8_t flags, const int pos); 63 static bool hasChildrenInFlags(const uint8_t flags); 64 static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags, 65 int *pos); 66 static int getTerminalPosition(const uint8_t* const root, const int32_t* const inWord, 67 const int length); 68 static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth, 69 uint16_t* outWord, int* outUnigramFrequency); 70 static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq); 71 static int getProbability(const int position, const std::map<int, int> *bigramMap, 72 const uint8_t *bigramFilter, const int unigramFreq); 73 74 // Flags for special processing 75 // Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or 76 // something very bad (like, the apocalypse) will happen. Please update both at the same time. 77 enum { 78 REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1, 79 REQUIRES_FRENCH_LIGATURES_PROCESSING = 0x4 80 }; 81 const static unsigned int NO_FLAGS = 0; 82 }; 83 84 inline int BinaryFormat::detectFormat(const uint8_t* const dict) { 85 // The magic number is stored big-endian. 86 const uint32_t magicNumber = (dict[0] << 24) + (dict[1] << 16) + (dict[2] << 8) + dict[3]; 87 switch (magicNumber) { 88 case FORMAT_VERSION_1_MAGIC_NUMBER: 89 // Format 1 header is exactly 5 bytes long and looks like: 90 // Magic number (2 bytes) 0x78 0xB1 91 // Version number (1 byte) 0x01 92 // Options (2 bytes) must be 0x00 0x00 93 return 1; 94 case FORMAT_VERSION_2_MAGIC_NUMBER: 95 // Format 2 header is as follows: 96 // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE 97 // Version number (2 bytes) 0x00 0x02 98 // Options (2 bytes) 99 // Header size (4 bytes) : integer, big endian 100 return (dict[4] << 8) + dict[5]; 101 default: 102 return UNKNOWN_FORMAT; 103 } 104 } 105 106 inline unsigned int BinaryFormat::getFlags(const uint8_t* const dict) { 107 switch (detectFormat(dict)) { 108 case 1: 109 return NO_FLAGS; 110 default: 111 return (dict[6] << 8) + dict[7]; 112 } 113 } 114 115 inline unsigned int BinaryFormat::getHeaderSize(const uint8_t* const dict) { 116 switch (detectFormat(dict)) { 117 case 1: 118 return FORMAT_VERSION_1_HEADER_SIZE; 119 case 2: 120 // See the format of the header in the comment in detectFormat() above 121 return (dict[8] << 24) + (dict[9] << 16) + (dict[10] << 8) + dict[11]; 122 default: 123 return std::numeric_limits<unsigned int>::max(); 124 } 125 } 126 127 inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) { 128 const int msb = dict[(*pos)++]; 129 if (msb < 0x80) return msb; 130 return ((msb & 0x7F) << 8) | dict[(*pos)++]; 131 } 132 133 inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) { 134 return dict[(*pos)++]; 135 } 136 137 inline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos) { 138 const int origin = *pos; 139 const int32_t character = dict[origin]; 140 if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { 141 if (character == CHARACTER_ARRAY_TERMINATOR) { 142 *pos = origin + 1; 143 return NOT_A_CHARACTER; 144 } else { 145 *pos = origin + 3; 146 const int32_t char_1 = character << 16; 147 const int32_t char_2 = char_1 + (dict[origin + 1] << 8); 148 return char_2 + dict[origin + 2]; 149 } 150 } else { 151 *pos = origin + 1; 152 return character; 153 } 154 } 155 156 inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t* const dict, 157 const int pos) { 158 return dict[pos]; 159 } 160 161 inline int BinaryFormat::skipOtherCharacters(const uint8_t* const dict, const int pos) { 162 int currentPos = pos; 163 int32_t character = dict[currentPos++]; 164 while (CHARACTER_ARRAY_TERMINATOR != character) { 165 if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { 166 currentPos += MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE; 167 } 168 character = dict[currentPos++]; 169 } 170 return currentPos; 171 } 172 173 static inline int attributeAddressSize(const uint8_t flags) { 174 static const int ATTRIBUTE_ADDRESS_SHIFT = 4; 175 return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT; 176 /* Note: this is a value-dependant optimization of what may probably be 177 more readably written this way: 178 switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) { 179 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1; 180 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2; 181 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3; 182 default: return 0; 183 } 184 */ 185 } 186 187 static inline int skipExistingBigrams(const uint8_t* const dict, const int pos) { 188 int currentPos = pos; 189 uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos); 190 while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) { 191 currentPos += attributeAddressSize(flags); 192 flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos); 193 } 194 currentPos += attributeAddressSize(flags); 195 return currentPos; 196 } 197 198 static inline int childrenAddressSize(const uint8_t flags) { 199 static const int CHILDREN_ADDRESS_SHIFT = 6; 200 return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT; 201 /* See the note in attributeAddressSize. The same applies here */ 202 } 203 204 static inline int shortcutByteSize(const uint8_t* const dict, const int pos) { 205 return ((int)(dict[pos] << 8)) + (dict[pos + 1]); 206 } 207 208 inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) { 209 return pos + childrenAddressSize(flags); 210 } 211 212 inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) { 213 return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos; 214 } 215 216 inline int BinaryFormat::skipShortcuts(const uint8_t* const dict, const uint8_t flags, 217 const int pos) { 218 if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) { 219 return pos + shortcutByteSize(dict, pos); 220 } else { 221 return pos; 222 } 223 } 224 225 inline int BinaryFormat::skipBigrams(const uint8_t* const dict, const uint8_t flags, 226 const int pos) { 227 if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) { 228 return skipExistingBigrams(dict, pos); 229 } else { 230 return pos; 231 } 232 } 233 234 inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags, 235 const int pos) { 236 // This function skips all attributes: shortcuts and bigrams. 237 int newPos = pos; 238 newPos = skipShortcuts(dict, flags, newPos); 239 newPos = skipBigrams(dict, flags, newPos); 240 return newPos; 241 } 242 243 inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict, 244 const uint8_t flags, const int pos) { 245 int currentPos = pos; 246 currentPos = skipChildrenPosition(flags, currentPos); 247 currentPos = skipAllAttributes(dict, flags, currentPos); 248 return currentPos; 249 } 250 251 inline int BinaryFormat::readChildrenPosition(const uint8_t* const dict, const uint8_t flags, 252 const int pos) { 253 int offset = 0; 254 switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) { 255 case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: 256 offset = dict[pos]; 257 break; 258 case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: 259 offset = dict[pos] << 8; 260 offset += dict[pos + 1]; 261 break; 262 case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: 263 offset = dict[pos] << 16; 264 offset += dict[pos + 1] << 8; 265 offset += dict[pos + 2]; 266 break; 267 default: 268 // If we come here, it means we asked for the children of a word with 269 // no children. 270 return -1; 271 } 272 return pos + offset; 273 } 274 275 inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) { 276 return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS 277 != (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)); 278 } 279 280 inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* const dict, 281 const uint8_t flags, int *pos) { 282 int offset = 0; 283 const int origin = *pos; 284 switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) { 285 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: 286 offset = dict[origin]; 287 *pos = origin + 1; 288 break; 289 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: 290 offset = dict[origin] << 8; 291 offset += dict[origin + 1]; 292 *pos = origin + 2; 293 break; 294 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: 295 offset = dict[origin] << 16; 296 offset += dict[origin + 1] << 8; 297 offset += dict[origin + 2]; 298 *pos = origin + 3; 299 break; 300 } 301 if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) { 302 return origin - offset; 303 } else { 304 return origin + offset; 305 } 306 } 307 308 // This function gets the byte position of the last chargroup of the exact matching word in the 309 // dictionary. If no match is found, it returns NOT_VALID_WORD. 310 inline int BinaryFormat::getTerminalPosition(const uint8_t* const root, 311 const int32_t* const inWord, const int length) { 312 int pos = 0; 313 int wordPos = 0; 314 315 while (true) { 316 // If we already traversed the tree further than the word is long, there means 317 // there was no match (or we would have found it). 318 if (wordPos > length) return NOT_VALID_WORD; 319 int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos); 320 const int32_t wChar = inWord[wordPos]; 321 while (true) { 322 // If there are no more character groups in this node, it means we could not 323 // find a matching character for this depth, therefore there is no match. 324 if (0 >= charGroupCount) return NOT_VALID_WORD; 325 const int charGroupPos = pos; 326 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); 327 int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); 328 if (character == wChar) { 329 // This is the correct node. Only one character group may start with the same 330 // char within a node, so either we found our match in this node, or there is 331 // no match and we can return NOT_VALID_WORD. So we will check all the characters 332 // in this character group indeed does match. 333 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { 334 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); 335 while (NOT_A_CHARACTER != character) { 336 ++wordPos; 337 // If we shoot the length of the word we search for, or if we find a single 338 // character that does not match, as explained above, it means the word is 339 // not in the dictionary (by virtue of this chargroup being the only one to 340 // match the word on the first character, but not matching the whole word). 341 if (wordPos > length) return NOT_VALID_WORD; 342 if (inWord[wordPos] != character) return NOT_VALID_WORD; 343 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); 344 } 345 } 346 // If we come here we know that so far, we do match. Either we are on a terminal 347 // and we match the length, in which case we found it, or we traverse children. 348 // If we don't match the length AND don't have children, then a word in the 349 // dictionary fully matches a prefix of the searched word but not the full word. 350 ++wordPos; 351 if (UnigramDictionary::FLAG_IS_TERMINAL & flags) { 352 if (wordPos == length) { 353 return charGroupPos; 354 } 355 pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos); 356 } 357 if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS 358 == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) { 359 return NOT_VALID_WORD; 360 } 361 // We have children and we are still shorter than the word we are searching for, so 362 // we need to traverse children. Put the pointer on the children position, and 363 // break 364 pos = BinaryFormat::readChildrenPosition(root, flags, pos); 365 break; 366 } else { 367 // This chargroup does not match, so skip the remaining part and go to the next. 368 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { 369 pos = BinaryFormat::skipOtherCharacters(root, pos); 370 } 371 pos = BinaryFormat::skipFrequency(flags, pos); 372 pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos); 373 } 374 --charGroupCount; 375 } 376 } 377 } 378 379 // This function searches for a terminal in the dictionary by its address. 380 // Due to the fact that words are ordered in the dictionary in a strict breadth-first order, 381 // it is possible to check for this with advantageous complexity. For each node, we search 382 // for groups with children and compare the children address with the address we look for. 383 // When we shoot the address we look for, it means the word we look for is in the children 384 // of the previous group. The only tricky part is the fact that if we arrive at the end of a 385 // node with the last group's children address still less than what we are searching for, we 386 // must descend the last group's children (for example, if the word we are searching for starts 387 // with a z, it's the last group of the root node, so all children addresses will be smaller 388 // than the address we look for, and we have to descend the z node). 389 /* Parameters : 390 * root: the dictionary buffer 391 * address: the byte position of the last chargroup of the word we are searching for (this is 392 * what is stored as the "bigram address" in each bigram) 393 * outword: an array to write the found word, with MAX_WORD_LENGTH size. 394 * outUnigramFrequency: a pointer to an int to write the frequency into. 395 * Return value : the length of the word, of 0 if the word was not found. 396 */ 397 inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int address, 398 const int maxDepth, uint16_t* outWord, int* outUnigramFrequency) { 399 int pos = 0; 400 int wordPos = 0; 401 402 // One iteration of the outer loop iterates through nodes. As stated above, we will only 403 // traverse nodes that are actually a part of the terminal we are searching, so each time 404 // we enter this loop we are one depth level further than last time. 405 // The only reason we count nodes is because we want to reduce the probability of infinite 406 // looping in case there is a bug. Since we know there is an upper bound to the depth we are 407 // supposed to traverse, it does not hurt to count iterations. 408 for (int loopCount = maxDepth; loopCount > 0; --loopCount) { 409 int lastCandidateGroupPos = 0; 410 // Let's loop through char groups in this node searching for either the terminal 411 // or one of its ascendants. 412 for (int charGroupCount = getGroupCountAndForwardPointer(root, &pos); charGroupCount > 0; 413 --charGroupCount) { 414 const int startPos = pos; 415 const uint8_t flags = getFlagsAndForwardPointer(root, &pos); 416 const int32_t character = getCharCodeAndForwardPointer(root, &pos); 417 if (address == startPos) { 418 // We found the address. Copy the rest of the word in the buffer and return 419 // the length. 420 outWord[wordPos] = character; 421 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { 422 int32_t nextChar = getCharCodeAndForwardPointer(root, &pos); 423 // We count chars in order to avoid infinite loops if the file is broken or 424 // if there is some other bug 425 int charCount = maxDepth; 426 while (NOT_A_CHARACTER != nextChar && --charCount > 0) { 427 outWord[++wordPos] = nextChar; 428 nextChar = getCharCodeAndForwardPointer(root, &pos); 429 } 430 } 431 *outUnigramFrequency = readFrequencyWithoutMovingPointer(root, pos); 432 return ++wordPos; 433 } 434 // We need to skip past this char group, so skip any remaining chars after the 435 // first and possibly the frequency. 436 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { 437 pos = skipOtherCharacters(root, pos); 438 } 439 pos = skipFrequency(flags, pos); 440 441 // The fact that this group has children is very important. Since we already know 442 // that this group does not match, if it has no children we know it is irrelevant 443 // to what we are searching for. 444 const bool hasChildren = (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != 445 (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)); 446 // We will write in `found' whether we have passed the children address we are 447 // searching for. For example if we search for "beer", the children of b are less 448 // than the address we are searching for and the children of c are greater. When we 449 // come here for c, we realize this is too big, and that we should descend b. 450 bool found; 451 if (hasChildren) { 452 // Here comes the tricky part. First, read the children position. 453 const int childrenPos = readChildrenPosition(root, flags, pos); 454 if (childrenPos > address) { 455 // If the children pos is greater than address, it means the previous chargroup, 456 // which address is stored in lastCandidateGroupPos, was the right one. 457 found = true; 458 } else if (1 >= charGroupCount) { 459 // However if we are on the LAST group of this node, and we have NOT shot the 460 // address we should descend THIS node. So we trick the lastCandidateGroupPos 461 // so that we will descend this node, not the previous one. 462 lastCandidateGroupPos = startPos; 463 found = true; 464 } else { 465 // Else, we should continue looking. 466 found = false; 467 } 468 } else { 469 // Even if we don't have children here, we could still be on the last group of this 470 // node. If this is the case, we should descend the last group that had children, 471 // and their address is already in lastCandidateGroup. 472 found = (1 >= charGroupCount); 473 } 474 475 if (found) { 476 // Okay, we found the group we should descend. Its address is in 477 // the lastCandidateGroupPos variable, so we just re-read it. 478 if (0 != lastCandidateGroupPos) { 479 const uint8_t lastFlags = 480 getFlagsAndForwardPointer(root, &lastCandidateGroupPos); 481 const int32_t lastChar = 482 getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); 483 // We copy all the characters in this group to the buffer 484 outWord[wordPos] = lastChar; 485 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & lastFlags) { 486 int32_t nextChar = 487 getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); 488 int charCount = maxDepth; 489 while (-1 != nextChar && --charCount > 0) { 490 outWord[++wordPos] = nextChar; 491 nextChar = getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); 492 } 493 } 494 ++wordPos; 495 // Now we only need to branch to the children address. Skip the frequency if 496 // it's there, read pos, and break to resume the search at pos. 497 lastCandidateGroupPos = skipFrequency(lastFlags, lastCandidateGroupPos); 498 pos = readChildrenPosition(root, lastFlags, lastCandidateGroupPos); 499 break; 500 } else { 501 // Here is a little tricky part: we come here if we found out that all children 502 // addresses in this group are bigger than the address we are searching for. 503 // Should we conclude the word is not in the dictionary? No! It could still be 504 // one of the remaining chargroups in this node, so we have to keep looking in 505 // this node until we find it (or we realize it's not there either, in which 506 // case it's actually not in the dictionary). Pass the end of this group, ready 507 // to start the next one. 508 pos = skipChildrenPosAndAttributes(root, flags, pos); 509 } 510 } else { 511 // If we did not find it, we should record the last children address for the next 512 // iteration. 513 if (hasChildren) lastCandidateGroupPos = startPos; 514 // Now skip the end of this group (children pos and the attributes if any) so that 515 // our pos is after the end of this char group, at the start of the next one. 516 pos = skipChildrenPosAndAttributes(root, flags, pos); 517 } 518 519 } 520 } 521 // If we have looked through all the chargroups and found no match, the address is 522 // not the address of a terminal in this dictionary. 523 return 0; 524 } 525 526 static inline int backoff(const int unigramFreq) { 527 return unigramFreq; 528 // For some reason, applying the backoff weight gives bad results in tests. To apply the 529 // backoff weight, we divide the probability by 2, which in our storing format means 530 // decreasing the score by 8. 531 // TODO: figure out what's wrong with this. 532 // return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8); 533 } 534 535 inline int BinaryFormat::computeFrequencyForBigram(const int unigramFreq, const int bigramFreq) { 536 // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the 537 // unigram frequency to be the median value of the 17th step from the top. A value of 538 // 0 for the bigram frequency represents the middle of the 16th step from the top, 539 // while a value of 15 represents the middle of the top step. 540 // See makedict.BinaryDictInputOutput for details. 541 const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ); 542 return (int)(unigramFreq + (bigramFreq + 1) * stepSize); 543 } 544 545 // This returns a probability in log space. 546 inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap, 547 const uint8_t *bigramFilter, const int unigramFreq) { 548 if (!bigramMap || !bigramFilter) return backoff(unigramFreq); 549 if (!isInFilter(bigramFilter, position)) return backoff(unigramFreq); 550 const std::map<int, int>::const_iterator bigramFreqIt = bigramMap->find(position); 551 if (bigramFreqIt != bigramMap->end()) { 552 const int bigramFreq = bigramFreqIt->second; 553 return computeFrequencyForBigram(unigramFreq, bigramFreq); 554 } else { 555 return backoff(unigramFreq); 556 } 557 } 558 559 } // namespace latinime 560 561 #endif // LATINIME_BINARY_FORMAT_H 562