1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <assert.h> 18 #include <ctype.h> 19 #include <stdio.h> 20 #include <string.h> 21 22 #define LOG_TAG "LatinIME: correction.cpp" 23 24 #include "correction.h" 25 #include "dictionary.h" 26 #include "proximity_info.h" 27 28 namespace latinime { 29 30 ////////////////////// 31 // inline functions // 32 ////////////////////// 33 static const char QUOTE = '\''; 34 35 inline bool Correction::isQuote(const unsigned short c) { 36 const unsigned short userTypedChar = mProximityInfo->getPrimaryCharAt(mInputIndex); 37 return (c == QUOTE && userTypedChar != QUOTE); 38 } 39 40 //////////////// 41 // Correction // 42 //////////////// 43 44 Correction::Correction(const int typedLetterMultiplier, const int fullWordMultiplier) 45 : TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier) { 46 } 47 48 void Correction::initCorrection(const ProximityInfo *pi, const int inputLength, 49 const int maxDepth) { 50 mProximityInfo = pi; 51 mInputLength = inputLength; 52 mMaxDepth = maxDepth; 53 mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2; 54 } 55 56 void Correction::initCorrectionState( 57 const int rootPos, const int childCount, const bool traverseAll) { 58 latinime::initCorrectionState(mCorrectionStates, rootPos, childCount, traverseAll); 59 // TODO: remove 60 mCorrectionStates[0].mTransposedPos = mTransposedPos; 61 mCorrectionStates[0].mExcessivePos = mExcessivePos; 62 mCorrectionStates[0].mSkipPos = mSkipPos; 63 } 64 65 void Correction::setCorrectionParams(const int skipPos, const int excessivePos, 66 const int transposedPos, const int spaceProximityPos, const int missingSpacePos, 67 const bool useFullEditDistance) { 68 // TODO: remove 69 mTransposedPos = transposedPos; 70 mExcessivePos = excessivePos; 71 mSkipPos = skipPos; 72 // TODO: remove 73 mCorrectionStates[0].mTransposedPos = transposedPos; 74 mCorrectionStates[0].mExcessivePos = excessivePos; 75 mCorrectionStates[0].mSkipPos = skipPos; 76 77 mSpaceProximityPos = spaceProximityPos; 78 mMissingSpacePos = missingSpacePos; 79 mUseFullEditDistance = useFullEditDistance; 80 } 81 82 void Correction::checkState() { 83 if (DEBUG_DICT) { 84 int inputCount = 0; 85 if (mSkipPos >= 0) ++inputCount; 86 if (mExcessivePos >= 0) ++inputCount; 87 if (mTransposedPos >= 0) ++inputCount; 88 // TODO: remove this assert 89 assert(inputCount <= 1); 90 } 91 } 92 93 int Correction::getFreqForSplitTwoWords(const int firstFreq, const int secondFreq, 94 const unsigned short *word) { 95 return Correction::RankingAlgorithm::calcFreqForSplitTwoWords( 96 firstFreq, secondFreq, this, word); 97 } 98 99 int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLength) { 100 const int outputIndex = mTerminalOutputIndex; 101 const int inputIndex = mTerminalInputIndex; 102 *wordLength = outputIndex + 1; 103 if (mProximityInfo->sameAsTyped(mWord, outputIndex + 1) || outputIndex < MIN_SUGGEST_DEPTH) { 104 return -1; 105 } 106 107 *word = mWord; 108 return Correction::RankingAlgorithm::calculateFinalFreq( 109 inputIndex, outputIndex, freq, mEditDistanceTable, this); 110 } 111 112 bool Correction::initProcessState(const int outputIndex) { 113 if (mCorrectionStates[outputIndex].mChildCount <= 0) { 114 return false; 115 } 116 mOutputIndex = outputIndex; 117 --(mCorrectionStates[outputIndex].mChildCount); 118 mInputIndex = mCorrectionStates[outputIndex].mInputIndex; 119 mNeedsToTraverseAllNodes = mCorrectionStates[outputIndex].mNeedsToTraverseAllNodes; 120 121 mEquivalentCharCount = mCorrectionStates[outputIndex].mEquivalentCharCount; 122 mProximityCount = mCorrectionStates[outputIndex].mProximityCount; 123 mTransposedCount = mCorrectionStates[outputIndex].mTransposedCount; 124 mExcessiveCount = mCorrectionStates[outputIndex].mExcessiveCount; 125 mSkippedCount = mCorrectionStates[outputIndex].mSkippedCount; 126 mLastCharExceeded = mCorrectionStates[outputIndex].mLastCharExceeded; 127 128 mTransposedPos = mCorrectionStates[outputIndex].mTransposedPos; 129 mExcessivePos = mCorrectionStates[outputIndex].mExcessivePos; 130 mSkipPos = mCorrectionStates[outputIndex].mSkipPos; 131 132 mMatching = false; 133 mProximityMatching = false; 134 mTransposing = false; 135 mExceeding = false; 136 mSkipping = false; 137 138 return true; 139 } 140 141 int Correction::goDownTree( 142 const int parentIndex, const int childCount, const int firstChildPos) { 143 mCorrectionStates[mOutputIndex].mParentIndex = parentIndex; 144 mCorrectionStates[mOutputIndex].mChildCount = childCount; 145 mCorrectionStates[mOutputIndex].mSiblingPos = firstChildPos; 146 return mOutputIndex; 147 } 148 149 // TODO: remove 150 int Correction::getOutputIndex() { 151 return mOutputIndex; 152 } 153 154 // TODO: remove 155 int Correction::getInputIndex() { 156 return mInputIndex; 157 } 158 159 // TODO: remove 160 bool Correction::needsToTraverseAllNodes() { 161 return mNeedsToTraverseAllNodes; 162 } 163 164 void Correction::incrementInputIndex() { 165 ++mInputIndex; 166 } 167 168 void Correction::incrementOutputIndex() { 169 ++mOutputIndex; 170 mCorrectionStates[mOutputIndex].mParentIndex = mCorrectionStates[mOutputIndex - 1].mParentIndex; 171 mCorrectionStates[mOutputIndex].mChildCount = mCorrectionStates[mOutputIndex - 1].mChildCount; 172 mCorrectionStates[mOutputIndex].mSiblingPos = mCorrectionStates[mOutputIndex - 1].mSiblingPos; 173 mCorrectionStates[mOutputIndex].mInputIndex = mInputIndex; 174 mCorrectionStates[mOutputIndex].mNeedsToTraverseAllNodes = mNeedsToTraverseAllNodes; 175 176 mCorrectionStates[mOutputIndex].mEquivalentCharCount = mEquivalentCharCount; 177 mCorrectionStates[mOutputIndex].mProximityCount = mProximityCount; 178 mCorrectionStates[mOutputIndex].mTransposedCount = mTransposedCount; 179 mCorrectionStates[mOutputIndex].mExcessiveCount = mExcessiveCount; 180 mCorrectionStates[mOutputIndex].mSkippedCount = mSkippedCount; 181 182 mCorrectionStates[mOutputIndex].mSkipPos = mSkipPos; 183 mCorrectionStates[mOutputIndex].mTransposedPos = mTransposedPos; 184 mCorrectionStates[mOutputIndex].mExcessivePos = mExcessivePos; 185 186 mCorrectionStates[mOutputIndex].mLastCharExceeded = mLastCharExceeded; 187 188 mCorrectionStates[mOutputIndex].mMatching = mMatching; 189 mCorrectionStates[mOutputIndex].mProximityMatching = mProximityMatching; 190 mCorrectionStates[mOutputIndex].mTransposing = mTransposing; 191 mCorrectionStates[mOutputIndex].mExceeding = mExceeding; 192 mCorrectionStates[mOutputIndex].mSkipping = mSkipping; 193 } 194 195 void Correction::startToTraverseAllNodes() { 196 mNeedsToTraverseAllNodes = true; 197 } 198 199 bool Correction::needsToPrune() const { 200 return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance; 201 } 202 203 // TODO: inline? 204 Correction::CorrectionType Correction::processSkipChar( 205 const int32_t c, const bool isTerminal, const bool inputIndexIncremented) { 206 mWord[mOutputIndex] = c; 207 if (needsToTraverseAllNodes() && isTerminal) { 208 mTerminalInputIndex = mInputIndex - (inputIndexIncremented ? 1 : 0); 209 mTerminalOutputIndex = mOutputIndex; 210 incrementOutputIndex(); 211 return TRAVERSE_ALL_ON_TERMINAL; 212 } else { 213 incrementOutputIndex(); 214 return TRAVERSE_ALL_NOT_ON_TERMINAL; 215 } 216 } 217 218 inline bool isEquivalentChar(ProximityInfo::ProximityType type) { 219 return type == ProximityInfo::EQUIVALENT_CHAR; 220 } 221 222 Correction::CorrectionType Correction::processCharAndCalcState( 223 const int32_t c, const bool isTerminal) { 224 const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount); 225 // TODO: Change the limit if we'll allow two or more corrections 226 const bool noCorrectionsHappenedSoFar = correctionCount == 0; 227 const bool canTryCorrection = noCorrectionsHappenedSoFar; 228 int proximityIndex = 0; 229 mDistances[mOutputIndex] = NOT_A_DISTANCE; 230 231 if (mNeedsToTraverseAllNodes || isQuote(c)) { 232 bool incremented = false; 233 if (mLastCharExceeded && mInputIndex == mInputLength - 1) { 234 // TODO: Do not check the proximity if EditDistance exceeds the threshold 235 const ProximityInfo::ProximityType matchId = 236 mProximityInfo->getMatchedProximityId(mInputIndex, c, true, &proximityIndex); 237 if (isEquivalentChar(matchId)) { 238 mLastCharExceeded = false; 239 --mExcessiveCount; 240 mDistances[mOutputIndex] = 241 mProximityInfo->getNormalizedSquaredDistance(mInputIndex, 0); 242 } else if (matchId == ProximityInfo::NEAR_PROXIMITY_CHAR) { 243 mLastCharExceeded = false; 244 --mExcessiveCount; 245 ++mProximityCount; 246 mDistances[mOutputIndex] = 247 mProximityInfo->getNormalizedSquaredDistance(mInputIndex, proximityIndex); 248 } 249 incrementInputIndex(); 250 incremented = true; 251 } 252 return processSkipChar(c, isTerminal, incremented); 253 } 254 255 if (mExcessivePos >= 0) { 256 if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) { 257 mExcessivePos = mOutputIndex; 258 } 259 if (mExcessivePos < mInputLength - 1) { 260 mExceeding = mExcessivePos == mInputIndex && canTryCorrection; 261 } 262 } 263 264 if (mSkipPos >= 0) { 265 if (mSkippedCount == 0 && mSkipPos < mOutputIndex) { 266 if (DEBUG_DICT) { 267 assert(mSkipPos == mOutputIndex - 1); 268 } 269 mSkipPos = mOutputIndex; 270 } 271 mSkipping = mSkipPos == mOutputIndex && canTryCorrection; 272 } 273 274 if (mTransposedPos >= 0) { 275 if (mTransposedCount == 0 && mTransposedPos < mOutputIndex) { 276 mTransposedPos = mOutputIndex; 277 } 278 if (mTransposedPos < mInputLength - 1) { 279 mTransposing = mInputIndex == mTransposedPos && canTryCorrection; 280 } 281 } 282 283 bool secondTransposing = false; 284 if (mTransposedCount % 2 == 1) { 285 if (isEquivalentChar(mProximityInfo->getMatchedProximityId(mInputIndex - 1, c, false))) { 286 ++mTransposedCount; 287 secondTransposing = true; 288 } else if (mCorrectionStates[mOutputIndex].mExceeding) { 289 --mTransposedCount; 290 ++mExcessiveCount; 291 --mExcessivePos; 292 incrementInputIndex(); 293 } else { 294 --mTransposedCount; 295 if (DEBUG_CORRECTION) { 296 DUMP_WORD(mWord, mOutputIndex); 297 LOGI("UNRELATED(0): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 298 mTransposedCount, mExcessiveCount, c); 299 } 300 return UNRELATED; 301 } 302 } 303 304 // TODO: Change the limit if we'll allow two or more proximity chars with corrections 305 const bool checkProximityChars = noCorrectionsHappenedSoFar || mProximityCount == 0; 306 ProximityInfo::ProximityType matchedProximityCharId = secondTransposing 307 ? ProximityInfo::EQUIVALENT_CHAR 308 : mProximityInfo->getMatchedProximityId( 309 mInputIndex, c, checkProximityChars, &proximityIndex); 310 311 if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) { 312 if (canTryCorrection && mOutputIndex > 0 313 && mCorrectionStates[mOutputIndex].mProximityMatching 314 && mCorrectionStates[mOutputIndex].mExceeding 315 && isEquivalentChar(mProximityInfo->getMatchedProximityId( 316 mInputIndex, mWord[mOutputIndex - 1], false))) { 317 if (DEBUG_CORRECTION) { 318 LOGI("CONVERSION p->e %c", mWord[mOutputIndex - 1]); 319 } 320 // Conversion p->e 321 // Example: 322 // wearth -> earth 323 // px -> (E)mmmmm 324 ++mExcessiveCount; 325 --mProximityCount; 326 mExcessivePos = mOutputIndex - 1; 327 ++mInputIndex; 328 // Here, we are doing something equivalent to matchedProximityCharId, 329 // but we already know that "excessive char correction" just happened 330 // so that we just need to check "mProximityCount == 0". 331 matchedProximityCharId = mProximityInfo->getMatchedProximityId( 332 mInputIndex, c, mProximityCount == 0, &proximityIndex); 333 } 334 } 335 336 if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) { 337 // TODO: Optimize 338 // As the current char turned out to be an unrelated char, 339 // we will try other correction-types. Please note that mCorrectionStates[mOutputIndex] 340 // here refers to the previous state. 341 if (mInputIndex < mInputLength - 1 && mOutputIndex > 0 && mTransposedCount > 0 342 && !mCorrectionStates[mOutputIndex].mTransposing 343 && mCorrectionStates[mOutputIndex - 1].mTransposing 344 && isEquivalentChar(mProximityInfo->getMatchedProximityId( 345 mInputIndex, mWord[mOutputIndex - 1], false)) 346 && isEquivalentChar( 347 mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false))) { 348 // Conversion t->e 349 // Example: 350 // occaisional -> occa sional 351 // mmmmttx -> mmmm(E)mmmmmm 352 mTransposedCount -= 2; 353 ++mExcessiveCount; 354 ++mInputIndex; 355 } else if (mOutputIndex > 0 && mInputIndex > 0 && mTransposedCount > 0 356 && !mCorrectionStates[mOutputIndex].mTransposing 357 && mCorrectionStates[mOutputIndex - 1].mTransposing 358 && isEquivalentChar( 359 mProximityInfo->getMatchedProximityId(mInputIndex - 1, c, false))) { 360 // Conversion t->s 361 // Example: 362 // chcolate -> chocolate 363 // mmttx -> mmsmmmmmm 364 mTransposedCount -= 2; 365 ++mSkippedCount; 366 --mInputIndex; 367 } else if (canTryCorrection && mInputIndex > 0 368 && mCorrectionStates[mOutputIndex].mProximityMatching 369 && mCorrectionStates[mOutputIndex].mSkipping 370 && isEquivalentChar( 371 mProximityInfo->getMatchedProximityId(mInputIndex - 1, c, false))) { 372 // Conversion p->s 373 // Note: This logic tries saving cases like contrst --> contrast -- "a" is one of 374 // proximity chars of "s", but it should rather be handled as a skipped char. 375 ++mSkippedCount; 376 --mProximityCount; 377 return processSkipChar(c, isTerminal, false); 378 } else if ((mExceeding || mTransposing) && mInputIndex - 1 < mInputLength 379 && isEquivalentChar( 380 mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false))) { 381 // 1.2. Excessive or transpose correction 382 if (mTransposing) { 383 ++mTransposedCount; 384 } else { 385 ++mExcessiveCount; 386 incrementInputIndex(); 387 } 388 } else if (mSkipping) { 389 // 3. Skip correction 390 ++mSkippedCount; 391 return processSkipChar(c, isTerminal, false); 392 } else { 393 if (DEBUG_CORRECTION) { 394 DUMP_WORD(mWord, mOutputIndex); 395 LOGI("UNRELATED(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 396 mTransposedCount, mExcessiveCount, c); 397 } 398 return UNRELATED; 399 } 400 } else if (secondTransposing) { 401 // If inputIndex is greater than mInputLength, that means there is no 402 // proximity chars. So, we don't need to check proximity. 403 mMatching = true; 404 } else if (isEquivalentChar(matchedProximityCharId)) { 405 mMatching = true; 406 ++mEquivalentCharCount; 407 mDistances[mOutputIndex] = mProximityInfo->getNormalizedSquaredDistance(mInputIndex, 0); 408 } else if (ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) { 409 mProximityMatching = true; 410 ++mProximityCount; 411 mDistances[mOutputIndex] = 412 mProximityInfo->getNormalizedSquaredDistance(mInputIndex, proximityIndex); 413 } 414 415 mWord[mOutputIndex] = c; 416 417 // 4. Last char excessive correction 418 mLastCharExceeded = mExcessiveCount == 0 && mSkippedCount == 0 && mTransposedCount == 0 419 && mProximityCount == 0 && (mInputIndex == mInputLength - 2); 420 const bool isSameAsUserTypedLength = (mInputLength == mInputIndex + 1) || mLastCharExceeded; 421 if (mLastCharExceeded) { 422 ++mExcessiveCount; 423 } 424 425 // Start traversing all nodes after the index exceeds the user typed length 426 if (isSameAsUserTypedLength) { 427 startToTraverseAllNodes(); 428 } 429 430 const bool needsToTryOnTerminalForTheLastPossibleExcessiveChar = 431 mExceeding && mInputIndex == mInputLength - 2; 432 433 // Finally, we are ready to go to the next character, the next "virtual node". 434 // We should advance the input index. 435 // We do this in this branch of the 'if traverseAllNodes' because we are still matching 436 // characters to input; the other branch is not matching them but searching for 437 // completions, this is why it does not have to do it. 438 incrementInputIndex(); 439 // Also, the next char is one "virtual node" depth more than this char. 440 incrementOutputIndex(); 441 442 if ((needsToTryOnTerminalForTheLastPossibleExcessiveChar 443 || isSameAsUserTypedLength) && isTerminal) { 444 mTerminalInputIndex = mInputIndex - 1; 445 mTerminalOutputIndex = mOutputIndex - 1; 446 if (DEBUG_CORRECTION) { 447 DUMP_WORD(mWord, mOutputIndex); 448 LOGI("ONTERMINAL(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 449 mTransposedCount, mExcessiveCount, c); 450 } 451 return ON_TERMINAL; 452 } else { 453 return NOT_ON_TERMINAL; 454 } 455 } 456 457 Correction::~Correction() { 458 } 459 460 ///////////////////////// 461 // static inline utils // 462 ///////////////////////// 463 464 static const int TWO_31ST_DIV_255 = S_INT_MAX / 255; 465 static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) { 466 return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX); 467 } 468 469 static const int TWO_31ST_DIV_2 = S_INT_MAX / 2; 470 inline static void multiplyIntCapped(const int multiplier, int *base) { 471 const int temp = *base; 472 if (temp != S_INT_MAX) { 473 // Branch if multiplier == 2 for the optimization 474 if (multiplier == 2) { 475 *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX; 476 } else { 477 // TODO: This overflow check gives a wrong answer when, for example, 478 // temp = 2^16 + 1 and multiplier = 2^17 + 1. 479 // Fix this behavior. 480 const int tempRetval = temp * multiplier; 481 *base = tempRetval >= temp ? tempRetval : S_INT_MAX; 482 } 483 } 484 } 485 486 inline static int powerIntCapped(const int base, const int n) { 487 if (n <= 0) return 1; 488 if (base == 2) { 489 return n < 31 ? 1 << n : S_INT_MAX; 490 } else { 491 int ret = base; 492 for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret); 493 return ret; 494 } 495 } 496 497 inline static void multiplyRate(const int rate, int *freq) { 498 if (*freq != S_INT_MAX) { 499 if (*freq > 1000000) { 500 *freq /= 100; 501 multiplyIntCapped(rate, freq); 502 } else { 503 multiplyIntCapped(rate, freq); 504 *freq /= 100; 505 } 506 } 507 } 508 509 inline static int getQuoteCount(const unsigned short* word, const int length) { 510 int quoteCount = 0; 511 for (int i = 0; i < length; ++i) { 512 if(word[i] == '\'') { 513 ++quoteCount; 514 } 515 } 516 return quoteCount; 517 } 518 519 inline static bool isUpperCase(unsigned short c) { 520 if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) { 521 c = BASE_CHARS[c]; 522 } 523 if (isupper(c)) { 524 return true; 525 } 526 return false; 527 } 528 529 /* static */ 530 inline static int editDistance( 531 int* editDistanceTable, const unsigned short* input, 532 const int inputLength, const unsigned short* output, const int outputLength) { 533 // dp[li][lo] dp[a][b] = dp[ a * lo + b] 534 int* dp = editDistanceTable; 535 const int li = inputLength + 1; 536 const int lo = outputLength + 1; 537 for (int i = 0; i < li; ++i) { 538 dp[lo * i] = i; 539 } 540 for (int i = 0; i < lo; ++i) { 541 dp[i] = i; 542 } 543 544 for (int i = 0; i < li - 1; ++i) { 545 for (int j = 0; j < lo - 1; ++j) { 546 const uint32_t ci = Dictionary::toBaseLowerCase(input[i]); 547 const uint32_t co = Dictionary::toBaseLowerCase(output[j]); 548 const uint16_t cost = (ci == co) ? 0 : 1; 549 dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1, 550 min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost)); 551 if (i > 0 && j > 0 && ci == Dictionary::toBaseLowerCase(output[j - 1]) 552 && co == Dictionary::toBaseLowerCase(input[i - 1])) { 553 dp[(i + 1) * lo + (j + 1)] = min( 554 dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost); 555 } 556 } 557 } 558 559 if (DEBUG_EDIT_DISTANCE) { 560 LOGI("IN = %d, OUT = %d", inputLength, outputLength); 561 for (int i = 0; i < li; ++i) { 562 for (int j = 0; j < lo; ++j) { 563 LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]); 564 } 565 } 566 } 567 return dp[li * lo - 1]; 568 } 569 570 ////////////////////// 571 // RankingAlgorithm // 572 ////////////////////// 573 574 /* static */ 575 int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const int outputIndex, 576 const int freq, int* editDistanceTable, const Correction* correction) { 577 const int excessivePos = correction->getExcessivePos(); 578 const int inputLength = correction->mInputLength; 579 const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; 580 const int fullWordMultiplier = correction->FULL_WORD_MULTIPLIER; 581 const ProximityInfo *proximityInfo = correction->mProximityInfo; 582 const int skippedCount = correction->mSkippedCount; 583 const int transposedCount = correction->mTransposedCount / 2; 584 const int excessiveCount = correction->mExcessiveCount + correction->mTransposedCount % 2; 585 const int proximityMatchedCount = correction->mProximityCount; 586 const bool lastCharExceeded = correction->mLastCharExceeded; 587 const bool useFullEditDistance = correction->mUseFullEditDistance; 588 const int outputLength = outputIndex + 1; 589 if (skippedCount >= inputLength || inputLength == 0) { 590 return -1; 591 } 592 593 // TODO: find more robust way 594 bool sameLength = lastCharExceeded ? (inputLength == inputIndex + 2) 595 : (inputLength == inputIndex + 1); 596 597 // TODO: use mExcessiveCount 598 const int matchCount = inputLength - correction->mProximityCount - excessiveCount; 599 600 const unsigned short* word = correction->mWord; 601 const bool skipped = skippedCount > 0; 602 603 const int quoteDiffCount = max(0, getQuoteCount(word, outputIndex + 1) 604 - getQuoteCount(proximityInfo->getPrimaryInputWord(), inputLength)); 605 606 // TODO: Calculate edit distance for transposed and excessive 607 int ed = 0; 608 int adjustedProximityMatchedCount = proximityMatchedCount; 609 610 int finalFreq = freq; 611 612 // TODO: Optimize this. 613 // TODO: Ignoring edit distance for transposed char, for now 614 if (transposedCount == 0 && (proximityMatchedCount > 0 || skipped || excessiveCount > 0)) { 615 const unsigned short* primaryInputWord = proximityInfo->getPrimaryInputWord(); 616 ed = editDistance(editDistanceTable, primaryInputWord, 617 inputLength, word, outputIndex + 1); 618 const int matchWeight = powerIntCapped(typedLetterMultiplier, 619 max(inputLength, outputIndex + 1) - ed); 620 multiplyIntCapped(matchWeight, &finalFreq); 621 622 // TODO: Demote further if there are two or more excessive chars with longer user input? 623 if (inputLength > outputIndex + 1) { 624 multiplyRate(INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE, &finalFreq); 625 } 626 627 ed = max(0, ed - quoteDiffCount); 628 629 if (ed == 1 && (inputLength == outputIndex || inputLength == outputIndex + 2)) { 630 // Promote a word with just one skipped or excessive char 631 if (sameLength) { 632 multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE, &finalFreq); 633 } else { 634 multiplyIntCapped(typedLetterMultiplier, &finalFreq); 635 } 636 } else if (ed == 0) { 637 multiplyIntCapped(typedLetterMultiplier, &finalFreq); 638 sameLength = true; 639 } 640 adjustedProximityMatchedCount = min(max(0, ed - (outputIndex + 1 - inputLength)), 641 proximityMatchedCount); 642 } else { 643 // TODO: Calculate the edit distance for transposed char 644 const int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount); 645 multiplyIntCapped(matchWeight, &finalFreq); 646 } 647 648 if (proximityInfo->getMatchedProximityId(0, word[0], true) 649 == ProximityInfo::UNRELATED_CHAR) { 650 multiplyRate(FIRST_CHAR_DIFFERENT_DEMOTION_RATE, &finalFreq); 651 } 652 653 /////////////////////////////////////////////// 654 // Promotion and Demotion for each correction 655 656 // Demotion for a word with missing character 657 if (skipped) { 658 const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 659 * (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X) 660 / (10 * inputLength 661 - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10); 662 if (DEBUG_DICT_FULL) { 663 LOGI("Demotion rate for missing character is %d.", demotionRate); 664 } 665 multiplyRate(demotionRate, &finalFreq); 666 } 667 668 // Demotion for a word with transposed character 669 if (transposedCount > 0) multiplyRate( 670 WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq); 671 672 // Demotion for a word with excessive character 673 if (excessiveCount > 0) { 674 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq); 675 if (!lastCharExceeded && !proximityInfo->existsAdjacentProximityChars(excessivePos)) { 676 if (DEBUG_CORRECTION_FREQ) { 677 LOGI("Double excessive demotion"); 678 } 679 // If an excessive character is not adjacent to the left char or the right char, 680 // we will demote this word. 681 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq); 682 } 683 } 684 685 // Score calibration by touch coordinates is being done only for pure-fat finger typing error 686 // cases. 687 // TODO: Remove this constraint. 688 if (CALIBRATE_SCORE_BY_TOUCH_COORDINATES && proximityInfo->touchPositionCorrectionEnabled() 689 && skippedCount == 0 && excessiveCount == 0 && transposedCount == 0) { 690 for (int i = 0; i < outputLength; ++i) { 691 const int squaredDistance = correction->mDistances[i]; 692 if (i < adjustedProximityMatchedCount) { 693 multiplyIntCapped(typedLetterMultiplier, &finalFreq); 694 } 695 if (squaredDistance >= 0) { 696 // Promote or demote the score according to the distance from the sweet spot 697 static const float A = ZERO_DISTANCE_PROMOTION_RATE / 100.0f; 698 static const float B = 1.0f; 699 static const float C = 0.5f; 700 static const float R1 = NEUTRAL_SCORE_SQUARED_RADIUS; 701 static const float R2 = HALF_SCORE_SQUARED_RADIUS; 702 const float x = (float)squaredDistance 703 / ProximityInfo::NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR; 704 const float factor = (x < R1) 705 ? (A * (R1 - x) + B * x) / R1 706 : (B * (R2 - x) + C * (x - R1)) / (R2 - R1); 707 // factor is piecewise linear function like: 708 // A -_ . 709 // ^-_ . 710 // B \ . 711 // \ . 712 // C \ . 713 // 0 R1 R2 714 multiplyRate((int)(factor * 100), &finalFreq); 715 } else if (squaredDistance == PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO) { 716 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); 717 } 718 } 719 } else { 720 // Promotion for a word with proximity characters 721 for (int i = 0; i < adjustedProximityMatchedCount; ++i) { 722 // A word with proximity corrections 723 if (DEBUG_DICT_FULL) { 724 LOGI("Found a proximity correction."); 725 } 726 multiplyIntCapped(typedLetterMultiplier, &finalFreq); 727 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); 728 } 729 } 730 731 const int errorCount = adjustedProximityMatchedCount > 0 732 ? adjustedProximityMatchedCount 733 : (proximityMatchedCount + transposedCount); 734 multiplyRate( 735 100 - CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE * errorCount / inputLength, &finalFreq); 736 737 // Promotion for an exactly matched word 738 if (ed == 0) { 739 // Full exact match 740 if (sameLength && transposedCount == 0 && !skipped && excessiveCount == 0) { 741 finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq); 742 } 743 } 744 745 // Promote a word with no correction 746 if (proximityMatchedCount == 0 && transposedCount == 0 && !skipped && excessiveCount == 0) { 747 multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq); 748 } 749 750 // TODO: Check excessive count and transposed count 751 // TODO: Remove this if possible 752 /* 753 If the last character of the user input word is the same as the next character 754 of the output word, and also all of characters of the user input are matched 755 to the output word, we'll promote that word a bit because 756 that word can be considered the combination of skipped and matched characters. 757 This means that the 'sm' pattern wins over the 'ma' pattern. 758 e.g.) 759 shel -> shell [mmmma] or [mmmsm] 760 hel -> hello [mmmaa] or [mmsma] 761 m ... matching 762 s ... skipping 763 a ... traversing all 764 t ... transposing 765 e ... exceeding 766 p ... proximity matching 767 */ 768 if (matchCount == inputLength && matchCount >= 2 && !skipped 769 && word[matchCount] == word[matchCount - 1]) { 770 multiplyRate(WORDS_WITH_MATCH_SKIP_PROMOTION_RATE, &finalFreq); 771 } 772 773 // TODO: Do not use sameLength? 774 if (sameLength) { 775 multiplyIntCapped(fullWordMultiplier, &finalFreq); 776 } 777 778 if (useFullEditDistance && outputLength > inputLength + 1) { 779 const int diff = outputLength - inputLength - 1; 780 const int divider = diff < 31 ? 1 << diff : S_INT_MAX; 781 finalFreq = divider > finalFreq ? 1 : finalFreq / divider; 782 } 783 784 if (DEBUG_DICT_FULL) { 785 LOGI("calc: %d, %d", outputIndex, sameLength); 786 } 787 788 if (DEBUG_CORRECTION_FREQ) { 789 DUMP_WORD(correction->mWord, outputIndex + 1); 790 LOGI("FinalFreq: [P%d, S%d, T%d, E%d] %d, %d, %d, %d, %d", proximityMatchedCount, 791 skippedCount, transposedCount, excessiveCount, lastCharExceeded, sameLength, 792 quoteDiffCount, ed, finalFreq); 793 } 794 795 return finalFreq; 796 } 797 798 /* static */ 799 int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( 800 const int firstFreq, const int secondFreq, const Correction* correction, 801 const unsigned short *word) { 802 const int spaceProximityPos = correction->mSpaceProximityPos; 803 const int missingSpacePos = correction->mMissingSpacePos; 804 if (DEBUG_DICT) { 805 int inputCount = 0; 806 if (spaceProximityPos >= 0) ++inputCount; 807 if (missingSpacePos >= 0) ++inputCount; 808 assert(inputCount <= 1); 809 } 810 const bool isSpaceProximity = spaceProximityPos >= 0; 811 const int inputLength = correction->mInputLength; 812 const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos; 813 const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1) 814 : (inputLength - missingSpacePos); 815 const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; 816 817 bool firstCapitalizedWordDemotion = false; 818 if (firstWordLength >= 2) { 819 firstCapitalizedWordDemotion = isUpperCase(word[0]); 820 } 821 822 bool secondCapitalizedWordDemotion = false; 823 if (secondWordLength >= 2) { 824 secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]); 825 } 826 827 const bool capitalizedWordDemotion = 828 firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion; 829 830 if (DEBUG_DICT_FULL) { 831 LOGI("Two words: %c, %c, %d", word[0], word[firstWordLength + 1], capitalizedWordDemotion); 832 } 833 834 if (firstWordLength == 0 || secondWordLength == 0) { 835 return 0; 836 } 837 const int firstDemotionRate = 100 - 100 / (firstWordLength + 1); 838 int tempFirstFreq = firstFreq; 839 multiplyRate(firstDemotionRate, &tempFirstFreq); 840 841 const int secondDemotionRate = 100 - 100 / (secondWordLength + 1); 842 int tempSecondFreq = secondFreq; 843 multiplyRate(secondDemotionRate, &tempSecondFreq); 844 845 const int totalLength = firstWordLength + secondWordLength; 846 847 // Promote pairFreq with multiplying by 2, because the word length is the same as the typed 848 // length. 849 int totalFreq = tempFirstFreq + tempSecondFreq; 850 851 // This is a workaround to try offsetting the not-enough-demotion which will be done in 852 // calcNormalizedScore in Utils.java. 853 // In calcNormalizedScore the score will be demoted by (1 - 1 / length) 854 // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by 855 // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length)) 856 const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength); 857 multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq); 858 859 // At this moment, totalFreq is calculated by the following formula: 860 // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1))) 861 // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1)) 862 863 multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq); 864 865 // This is another workaround to offset the demotion which will be done in 866 // calcNormalizedScore in Utils.java. 867 // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote 868 // the same amount because we already have adjusted the synthetic freq of this "missing or 869 // mistyped space" suggestion candidate above in this method. 870 const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength); 871 multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq); 872 873 if (isSpaceProximity) { 874 // A word pair with one space proximity correction 875 if (DEBUG_DICT) { 876 LOGI("Found a word pair with space proximity correction."); 877 } 878 multiplyIntCapped(typedLetterMultiplier, &totalFreq); 879 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq); 880 } 881 882 multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq); 883 884 if (capitalizedWordDemotion) { 885 multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq); 886 } 887 888 return totalFreq; 889 } 890 891 } // namespace latinime 892