1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #define LOG_TAG "LatinIME: correction.cpp" 18 19 #include <cmath> 20 21 #include "char_utils.h" 22 #include "correction.h" 23 #include "defines.h" 24 #include "proximity_info_state.h" 25 #include "suggest_utils.h" 26 #include "suggest/policyimpl/utils/edit_distance.h" 27 #include "suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h" 28 29 namespace latinime { 30 31 class ProximityInfo; 32 33 ///////////////////////////// 34 // edit distance funcitons // 35 ///////////////////////////// 36 37 inline static void initEditDistance(int *editDistanceTable) { 38 for (int i = 0; i <= MAX_WORD_LENGTH; ++i) { 39 editDistanceTable[i] = i; 40 } 41 } 42 43 inline static void dumpEditDistance10ForDebug(int *editDistanceTable, 44 const int editDistanceTableWidth, const int outputLength) { 45 if (DEBUG_DICT) { 46 AKLOGI("EditDistanceTable"); 47 for (int i = 0; i <= 10; ++i) { 48 int c[11]; 49 for (int j = 0; j <= 10; ++j) { 50 if (j < editDistanceTableWidth + 1 && i < outputLength + 1) { 51 c[j] = (editDistanceTable + i * (editDistanceTableWidth + 1))[j]; 52 } else { 53 c[j] = -1; 54 } 55 } 56 AKLOGI("[ %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d ]", 57 c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10]); 58 (void)c; // To suppress compiler warning 59 } 60 } 61 } 62 63 inline static int getCurrentEditDistance(int *editDistanceTable, const int editDistanceTableWidth, 64 const int outputLength, const int inputSize) { 65 if (DEBUG_EDIT_DISTANCE) { 66 AKLOGI("getCurrentEditDistance %d, %d", inputSize, outputLength); 67 } 68 return editDistanceTable[(editDistanceTableWidth + 1) * (outputLength) + inputSize]; 69 } 70 71 //////////////// 72 // Correction // 73 //////////////// 74 75 void Correction::resetCorrection() { 76 mTotalTraverseCount = 0; 77 } 78 79 void Correction::initCorrection(const ProximityInfo *pi, const int inputSize, const int maxDepth) { 80 mProximityInfo = pi; 81 mInputSize = inputSize; 82 mMaxDepth = maxDepth; 83 mMaxEditDistance = mInputSize < 5 ? 2 : mInputSize / 2; 84 // TODO: This is not supposed to be required. Check what's going wrong with 85 // editDistance[0 ~ MAX_WORD_LENGTH] 86 initEditDistance(mEditDistanceTable); 87 } 88 89 void Correction::initCorrectionState( 90 const int rootPos, const int childCount, const bool traverseAll) { 91 latinime::initCorrectionState(mCorrectionStates, rootPos, childCount, traverseAll); 92 // TODO: remove 93 mCorrectionStates[0].mTransposedPos = mTransposedPos; 94 mCorrectionStates[0].mExcessivePos = mExcessivePos; 95 mCorrectionStates[0].mSkipPos = mSkipPos; 96 } 97 98 void Correction::setCorrectionParams(const int skipPos, const int excessivePos, 99 const int transposedPos, const int spaceProximityPos, const int missingSpacePos, 100 const bool useFullEditDistance, const bool doAutoCompletion, const int maxErrors) { 101 // TODO: remove 102 mTransposedPos = transposedPos; 103 mExcessivePos = excessivePos; 104 mSkipPos = skipPos; 105 // TODO: remove 106 mCorrectionStates[0].mTransposedPos = transposedPos; 107 mCorrectionStates[0].mExcessivePos = excessivePos; 108 mCorrectionStates[0].mSkipPos = skipPos; 109 110 mSpaceProximityPos = spaceProximityPos; 111 mMissingSpacePos = missingSpacePos; 112 mUseFullEditDistance = useFullEditDistance; 113 mDoAutoCompletion = doAutoCompletion; 114 mMaxErrors = maxErrors; 115 } 116 117 void Correction::checkState() const { 118 if (DEBUG_DICT) { 119 int inputCount = 0; 120 if (mSkipPos >= 0) ++inputCount; 121 if (mExcessivePos >= 0) ++inputCount; 122 if (mTransposedPos >= 0) ++inputCount; 123 } 124 } 125 126 bool Correction::sameAsTyped() const { 127 return mProximityInfoState.sameAsTyped(mWord, mOutputIndex); 128 } 129 130 int Correction::getFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray, 131 const int wordCount, const bool isSpaceProximity, const int *word) const { 132 return Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(freqArray, wordLengthArray, 133 wordCount, this, isSpaceProximity, word); 134 } 135 136 int Correction::getFinalProbability(const int probability, int **word, int *wordLength) { 137 return getFinalProbabilityInternal(probability, word, wordLength, mInputSize); 138 } 139 140 int Correction::getFinalProbabilityForSubQueue(const int probability, int **word, int *wordLength, 141 const int inputSize) { 142 return getFinalProbabilityInternal(probability, word, wordLength, inputSize); 143 } 144 145 bool Correction::initProcessState(const int outputIndex) { 146 if (mCorrectionStates[outputIndex].mChildCount <= 0) { 147 return false; 148 } 149 mOutputIndex = outputIndex; 150 --(mCorrectionStates[outputIndex].mChildCount); 151 mInputIndex = mCorrectionStates[outputIndex].mInputIndex; 152 mNeedsToTraverseAllNodes = mCorrectionStates[outputIndex].mNeedsToTraverseAllNodes; 153 154 mEquivalentCharCount = mCorrectionStates[outputIndex].mEquivalentCharCount; 155 mProximityCount = mCorrectionStates[outputIndex].mProximityCount; 156 mTransposedCount = mCorrectionStates[outputIndex].mTransposedCount; 157 mExcessiveCount = mCorrectionStates[outputIndex].mExcessiveCount; 158 mSkippedCount = mCorrectionStates[outputIndex].mSkippedCount; 159 mLastCharExceeded = mCorrectionStates[outputIndex].mLastCharExceeded; 160 161 mTransposedPos = mCorrectionStates[outputIndex].mTransposedPos; 162 mExcessivePos = mCorrectionStates[outputIndex].mExcessivePos; 163 mSkipPos = mCorrectionStates[outputIndex].mSkipPos; 164 165 mMatching = false; 166 mProximityMatching = false; 167 mAdditionalProximityMatching = false; 168 mTransposing = false; 169 mExceeding = false; 170 mSkipping = false; 171 172 return true; 173 } 174 175 int Correction::goDownTree(const int parentIndex, const int childCount, const int firstChildPos) { 176 mCorrectionStates[mOutputIndex].mParentIndex = parentIndex; 177 mCorrectionStates[mOutputIndex].mChildCount = childCount; 178 mCorrectionStates[mOutputIndex].mSiblingPos = firstChildPos; 179 return mOutputIndex; 180 } 181 182 // TODO: remove 183 int Correction::getInputIndex() const { 184 return mInputIndex; 185 } 186 187 bool Correction::needsToPrune() const { 188 // TODO: use edit distance here 189 return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance 190 // Allow one char longer word for missing character 191 || (!mDoAutoCompletion && (mOutputIndex > mInputSize)); 192 } 193 194 inline static bool isEquivalentChar(ProximityType type) { 195 return type == MATCH_CHAR; 196 } 197 198 inline static bool isProximityCharOrEquivalentChar(ProximityType type) { 199 return type == MATCH_CHAR || type == PROXIMITY_CHAR; 200 } 201 202 Correction::CorrectionType Correction::processCharAndCalcState(const int c, const bool isTerminal) { 203 const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount); 204 if (correctionCount > mMaxErrors) { 205 return processUnrelatedCorrectionType(); 206 } 207 208 // TODO: Change the limit if we'll allow two or more corrections 209 const bool noCorrectionsHappenedSoFar = correctionCount == 0; 210 const bool canTryCorrection = noCorrectionsHappenedSoFar; 211 int proximityIndex = 0; 212 mDistances[mOutputIndex] = NOT_A_DISTANCE; 213 214 // Skip checking this node 215 if (mNeedsToTraverseAllNodes || isSingleQuote(c)) { 216 bool incremented = false; 217 if (mLastCharExceeded && mInputIndex == mInputSize - 1) { 218 // TODO: Do not check the proximity if EditDistance exceeds the threshold 219 const ProximityType matchId = mProximityInfoState.getProximityType( 220 mInputIndex, c, true, &proximityIndex); 221 if (isEquivalentChar(matchId)) { 222 mLastCharExceeded = false; 223 --mExcessiveCount; 224 mDistances[mOutputIndex] = 225 mProximityInfoState.getNormalizedSquaredDistance(mInputIndex, 0); 226 } else if (matchId == PROXIMITY_CHAR) { 227 mLastCharExceeded = false; 228 --mExcessiveCount; 229 ++mProximityCount; 230 mDistances[mOutputIndex] = mProximityInfoState.getNormalizedSquaredDistance( 231 mInputIndex, proximityIndex); 232 } 233 if (!isSingleQuote(c)) { 234 incrementInputIndex(); 235 incremented = true; 236 } 237 } 238 return processSkipChar(c, isTerminal, incremented); 239 } 240 241 // Check possible corrections. 242 if (mExcessivePos >= 0) { 243 if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) { 244 mExcessivePos = mOutputIndex; 245 } 246 if (mExcessivePos < mInputSize - 1) { 247 mExceeding = mExcessivePos == mInputIndex && canTryCorrection; 248 } 249 } 250 251 if (mSkipPos >= 0) { 252 if (mSkippedCount == 0 && mSkipPos < mOutputIndex) { 253 if (DEBUG_DICT) { 254 // TODO: Enable this assertion. 255 //ASSERT(mSkipPos == mOutputIndex - 1); 256 } 257 mSkipPos = mOutputIndex; 258 } 259 mSkipping = mSkipPos == mOutputIndex && canTryCorrection; 260 } 261 262 if (mTransposedPos >= 0) { 263 if (mTransposedCount == 0 && mTransposedPos < mOutputIndex) { 264 mTransposedPos = mOutputIndex; 265 } 266 if (mTransposedPos < mInputSize - 1) { 267 mTransposing = mInputIndex == mTransposedPos && canTryCorrection; 268 } 269 } 270 271 bool secondTransposing = false; 272 if (mTransposedCount % 2 == 1) { 273 if (isEquivalentChar(mProximityInfoState.getProximityType( 274 mInputIndex - 1, c, false))) { 275 ++mTransposedCount; 276 secondTransposing = true; 277 } else if (mCorrectionStates[mOutputIndex].mExceeding) { 278 --mTransposedCount; 279 ++mExcessiveCount; 280 --mExcessivePos; 281 incrementInputIndex(); 282 } else { 283 --mTransposedCount; 284 if (DEBUG_CORRECTION 285 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) 286 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 287 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 288 DUMP_WORD(mWord, mOutputIndex); 289 AKLOGI("UNRELATED(0): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 290 mTransposedCount, mExcessiveCount, c); 291 } 292 return processUnrelatedCorrectionType(); 293 } 294 } 295 296 // TODO: Change the limit if we'll allow two or more proximity chars with corrections 297 // Work around: When the mMaxErrors is 1, we only allow just one error 298 // including proximity correction. 299 const bool checkProximityChars = (mMaxErrors > 1) 300 ? (noCorrectionsHappenedSoFar || mProximityCount == 0) 301 : (noCorrectionsHappenedSoFar && mProximityCount == 0); 302 303 ProximityType matchedProximityCharId = secondTransposing 304 ? MATCH_CHAR 305 : mProximityInfoState.getProximityType( 306 mInputIndex, c, checkProximityChars, &proximityIndex); 307 308 if (SUBSTITUTION_CHAR == matchedProximityCharId 309 || ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { 310 if (canTryCorrection && mOutputIndex > 0 311 && mCorrectionStates[mOutputIndex].mProximityMatching 312 && mCorrectionStates[mOutputIndex].mExceeding 313 && isEquivalentChar(mProximityInfoState.getProximityType( 314 mInputIndex, mWord[mOutputIndex - 1], false))) { 315 if (DEBUG_CORRECTION 316 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) 317 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 318 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 319 AKLOGI("CONVERSION p->e %c", mWord[mOutputIndex - 1]); 320 } 321 // Conversion p->e 322 // Example: 323 // wearth -> earth 324 // px -> (E)mmmmm 325 ++mExcessiveCount; 326 --mProximityCount; 327 mExcessivePos = mOutputIndex - 1; 328 ++mInputIndex; 329 // Here, we are doing something equivalent to matchedProximityCharId, 330 // but we already know that "excessive char correction" just happened 331 // so that we just need to check "mProximityCount == 0". 332 matchedProximityCharId = mProximityInfoState.getProximityType( 333 mInputIndex, c, mProximityCount == 0, &proximityIndex); 334 } 335 } 336 337 if (SUBSTITUTION_CHAR == matchedProximityCharId 338 || ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { 339 if (ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { 340 mAdditionalProximityMatching = true; 341 } 342 // TODO: Optimize 343 // As the current char turned out to be an unrelated char, 344 // we will try other correction-types. Please note that mCorrectionStates[mOutputIndex] 345 // here refers to the previous state. 346 if (mInputIndex < mInputSize - 1 && mOutputIndex > 0 && mTransposedCount > 0 347 && !mCorrectionStates[mOutputIndex].mTransposing 348 && mCorrectionStates[mOutputIndex - 1].mTransposing 349 && isEquivalentChar(mProximityInfoState.getProximityType( 350 mInputIndex, mWord[mOutputIndex - 1], false)) 351 && isEquivalentChar( 352 mProximityInfoState.getProximityType(mInputIndex + 1, c, false))) { 353 // Conversion t->e 354 // Example: 355 // occaisional -> occa sional 356 // mmmmttx -> mmmm(E)mmmmmm 357 mTransposedCount -= 2; 358 ++mExcessiveCount; 359 ++mInputIndex; 360 } else if (mOutputIndex > 0 && mInputIndex > 0 && mTransposedCount > 0 361 && !mCorrectionStates[mOutputIndex].mTransposing 362 && mCorrectionStates[mOutputIndex - 1].mTransposing 363 && isEquivalentChar( 364 mProximityInfoState.getProximityType(mInputIndex - 1, c, false))) { 365 // Conversion t->s 366 // Example: 367 // chcolate -> chocolate 368 // mmttx -> mmsmmmmmm 369 mTransposedCount -= 2; 370 ++mSkippedCount; 371 --mInputIndex; 372 } else if (canTryCorrection && mInputIndex > 0 373 && mCorrectionStates[mOutputIndex].mProximityMatching 374 && mCorrectionStates[mOutputIndex].mSkipping 375 && isEquivalentChar( 376 mProximityInfoState.getProximityType(mInputIndex - 1, c, false))) { 377 // Conversion p->s 378 // Note: This logic tries saving cases like contrst --> contrast -- "a" is one of 379 // proximity chars of "s", but it should rather be handled as a skipped char. 380 ++mSkippedCount; 381 --mProximityCount; 382 return processSkipChar(c, isTerminal, false); 383 } else if (mInputIndex - 1 < mInputSize 384 && mSkippedCount > 0 385 && mCorrectionStates[mOutputIndex].mSkipping 386 && mCorrectionStates[mOutputIndex].mAdditionalProximityMatching 387 && isProximityCharOrEquivalentChar( 388 mProximityInfoState.getProximityType(mInputIndex + 1, c, false))) { 389 // Conversion s->a 390 incrementInputIndex(); 391 --mSkippedCount; 392 mProximityMatching = true; 393 ++mProximityCount; 394 mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO; 395 } else if ((mExceeding || mTransposing) && mInputIndex - 1 < mInputSize 396 && isEquivalentChar( 397 mProximityInfoState.getProximityType(mInputIndex + 1, c, false))) { 398 // 1.2. Excessive or transpose correction 399 if (mTransposing) { 400 ++mTransposedCount; 401 } else { 402 ++mExcessiveCount; 403 incrementInputIndex(); 404 } 405 if (DEBUG_CORRECTION 406 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) 407 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 408 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 409 DUMP_WORD(mWord, mOutputIndex); 410 if (mTransposing) { 411 AKLOGI("TRANSPOSE: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 412 mTransposedCount, mExcessiveCount, c); 413 } else { 414 AKLOGI("EXCEED: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 415 mTransposedCount, mExcessiveCount, c); 416 } 417 } 418 } else if (mSkipping) { 419 // 3. Skip correction 420 ++mSkippedCount; 421 if (DEBUG_CORRECTION 422 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) 423 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 424 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 425 AKLOGI("SKIP: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 426 mTransposedCount, mExcessiveCount, c); 427 } 428 return processSkipChar(c, isTerminal, false); 429 } else if (ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { 430 // As a last resort, use additional proximity characters 431 mProximityMatching = true; 432 ++mProximityCount; 433 mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO; 434 if (DEBUG_CORRECTION 435 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) 436 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 437 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 438 AKLOGI("ADDITIONALPROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 439 mTransposedCount, mExcessiveCount, c); 440 } 441 } else { 442 if (DEBUG_CORRECTION 443 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) 444 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 445 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 446 DUMP_WORD(mWord, mOutputIndex); 447 AKLOGI("UNRELATED(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 448 mTransposedCount, mExcessiveCount, c); 449 } 450 return processUnrelatedCorrectionType(); 451 } 452 } else if (secondTransposing) { 453 // If inputIndex is greater than mInputSize, that means there is no 454 // proximity chars. So, we don't need to check proximity. 455 mMatching = true; 456 } else if (isEquivalentChar(matchedProximityCharId)) { 457 mMatching = true; 458 ++mEquivalentCharCount; 459 mDistances[mOutputIndex] = mProximityInfoState.getNormalizedSquaredDistance(mInputIndex, 0); 460 } else if (PROXIMITY_CHAR == matchedProximityCharId) { 461 mProximityMatching = true; 462 ++mProximityCount; 463 mDistances[mOutputIndex] = 464 mProximityInfoState.getNormalizedSquaredDistance(mInputIndex, proximityIndex); 465 if (DEBUG_CORRECTION 466 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) 467 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 468 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 469 AKLOGI("PROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 470 mTransposedCount, mExcessiveCount, c); 471 } 472 } 473 474 addCharToCurrentWord(c); 475 476 // 4. Last char excessive correction 477 mLastCharExceeded = mExcessiveCount == 0 && mSkippedCount == 0 && mTransposedCount == 0 478 && mProximityCount == 0 && (mInputIndex == mInputSize - 2); 479 const bool isSameAsUserTypedLength = (mInputSize == mInputIndex + 1) || mLastCharExceeded; 480 if (mLastCharExceeded) { 481 ++mExcessiveCount; 482 } 483 484 // Start traversing all nodes after the index exceeds the user typed length 485 if (isSameAsUserTypedLength) { 486 startToTraverseAllNodes(); 487 } 488 489 const bool needsToTryOnTerminalForTheLastPossibleExcessiveChar = 490 mExceeding && mInputIndex == mInputSize - 2; 491 492 // Finally, we are ready to go to the next character, the next "virtual node". 493 // We should advance the input index. 494 // We do this in this branch of the 'if traverseAllNodes' because we are still matching 495 // characters to input; the other branch is not matching them but searching for 496 // completions, this is why it does not have to do it. 497 incrementInputIndex(); 498 // Also, the next char is one "virtual node" depth more than this char. 499 incrementOutputIndex(); 500 501 if ((needsToTryOnTerminalForTheLastPossibleExcessiveChar 502 || isSameAsUserTypedLength) && isTerminal) { 503 mTerminalInputIndex = mInputIndex - 1; 504 mTerminalOutputIndex = mOutputIndex - 1; 505 if (DEBUG_CORRECTION 506 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) 507 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 508 DUMP_WORD(mWord, mOutputIndex); 509 AKLOGI("ONTERMINAL(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 510 mTransposedCount, mExcessiveCount, c); 511 } 512 return ON_TERMINAL; 513 } else { 514 mTerminalInputIndex = mInputIndex - 1; 515 mTerminalOutputIndex = mOutputIndex - 1; 516 return NOT_ON_TERMINAL; 517 } 518 } 519 520 inline static int getQuoteCount(const int *word, const int length) { 521 int quoteCount = 0; 522 for (int i = 0; i < length; ++i) { 523 if (word[i] == KEYCODE_SINGLE_QUOTE) { 524 ++quoteCount; 525 } 526 } 527 return quoteCount; 528 } 529 530 inline static bool isUpperCase(unsigned short c) { 531 return isAsciiUpper(toBaseCodePoint(c)); 532 } 533 534 ////////////////////// 535 // RankingAlgorithm // 536 ////////////////////// 537 538 /* static */ int Correction::RankingAlgorithm::calculateFinalProbability(const int inputIndex, 539 const int outputIndex, const int freq, int *editDistanceTable, const Correction *correction, 540 const int inputSize) { 541 const int excessivePos = correction->getExcessivePos(); 542 const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; 543 const int fullWordMultiplier = correction->FULL_WORD_MULTIPLIER; 544 const ProximityInfoState *proximityInfoState = &correction->mProximityInfoState; 545 const int skippedCount = correction->mSkippedCount; 546 const int transposedCount = correction->mTransposedCount / 2; 547 const int excessiveCount = correction->mExcessiveCount + correction->mTransposedCount % 2; 548 const int proximityMatchedCount = correction->mProximityCount; 549 const bool lastCharExceeded = correction->mLastCharExceeded; 550 const bool useFullEditDistance = correction->mUseFullEditDistance; 551 const int outputLength = outputIndex + 1; 552 if (skippedCount >= inputSize || inputSize == 0) { 553 return -1; 554 } 555 556 // TODO: find more robust way 557 bool sameLength = lastCharExceeded ? (inputSize == inputIndex + 2) 558 : (inputSize == inputIndex + 1); 559 560 // TODO: use mExcessiveCount 561 const int matchCount = inputSize - correction->mProximityCount - excessiveCount; 562 563 const int *word = correction->mWord; 564 const bool skipped = skippedCount > 0; 565 566 const int quoteDiffCount = max(0, getQuoteCount(word, outputLength) 567 - getQuoteCount(proximityInfoState->getPrimaryInputWord(), inputSize)); 568 569 // TODO: Calculate edit distance for transposed and excessive 570 int ed = 0; 571 if (DEBUG_DICT_FULL) { 572 dumpEditDistance10ForDebug(editDistanceTable, correction->mInputSize, outputLength); 573 } 574 int adjustedProximityMatchedCount = proximityMatchedCount; 575 576 int finalFreq = freq; 577 578 if (DEBUG_CORRECTION_FREQ 579 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputSize)) { 580 AKLOGI("FinalFreq0: %d", finalFreq); 581 } 582 // TODO: Optimize this. 583 if (transposedCount > 0 || proximityMatchedCount > 0 || skipped || excessiveCount > 0) { 584 ed = getCurrentEditDistance(editDistanceTable, correction->mInputSize, outputLength, 585 inputSize) - transposedCount; 586 587 const int matchWeight = powerIntCapped(typedLetterMultiplier, 588 max(inputSize, outputLength) - ed); 589 multiplyIntCapped(matchWeight, &finalFreq); 590 591 // TODO: Demote further if there are two or more excessive chars with longer user input? 592 if (inputSize > outputLength) { 593 multiplyRate(INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE, &finalFreq); 594 } 595 596 ed = max(0, ed - quoteDiffCount); 597 adjustedProximityMatchedCount = min(max(0, ed - (outputLength - inputSize)), 598 proximityMatchedCount); 599 if (transposedCount <= 0) { 600 if (ed == 1 && (inputSize == outputLength - 1 || inputSize == outputLength + 1)) { 601 // Promote a word with just one skipped or excessive char 602 if (sameLength) { 603 multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE 604 + WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_MULTIPLIER * outputLength, 605 &finalFreq); 606 } else { 607 multiplyIntCapped(typedLetterMultiplier, &finalFreq); 608 } 609 } else if (ed == 0) { 610 multiplyIntCapped(typedLetterMultiplier, &finalFreq); 611 sameLength = true; 612 } 613 } 614 } else { 615 const int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount); 616 multiplyIntCapped(matchWeight, &finalFreq); 617 } 618 619 if (proximityInfoState->getProximityType(0, word[0], true) == SUBSTITUTION_CHAR) { 620 multiplyRate(FIRST_CHAR_DIFFERENT_DEMOTION_RATE, &finalFreq); 621 } 622 623 /////////////////////////////////////////////// 624 // Promotion and Demotion for each correction 625 626 // Demotion for a word with missing character 627 if (skipped) { 628 const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 629 * (10 * inputSize - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X) 630 / (10 * inputSize 631 - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10); 632 if (DEBUG_DICT_FULL) { 633 AKLOGI("Demotion rate for missing character is %d.", demotionRate); 634 } 635 multiplyRate(demotionRate, &finalFreq); 636 } 637 638 // Demotion for a word with transposed character 639 if (transposedCount > 0) multiplyRate( 640 WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq); 641 642 // Demotion for a word with excessive character 643 if (excessiveCount > 0) { 644 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq); 645 if (!lastCharExceeded && !proximityInfoState->existsAdjacentProximityChars(excessivePos)) { 646 if (DEBUG_DICT_FULL) { 647 AKLOGI("Double excessive demotion"); 648 } 649 // If an excessive character is not adjacent to the left char or the right char, 650 // we will demote this word. 651 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq); 652 } 653 } 654 655 int additionalProximityCount = 0; 656 // Demote additional proximity characters 657 for (int i = 0; i < outputLength; ++i) { 658 const int squaredDistance = correction->mDistances[i]; 659 if (squaredDistance == ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO) { 660 ++additionalProximityCount; 661 } 662 } 663 664 const bool performTouchPositionCorrection = 665 CALIBRATE_SCORE_BY_TOUCH_COORDINATES 666 && proximityInfoState->touchPositionCorrectionEnabled() 667 && skippedCount == 0 && excessiveCount == 0 && transposedCount == 0 668 && additionalProximityCount == 0; 669 670 // Score calibration by touch coordinates is being done only for pure-fat finger typing error 671 // cases. 672 // TODO: Remove this constraint. 673 if (performTouchPositionCorrection) { 674 for (int i = 0; i < outputLength; ++i) { 675 const int squaredDistance = correction->mDistances[i]; 676 if (i < adjustedProximityMatchedCount) { 677 multiplyIntCapped(typedLetterMultiplier, &finalFreq); 678 } 679 const float factor = 680 SuggestUtils::getLengthScalingFactor(static_cast<float>(squaredDistance)); 681 if (factor > 0.0f) { 682 multiplyRate(static_cast<int>(factor * 100.0f), &finalFreq); 683 } else if (squaredDistance == PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO) { 684 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); 685 } 686 } 687 } else { 688 // Promotion for a word with proximity characters 689 for (int i = 0; i < adjustedProximityMatchedCount; ++i) { 690 // A word with proximity corrections 691 if (DEBUG_DICT_FULL) { 692 AKLOGI("Found a proximity correction."); 693 } 694 multiplyIntCapped(typedLetterMultiplier, &finalFreq); 695 if (i < additionalProximityCount) { 696 multiplyRate(WORDS_WITH_ADDITIONAL_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); 697 } else { 698 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); 699 } 700 } 701 } 702 703 // If the user types too many(three or more) proximity characters with additional proximity 704 // character,do not treat as the same length word. 705 if (sameLength && additionalProximityCount > 0 && (adjustedProximityMatchedCount >= 3 706 || transposedCount > 0 || skipped || excessiveCount > 0)) { 707 sameLength = false; 708 } 709 710 const int errorCount = adjustedProximityMatchedCount > 0 711 ? adjustedProximityMatchedCount 712 : (proximityMatchedCount + transposedCount); 713 multiplyRate( 714 100 - CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE * errorCount / inputSize, &finalFreq); 715 716 // Promotion for an exactly matched word 717 if (ed == 0) { 718 // Full exact match 719 if (sameLength && transposedCount == 0 && !skipped && excessiveCount == 0 720 && quoteDiffCount == 0 && additionalProximityCount == 0) { 721 finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq); 722 } 723 } 724 725 // Promote a word with no correction 726 if (proximityMatchedCount == 0 && transposedCount == 0 && !skipped && excessiveCount == 0 727 && additionalProximityCount == 0) { 728 multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq); 729 } 730 731 // TODO: Check excessive count and transposed count 732 // TODO: Remove this if possible 733 /* 734 If the last character of the user input word is the same as the next character 735 of the output word, and also all of characters of the user input are matched 736 to the output word, we'll promote that word a bit because 737 that word can be considered the combination of skipped and matched characters. 738 This means that the 'sm' pattern wins over the 'ma' pattern. 739 e.g.) 740 shel -> shell [mmmma] or [mmmsm] 741 hel -> hello [mmmaa] or [mmsma] 742 m ... matching 743 s ... skipping 744 a ... traversing all 745 t ... transposing 746 e ... exceeding 747 p ... proximity matching 748 */ 749 if (matchCount == inputSize && matchCount >= 2 && !skipped 750 && word[matchCount] == word[matchCount - 1]) { 751 multiplyRate(WORDS_WITH_MATCH_SKIP_PROMOTION_RATE, &finalFreq); 752 } 753 754 // TODO: Do not use sameLength? 755 if (sameLength) { 756 multiplyIntCapped(fullWordMultiplier, &finalFreq); 757 } 758 759 if (useFullEditDistance && outputLength > inputSize + 1) { 760 const int diff = outputLength - inputSize - 1; 761 const int divider = diff < 31 ? 1 << diff : S_INT_MAX; 762 finalFreq = divider > finalFreq ? 1 : finalFreq / divider; 763 } 764 765 if (DEBUG_DICT_FULL) { 766 AKLOGI("calc: %d, %d", outputLength, sameLength); 767 } 768 769 if (DEBUG_CORRECTION_FREQ 770 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputSize)) { 771 DUMP_WORD(correction->getPrimaryInputWord(), inputSize); 772 DUMP_WORD(correction->mWord, outputLength); 773 AKLOGI("FinalFreq: [P%d, S%d, T%d, E%d, A%d] %d, %d, %d, %d, %d, %d", proximityMatchedCount, 774 skippedCount, transposedCount, excessiveCount, additionalProximityCount, 775 outputLength, lastCharExceeded, sameLength, quoteDiffCount, ed, finalFreq); 776 } 777 778 return finalFreq; 779 } 780 781 /* static */ int Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(const int *freqArray, 782 const int *wordLengthArray, const int wordCount, const Correction *correction, 783 const bool isSpaceProximity, const int *word) { 784 const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; 785 786 bool firstCapitalizedWordDemotion = false; 787 bool secondCapitalizedWordDemotion = false; 788 789 { 790 // TODO: Handle multiple capitalized word demotion properly 791 const int firstWordLength = wordLengthArray[0]; 792 const int secondWordLength = wordLengthArray[1]; 793 if (firstWordLength >= 2) { 794 firstCapitalizedWordDemotion = isUpperCase(word[0]); 795 } 796 797 if (secondWordLength >= 2) { 798 // FIXME: word[firstWordLength + 1] is incorrect. 799 secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]); 800 } 801 } 802 803 804 const bool capitalizedWordDemotion = 805 firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion; 806 807 int totalLength = 0; 808 int totalFreq = 0; 809 for (int i = 0; i < wordCount; ++i) { 810 const int wordLength = wordLengthArray[i]; 811 if (wordLength <= 0) { 812 return 0; 813 } 814 totalLength += wordLength; 815 const int demotionRate = 100 - TWO_WORDS_CORRECTION_DEMOTION_BASE / (wordLength + 1); 816 int tempFirstFreq = freqArray[i]; 817 multiplyRate(demotionRate, &tempFirstFreq); 818 totalFreq += tempFirstFreq; 819 } 820 821 if (totalLength <= 0 || totalFreq <= 0) { 822 return 0; 823 } 824 825 // TODO: Currently totalFreq is adjusted to two word metrix. 826 // Promote pairFreq with multiplying by 2, because the word length is the same as the typed 827 // length. 828 totalFreq = totalFreq * 2 / wordCount; 829 if (wordCount > 2) { 830 // Safety net for 3+ words -- Caveats: many heuristics and workarounds here. 831 int oneLengthCounter = 0; 832 int twoLengthCounter = 0; 833 for (int i = 0; i < wordCount; ++i) { 834 const int wordLength = wordLengthArray[i]; 835 // TODO: Use bigram instead of this safety net 836 if (i < wordCount - 1) { 837 const int nextWordLength = wordLengthArray[i + 1]; 838 if (wordLength == 1 && nextWordLength == 2) { 839 // Safety net to filter 1 length and 2 length sequential words 840 return 0; 841 } 842 } 843 const int freq = freqArray[i]; 844 // Demote too short weak words 845 if (wordLength <= 4 && freq <= SUPPRESS_SHORT_MULTIPLE_WORDS_THRESHOLD_FREQ) { 846 multiplyRate(100 * freq / MAX_PROBABILITY, &totalFreq); 847 } 848 if (wordLength == 1) { 849 ++oneLengthCounter; 850 } else if (wordLength == 2) { 851 ++twoLengthCounter; 852 } 853 if (oneLengthCounter >= 2 || (oneLengthCounter + twoLengthCounter) >= 4) { 854 // Safety net to filter too many short words 855 return 0; 856 } 857 } 858 multiplyRate(MULTIPLE_WORDS_DEMOTION_RATE, &totalFreq); 859 } 860 861 // This is a workaround to try offsetting the not-enough-demotion which will be done in 862 // calcNormalizedScore in Utils.java. 863 // In calcNormalizedScore the score will be demoted by (1 - 1 / length) 864 // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by 865 // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length)) 866 const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength); 867 multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq); 868 869 // At this moment, totalFreq is calculated by the following formula: 870 // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1))) 871 // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1)) 872 873 multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq); 874 875 // This is another workaround to offset the demotion which will be done in 876 // calcNormalizedScore in Utils.java. 877 // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote 878 // the same amount because we already have adjusted the synthetic freq of this "missing or 879 // mistyped space" suggestion candidate above in this method. 880 const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength); 881 multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq); 882 883 if (isSpaceProximity) { 884 // A word pair with one space proximity correction 885 if (DEBUG_DICT) { 886 AKLOGI("Found a word pair with space proximity correction."); 887 } 888 multiplyIntCapped(typedLetterMultiplier, &totalFreq); 889 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq); 890 } 891 892 if (isSpaceProximity) { 893 multiplyRate(WORDS_WITH_MISTYPED_SPACE_DEMOTION_RATE, &totalFreq); 894 } else { 895 multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq); 896 } 897 898 if (capitalizedWordDemotion) { 899 multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq); 900 } 901 902 if (DEBUG_CORRECTION_FREQ) { 903 AKLOGI("Multiple words (%d, %d) (%d, %d) %d, %d", freqArray[0], freqArray[1], 904 wordLengthArray[0], wordLengthArray[1], capitalizedWordDemotion, totalFreq); 905 DUMP_WORD(word, wordLengthArray[0]); 906 } 907 908 return totalFreq; 909 } 910 911 /* static */ int Correction::RankingAlgorithm::editDistance(const int *before, 912 const int beforeLength, const int *after, const int afterLength) { 913 const DamerauLevenshteinEditDistancePolicy daemaruLevenshtein( 914 before, beforeLength, after, afterLength); 915 return static_cast<int>(EditDistance::getEditDistance(&daemaruLevenshtein)); 916 } 917 918 919 // In dictionary.cpp, getSuggestion() method, 920 // When USE_SUGGEST_INTERFACE_FOR_TYPING is true: 921 // SUGGEST_INTERFACE_OUTPUT_SCALE was multiplied to the original suggestion scores to convert 922 // them to integers. 923 // score = (int)((original score) * SUGGEST_INTERFACE_OUTPUT_SCALE) 924 // Undo the scaling here to recover the original score. 925 // normalizedScore = ((float)score) / SUGGEST_INTERFACE_OUTPUT_SCALE 926 // Otherwise: suggestion scores are computed using the below formula. 927 // original score 928 // := powf(mTypedLetterMultiplier (this is defined 2), 929 // (the number of matched characters between typed word and suggested word)) 930 // * (individual word's score which defined in the unigram dictionary, 931 // and this score is defined in range [0, 255].) 932 // Then, the following processing is applied. 933 // - If the dictionary word is matched up to the point of the user entry 934 // (full match up to min(before.length(), after.length()) 935 // => Then multiply by FULL_MATCHED_WORDS_PROMOTION_RATE (this is defined 1.2) 936 // - If the word is a true full match except for differences in accents or 937 // capitalization, then treat it as if the score was 255. 938 // - If before.length() == after.length() 939 // => multiply by mFullWordMultiplier (this is defined 2)) 940 // So, maximum original score is powf(2, min(before.length(), after.length())) * 255 * 2 * 1.2 941 // For historical reasons we ignore the 1.2 modifier (because the measure for a good 942 // autocorrection threshold was done at a time when it didn't exist). This doesn't change 943 // the result. 944 // So, we can normalize original score by dividing powf(2, min(b.l(),a.l())) * 255 * 2. 945 946 /* static */ float Correction::RankingAlgorithm::calcNormalizedScore(const int *before, 947 const int beforeLength, const int *after, const int afterLength, const int score) { 948 if (0 == beforeLength || 0 == afterLength) { 949 return 0.0f; 950 } 951 const int distance = editDistance(before, beforeLength, after, afterLength); 952 int spaceCount = 0; 953 for (int i = 0; i < afterLength; ++i) { 954 if (after[i] == KEYCODE_SPACE) { 955 ++spaceCount; 956 } 957 } 958 959 if (spaceCount == afterLength) { 960 return 0.0f; 961 } 962 963 // add a weight based on edit distance. 964 // distance <= max(afterLength, beforeLength) == afterLength, 965 // so, 0 <= distance / afterLength <= 1 966 const float weight = 1.0f - static_cast<float>(distance) / static_cast<float>(afterLength); 967 968 if (USE_SUGGEST_INTERFACE_FOR_TYPING) { 969 return (static_cast<float>(score) / SUGGEST_INTERFACE_OUTPUT_SCALE) * weight; 970 } 971 const float maxScore = score >= S_INT_MAX ? static_cast<float>(S_INT_MAX) 972 : static_cast<float>(MAX_INITIAL_SCORE) 973 * powf(static_cast<float>(TYPED_LETTER_MULTIPLIER), 974 static_cast<float>(min(beforeLength, afterLength - spaceCount))) 975 * static_cast<float>(FULL_WORD_MULTIPLIER); 976 977 return (static_cast<float>(score) / maxScore) * weight; 978 } 979 } // namespace latinime 980