1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <assert.h> 18 #include <ctype.h> 19 #include <math.h> 20 #include <stdio.h> 21 #include <string.h> 22 23 #define LOG_TAG "LatinIME: correction.cpp" 24 25 #include "char_utils.h" 26 #include "correction.h" 27 #include "defines.h" 28 #include "dictionary.h" 29 #include "proximity_info.h" 30 31 namespace latinime { 32 33 ///////////////////////////// 34 // edit distance funcitons // 35 ///////////////////////////// 36 37 inline static void initEditDistance(int *editDistanceTable) { 38 for (int i = 0; i <= MAX_WORD_LENGTH_INTERNAL; ++i) { 39 editDistanceTable[i] = i; 40 } 41 } 42 43 inline static void dumpEditDistance10ForDebug(int *editDistanceTable, 44 const int editDistanceTableWidth, const int outputLength) { 45 if (DEBUG_DICT) { 46 AKLOGI("EditDistanceTable"); 47 for (int i = 0; i <= 10; ++i) { 48 int c[11]; 49 for (int j = 0; j <= 10; ++j) { 50 if (j < editDistanceTableWidth + 1 && i < outputLength + 1) { 51 c[j] = (editDistanceTable + i * (editDistanceTableWidth + 1))[j]; 52 } else { 53 c[j] = -1; 54 } 55 } 56 AKLOGI("[ %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d ]", 57 c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10]); 58 } 59 } 60 } 61 62 inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigned short *input, 63 const int inputLength, const unsigned short *output, const int outputLength) { 64 // TODO: Make sure that editDistance[0 ~ MAX_WORD_LENGTH_INTERNAL] is not touched. 65 // Let dp[i][j] be editDistanceTable[i * (inputLength + 1) + j]. 66 // Assuming that dp[0][0] ... dp[outputLength - 1][inputLength] are already calculated, 67 // and calculate dp[ouputLength][0] ... dp[outputLength][inputLength]. 68 int *const current = editDistanceTable + outputLength * (inputLength + 1); 69 const int *const prev = editDistanceTable + (outputLength - 1) * (inputLength + 1); 70 const int *const prevprev = 71 outputLength >= 2 ? editDistanceTable + (outputLength - 2) * (inputLength + 1) : 0; 72 current[0] = outputLength; 73 const uint32_t co = toBaseLowerCase(output[outputLength - 1]); 74 const uint32_t prevCO = outputLength >= 2 ? toBaseLowerCase(output[outputLength - 2]) : 0; 75 for (int i = 1; i <= inputLength; ++i) { 76 const uint32_t ci = toBaseLowerCase(input[i - 1]); 77 const uint16_t cost = (ci == co) ? 0 : 1; 78 current[i] = min(current[i - 1] + 1, min(prev[i] + 1, prev[i - 1] + cost)); 79 if (i >= 2 && prevprev && ci == prevCO && co == toBaseLowerCase(input[i - 2])) { 80 current[i] = min(current[i], prevprev[i - 2] + 1); 81 } 82 } 83 } 84 85 inline static int getCurrentEditDistance(int *editDistanceTable, const int editDistanceTableWidth, 86 const int outputLength, const int inputLength) { 87 if (DEBUG_EDIT_DISTANCE) { 88 AKLOGI("getCurrentEditDistance %d, %d", inputLength, outputLength); 89 } 90 return editDistanceTable[(editDistanceTableWidth + 1) * (outputLength) + inputLength]; 91 } 92 93 ////////////////////// 94 // inline functions // 95 ////////////////////// 96 static const char QUOTE = '\''; 97 98 inline bool Correction::isQuote(const unsigned short c) { 99 const unsigned short userTypedChar = mProximityInfo->getPrimaryCharAt(mInputIndex); 100 return (c == QUOTE && userTypedChar != QUOTE); 101 } 102 103 //////////////// 104 // Correction // 105 //////////////// 106 107 Correction::Correction(const int typedLetterMultiplier, const int fullWordMultiplier) 108 : TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier) { 109 initEditDistance(mEditDistanceTable); 110 } 111 112 void Correction::resetCorrection() { 113 mTotalTraverseCount = 0; 114 } 115 116 void Correction::initCorrection(const ProximityInfo *pi, const int inputLength, 117 const int maxDepth) { 118 mProximityInfo = pi; 119 mInputLength = inputLength; 120 mMaxDepth = maxDepth; 121 mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2; 122 // TODO: This is not supposed to be required. Check what's going wrong with 123 // editDistance[0 ~ MAX_WORD_LENGTH_INTERNAL] 124 initEditDistance(mEditDistanceTable); 125 } 126 127 void Correction::initCorrectionState( 128 const int rootPos, const int childCount, const bool traverseAll) { 129 latinime::initCorrectionState(mCorrectionStates, rootPos, childCount, traverseAll); 130 // TODO: remove 131 mCorrectionStates[0].mTransposedPos = mTransposedPos; 132 mCorrectionStates[0].mExcessivePos = mExcessivePos; 133 mCorrectionStates[0].mSkipPos = mSkipPos; 134 } 135 136 void Correction::setCorrectionParams(const int skipPos, const int excessivePos, 137 const int transposedPos, const int spaceProximityPos, const int missingSpacePos, 138 const bool useFullEditDistance, const bool doAutoCompletion, const int maxErrors) { 139 // TODO: remove 140 mTransposedPos = transposedPos; 141 mExcessivePos = excessivePos; 142 mSkipPos = skipPos; 143 // TODO: remove 144 mCorrectionStates[0].mTransposedPos = transposedPos; 145 mCorrectionStates[0].mExcessivePos = excessivePos; 146 mCorrectionStates[0].mSkipPos = skipPos; 147 148 mSpaceProximityPos = spaceProximityPos; 149 mMissingSpacePos = missingSpacePos; 150 mUseFullEditDistance = useFullEditDistance; 151 mDoAutoCompletion = doAutoCompletion; 152 mMaxErrors = maxErrors; 153 } 154 155 void Correction::checkState() { 156 if (DEBUG_DICT) { 157 int inputCount = 0; 158 if (mSkipPos >= 0) ++inputCount; 159 if (mExcessivePos >= 0) ++inputCount; 160 if (mTransposedPos >= 0) ++inputCount; 161 // TODO: remove this assert 162 assert(inputCount <= 1); 163 } 164 } 165 166 int Correction::getFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray, 167 const int wordCount, const bool isSpaceProximity, const unsigned short *word) { 168 return Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(freqArray, wordLengthArray, 169 wordCount, this, isSpaceProximity, word); 170 } 171 172 int Correction::getFinalProbability(const int probability, unsigned short **word, int *wordLength) { 173 return getFinalProbabilityInternal(probability, word, wordLength, mInputLength); 174 } 175 176 int Correction::getFinalProbabilityForSubQueue(const int probability, unsigned short **word, 177 int *wordLength, const int inputLength) { 178 return getFinalProbabilityInternal(probability, word, wordLength, inputLength); 179 } 180 181 int Correction::getFinalProbabilityInternal(const int probability, unsigned short **word, 182 int *wordLength, const int inputLength) { 183 const int outputIndex = mTerminalOutputIndex; 184 const int inputIndex = mTerminalInputIndex; 185 *wordLength = outputIndex + 1; 186 if (outputIndex < MIN_SUGGEST_DEPTH) { 187 return NOT_A_PROBABILITY; 188 } 189 190 *word = mWord; 191 int finalProbability= Correction::RankingAlgorithm::calculateFinalProbability( 192 inputIndex, outputIndex, probability, mEditDistanceTable, this, inputLength); 193 return finalProbability; 194 } 195 196 bool Correction::initProcessState(const int outputIndex) { 197 if (mCorrectionStates[outputIndex].mChildCount <= 0) { 198 return false; 199 } 200 mOutputIndex = outputIndex; 201 --(mCorrectionStates[outputIndex].mChildCount); 202 mInputIndex = mCorrectionStates[outputIndex].mInputIndex; 203 mNeedsToTraverseAllNodes = mCorrectionStates[outputIndex].mNeedsToTraverseAllNodes; 204 205 mEquivalentCharCount = mCorrectionStates[outputIndex].mEquivalentCharCount; 206 mProximityCount = mCorrectionStates[outputIndex].mProximityCount; 207 mTransposedCount = mCorrectionStates[outputIndex].mTransposedCount; 208 mExcessiveCount = mCorrectionStates[outputIndex].mExcessiveCount; 209 mSkippedCount = mCorrectionStates[outputIndex].mSkippedCount; 210 mLastCharExceeded = mCorrectionStates[outputIndex].mLastCharExceeded; 211 212 mTransposedPos = mCorrectionStates[outputIndex].mTransposedPos; 213 mExcessivePos = mCorrectionStates[outputIndex].mExcessivePos; 214 mSkipPos = mCorrectionStates[outputIndex].mSkipPos; 215 216 mMatching = false; 217 mProximityMatching = false; 218 mAdditionalProximityMatching = false; 219 mTransposing = false; 220 mExceeding = false; 221 mSkipping = false; 222 223 return true; 224 } 225 226 int Correction::goDownTree( 227 const int parentIndex, const int childCount, const int firstChildPos) { 228 mCorrectionStates[mOutputIndex].mParentIndex = parentIndex; 229 mCorrectionStates[mOutputIndex].mChildCount = childCount; 230 mCorrectionStates[mOutputIndex].mSiblingPos = firstChildPos; 231 return mOutputIndex; 232 } 233 234 // TODO: remove 235 int Correction::getInputIndex() { 236 return mInputIndex; 237 } 238 239 void Correction::incrementInputIndex() { 240 ++mInputIndex; 241 } 242 243 void Correction::incrementOutputIndex() { 244 ++mOutputIndex; 245 mCorrectionStates[mOutputIndex].mParentIndex = mCorrectionStates[mOutputIndex - 1].mParentIndex; 246 mCorrectionStates[mOutputIndex].mChildCount = mCorrectionStates[mOutputIndex - 1].mChildCount; 247 mCorrectionStates[mOutputIndex].mSiblingPos = mCorrectionStates[mOutputIndex - 1].mSiblingPos; 248 mCorrectionStates[mOutputIndex].mInputIndex = mInputIndex; 249 mCorrectionStates[mOutputIndex].mNeedsToTraverseAllNodes = mNeedsToTraverseAllNodes; 250 251 mCorrectionStates[mOutputIndex].mEquivalentCharCount = mEquivalentCharCount; 252 mCorrectionStates[mOutputIndex].mProximityCount = mProximityCount; 253 mCorrectionStates[mOutputIndex].mTransposedCount = mTransposedCount; 254 mCorrectionStates[mOutputIndex].mExcessiveCount = mExcessiveCount; 255 mCorrectionStates[mOutputIndex].mSkippedCount = mSkippedCount; 256 257 mCorrectionStates[mOutputIndex].mSkipPos = mSkipPos; 258 mCorrectionStates[mOutputIndex].mTransposedPos = mTransposedPos; 259 mCorrectionStates[mOutputIndex].mExcessivePos = mExcessivePos; 260 261 mCorrectionStates[mOutputIndex].mLastCharExceeded = mLastCharExceeded; 262 263 mCorrectionStates[mOutputIndex].mMatching = mMatching; 264 mCorrectionStates[mOutputIndex].mProximityMatching = mProximityMatching; 265 mCorrectionStates[mOutputIndex].mAdditionalProximityMatching = mAdditionalProximityMatching; 266 mCorrectionStates[mOutputIndex].mTransposing = mTransposing; 267 mCorrectionStates[mOutputIndex].mExceeding = mExceeding; 268 mCorrectionStates[mOutputIndex].mSkipping = mSkipping; 269 } 270 271 void Correction::startToTraverseAllNodes() { 272 mNeedsToTraverseAllNodes = true; 273 } 274 275 bool Correction::needsToPrune() const { 276 // TODO: use edit distance here 277 return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance 278 // Allow one char longer word for missing character 279 || (!mDoAutoCompletion && (mOutputIndex > mInputLength)); 280 } 281 282 void Correction::addCharToCurrentWord(const int32_t c) { 283 mWord[mOutputIndex] = c; 284 const unsigned short *primaryInputWord = mProximityInfo->getPrimaryInputWord(); 285 calcEditDistanceOneStep(mEditDistanceTable, primaryInputWord, mInputLength, 286 mWord, mOutputIndex + 1); 287 } 288 289 Correction::CorrectionType Correction::processSkipChar( 290 const int32_t c, const bool isTerminal, const bool inputIndexIncremented) { 291 addCharToCurrentWord(c); 292 mTerminalInputIndex = mInputIndex - (inputIndexIncremented ? 1 : 0); 293 mTerminalOutputIndex = mOutputIndex; 294 if (mNeedsToTraverseAllNodes && isTerminal) { 295 incrementOutputIndex(); 296 return TRAVERSE_ALL_ON_TERMINAL; 297 } else { 298 incrementOutputIndex(); 299 return TRAVERSE_ALL_NOT_ON_TERMINAL; 300 } 301 } 302 303 Correction::CorrectionType Correction::processUnrelatedCorrectionType() { 304 // Needs to set mTerminalInputIndex and mTerminalOutputIndex before returning any CorrectionType 305 mTerminalInputIndex = mInputIndex; 306 mTerminalOutputIndex = mOutputIndex; 307 return UNRELATED; 308 } 309 310 inline bool isEquivalentChar(ProximityInfo::ProximityType type) { 311 return type == ProximityInfo::EQUIVALENT_CHAR; 312 } 313 314 inline bool isProximityCharOrEquivalentChar(ProximityInfo::ProximityType type) { 315 return type == ProximityInfo::EQUIVALENT_CHAR 316 || type == ProximityInfo::NEAR_PROXIMITY_CHAR; 317 } 318 319 Correction::CorrectionType Correction::processCharAndCalcState( 320 const int32_t c, const bool isTerminal) { 321 const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount); 322 if (correctionCount > mMaxErrors) { 323 return processUnrelatedCorrectionType(); 324 } 325 326 // TODO: Change the limit if we'll allow two or more corrections 327 const bool noCorrectionsHappenedSoFar = correctionCount == 0; 328 const bool canTryCorrection = noCorrectionsHappenedSoFar; 329 int proximityIndex = 0; 330 mDistances[mOutputIndex] = NOT_A_DISTANCE; 331 332 // Skip checking this node 333 if (mNeedsToTraverseAllNodes || isQuote(c)) { 334 bool incremented = false; 335 if (mLastCharExceeded && mInputIndex == mInputLength - 1) { 336 // TODO: Do not check the proximity if EditDistance exceeds the threshold 337 const ProximityInfo::ProximityType matchId = 338 mProximityInfo->getMatchedProximityId(mInputIndex, c, true, &proximityIndex); 339 if (isEquivalentChar(matchId)) { 340 mLastCharExceeded = false; 341 --mExcessiveCount; 342 mDistances[mOutputIndex] = 343 mProximityInfo->getNormalizedSquaredDistance(mInputIndex, 0); 344 } else if (matchId == ProximityInfo::NEAR_PROXIMITY_CHAR) { 345 mLastCharExceeded = false; 346 --mExcessiveCount; 347 ++mProximityCount; 348 mDistances[mOutputIndex] = 349 mProximityInfo->getNormalizedSquaredDistance(mInputIndex, proximityIndex); 350 } 351 if (!isQuote(c)) { 352 incrementInputIndex(); 353 incremented = true; 354 } 355 } 356 return processSkipChar(c, isTerminal, incremented); 357 } 358 359 // Check possible corrections. 360 if (mExcessivePos >= 0) { 361 if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) { 362 mExcessivePos = mOutputIndex; 363 } 364 if (mExcessivePos < mInputLength - 1) { 365 mExceeding = mExcessivePos == mInputIndex && canTryCorrection; 366 } 367 } 368 369 if (mSkipPos >= 0) { 370 if (mSkippedCount == 0 && mSkipPos < mOutputIndex) { 371 if (DEBUG_DICT) { 372 assert(mSkipPos == mOutputIndex - 1); 373 } 374 mSkipPos = mOutputIndex; 375 } 376 mSkipping = mSkipPos == mOutputIndex && canTryCorrection; 377 } 378 379 if (mTransposedPos >= 0) { 380 if (mTransposedCount == 0 && mTransposedPos < mOutputIndex) { 381 mTransposedPos = mOutputIndex; 382 } 383 if (mTransposedPos < mInputLength - 1) { 384 mTransposing = mInputIndex == mTransposedPos && canTryCorrection; 385 } 386 } 387 388 bool secondTransposing = false; 389 if (mTransposedCount % 2 == 1) { 390 if (isEquivalentChar(mProximityInfo->getMatchedProximityId(mInputIndex - 1, c, false))) { 391 ++mTransposedCount; 392 secondTransposing = true; 393 } else if (mCorrectionStates[mOutputIndex].mExceeding) { 394 --mTransposedCount; 395 ++mExcessiveCount; 396 --mExcessivePos; 397 incrementInputIndex(); 398 } else { 399 --mTransposedCount; 400 if (DEBUG_CORRECTION 401 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) 402 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 403 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 404 DUMP_WORD(mWord, mOutputIndex); 405 AKLOGI("UNRELATED(0): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 406 mTransposedCount, mExcessiveCount, c); 407 } 408 return processUnrelatedCorrectionType(); 409 } 410 } 411 412 // TODO: Change the limit if we'll allow two or more proximity chars with corrections 413 // Work around: When the mMaxErrors is 1, we only allow just one error 414 // including proximity correction. 415 const bool checkProximityChars = (mMaxErrors > 1) 416 ? (noCorrectionsHappenedSoFar || mProximityCount == 0) 417 : (noCorrectionsHappenedSoFar && mProximityCount == 0); 418 419 ProximityInfo::ProximityType matchedProximityCharId = secondTransposing 420 ? ProximityInfo::EQUIVALENT_CHAR 421 : mProximityInfo->getMatchedProximityId( 422 mInputIndex, c, checkProximityChars, &proximityIndex); 423 424 if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId 425 || ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { 426 if (canTryCorrection && mOutputIndex > 0 427 && mCorrectionStates[mOutputIndex].mProximityMatching 428 && mCorrectionStates[mOutputIndex].mExceeding 429 && isEquivalentChar(mProximityInfo->getMatchedProximityId( 430 mInputIndex, mWord[mOutputIndex - 1], false))) { 431 if (DEBUG_CORRECTION 432 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) 433 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 434 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 435 AKLOGI("CONVERSION p->e %c", mWord[mOutputIndex - 1]); 436 } 437 // Conversion p->e 438 // Example: 439 // wearth -> earth 440 // px -> (E)mmmmm 441 ++mExcessiveCount; 442 --mProximityCount; 443 mExcessivePos = mOutputIndex - 1; 444 ++mInputIndex; 445 // Here, we are doing something equivalent to matchedProximityCharId, 446 // but we already know that "excessive char correction" just happened 447 // so that we just need to check "mProximityCount == 0". 448 matchedProximityCharId = mProximityInfo->getMatchedProximityId( 449 mInputIndex, c, mProximityCount == 0, &proximityIndex); 450 } 451 } 452 453 if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId 454 || ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { 455 if (ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { 456 mAdditionalProximityMatching = true; 457 } 458 // TODO: Optimize 459 // As the current char turned out to be an unrelated char, 460 // we will try other correction-types. Please note that mCorrectionStates[mOutputIndex] 461 // here refers to the previous state. 462 if (mInputIndex < mInputLength - 1 && mOutputIndex > 0 && mTransposedCount > 0 463 && !mCorrectionStates[mOutputIndex].mTransposing 464 && mCorrectionStates[mOutputIndex - 1].mTransposing 465 && isEquivalentChar(mProximityInfo->getMatchedProximityId( 466 mInputIndex, mWord[mOutputIndex - 1], false)) 467 && isEquivalentChar( 468 mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false))) { 469 // Conversion t->e 470 // Example: 471 // occaisional -> occa sional 472 // mmmmttx -> mmmm(E)mmmmmm 473 mTransposedCount -= 2; 474 ++mExcessiveCount; 475 ++mInputIndex; 476 } else if (mOutputIndex > 0 && mInputIndex > 0 && mTransposedCount > 0 477 && !mCorrectionStates[mOutputIndex].mTransposing 478 && mCorrectionStates[mOutputIndex - 1].mTransposing 479 && isEquivalentChar( 480 mProximityInfo->getMatchedProximityId(mInputIndex - 1, c, false))) { 481 // Conversion t->s 482 // Example: 483 // chcolate -> chocolate 484 // mmttx -> mmsmmmmmm 485 mTransposedCount -= 2; 486 ++mSkippedCount; 487 --mInputIndex; 488 } else if (canTryCorrection && mInputIndex > 0 489 && mCorrectionStates[mOutputIndex].mProximityMatching 490 && mCorrectionStates[mOutputIndex].mSkipping 491 && isEquivalentChar( 492 mProximityInfo->getMatchedProximityId(mInputIndex - 1, c, false))) { 493 // Conversion p->s 494 // Note: This logic tries saving cases like contrst --> contrast -- "a" is one of 495 // proximity chars of "s", but it should rather be handled as a skipped char. 496 ++mSkippedCount; 497 --mProximityCount; 498 return processSkipChar(c, isTerminal, false); 499 } else if (mInputIndex - 1 < mInputLength 500 && mSkippedCount > 0 501 && mCorrectionStates[mOutputIndex].mSkipping 502 && mCorrectionStates[mOutputIndex].mAdditionalProximityMatching 503 && isProximityCharOrEquivalentChar( 504 mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false))) { 505 // Conversion s->a 506 incrementInputIndex(); 507 --mSkippedCount; 508 mProximityMatching = true; 509 ++mProximityCount; 510 mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO; 511 } else if ((mExceeding || mTransposing) && mInputIndex - 1 < mInputLength 512 && isEquivalentChar( 513 mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false))) { 514 // 1.2. Excessive or transpose correction 515 if (mTransposing) { 516 ++mTransposedCount; 517 } else { 518 ++mExcessiveCount; 519 incrementInputIndex(); 520 } 521 if (DEBUG_CORRECTION 522 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) 523 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 524 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 525 DUMP_WORD(mWord, mOutputIndex); 526 if (mTransposing) { 527 AKLOGI("TRANSPOSE: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 528 mTransposedCount, mExcessiveCount, c); 529 } else { 530 AKLOGI("EXCEED: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 531 mTransposedCount, mExcessiveCount, c); 532 } 533 } 534 } else if (mSkipping) { 535 // 3. Skip correction 536 ++mSkippedCount; 537 if (DEBUG_CORRECTION 538 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) 539 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 540 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 541 AKLOGI("SKIP: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 542 mTransposedCount, mExcessiveCount, c); 543 } 544 return processSkipChar(c, isTerminal, false); 545 } else if (ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { 546 // As a last resort, use additional proximity characters 547 mProximityMatching = true; 548 ++mProximityCount; 549 mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO; 550 if (DEBUG_CORRECTION 551 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) 552 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 553 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 554 AKLOGI("ADDITIONALPROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 555 mTransposedCount, mExcessiveCount, c); 556 } 557 } else { 558 if (DEBUG_CORRECTION 559 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) 560 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 561 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 562 DUMP_WORD(mWord, mOutputIndex); 563 AKLOGI("UNRELATED(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 564 mTransposedCount, mExcessiveCount, c); 565 } 566 return processUnrelatedCorrectionType(); 567 } 568 } else if (secondTransposing) { 569 // If inputIndex is greater than mInputLength, that means there is no 570 // proximity chars. So, we don't need to check proximity. 571 mMatching = true; 572 } else if (isEquivalentChar(matchedProximityCharId)) { 573 mMatching = true; 574 ++mEquivalentCharCount; 575 mDistances[mOutputIndex] = mProximityInfo->getNormalizedSquaredDistance(mInputIndex, 0); 576 } else if (ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) { 577 mProximityMatching = true; 578 ++mProximityCount; 579 mDistances[mOutputIndex] = 580 mProximityInfo->getNormalizedSquaredDistance(mInputIndex, proximityIndex); 581 if (DEBUG_CORRECTION 582 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) 583 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 584 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 585 AKLOGI("PROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 586 mTransposedCount, mExcessiveCount, c); 587 } 588 } 589 590 addCharToCurrentWord(c); 591 592 // 4. Last char excessive correction 593 mLastCharExceeded = mExcessiveCount == 0 && mSkippedCount == 0 && mTransposedCount == 0 594 && mProximityCount == 0 && (mInputIndex == mInputLength - 2); 595 const bool isSameAsUserTypedLength = (mInputLength == mInputIndex + 1) || mLastCharExceeded; 596 if (mLastCharExceeded) { 597 ++mExcessiveCount; 598 } 599 600 // Start traversing all nodes after the index exceeds the user typed length 601 if (isSameAsUserTypedLength) { 602 startToTraverseAllNodes(); 603 } 604 605 const bool needsToTryOnTerminalForTheLastPossibleExcessiveChar = 606 mExceeding && mInputIndex == mInputLength - 2; 607 608 // Finally, we are ready to go to the next character, the next "virtual node". 609 // We should advance the input index. 610 // We do this in this branch of the 'if traverseAllNodes' because we are still matching 611 // characters to input; the other branch is not matching them but searching for 612 // completions, this is why it does not have to do it. 613 incrementInputIndex(); 614 // Also, the next char is one "virtual node" depth more than this char. 615 incrementOutputIndex(); 616 617 if ((needsToTryOnTerminalForTheLastPossibleExcessiveChar 618 || isSameAsUserTypedLength) && isTerminal) { 619 mTerminalInputIndex = mInputIndex - 1; 620 mTerminalOutputIndex = mOutputIndex - 1; 621 if (DEBUG_CORRECTION 622 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) 623 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 624 DUMP_WORD(mWord, mOutputIndex); 625 AKLOGI("ONTERMINAL(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 626 mTransposedCount, mExcessiveCount, c); 627 } 628 return ON_TERMINAL; 629 } else { 630 mTerminalInputIndex = mInputIndex - 1; 631 mTerminalOutputIndex = mOutputIndex - 1; 632 return NOT_ON_TERMINAL; 633 } 634 } 635 636 Correction::~Correction() { 637 } 638 639 inline static int getQuoteCount(const unsigned short* word, const int length) { 640 int quoteCount = 0; 641 for (int i = 0; i < length; ++i) { 642 if(word[i] == '\'') { 643 ++quoteCount; 644 } 645 } 646 return quoteCount; 647 } 648 649 inline static bool isUpperCase(unsigned short c) { 650 return isAsciiUpper(toBaseChar(c)); 651 } 652 653 ////////////////////// 654 // RankingAlgorithm // 655 ////////////////////// 656 657 /* static */ 658 int Correction::RankingAlgorithm::calculateFinalProbability(const int inputIndex, 659 const int outputIndex, const int freq, int* editDistanceTable, const Correction* correction, 660 const int inputLength) { 661 const int excessivePos = correction->getExcessivePos(); 662 const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; 663 const int fullWordMultiplier = correction->FULL_WORD_MULTIPLIER; 664 const ProximityInfo *proximityInfo = correction->mProximityInfo; 665 const int skippedCount = correction->mSkippedCount; 666 const int transposedCount = correction->mTransposedCount / 2; 667 const int excessiveCount = correction->mExcessiveCount + correction->mTransposedCount % 2; 668 const int proximityMatchedCount = correction->mProximityCount; 669 const bool lastCharExceeded = correction->mLastCharExceeded; 670 const bool useFullEditDistance = correction->mUseFullEditDistance; 671 const int outputLength = outputIndex + 1; 672 if (skippedCount >= inputLength || inputLength == 0) { 673 return -1; 674 } 675 676 // TODO: find more robust way 677 bool sameLength = lastCharExceeded ? (inputLength == inputIndex + 2) 678 : (inputLength == inputIndex + 1); 679 680 // TODO: use mExcessiveCount 681 const int matchCount = inputLength - correction->mProximityCount - excessiveCount; 682 683 const unsigned short* word = correction->mWord; 684 const bool skipped = skippedCount > 0; 685 686 const int quoteDiffCount = max(0, getQuoteCount(word, outputLength) 687 - getQuoteCount(proximityInfo->getPrimaryInputWord(), inputLength)); 688 689 // TODO: Calculate edit distance for transposed and excessive 690 int ed = 0; 691 if (DEBUG_DICT_FULL) { 692 dumpEditDistance10ForDebug(editDistanceTable, correction->mInputLength, outputLength); 693 } 694 int adjustedProximityMatchedCount = proximityMatchedCount; 695 696 int finalFreq = freq; 697 698 if (DEBUG_CORRECTION_FREQ 699 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputLength)) { 700 AKLOGI("FinalFreq0: %d", finalFreq); 701 } 702 // TODO: Optimize this. 703 if (transposedCount > 0 || proximityMatchedCount > 0 || skipped || excessiveCount > 0) { 704 ed = getCurrentEditDistance(editDistanceTable, correction->mInputLength, outputLength, 705 inputLength) - transposedCount; 706 707 const int matchWeight = powerIntCapped(typedLetterMultiplier, 708 max(inputLength, outputLength) - ed); 709 multiplyIntCapped(matchWeight, &finalFreq); 710 711 // TODO: Demote further if there are two or more excessive chars with longer user input? 712 if (inputLength > outputLength) { 713 multiplyRate(INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE, &finalFreq); 714 } 715 716 ed = max(0, ed - quoteDiffCount); 717 adjustedProximityMatchedCount = min(max(0, ed - (outputLength - inputLength)), 718 proximityMatchedCount); 719 if (transposedCount <= 0) { 720 if (ed == 1 && (inputLength == outputLength - 1 || inputLength == outputLength + 1)) { 721 // Promote a word with just one skipped or excessive char 722 if (sameLength) { 723 multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE 724 + WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_MULTIPLIER * outputLength, 725 &finalFreq); 726 } else { 727 multiplyIntCapped(typedLetterMultiplier, &finalFreq); 728 } 729 } else if (ed == 0) { 730 multiplyIntCapped(typedLetterMultiplier, &finalFreq); 731 sameLength = true; 732 } 733 } 734 } else { 735 const int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount); 736 multiplyIntCapped(matchWeight, &finalFreq); 737 } 738 739 if (proximityInfo->getMatchedProximityId(0, word[0], true) 740 == ProximityInfo::UNRELATED_CHAR) { 741 multiplyRate(FIRST_CHAR_DIFFERENT_DEMOTION_RATE, &finalFreq); 742 } 743 744 /////////////////////////////////////////////// 745 // Promotion and Demotion for each correction 746 747 // Demotion for a word with missing character 748 if (skipped) { 749 const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 750 * (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X) 751 / (10 * inputLength 752 - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10); 753 if (DEBUG_DICT_FULL) { 754 AKLOGI("Demotion rate for missing character is %d.", demotionRate); 755 } 756 multiplyRate(demotionRate, &finalFreq); 757 } 758 759 // Demotion for a word with transposed character 760 if (transposedCount > 0) multiplyRate( 761 WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq); 762 763 // Demotion for a word with excessive character 764 if (excessiveCount > 0) { 765 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq); 766 if (!lastCharExceeded && !proximityInfo->existsAdjacentProximityChars(excessivePos)) { 767 if (DEBUG_DICT_FULL) { 768 AKLOGI("Double excessive demotion"); 769 } 770 // If an excessive character is not adjacent to the left char or the right char, 771 // we will demote this word. 772 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq); 773 } 774 } 775 776 const bool performTouchPositionCorrection = 777 CALIBRATE_SCORE_BY_TOUCH_COORDINATES && proximityInfo->touchPositionCorrectionEnabled() 778 && skippedCount == 0 && excessiveCount == 0 && transposedCount == 0; 779 // Score calibration by touch coordinates is being done only for pure-fat finger typing error 780 // cases. 781 int additionalProximityCount = 0; 782 // TODO: Remove this constraint. 783 if (performTouchPositionCorrection) { 784 for (int i = 0; i < outputLength; ++i) { 785 const int squaredDistance = correction->mDistances[i]; 786 if (i < adjustedProximityMatchedCount) { 787 multiplyIntCapped(typedLetterMultiplier, &finalFreq); 788 } 789 if (squaredDistance >= 0) { 790 // Promote or demote the score according to the distance from the sweet spot 791 static const float A = ZERO_DISTANCE_PROMOTION_RATE / 100.0f; 792 static const float B = 1.0f; 793 static const float C = 0.5f; 794 static const float MIN = 0.3f; 795 static const float R1 = NEUTRAL_SCORE_SQUARED_RADIUS; 796 static const float R2 = HALF_SCORE_SQUARED_RADIUS; 797 const float x = (float)squaredDistance 798 / ProximityInfo::NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR; 799 const float factor = max((x < R1) 800 ? (A * (R1 - x) + B * x) / R1 801 : (B * (R2 - x) + C * (x - R1)) / (R2 - R1), MIN); 802 // factor is piecewise linear function like: 803 // A -_ . 804 // ^-_ . 805 // B \ . 806 // \_ . 807 // C ------------. 808 // . 809 // 0 R1 R2 . 810 multiplyRate((int)(factor * 100), &finalFreq); 811 } else if (squaredDistance == PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO) { 812 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); 813 } else if (squaredDistance == ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO) { 814 ++additionalProximityCount; 815 multiplyRate(WORDS_WITH_ADDITIONAL_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); 816 } 817 } 818 } else { 819 // Demote additional proximity characters 820 for (int i = 0; i < outputLength; ++i) { 821 const int squaredDistance = correction->mDistances[i]; 822 if (squaredDistance == ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO) { 823 ++additionalProximityCount; 824 } 825 } 826 // Promotion for a word with proximity characters 827 for (int i = 0; i < adjustedProximityMatchedCount; ++i) { 828 // A word with proximity corrections 829 if (DEBUG_DICT_FULL) { 830 AKLOGI("Found a proximity correction."); 831 } 832 multiplyIntCapped(typedLetterMultiplier, &finalFreq); 833 if (i < additionalProximityCount) { 834 multiplyRate(WORDS_WITH_ADDITIONAL_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); 835 } else { 836 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); 837 } 838 } 839 } 840 841 // If the user types too many(three or more) proximity characters with additional proximity 842 // character,do not treat as the same length word. 843 if (sameLength && additionalProximityCount > 0 && (adjustedProximityMatchedCount >= 3 844 || transposedCount > 0 || skipped || excessiveCount > 0)) { 845 sameLength = false; 846 } 847 848 const int errorCount = adjustedProximityMatchedCount > 0 849 ? adjustedProximityMatchedCount 850 : (proximityMatchedCount + transposedCount); 851 multiplyRate( 852 100 - CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE * errorCount / inputLength, &finalFreq); 853 854 // Promotion for an exactly matched word 855 if (ed == 0) { 856 // Full exact match 857 if (sameLength && transposedCount == 0 && !skipped && excessiveCount == 0 858 && quoteDiffCount == 0 && additionalProximityCount == 0) { 859 finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq); 860 } 861 } 862 863 // Promote a word with no correction 864 if (proximityMatchedCount == 0 && transposedCount == 0 && !skipped && excessiveCount == 0 865 && additionalProximityCount == 0) { 866 multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq); 867 } 868 869 // TODO: Check excessive count and transposed count 870 // TODO: Remove this if possible 871 /* 872 If the last character of the user input word is the same as the next character 873 of the output word, and also all of characters of the user input are matched 874 to the output word, we'll promote that word a bit because 875 that word can be considered the combination of skipped and matched characters. 876 This means that the 'sm' pattern wins over the 'ma' pattern. 877 e.g.) 878 shel -> shell [mmmma] or [mmmsm] 879 hel -> hello [mmmaa] or [mmsma] 880 m ... matching 881 s ... skipping 882 a ... traversing all 883 t ... transposing 884 e ... exceeding 885 p ... proximity matching 886 */ 887 if (matchCount == inputLength && matchCount >= 2 && !skipped 888 && word[matchCount] == word[matchCount - 1]) { 889 multiplyRate(WORDS_WITH_MATCH_SKIP_PROMOTION_RATE, &finalFreq); 890 } 891 892 // TODO: Do not use sameLength? 893 if (sameLength) { 894 multiplyIntCapped(fullWordMultiplier, &finalFreq); 895 } 896 897 if (useFullEditDistance && outputLength > inputLength + 1) { 898 const int diff = outputLength - inputLength - 1; 899 const int divider = diff < 31 ? 1 << diff : S_INT_MAX; 900 finalFreq = divider > finalFreq ? 1 : finalFreq / divider; 901 } 902 903 if (DEBUG_DICT_FULL) { 904 AKLOGI("calc: %d, %d", outputLength, sameLength); 905 } 906 907 if (DEBUG_CORRECTION_FREQ 908 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputLength)) { 909 DUMP_WORD(proximityInfo->getPrimaryInputWord(), inputLength); 910 DUMP_WORD(correction->mWord, outputLength); 911 AKLOGI("FinalFreq: [P%d, S%d, T%d, E%d, A%d] %d, %d, %d, %d, %d, %d", proximityMatchedCount, 912 skippedCount, transposedCount, excessiveCount, additionalProximityCount, 913 outputLength, lastCharExceeded, sameLength, quoteDiffCount, ed, finalFreq); 914 } 915 916 return finalFreq; 917 } 918 919 /* static */ 920 int Correction::RankingAlgorithm::calcFreqForSplitMultipleWords( 921 const int *freqArray, const int *wordLengthArray, const int wordCount, 922 const Correction* correction, const bool isSpaceProximity, const unsigned short *word) { 923 const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; 924 925 bool firstCapitalizedWordDemotion = false; 926 bool secondCapitalizedWordDemotion = false; 927 928 { 929 // TODO: Handle multiple capitalized word demotion properly 930 const int firstWordLength = wordLengthArray[0]; 931 const int secondWordLength = wordLengthArray[1]; 932 if (firstWordLength >= 2) { 933 firstCapitalizedWordDemotion = isUpperCase(word[0]); 934 } 935 936 if (secondWordLength >= 2) { 937 // FIXME: word[firstWordLength + 1] is incorrect. 938 secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]); 939 } 940 } 941 942 943 const bool capitalizedWordDemotion = 944 firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion; 945 946 int totalLength = 0; 947 int totalFreq = 0; 948 for (int i = 0; i < wordCount; ++i){ 949 const int wordLength = wordLengthArray[i]; 950 if (wordLength <= 0) { 951 return 0; 952 } 953 totalLength += wordLength; 954 const int demotionRate = 100 - TWO_WORDS_CORRECTION_DEMOTION_BASE / (wordLength + 1); 955 int tempFirstFreq = freqArray[i]; 956 multiplyRate(demotionRate, &tempFirstFreq); 957 totalFreq += tempFirstFreq; 958 } 959 960 if (totalLength <= 0 || totalFreq <= 0) { 961 return 0; 962 } 963 964 // TODO: Currently totalFreq is adjusted to two word metrix. 965 // Promote pairFreq with multiplying by 2, because the word length is the same as the typed 966 // length. 967 totalFreq = totalFreq * 2 / wordCount; 968 if (wordCount > 2) { 969 // Safety net for 3+ words -- Caveats: many heuristics and workarounds here. 970 int oneLengthCounter = 0; 971 int twoLengthCounter = 0; 972 for (int i = 0; i < wordCount; ++i) { 973 const int wordLength = wordLengthArray[i]; 974 // TODO: Use bigram instead of this safety net 975 if (i < wordCount - 1) { 976 const int nextWordLength = wordLengthArray[i + 1]; 977 if (wordLength == 1 && nextWordLength == 2) { 978 // Safety net to filter 1 length and 2 length sequential words 979 return 0; 980 } 981 } 982 const int freq = freqArray[i]; 983 // Demote too short weak words 984 if (wordLength <= 4 && freq <= SUPPRESS_SHORT_MULTIPLE_WORDS_THRESHOLD_FREQ) { 985 multiplyRate(100 * freq / MAX_FREQ, &totalFreq); 986 } 987 if (wordLength == 1) { 988 ++oneLengthCounter; 989 } else if (wordLength == 2) { 990 ++twoLengthCounter; 991 } 992 if (oneLengthCounter >= 2 || (oneLengthCounter + twoLengthCounter) >= 4) { 993 // Safety net to filter too many short words 994 return 0; 995 } 996 } 997 multiplyRate(MULTIPLE_WORDS_DEMOTION_RATE, &totalFreq); 998 } 999 1000 // This is a workaround to try offsetting the not-enough-demotion which will be done in 1001 // calcNormalizedScore in Utils.java. 1002 // In calcNormalizedScore the score will be demoted by (1 - 1 / length) 1003 // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by 1004 // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length)) 1005 const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength); 1006 multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq); 1007 1008 // At this moment, totalFreq is calculated by the following formula: 1009 // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1))) 1010 // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1)) 1011 1012 multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq); 1013 1014 // This is another workaround to offset the demotion which will be done in 1015 // calcNormalizedScore in Utils.java. 1016 // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote 1017 // the same amount because we already have adjusted the synthetic freq of this "missing or 1018 // mistyped space" suggestion candidate above in this method. 1019 const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength); 1020 multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq); 1021 1022 if (isSpaceProximity) { 1023 // A word pair with one space proximity correction 1024 if (DEBUG_DICT) { 1025 AKLOGI("Found a word pair with space proximity correction."); 1026 } 1027 multiplyIntCapped(typedLetterMultiplier, &totalFreq); 1028 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq); 1029 } 1030 1031 if (isSpaceProximity) { 1032 multiplyRate(WORDS_WITH_MISTYPED_SPACE_DEMOTION_RATE, &totalFreq); 1033 } else { 1034 multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq); 1035 } 1036 1037 if (capitalizedWordDemotion) { 1038 multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq); 1039 } 1040 1041 if (DEBUG_CORRECTION_FREQ) { 1042 AKLOGI("Multiple words (%d, %d) (%d, %d) %d, %d", freqArray[0], freqArray[1], 1043 wordLengthArray[0], wordLengthArray[1], capitalizedWordDemotion, totalFreq); 1044 DUMP_WORD(word, wordLengthArray[0]); 1045 } 1046 1047 return totalFreq; 1048 } 1049 1050 /* Damerau-Levenshtein distance */ 1051 inline static int editDistanceInternal( 1052 int* editDistanceTable, const unsigned short* before, 1053 const int beforeLength, const unsigned short* after, const int afterLength) { 1054 // dp[li][lo] dp[a][b] = dp[ a * lo + b] 1055 int* dp = editDistanceTable; 1056 const int li = beforeLength + 1; 1057 const int lo = afterLength + 1; 1058 for (int i = 0; i < li; ++i) { 1059 dp[lo * i] = i; 1060 } 1061 for (int i = 0; i < lo; ++i) { 1062 dp[i] = i; 1063 } 1064 1065 for (int i = 0; i < li - 1; ++i) { 1066 for (int j = 0; j < lo - 1; ++j) { 1067 const uint32_t ci = toBaseLowerCase(before[i]); 1068 const uint32_t co = toBaseLowerCase(after[j]); 1069 const uint16_t cost = (ci == co) ? 0 : 1; 1070 dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1, 1071 min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost)); 1072 if (i > 0 && j > 0 && ci == toBaseLowerCase(after[j - 1]) 1073 && co == toBaseLowerCase(before[i - 1])) { 1074 dp[(i + 1) * lo + (j + 1)] = min( 1075 dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost); 1076 } 1077 } 1078 } 1079 1080 if (DEBUG_EDIT_DISTANCE) { 1081 AKLOGI("IN = %d, OUT = %d", beforeLength, afterLength); 1082 for (int i = 0; i < li; ++i) { 1083 for (int j = 0; j < lo; ++j) { 1084 AKLOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]); 1085 } 1086 } 1087 } 1088 return dp[li * lo - 1]; 1089 } 1090 1091 int Correction::RankingAlgorithm::editDistance(const unsigned short* before, 1092 const int beforeLength, const unsigned short* after, const int afterLength) { 1093 int table[(beforeLength + 1) * (afterLength + 1)]; 1094 return editDistanceInternal(table, before, beforeLength, after, afterLength); 1095 } 1096 1097 1098 // In dictionary.cpp, getSuggestion() method, 1099 // suggestion scores are computed using the below formula. 1100 // original score 1101 // := pow(mTypedLetterMultiplier (this is defined 2), 1102 // (the number of matched characters between typed word and suggested word)) 1103 // * (individual word's score which defined in the unigram dictionary, 1104 // and this score is defined in range [0, 255].) 1105 // Then, the following processing is applied. 1106 // - If the dictionary word is matched up to the point of the user entry 1107 // (full match up to min(before.length(), after.length()) 1108 // => Then multiply by FULL_MATCHED_WORDS_PROMOTION_RATE (this is defined 1.2) 1109 // - If the word is a true full match except for differences in accents or 1110 // capitalization, then treat it as if the score was 255. 1111 // - If before.length() == after.length() 1112 // => multiply by mFullWordMultiplier (this is defined 2)) 1113 // So, maximum original score is pow(2, min(before.length(), after.length())) * 255 * 2 * 1.2 1114 // For historical reasons we ignore the 1.2 modifier (because the measure for a good 1115 // autocorrection threshold was done at a time when it didn't exist). This doesn't change 1116 // the result. 1117 // So, we can normalize original score by dividing pow(2, min(b.l(),a.l())) * 255 * 2. 1118 1119 /* static */ 1120 float Correction::RankingAlgorithm::calcNormalizedScore(const unsigned short* before, 1121 const int beforeLength, const unsigned short* after, const int afterLength, 1122 const int score) { 1123 if (0 == beforeLength || 0 == afterLength) { 1124 return 0; 1125 } 1126 const int distance = editDistance(before, beforeLength, after, afterLength); 1127 int spaceCount = 0; 1128 for (int i = 0; i < afterLength; ++i) { 1129 if (after[i] == CODE_SPACE) { 1130 ++spaceCount; 1131 } 1132 } 1133 1134 if (spaceCount == afterLength) { 1135 return 0; 1136 } 1137 1138 const float maxScore = score >= S_INT_MAX ? S_INT_MAX : MAX_INITIAL_SCORE 1139 * pow((float)TYPED_LETTER_MULTIPLIER, 1140 (float)min(beforeLength, afterLength - spaceCount)) * FULL_WORD_MULTIPLIER; 1141 1142 // add a weight based on edit distance. 1143 // distance <= max(afterLength, beforeLength) == afterLength, 1144 // so, 0 <= distance / afterLength <= 1 1145 const float weight = 1.0 - (float) distance / afterLength; 1146 return (score / maxScore) * weight; 1147 } 1148 1149 } // namespace latinime 1150