1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <cassert> 18 #include <cctype> 19 #include <cmath> 20 #include <cstring> 21 22 #define LOG_TAG "LatinIME: correction.cpp" 23 24 #include "char_utils.h" 25 #include "correction.h" 26 #include "defines.h" 27 #include "proximity_info_state.h" 28 29 namespace latinime { 30 31 class ProximityInfo; 32 33 ///////////////////////////// 34 // edit distance funcitons // 35 ///////////////////////////// 36 37 inline static void initEditDistance(int *editDistanceTable) { 38 for (int i = 0; i <= MAX_WORD_LENGTH_INTERNAL; ++i) { 39 editDistanceTable[i] = i; 40 } 41 } 42 43 inline static void dumpEditDistance10ForDebug(int *editDistanceTable, 44 const int editDistanceTableWidth, const int outputLength) { 45 if (DEBUG_DICT) { 46 AKLOGI("EditDistanceTable"); 47 for (int i = 0; i <= 10; ++i) { 48 int c[11]; 49 for (int j = 0; j <= 10; ++j) { 50 if (j < editDistanceTableWidth + 1 && i < outputLength + 1) { 51 c[j] = (editDistanceTable + i * (editDistanceTableWidth + 1))[j]; 52 } else { 53 c[j] = -1; 54 } 55 } 56 AKLOGI("[ %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d ]", 57 c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10]); 58 (void)c; // To suppress compiler warning 59 } 60 } 61 } 62 63 inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigned short *input, 64 const int inputSize, const unsigned short *output, const int outputLength) { 65 // TODO: Make sure that editDistance[0 ~ MAX_WORD_LENGTH_INTERNAL] is not touched. 66 // Let dp[i][j] be editDistanceTable[i * (inputSize + 1) + j]. 67 // Assuming that dp[0][0] ... dp[outputLength - 1][inputSize] are already calculated, 68 // and calculate dp[ouputLength][0] ... dp[outputLength][inputSize]. 69 int *const current = editDistanceTable + outputLength * (inputSize + 1); 70 const int *const prev = editDistanceTable + (outputLength - 1) * (inputSize + 1); 71 const int *const prevprev = 72 outputLength >= 2 ? editDistanceTable + (outputLength - 2) * (inputSize + 1) : 0; 73 current[0] = outputLength; 74 const uint32_t co = toBaseLowerCase(output[outputLength - 1]); 75 const uint32_t prevCO = outputLength >= 2 ? toBaseLowerCase(output[outputLength - 2]) : 0; 76 for (int i = 1; i <= inputSize; ++i) { 77 const uint32_t ci = toBaseLowerCase(input[i - 1]); 78 const uint16_t cost = (ci == co) ? 0 : 1; 79 current[i] = min(current[i - 1] + 1, min(prev[i] + 1, prev[i - 1] + cost)); 80 if (i >= 2 && prevprev && ci == prevCO && co == toBaseLowerCase(input[i - 2])) { 81 current[i] = min(current[i], prevprev[i - 2] + 1); 82 } 83 } 84 } 85 86 inline static int getCurrentEditDistance(int *editDistanceTable, const int editDistanceTableWidth, 87 const int outputLength, const int inputSize) { 88 if (DEBUG_EDIT_DISTANCE) { 89 AKLOGI("getCurrentEditDistance %d, %d", inputSize, outputLength); 90 } 91 return editDistanceTable[(editDistanceTableWidth + 1) * (outputLength) + inputSize]; 92 } 93 94 ////////////////////// 95 // inline functions // 96 ////////////////////// 97 static const char SINGLE_QUOTE = '\''; 98 99 inline bool Correction::isSingleQuote(const unsigned short c) { 100 const unsigned short userTypedChar = mProximityInfoState.getPrimaryCharAt(mInputIndex); 101 return (c == SINGLE_QUOTE && userTypedChar != SINGLE_QUOTE); 102 } 103 104 //////////////// 105 // Correction // 106 //////////////// 107 108 void Correction::resetCorrection() { 109 mTotalTraverseCount = 0; 110 } 111 112 void Correction::initCorrection(const ProximityInfo *pi, const int inputSize, 113 const int maxDepth) { 114 mProximityInfo = pi; 115 mInputSize = inputSize; 116 mMaxDepth = maxDepth; 117 mMaxEditDistance = mInputSize < 5 ? 2 : mInputSize / 2; 118 // TODO: This is not supposed to be required. Check what's going wrong with 119 // editDistance[0 ~ MAX_WORD_LENGTH_INTERNAL] 120 initEditDistance(mEditDistanceTable); 121 } 122 123 void Correction::initCorrectionState( 124 const int rootPos, const int childCount, const bool traverseAll) { 125 latinime::initCorrectionState(mCorrectionStates, rootPos, childCount, traverseAll); 126 // TODO: remove 127 mCorrectionStates[0].mTransposedPos = mTransposedPos; 128 mCorrectionStates[0].mExcessivePos = mExcessivePos; 129 mCorrectionStates[0].mSkipPos = mSkipPos; 130 } 131 132 void Correction::setCorrectionParams(const int skipPos, const int excessivePos, 133 const int transposedPos, const int spaceProximityPos, const int missingSpacePos, 134 const bool useFullEditDistance, const bool doAutoCompletion, const int maxErrors) { 135 // TODO: remove 136 mTransposedPos = transposedPos; 137 mExcessivePos = excessivePos; 138 mSkipPos = skipPos; 139 // TODO: remove 140 mCorrectionStates[0].mTransposedPos = transposedPos; 141 mCorrectionStates[0].mExcessivePos = excessivePos; 142 mCorrectionStates[0].mSkipPos = skipPos; 143 144 mSpaceProximityPos = spaceProximityPos; 145 mMissingSpacePos = missingSpacePos; 146 mUseFullEditDistance = useFullEditDistance; 147 mDoAutoCompletion = doAutoCompletion; 148 mMaxErrors = maxErrors; 149 } 150 151 void Correction::checkState() { 152 if (DEBUG_DICT) { 153 int inputCount = 0; 154 if (mSkipPos >= 0) ++inputCount; 155 if (mExcessivePos >= 0) ++inputCount; 156 if (mTransposedPos >= 0) ++inputCount; 157 } 158 } 159 160 bool Correction::sameAsTyped() { 161 return mProximityInfoState.sameAsTyped(mWord, mOutputIndex); 162 } 163 164 int Correction::getFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray, 165 const int wordCount, const bool isSpaceProximity, const unsigned short *word) { 166 return Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(freqArray, wordLengthArray, 167 wordCount, this, isSpaceProximity, word); 168 } 169 170 int Correction::getFinalProbability(const int probability, unsigned short **word, int *wordLength) { 171 return getFinalProbabilityInternal(probability, word, wordLength, mInputSize); 172 } 173 174 int Correction::getFinalProbabilityForSubQueue(const int probability, unsigned short **word, 175 int *wordLength, const int inputSize) { 176 return getFinalProbabilityInternal(probability, word, wordLength, inputSize); 177 } 178 179 int Correction::getFinalProbabilityInternal(const int probability, unsigned short **word, 180 int *wordLength, const int inputSize) { 181 const int outputIndex = mTerminalOutputIndex; 182 const int inputIndex = mTerminalInputIndex; 183 *wordLength = outputIndex + 1; 184 *word = mWord; 185 int finalProbability= Correction::RankingAlgorithm::calculateFinalProbability( 186 inputIndex, outputIndex, probability, mEditDistanceTable, this, inputSize); 187 return finalProbability; 188 } 189 190 bool Correction::initProcessState(const int outputIndex) { 191 if (mCorrectionStates[outputIndex].mChildCount <= 0) { 192 return false; 193 } 194 mOutputIndex = outputIndex; 195 --(mCorrectionStates[outputIndex].mChildCount); 196 mInputIndex = mCorrectionStates[outputIndex].mInputIndex; 197 mNeedsToTraverseAllNodes = mCorrectionStates[outputIndex].mNeedsToTraverseAllNodes; 198 199 mEquivalentCharCount = mCorrectionStates[outputIndex].mEquivalentCharCount; 200 mProximityCount = mCorrectionStates[outputIndex].mProximityCount; 201 mTransposedCount = mCorrectionStates[outputIndex].mTransposedCount; 202 mExcessiveCount = mCorrectionStates[outputIndex].mExcessiveCount; 203 mSkippedCount = mCorrectionStates[outputIndex].mSkippedCount; 204 mLastCharExceeded = mCorrectionStates[outputIndex].mLastCharExceeded; 205 206 mTransposedPos = mCorrectionStates[outputIndex].mTransposedPos; 207 mExcessivePos = mCorrectionStates[outputIndex].mExcessivePos; 208 mSkipPos = mCorrectionStates[outputIndex].mSkipPos; 209 210 mMatching = false; 211 mProximityMatching = false; 212 mAdditionalProximityMatching = false; 213 mTransposing = false; 214 mExceeding = false; 215 mSkipping = false; 216 217 return true; 218 } 219 220 int Correction::goDownTree( 221 const int parentIndex, const int childCount, const int firstChildPos) { 222 mCorrectionStates[mOutputIndex].mParentIndex = parentIndex; 223 mCorrectionStates[mOutputIndex].mChildCount = childCount; 224 mCorrectionStates[mOutputIndex].mSiblingPos = firstChildPos; 225 return mOutputIndex; 226 } 227 228 // TODO: remove 229 int Correction::getInputIndex() const { 230 return mInputIndex; 231 } 232 233 void Correction::incrementInputIndex() { 234 ++mInputIndex; 235 } 236 237 void Correction::incrementOutputIndex() { 238 ++mOutputIndex; 239 mCorrectionStates[mOutputIndex].mParentIndex = mCorrectionStates[mOutputIndex - 1].mParentIndex; 240 mCorrectionStates[mOutputIndex].mChildCount = mCorrectionStates[mOutputIndex - 1].mChildCount; 241 mCorrectionStates[mOutputIndex].mSiblingPos = mCorrectionStates[mOutputIndex - 1].mSiblingPos; 242 mCorrectionStates[mOutputIndex].mInputIndex = mInputIndex; 243 mCorrectionStates[mOutputIndex].mNeedsToTraverseAllNodes = mNeedsToTraverseAllNodes; 244 245 mCorrectionStates[mOutputIndex].mEquivalentCharCount = mEquivalentCharCount; 246 mCorrectionStates[mOutputIndex].mProximityCount = mProximityCount; 247 mCorrectionStates[mOutputIndex].mTransposedCount = mTransposedCount; 248 mCorrectionStates[mOutputIndex].mExcessiveCount = mExcessiveCount; 249 mCorrectionStates[mOutputIndex].mSkippedCount = mSkippedCount; 250 251 mCorrectionStates[mOutputIndex].mSkipPos = mSkipPos; 252 mCorrectionStates[mOutputIndex].mTransposedPos = mTransposedPos; 253 mCorrectionStates[mOutputIndex].mExcessivePos = mExcessivePos; 254 255 mCorrectionStates[mOutputIndex].mLastCharExceeded = mLastCharExceeded; 256 257 mCorrectionStates[mOutputIndex].mMatching = mMatching; 258 mCorrectionStates[mOutputIndex].mProximityMatching = mProximityMatching; 259 mCorrectionStates[mOutputIndex].mAdditionalProximityMatching = mAdditionalProximityMatching; 260 mCorrectionStates[mOutputIndex].mTransposing = mTransposing; 261 mCorrectionStates[mOutputIndex].mExceeding = mExceeding; 262 mCorrectionStates[mOutputIndex].mSkipping = mSkipping; 263 } 264 265 void Correction::startToTraverseAllNodes() { 266 mNeedsToTraverseAllNodes = true; 267 } 268 269 bool Correction::needsToPrune() const { 270 // TODO: use edit distance here 271 return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance 272 // Allow one char longer word for missing character 273 || (!mDoAutoCompletion && (mOutputIndex > mInputSize)); 274 } 275 276 void Correction::addCharToCurrentWord(const int32_t c) { 277 mWord[mOutputIndex] = c; 278 const unsigned short *primaryInputWord = mProximityInfoState.getPrimaryInputWord(); 279 calcEditDistanceOneStep(mEditDistanceTable, primaryInputWord, mInputSize, 280 mWord, mOutputIndex + 1); 281 } 282 283 Correction::CorrectionType Correction::processSkipChar( 284 const int32_t c, const bool isTerminal, const bool inputIndexIncremented) { 285 addCharToCurrentWord(c); 286 mTerminalInputIndex = mInputIndex - (inputIndexIncremented ? 1 : 0); 287 mTerminalOutputIndex = mOutputIndex; 288 if (mNeedsToTraverseAllNodes && isTerminal) { 289 incrementOutputIndex(); 290 return TRAVERSE_ALL_ON_TERMINAL; 291 } else { 292 incrementOutputIndex(); 293 return TRAVERSE_ALL_NOT_ON_TERMINAL; 294 } 295 } 296 297 Correction::CorrectionType Correction::processUnrelatedCorrectionType() { 298 // Needs to set mTerminalInputIndex and mTerminalOutputIndex before returning any CorrectionType 299 mTerminalInputIndex = mInputIndex; 300 mTerminalOutputIndex = mOutputIndex; 301 return UNRELATED; 302 } 303 304 inline bool isEquivalentChar(ProximityType type) { 305 return type == EQUIVALENT_CHAR; 306 } 307 308 inline bool isProximityCharOrEquivalentChar(ProximityType type) { 309 return type == EQUIVALENT_CHAR || type == NEAR_PROXIMITY_CHAR; 310 } 311 312 Correction::CorrectionType Correction::processCharAndCalcState( 313 const int32_t c, const bool isTerminal) { 314 const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount); 315 if (correctionCount > mMaxErrors) { 316 return processUnrelatedCorrectionType(); 317 } 318 319 // TODO: Change the limit if we'll allow two or more corrections 320 const bool noCorrectionsHappenedSoFar = correctionCount == 0; 321 const bool canTryCorrection = noCorrectionsHappenedSoFar; 322 int proximityIndex = 0; 323 mDistances[mOutputIndex] = NOT_A_DISTANCE; 324 325 // Skip checking this node 326 if (mNeedsToTraverseAllNodes || isSingleQuote(c)) { 327 bool incremented = false; 328 if (mLastCharExceeded && mInputIndex == mInputSize - 1) { 329 // TODO: Do not check the proximity if EditDistance exceeds the threshold 330 const ProximityType matchId = mProximityInfoState.getMatchedProximityId( 331 mInputIndex, c, true, &proximityIndex); 332 if (isEquivalentChar(matchId)) { 333 mLastCharExceeded = false; 334 --mExcessiveCount; 335 mDistances[mOutputIndex] = 336 mProximityInfoState.getNormalizedSquaredDistance(mInputIndex, 0); 337 } else if (matchId == NEAR_PROXIMITY_CHAR) { 338 mLastCharExceeded = false; 339 --mExcessiveCount; 340 ++mProximityCount; 341 mDistances[mOutputIndex] = mProximityInfoState.getNormalizedSquaredDistance( 342 mInputIndex, proximityIndex); 343 } 344 if (!isSingleQuote(c)) { 345 incrementInputIndex(); 346 incremented = true; 347 } 348 } 349 return processSkipChar(c, isTerminal, incremented); 350 } 351 352 // Check possible corrections. 353 if (mExcessivePos >= 0) { 354 if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) { 355 mExcessivePos = mOutputIndex; 356 } 357 if (mExcessivePos < mInputSize - 1) { 358 mExceeding = mExcessivePos == mInputIndex && canTryCorrection; 359 } 360 } 361 362 if (mSkipPos >= 0) { 363 if (mSkippedCount == 0 && mSkipPos < mOutputIndex) { 364 if (DEBUG_DICT) { 365 // TODO: Enable this assertion. 366 //assert(mSkipPos == mOutputIndex - 1); 367 } 368 mSkipPos = mOutputIndex; 369 } 370 mSkipping = mSkipPos == mOutputIndex && canTryCorrection; 371 } 372 373 if (mTransposedPos >= 0) { 374 if (mTransposedCount == 0 && mTransposedPos < mOutputIndex) { 375 mTransposedPos = mOutputIndex; 376 } 377 if (mTransposedPos < mInputSize - 1) { 378 mTransposing = mInputIndex == mTransposedPos && canTryCorrection; 379 } 380 } 381 382 bool secondTransposing = false; 383 if (mTransposedCount % 2 == 1) { 384 if (isEquivalentChar(mProximityInfoState.getMatchedProximityId( 385 mInputIndex - 1, c, false))) { 386 ++mTransposedCount; 387 secondTransposing = true; 388 } else if (mCorrectionStates[mOutputIndex].mExceeding) { 389 --mTransposedCount; 390 ++mExcessiveCount; 391 --mExcessivePos; 392 incrementInputIndex(); 393 } else { 394 --mTransposedCount; 395 if (DEBUG_CORRECTION 396 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) 397 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 398 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 399 DUMP_WORD(mWord, mOutputIndex); 400 AKLOGI("UNRELATED(0): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 401 mTransposedCount, mExcessiveCount, c); 402 } 403 return processUnrelatedCorrectionType(); 404 } 405 } 406 407 // TODO: Change the limit if we'll allow two or more proximity chars with corrections 408 // Work around: When the mMaxErrors is 1, we only allow just one error 409 // including proximity correction. 410 const bool checkProximityChars = (mMaxErrors > 1) 411 ? (noCorrectionsHappenedSoFar || mProximityCount == 0) 412 : (noCorrectionsHappenedSoFar && mProximityCount == 0); 413 414 ProximityType matchedProximityCharId = secondTransposing 415 ? EQUIVALENT_CHAR 416 : mProximityInfoState.getMatchedProximityId( 417 mInputIndex, c, checkProximityChars, &proximityIndex); 418 419 if (UNRELATED_CHAR == matchedProximityCharId 420 || ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { 421 if (canTryCorrection && mOutputIndex > 0 422 && mCorrectionStates[mOutputIndex].mProximityMatching 423 && mCorrectionStates[mOutputIndex].mExceeding 424 && isEquivalentChar(mProximityInfoState.getMatchedProximityId( 425 mInputIndex, mWord[mOutputIndex - 1], false))) { 426 if (DEBUG_CORRECTION 427 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) 428 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 429 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 430 AKLOGI("CONVERSION p->e %c", mWord[mOutputIndex - 1]); 431 } 432 // Conversion p->e 433 // Example: 434 // wearth -> earth 435 // px -> (E)mmmmm 436 ++mExcessiveCount; 437 --mProximityCount; 438 mExcessivePos = mOutputIndex - 1; 439 ++mInputIndex; 440 // Here, we are doing something equivalent to matchedProximityCharId, 441 // but we already know that "excessive char correction" just happened 442 // so that we just need to check "mProximityCount == 0". 443 matchedProximityCharId = mProximityInfoState.getMatchedProximityId( 444 mInputIndex, c, mProximityCount == 0, &proximityIndex); 445 } 446 } 447 448 if (UNRELATED_CHAR == matchedProximityCharId 449 || ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { 450 if (ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { 451 mAdditionalProximityMatching = true; 452 } 453 // TODO: Optimize 454 // As the current char turned out to be an unrelated char, 455 // we will try other correction-types. Please note that mCorrectionStates[mOutputIndex] 456 // here refers to the previous state. 457 if (mInputIndex < mInputSize - 1 && mOutputIndex > 0 && mTransposedCount > 0 458 && !mCorrectionStates[mOutputIndex].mTransposing 459 && mCorrectionStates[mOutputIndex - 1].mTransposing 460 && isEquivalentChar(mProximityInfoState.getMatchedProximityId( 461 mInputIndex, mWord[mOutputIndex - 1], false)) 462 && isEquivalentChar( 463 mProximityInfoState.getMatchedProximityId(mInputIndex + 1, c, false))) { 464 // Conversion t->e 465 // Example: 466 // occaisional -> occa sional 467 // mmmmttx -> mmmm(E)mmmmmm 468 mTransposedCount -= 2; 469 ++mExcessiveCount; 470 ++mInputIndex; 471 } else if (mOutputIndex > 0 && mInputIndex > 0 && mTransposedCount > 0 472 && !mCorrectionStates[mOutputIndex].mTransposing 473 && mCorrectionStates[mOutputIndex - 1].mTransposing 474 && isEquivalentChar( 475 mProximityInfoState.getMatchedProximityId(mInputIndex - 1, c, false))) { 476 // Conversion t->s 477 // Example: 478 // chcolate -> chocolate 479 // mmttx -> mmsmmmmmm 480 mTransposedCount -= 2; 481 ++mSkippedCount; 482 --mInputIndex; 483 } else if (canTryCorrection && mInputIndex > 0 484 && mCorrectionStates[mOutputIndex].mProximityMatching 485 && mCorrectionStates[mOutputIndex].mSkipping 486 && isEquivalentChar( 487 mProximityInfoState.getMatchedProximityId(mInputIndex - 1, c, false))) { 488 // Conversion p->s 489 // Note: This logic tries saving cases like contrst --> contrast -- "a" is one of 490 // proximity chars of "s", but it should rather be handled as a skipped char. 491 ++mSkippedCount; 492 --mProximityCount; 493 return processSkipChar(c, isTerminal, false); 494 } else if (mInputIndex - 1 < mInputSize 495 && mSkippedCount > 0 496 && mCorrectionStates[mOutputIndex].mSkipping 497 && mCorrectionStates[mOutputIndex].mAdditionalProximityMatching 498 && isProximityCharOrEquivalentChar( 499 mProximityInfoState.getMatchedProximityId(mInputIndex + 1, c, false))) { 500 // Conversion s->a 501 incrementInputIndex(); 502 --mSkippedCount; 503 mProximityMatching = true; 504 ++mProximityCount; 505 mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO; 506 } else if ((mExceeding || mTransposing) && mInputIndex - 1 < mInputSize 507 && isEquivalentChar( 508 mProximityInfoState.getMatchedProximityId(mInputIndex + 1, c, false))) { 509 // 1.2. Excessive or transpose correction 510 if (mTransposing) { 511 ++mTransposedCount; 512 } else { 513 ++mExcessiveCount; 514 incrementInputIndex(); 515 } 516 if (DEBUG_CORRECTION 517 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) 518 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 519 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 520 DUMP_WORD(mWord, mOutputIndex); 521 if (mTransposing) { 522 AKLOGI("TRANSPOSE: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 523 mTransposedCount, mExcessiveCount, c); 524 } else { 525 AKLOGI("EXCEED: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 526 mTransposedCount, mExcessiveCount, c); 527 } 528 } 529 } else if (mSkipping) { 530 // 3. Skip correction 531 ++mSkippedCount; 532 if (DEBUG_CORRECTION 533 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) 534 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 535 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 536 AKLOGI("SKIP: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 537 mTransposedCount, mExcessiveCount, c); 538 } 539 return processSkipChar(c, isTerminal, false); 540 } else if (ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { 541 // As a last resort, use additional proximity characters 542 mProximityMatching = true; 543 ++mProximityCount; 544 mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO; 545 if (DEBUG_CORRECTION 546 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) 547 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 548 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 549 AKLOGI("ADDITIONALPROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 550 mTransposedCount, mExcessiveCount, c); 551 } 552 } else { 553 if (DEBUG_CORRECTION 554 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) 555 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 556 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 557 DUMP_WORD(mWord, mOutputIndex); 558 AKLOGI("UNRELATED(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 559 mTransposedCount, mExcessiveCount, c); 560 } 561 return processUnrelatedCorrectionType(); 562 } 563 } else if (secondTransposing) { 564 // If inputIndex is greater than mInputSize, that means there is no 565 // proximity chars. So, we don't need to check proximity. 566 mMatching = true; 567 } else if (isEquivalentChar(matchedProximityCharId)) { 568 mMatching = true; 569 ++mEquivalentCharCount; 570 mDistances[mOutputIndex] = mProximityInfoState.getNormalizedSquaredDistance(mInputIndex, 0); 571 } else if (NEAR_PROXIMITY_CHAR == matchedProximityCharId) { 572 mProximityMatching = true; 573 ++mProximityCount; 574 mDistances[mOutputIndex] = 575 mProximityInfoState.getNormalizedSquaredDistance(mInputIndex, proximityIndex); 576 if (DEBUG_CORRECTION 577 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) 578 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 579 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 580 AKLOGI("PROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 581 mTransposedCount, mExcessiveCount, c); 582 } 583 } 584 585 addCharToCurrentWord(c); 586 587 // 4. Last char excessive correction 588 mLastCharExceeded = mExcessiveCount == 0 && mSkippedCount == 0 && mTransposedCount == 0 589 && mProximityCount == 0 && (mInputIndex == mInputSize - 2); 590 const bool isSameAsUserTypedLength = (mInputSize == mInputIndex + 1) || mLastCharExceeded; 591 if (mLastCharExceeded) { 592 ++mExcessiveCount; 593 } 594 595 // Start traversing all nodes after the index exceeds the user typed length 596 if (isSameAsUserTypedLength) { 597 startToTraverseAllNodes(); 598 } 599 600 const bool needsToTryOnTerminalForTheLastPossibleExcessiveChar = 601 mExceeding && mInputIndex == mInputSize - 2; 602 603 // Finally, we are ready to go to the next character, the next "virtual node". 604 // We should advance the input index. 605 // We do this in this branch of the 'if traverseAllNodes' because we are still matching 606 // characters to input; the other branch is not matching them but searching for 607 // completions, this is why it does not have to do it. 608 incrementInputIndex(); 609 // Also, the next char is one "virtual node" depth more than this char. 610 incrementOutputIndex(); 611 612 if ((needsToTryOnTerminalForTheLastPossibleExcessiveChar 613 || isSameAsUserTypedLength) && isTerminal) { 614 mTerminalInputIndex = mInputIndex - 1; 615 mTerminalOutputIndex = mOutputIndex - 1; 616 if (DEBUG_CORRECTION 617 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) 618 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { 619 DUMP_WORD(mWord, mOutputIndex); 620 AKLOGI("ONTERMINAL(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, 621 mTransposedCount, mExcessiveCount, c); 622 } 623 return ON_TERMINAL; 624 } else { 625 mTerminalInputIndex = mInputIndex - 1; 626 mTerminalOutputIndex = mOutputIndex - 1; 627 return NOT_ON_TERMINAL; 628 } 629 } 630 631 inline static int getQuoteCount(const unsigned short *word, const int length) { 632 int quoteCount = 0; 633 for (int i = 0; i < length; ++i) { 634 if (word[i] == SINGLE_QUOTE) { 635 ++quoteCount; 636 } 637 } 638 return quoteCount; 639 } 640 641 inline static bool isUpperCase(unsigned short c) { 642 return isAsciiUpper(toBaseChar(c)); 643 } 644 645 ////////////////////// 646 // RankingAlgorithm // 647 ////////////////////// 648 649 /* static */ 650 int Correction::RankingAlgorithm::calculateFinalProbability(const int inputIndex, 651 const int outputIndex, const int freq, int *editDistanceTable, const Correction *correction, 652 const int inputSize) { 653 const int excessivePos = correction->getExcessivePos(); 654 const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; 655 const int fullWordMultiplier = correction->FULL_WORD_MULTIPLIER; 656 const ProximityInfoState *proximityInfoState = &correction->mProximityInfoState; 657 const int skippedCount = correction->mSkippedCount; 658 const int transposedCount = correction->mTransposedCount / 2; 659 const int excessiveCount = correction->mExcessiveCount + correction->mTransposedCount % 2; 660 const int proximityMatchedCount = correction->mProximityCount; 661 const bool lastCharExceeded = correction->mLastCharExceeded; 662 const bool useFullEditDistance = correction->mUseFullEditDistance; 663 const int outputLength = outputIndex + 1; 664 if (skippedCount >= inputSize || inputSize == 0) { 665 return -1; 666 } 667 668 // TODO: find more robust way 669 bool sameLength = lastCharExceeded ? (inputSize == inputIndex + 2) 670 : (inputSize == inputIndex + 1); 671 672 // TODO: use mExcessiveCount 673 const int matchCount = inputSize - correction->mProximityCount - excessiveCount; 674 675 const unsigned short *word = correction->mWord; 676 const bool skipped = skippedCount > 0; 677 678 const int quoteDiffCount = max(0, getQuoteCount(word, outputLength) 679 - getQuoteCount(proximityInfoState->getPrimaryInputWord(), inputSize)); 680 681 // TODO: Calculate edit distance for transposed and excessive 682 int ed = 0; 683 if (DEBUG_DICT_FULL) { 684 dumpEditDistance10ForDebug(editDistanceTable, correction->mInputSize, outputLength); 685 } 686 int adjustedProximityMatchedCount = proximityMatchedCount; 687 688 int finalFreq = freq; 689 690 if (DEBUG_CORRECTION_FREQ 691 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputSize)) { 692 AKLOGI("FinalFreq0: %d", finalFreq); 693 } 694 // TODO: Optimize this. 695 if (transposedCount > 0 || proximityMatchedCount > 0 || skipped || excessiveCount > 0) { 696 ed = getCurrentEditDistance(editDistanceTable, correction->mInputSize, outputLength, 697 inputSize) - transposedCount; 698 699 const int matchWeight = powerIntCapped(typedLetterMultiplier, 700 max(inputSize, outputLength) - ed); 701 multiplyIntCapped(matchWeight, &finalFreq); 702 703 // TODO: Demote further if there are two or more excessive chars with longer user input? 704 if (inputSize > outputLength) { 705 multiplyRate(INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE, &finalFreq); 706 } 707 708 ed = max(0, ed - quoteDiffCount); 709 adjustedProximityMatchedCount = min(max(0, ed - (outputLength - inputSize)), 710 proximityMatchedCount); 711 if (transposedCount <= 0) { 712 if (ed == 1 && (inputSize == outputLength - 1 || inputSize == outputLength + 1)) { 713 // Promote a word with just one skipped or excessive char 714 if (sameLength) { 715 multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE 716 + WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_MULTIPLIER * outputLength, 717 &finalFreq); 718 } else { 719 multiplyIntCapped(typedLetterMultiplier, &finalFreq); 720 } 721 } else if (ed == 0) { 722 multiplyIntCapped(typedLetterMultiplier, &finalFreq); 723 sameLength = true; 724 } 725 } 726 } else { 727 const int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount); 728 multiplyIntCapped(matchWeight, &finalFreq); 729 } 730 731 if (proximityInfoState->getMatchedProximityId(0, word[0], true) == UNRELATED_CHAR) { 732 multiplyRate(FIRST_CHAR_DIFFERENT_DEMOTION_RATE, &finalFreq); 733 } 734 735 /////////////////////////////////////////////// 736 // Promotion and Demotion for each correction 737 738 // Demotion for a word with missing character 739 if (skipped) { 740 const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 741 * (10 * inputSize - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X) 742 / (10 * inputSize 743 - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10); 744 if (DEBUG_DICT_FULL) { 745 AKLOGI("Demotion rate for missing character is %d.", demotionRate); 746 } 747 multiplyRate(demotionRate, &finalFreq); 748 } 749 750 // Demotion for a word with transposed character 751 if (transposedCount > 0) multiplyRate( 752 WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq); 753 754 // Demotion for a word with excessive character 755 if (excessiveCount > 0) { 756 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq); 757 if (!lastCharExceeded && !proximityInfoState->existsAdjacentProximityChars(excessivePos)) { 758 if (DEBUG_DICT_FULL) { 759 AKLOGI("Double excessive demotion"); 760 } 761 // If an excessive character is not adjacent to the left char or the right char, 762 // we will demote this word. 763 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq); 764 } 765 } 766 767 int additionalProximityCount = 0; 768 // Demote additional proximity characters 769 for (int i = 0; i < outputLength; ++i) { 770 const int squaredDistance = correction->mDistances[i]; 771 if (squaredDistance == ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO) { 772 ++additionalProximityCount; 773 } 774 } 775 776 const bool performTouchPositionCorrection = 777 CALIBRATE_SCORE_BY_TOUCH_COORDINATES 778 && proximityInfoState->touchPositionCorrectionEnabled() 779 && skippedCount == 0 && excessiveCount == 0 && transposedCount == 0 780 && additionalProximityCount == 0; 781 782 // Score calibration by touch coordinates is being done only for pure-fat finger typing error 783 // cases. 784 // TODO: Remove this constraint. 785 if (performTouchPositionCorrection) { 786 for (int i = 0; i < outputLength; ++i) { 787 const int squaredDistance = correction->mDistances[i]; 788 if (i < adjustedProximityMatchedCount) { 789 multiplyIntCapped(typedLetterMultiplier, &finalFreq); 790 } 791 if (squaredDistance >= 0) { 792 // Promote or demote the score according to the distance from the sweet spot 793 static const float A = ZERO_DISTANCE_PROMOTION_RATE / 100.0f; 794 static const float B = 1.0f; 795 static const float C = 0.5f; 796 static const float MIN = 0.3f; 797 static const float R1 = NEUTRAL_SCORE_SQUARED_RADIUS; 798 static const float R2 = HALF_SCORE_SQUARED_RADIUS; 799 const float x = static_cast<float>(squaredDistance) 800 / ProximityInfoState::NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR; 801 const float factor = max((x < R1) 802 ? (A * (R1 - x) + B * x) / R1 803 : (B * (R2 - x) + C * (x - R1)) / (R2 - R1), MIN); 804 // factor is a piecewise linear function like: 805 // A -_ . 806 // ^-_ . 807 // B \ . 808 // \_ . 809 // C ------------. 810 // . 811 // 0 R1 R2 . 812 multiplyRate((int)(factor * 100.0f), &finalFreq); 813 } else if (squaredDistance == PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO) { 814 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); 815 } 816 } 817 } else { 818 // Promotion for a word with proximity characters 819 for (int i = 0; i < adjustedProximityMatchedCount; ++i) { 820 // A word with proximity corrections 821 if (DEBUG_DICT_FULL) { 822 AKLOGI("Found a proximity correction."); 823 } 824 multiplyIntCapped(typedLetterMultiplier, &finalFreq); 825 if (i < additionalProximityCount) { 826 multiplyRate(WORDS_WITH_ADDITIONAL_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); 827 } else { 828 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); 829 } 830 } 831 } 832 833 // If the user types too many(three or more) proximity characters with additional proximity 834 // character,do not treat as the same length word. 835 if (sameLength && additionalProximityCount > 0 && (adjustedProximityMatchedCount >= 3 836 || transposedCount > 0 || skipped || excessiveCount > 0)) { 837 sameLength = false; 838 } 839 840 const int errorCount = adjustedProximityMatchedCount > 0 841 ? adjustedProximityMatchedCount 842 : (proximityMatchedCount + transposedCount); 843 multiplyRate( 844 100 - CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE * errorCount / inputSize, &finalFreq); 845 846 // Promotion for an exactly matched word 847 if (ed == 0) { 848 // Full exact match 849 if (sameLength && transposedCount == 0 && !skipped && excessiveCount == 0 850 && quoteDiffCount == 0 && additionalProximityCount == 0) { 851 finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq); 852 } 853 } 854 855 // Promote a word with no correction 856 if (proximityMatchedCount == 0 && transposedCount == 0 && !skipped && excessiveCount == 0 857 && additionalProximityCount == 0) { 858 multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq); 859 } 860 861 // TODO: Check excessive count and transposed count 862 // TODO: Remove this if possible 863 /* 864 If the last character of the user input word is the same as the next character 865 of the output word, and also all of characters of the user input are matched 866 to the output word, we'll promote that word a bit because 867 that word can be considered the combination of skipped and matched characters. 868 This means that the 'sm' pattern wins over the 'ma' pattern. 869 e.g.) 870 shel -> shell [mmmma] or [mmmsm] 871 hel -> hello [mmmaa] or [mmsma] 872 m ... matching 873 s ... skipping 874 a ... traversing all 875 t ... transposing 876 e ... exceeding 877 p ... proximity matching 878 */ 879 if (matchCount == inputSize && matchCount >= 2 && !skipped 880 && word[matchCount] == word[matchCount - 1]) { 881 multiplyRate(WORDS_WITH_MATCH_SKIP_PROMOTION_RATE, &finalFreq); 882 } 883 884 // TODO: Do not use sameLength? 885 if (sameLength) { 886 multiplyIntCapped(fullWordMultiplier, &finalFreq); 887 } 888 889 if (useFullEditDistance && outputLength > inputSize + 1) { 890 const int diff = outputLength - inputSize - 1; 891 const int divider = diff < 31 ? 1 << diff : S_INT_MAX; 892 finalFreq = divider > finalFreq ? 1 : finalFreq / divider; 893 } 894 895 if (DEBUG_DICT_FULL) { 896 AKLOGI("calc: %d, %d", outputLength, sameLength); 897 } 898 899 if (DEBUG_CORRECTION_FREQ 900 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputSize)) { 901 DUMP_WORD(correction->getPrimaryInputWord(), inputSize); 902 DUMP_WORD(correction->mWord, outputLength); 903 AKLOGI("FinalFreq: [P%d, S%d, T%d, E%d, A%d] %d, %d, %d, %d, %d, %d", proximityMatchedCount, 904 skippedCount, transposedCount, excessiveCount, additionalProximityCount, 905 outputLength, lastCharExceeded, sameLength, quoteDiffCount, ed, finalFreq); 906 } 907 908 return finalFreq; 909 } 910 911 /* static */ 912 int Correction::RankingAlgorithm::calcFreqForSplitMultipleWords( 913 const int *freqArray, const int *wordLengthArray, const int wordCount, 914 const Correction *correction, const bool isSpaceProximity, const unsigned short *word) { 915 const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; 916 917 bool firstCapitalizedWordDemotion = false; 918 bool secondCapitalizedWordDemotion = false; 919 920 { 921 // TODO: Handle multiple capitalized word demotion properly 922 const int firstWordLength = wordLengthArray[0]; 923 const int secondWordLength = wordLengthArray[1]; 924 if (firstWordLength >= 2) { 925 firstCapitalizedWordDemotion = isUpperCase(word[0]); 926 } 927 928 if (secondWordLength >= 2) { 929 // FIXME: word[firstWordLength + 1] is incorrect. 930 secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]); 931 } 932 } 933 934 935 const bool capitalizedWordDemotion = 936 firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion; 937 938 int totalLength = 0; 939 int totalFreq = 0; 940 for (int i = 0; i < wordCount; ++i) { 941 const int wordLength = wordLengthArray[i]; 942 if (wordLength <= 0) { 943 return 0; 944 } 945 totalLength += wordLength; 946 const int demotionRate = 100 - TWO_WORDS_CORRECTION_DEMOTION_BASE / (wordLength + 1); 947 int tempFirstFreq = freqArray[i]; 948 multiplyRate(demotionRate, &tempFirstFreq); 949 totalFreq += tempFirstFreq; 950 } 951 952 if (totalLength <= 0 || totalFreq <= 0) { 953 return 0; 954 } 955 956 // TODO: Currently totalFreq is adjusted to two word metrix. 957 // Promote pairFreq with multiplying by 2, because the word length is the same as the typed 958 // length. 959 totalFreq = totalFreq * 2 / wordCount; 960 if (wordCount > 2) { 961 // Safety net for 3+ words -- Caveats: many heuristics and workarounds here. 962 int oneLengthCounter = 0; 963 int twoLengthCounter = 0; 964 for (int i = 0; i < wordCount; ++i) { 965 const int wordLength = wordLengthArray[i]; 966 // TODO: Use bigram instead of this safety net 967 if (i < wordCount - 1) { 968 const int nextWordLength = wordLengthArray[i + 1]; 969 if (wordLength == 1 && nextWordLength == 2) { 970 // Safety net to filter 1 length and 2 length sequential words 971 return 0; 972 } 973 } 974 const int freq = freqArray[i]; 975 // Demote too short weak words 976 if (wordLength <= 4 && freq <= SUPPRESS_SHORT_MULTIPLE_WORDS_THRESHOLD_FREQ) { 977 multiplyRate(100 * freq / MAX_FREQ, &totalFreq); 978 } 979 if (wordLength == 1) { 980 ++oneLengthCounter; 981 } else if (wordLength == 2) { 982 ++twoLengthCounter; 983 } 984 if (oneLengthCounter >= 2 || (oneLengthCounter + twoLengthCounter) >= 4) { 985 // Safety net to filter too many short words 986 return 0; 987 } 988 } 989 multiplyRate(MULTIPLE_WORDS_DEMOTION_RATE, &totalFreq); 990 } 991 992 // This is a workaround to try offsetting the not-enough-demotion which will be done in 993 // calcNormalizedScore in Utils.java. 994 // In calcNormalizedScore the score will be demoted by (1 - 1 / length) 995 // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by 996 // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length)) 997 const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength); 998 multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq); 999 1000 // At this moment, totalFreq is calculated by the following formula: 1001 // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1))) 1002 // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1)) 1003 1004 multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq); 1005 1006 // This is another workaround to offset the demotion which will be done in 1007 // calcNormalizedScore in Utils.java. 1008 // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote 1009 // the same amount because we already have adjusted the synthetic freq of this "missing or 1010 // mistyped space" suggestion candidate above in this method. 1011 const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength); 1012 multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq); 1013 1014 if (isSpaceProximity) { 1015 // A word pair with one space proximity correction 1016 if (DEBUG_DICT) { 1017 AKLOGI("Found a word pair with space proximity correction."); 1018 } 1019 multiplyIntCapped(typedLetterMultiplier, &totalFreq); 1020 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq); 1021 } 1022 1023 if (isSpaceProximity) { 1024 multiplyRate(WORDS_WITH_MISTYPED_SPACE_DEMOTION_RATE, &totalFreq); 1025 } else { 1026 multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq); 1027 } 1028 1029 if (capitalizedWordDemotion) { 1030 multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq); 1031 } 1032 1033 if (DEBUG_CORRECTION_FREQ) { 1034 AKLOGI("Multiple words (%d, %d) (%d, %d) %d, %d", freqArray[0], freqArray[1], 1035 wordLengthArray[0], wordLengthArray[1], capitalizedWordDemotion, totalFreq); 1036 DUMP_WORD(word, wordLengthArray[0]); 1037 } 1038 1039 return totalFreq; 1040 } 1041 1042 /* Damerau-Levenshtein distance */ 1043 inline static int editDistanceInternal( 1044 int *editDistanceTable, const unsigned short *before, 1045 const int beforeLength, const unsigned short *after, const int afterLength) { 1046 // dp[li][lo] dp[a][b] = dp[ a * lo + b] 1047 int *dp = editDistanceTable; 1048 const int li = beforeLength + 1; 1049 const int lo = afterLength + 1; 1050 for (int i = 0; i < li; ++i) { 1051 dp[lo * i] = i; 1052 } 1053 for (int i = 0; i < lo; ++i) { 1054 dp[i] = i; 1055 } 1056 1057 for (int i = 0; i < li - 1; ++i) { 1058 for (int j = 0; j < lo - 1; ++j) { 1059 const uint32_t ci = toBaseLowerCase(before[i]); 1060 const uint32_t co = toBaseLowerCase(after[j]); 1061 const uint16_t cost = (ci == co) ? 0 : 1; 1062 dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1, 1063 min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost)); 1064 if (i > 0 && j > 0 && ci == toBaseLowerCase(after[j - 1]) 1065 && co == toBaseLowerCase(before[i - 1])) { 1066 dp[(i + 1) * lo + (j + 1)] = min( 1067 dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost); 1068 } 1069 } 1070 } 1071 1072 if (DEBUG_EDIT_DISTANCE) { 1073 AKLOGI("IN = %d, OUT = %d", beforeLength, afterLength); 1074 for (int i = 0; i < li; ++i) { 1075 for (int j = 0; j < lo; ++j) { 1076 AKLOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]); 1077 } 1078 } 1079 } 1080 return dp[li * lo - 1]; 1081 } 1082 1083 int Correction::RankingAlgorithm::editDistance(const unsigned short *before, 1084 const int beforeLength, const unsigned short *after, const int afterLength) { 1085 int table[(beforeLength + 1) * (afterLength + 1)]; 1086 return editDistanceInternal(table, before, beforeLength, after, afterLength); 1087 } 1088 1089 1090 // In dictionary.cpp, getSuggestion() method, 1091 // suggestion scores are computed using the below formula. 1092 // original score 1093 // := powf(mTypedLetterMultiplier (this is defined 2), 1094 // (the number of matched characters between typed word and suggested word)) 1095 // * (individual word's score which defined in the unigram dictionary, 1096 // and this score is defined in range [0, 255].) 1097 // Then, the following processing is applied. 1098 // - If the dictionary word is matched up to the point of the user entry 1099 // (full match up to min(before.length(), after.length()) 1100 // => Then multiply by FULL_MATCHED_WORDS_PROMOTION_RATE (this is defined 1.2) 1101 // - If the word is a true full match except for differences in accents or 1102 // capitalization, then treat it as if the score was 255. 1103 // - If before.length() == after.length() 1104 // => multiply by mFullWordMultiplier (this is defined 2)) 1105 // So, maximum original score is powf(2, min(before.length(), after.length())) * 255 * 2 * 1.2 1106 // For historical reasons we ignore the 1.2 modifier (because the measure for a good 1107 // autocorrection threshold was done at a time when it didn't exist). This doesn't change 1108 // the result. 1109 // So, we can normalize original score by dividing powf(2, min(b.l(),a.l())) * 255 * 2. 1110 1111 /* static */ 1112 float Correction::RankingAlgorithm::calcNormalizedScore(const unsigned short *before, 1113 const int beforeLength, const unsigned short *after, const int afterLength, 1114 const int score) { 1115 if (0 == beforeLength || 0 == afterLength) { 1116 return 0; 1117 } 1118 const int distance = editDistance(before, beforeLength, after, afterLength); 1119 int spaceCount = 0; 1120 for (int i = 0; i < afterLength; ++i) { 1121 if (after[i] == CODE_SPACE) { 1122 ++spaceCount; 1123 } 1124 } 1125 1126 if (spaceCount == afterLength) { 1127 return 0; 1128 } 1129 1130 const float maxScore = score >= S_INT_MAX ? static_cast<float>(S_INT_MAX) 1131 : static_cast<float>(MAX_INITIAL_SCORE) 1132 * powf(static_cast<float>(TYPED_LETTER_MULTIPLIER), 1133 static_cast<float>(min(beforeLength, afterLength - spaceCount))) 1134 * static_cast<float>(FULL_WORD_MULTIPLIER); 1135 1136 // add a weight based on edit distance. 1137 // distance <= max(afterLength, beforeLength) == afterLength, 1138 // so, 0 <= distance / afterLength <= 1 1139 const float weight = 1.0f - static_cast<float>(distance) / static_cast<float>(afterLength); 1140 return (static_cast<float>(score) / maxScore) * weight; 1141 } 1142 } // namespace latinime 1143