Home | History | Annotate | Download | only in src
      1 /*
      2  * Copyright (C) 2011 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #define LOG_TAG "LatinIME: correction.cpp"
     18 
     19 #include <cmath>
     20 
     21 #include "char_utils.h"
     22 #include "correction.h"
     23 #include "defines.h"
     24 #include "proximity_info_state.h"
     25 #include "suggest_utils.h"
     26 #include "suggest/policyimpl/utils/edit_distance.h"
     27 #include "suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h"
     28 
     29 namespace latinime {
     30 
     31 class ProximityInfo;
     32 
     33 /////////////////////////////
     34 // edit distance funcitons //
     35 /////////////////////////////
     36 
     37 inline static void initEditDistance(int *editDistanceTable) {
     38     for (int i = 0; i <= MAX_WORD_LENGTH; ++i) {
     39         editDistanceTable[i] = i;
     40     }
     41 }
     42 
     43 inline static void dumpEditDistance10ForDebug(int *editDistanceTable,
     44         const int editDistanceTableWidth, const int outputLength) {
     45     if (DEBUG_DICT) {
     46         AKLOGI("EditDistanceTable");
     47         for (int i = 0; i <= 10; ++i) {
     48             int c[11];
     49             for (int j = 0; j <= 10; ++j) {
     50                 if (j < editDistanceTableWidth + 1 && i < outputLength + 1) {
     51                     c[j] = (editDistanceTable + i * (editDistanceTableWidth + 1))[j];
     52                 } else {
     53                     c[j] = -1;
     54                 }
     55             }
     56             AKLOGI("[ %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d ]",
     57                     c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10]);
     58             (void)c; // To suppress compiler warning
     59         }
     60     }
     61 }
     62 
     63 inline static int getCurrentEditDistance(int *editDistanceTable, const int editDistanceTableWidth,
     64         const int outputLength, const int inputSize) {
     65     if (DEBUG_EDIT_DISTANCE) {
     66         AKLOGI("getCurrentEditDistance %d, %d", inputSize, outputLength);
     67     }
     68     return editDistanceTable[(editDistanceTableWidth + 1) * (outputLength) + inputSize];
     69 }
     70 
     71 ////////////////
     72 // Correction //
     73 ////////////////
     74 
     75 void Correction::resetCorrection() {
     76     mTotalTraverseCount = 0;
     77 }
     78 
     79 void Correction::initCorrection(const ProximityInfo *pi, const int inputSize, const int maxDepth) {
     80     mProximityInfo = pi;
     81     mInputSize = inputSize;
     82     mMaxDepth = maxDepth;
     83     mMaxEditDistance = mInputSize < 5 ? 2 : mInputSize / 2;
     84     // TODO: This is not supposed to be required.  Check what's going wrong with
     85     // editDistance[0 ~ MAX_WORD_LENGTH]
     86     initEditDistance(mEditDistanceTable);
     87 }
     88 
     89 void Correction::initCorrectionState(
     90         const int rootPos, const int childCount, const bool traverseAll) {
     91     latinime::initCorrectionState(mCorrectionStates, rootPos, childCount, traverseAll);
     92     // TODO: remove
     93     mCorrectionStates[0].mTransposedPos = mTransposedPos;
     94     mCorrectionStates[0].mExcessivePos = mExcessivePos;
     95     mCorrectionStates[0].mSkipPos = mSkipPos;
     96 }
     97 
     98 void Correction::setCorrectionParams(const int skipPos, const int excessivePos,
     99         const int transposedPos, const int spaceProximityPos, const int missingSpacePos,
    100         const bool useFullEditDistance, const bool doAutoCompletion, const int maxErrors) {
    101     // TODO: remove
    102     mTransposedPos = transposedPos;
    103     mExcessivePos = excessivePos;
    104     mSkipPos = skipPos;
    105     // TODO: remove
    106     mCorrectionStates[0].mTransposedPos = transposedPos;
    107     mCorrectionStates[0].mExcessivePos = excessivePos;
    108     mCorrectionStates[0].mSkipPos = skipPos;
    109 
    110     mSpaceProximityPos = spaceProximityPos;
    111     mMissingSpacePos = missingSpacePos;
    112     mUseFullEditDistance = useFullEditDistance;
    113     mDoAutoCompletion = doAutoCompletion;
    114     mMaxErrors = maxErrors;
    115 }
    116 
    117 void Correction::checkState() const {
    118     if (DEBUG_DICT) {
    119         int inputCount = 0;
    120         if (mSkipPos >= 0) ++inputCount;
    121         if (mExcessivePos >= 0) ++inputCount;
    122         if (mTransposedPos >= 0) ++inputCount;
    123     }
    124 }
    125 
    126 bool Correction::sameAsTyped() const {
    127     return mProximityInfoState.sameAsTyped(mWord, mOutputIndex);
    128 }
    129 
    130 int Correction::getFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray,
    131         const int wordCount, const bool isSpaceProximity, const int *word) const {
    132     return Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(freqArray, wordLengthArray,
    133             wordCount, this, isSpaceProximity, word);
    134 }
    135 
    136 int Correction::getFinalProbability(const int probability, int **word, int *wordLength) {
    137     return getFinalProbabilityInternal(probability, word, wordLength, mInputSize);
    138 }
    139 
    140 int Correction::getFinalProbabilityForSubQueue(const int probability, int **word, int *wordLength,
    141         const int inputSize) {
    142     return getFinalProbabilityInternal(probability, word, wordLength, inputSize);
    143 }
    144 
    145 bool Correction::initProcessState(const int outputIndex) {
    146     if (mCorrectionStates[outputIndex].mChildCount <= 0) {
    147         return false;
    148     }
    149     mOutputIndex = outputIndex;
    150     --(mCorrectionStates[outputIndex].mChildCount);
    151     mInputIndex = mCorrectionStates[outputIndex].mInputIndex;
    152     mNeedsToTraverseAllNodes = mCorrectionStates[outputIndex].mNeedsToTraverseAllNodes;
    153 
    154     mEquivalentCharCount = mCorrectionStates[outputIndex].mEquivalentCharCount;
    155     mProximityCount = mCorrectionStates[outputIndex].mProximityCount;
    156     mTransposedCount = mCorrectionStates[outputIndex].mTransposedCount;
    157     mExcessiveCount = mCorrectionStates[outputIndex].mExcessiveCount;
    158     mSkippedCount = mCorrectionStates[outputIndex].mSkippedCount;
    159     mLastCharExceeded = mCorrectionStates[outputIndex].mLastCharExceeded;
    160 
    161     mTransposedPos = mCorrectionStates[outputIndex].mTransposedPos;
    162     mExcessivePos = mCorrectionStates[outputIndex].mExcessivePos;
    163     mSkipPos = mCorrectionStates[outputIndex].mSkipPos;
    164 
    165     mMatching = false;
    166     mProximityMatching = false;
    167     mAdditionalProximityMatching = false;
    168     mTransposing = false;
    169     mExceeding = false;
    170     mSkipping = false;
    171 
    172     return true;
    173 }
    174 
    175 int Correction::goDownTree(const int parentIndex, const int childCount, const int firstChildPos) {
    176     mCorrectionStates[mOutputIndex].mParentIndex = parentIndex;
    177     mCorrectionStates[mOutputIndex].mChildCount = childCount;
    178     mCorrectionStates[mOutputIndex].mSiblingPos = firstChildPos;
    179     return mOutputIndex;
    180 }
    181 
    182 // TODO: remove
    183 int Correction::getInputIndex() const {
    184     return mInputIndex;
    185 }
    186 
    187 bool Correction::needsToPrune() const {
    188     // TODO: use edit distance here
    189     return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance
    190             // Allow one char longer word for missing character
    191             || (!mDoAutoCompletion && (mOutputIndex > mInputSize));
    192 }
    193 
    194 inline static bool isEquivalentChar(ProximityType type) {
    195     return type == MATCH_CHAR;
    196 }
    197 
    198 inline static bool isProximityCharOrEquivalentChar(ProximityType type) {
    199     return type == MATCH_CHAR || type == PROXIMITY_CHAR;
    200 }
    201 
    202 Correction::CorrectionType Correction::processCharAndCalcState(const int c, const bool isTerminal) {
    203     const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount);
    204     if (correctionCount > mMaxErrors) {
    205         return processUnrelatedCorrectionType();
    206     }
    207 
    208     // TODO: Change the limit if we'll allow two or more corrections
    209     const bool noCorrectionsHappenedSoFar = correctionCount == 0;
    210     const bool canTryCorrection = noCorrectionsHappenedSoFar;
    211     int proximityIndex = 0;
    212     mDistances[mOutputIndex] = NOT_A_DISTANCE;
    213 
    214     // Skip checking this node
    215     if (mNeedsToTraverseAllNodes || isSingleQuote(c)) {
    216         bool incremented = false;
    217         if (mLastCharExceeded && mInputIndex == mInputSize - 1) {
    218             // TODO: Do not check the proximity if EditDistance exceeds the threshold
    219             const ProximityType matchId = mProximityInfoState.getProximityType(
    220                     mInputIndex, c, true, &proximityIndex);
    221             if (isEquivalentChar(matchId)) {
    222                 mLastCharExceeded = false;
    223                 --mExcessiveCount;
    224                 mDistances[mOutputIndex] =
    225                         mProximityInfoState.getNormalizedSquaredDistance(mInputIndex, 0);
    226             } else if (matchId == PROXIMITY_CHAR) {
    227                 mLastCharExceeded = false;
    228                 --mExcessiveCount;
    229                 ++mProximityCount;
    230                 mDistances[mOutputIndex] = mProximityInfoState.getNormalizedSquaredDistance(
    231                         mInputIndex, proximityIndex);
    232             }
    233             if (!isSingleQuote(c)) {
    234                 incrementInputIndex();
    235                 incremented = true;
    236             }
    237         }
    238         return processSkipChar(c, isTerminal, incremented);
    239     }
    240 
    241     // Check possible corrections.
    242     if (mExcessivePos >= 0) {
    243         if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) {
    244             mExcessivePos = mOutputIndex;
    245         }
    246         if (mExcessivePos < mInputSize - 1) {
    247             mExceeding = mExcessivePos == mInputIndex && canTryCorrection;
    248         }
    249     }
    250 
    251     if (mSkipPos >= 0) {
    252         if (mSkippedCount == 0 && mSkipPos < mOutputIndex) {
    253             if (DEBUG_DICT) {
    254                 // TODO: Enable this assertion.
    255                 //ASSERT(mSkipPos == mOutputIndex - 1);
    256             }
    257             mSkipPos = mOutputIndex;
    258         }
    259         mSkipping = mSkipPos == mOutputIndex && canTryCorrection;
    260     }
    261 
    262     if (mTransposedPos >= 0) {
    263         if (mTransposedCount == 0 && mTransposedPos < mOutputIndex) {
    264             mTransposedPos = mOutputIndex;
    265         }
    266         if (mTransposedPos < mInputSize - 1) {
    267             mTransposing = mInputIndex == mTransposedPos && canTryCorrection;
    268         }
    269     }
    270 
    271     bool secondTransposing = false;
    272     if (mTransposedCount % 2 == 1) {
    273         if (isEquivalentChar(mProximityInfoState.getProximityType(
    274                 mInputIndex - 1, c, false))) {
    275             ++mTransposedCount;
    276             secondTransposing = true;
    277         } else if (mCorrectionStates[mOutputIndex].mExceeding) {
    278             --mTransposedCount;
    279             ++mExcessiveCount;
    280             --mExcessivePos;
    281             incrementInputIndex();
    282         } else {
    283             --mTransposedCount;
    284             if (DEBUG_CORRECTION
    285                     && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize)
    286                     && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
    287                             || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
    288                 DUMP_WORD(mWord, mOutputIndex);
    289                 AKLOGI("UNRELATED(0): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
    290                         mTransposedCount, mExcessiveCount, c);
    291             }
    292             return processUnrelatedCorrectionType();
    293         }
    294     }
    295 
    296     // TODO: Change the limit if we'll allow two or more proximity chars with corrections
    297     // Work around: When the mMaxErrors is 1, we only allow just one error
    298     // including proximity correction.
    299     const bool checkProximityChars = (mMaxErrors > 1)
    300             ? (noCorrectionsHappenedSoFar || mProximityCount == 0)
    301             : (noCorrectionsHappenedSoFar && mProximityCount == 0);
    302 
    303     ProximityType matchedProximityCharId = secondTransposing
    304             ? MATCH_CHAR
    305             : mProximityInfoState.getProximityType(
    306                     mInputIndex, c, checkProximityChars, &proximityIndex);
    307 
    308     if (SUBSTITUTION_CHAR == matchedProximityCharId
    309             || ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) {
    310         if (canTryCorrection && mOutputIndex > 0
    311                 && mCorrectionStates[mOutputIndex].mProximityMatching
    312                 && mCorrectionStates[mOutputIndex].mExceeding
    313                 && isEquivalentChar(mProximityInfoState.getProximityType(
    314                         mInputIndex, mWord[mOutputIndex - 1], false))) {
    315             if (DEBUG_CORRECTION
    316                     && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize)
    317                     && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
    318                             || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
    319                 AKLOGI("CONVERSION p->e %c", mWord[mOutputIndex - 1]);
    320             }
    321             // Conversion p->e
    322             // Example:
    323             // wearth ->    earth
    324             // px     -> (E)mmmmm
    325             ++mExcessiveCount;
    326             --mProximityCount;
    327             mExcessivePos = mOutputIndex - 1;
    328             ++mInputIndex;
    329             // Here, we are doing something equivalent to matchedProximityCharId,
    330             // but we already know that "excessive char correction" just happened
    331             // so that we just need to check "mProximityCount == 0".
    332             matchedProximityCharId = mProximityInfoState.getProximityType(
    333                     mInputIndex, c, mProximityCount == 0, &proximityIndex);
    334         }
    335     }
    336 
    337     if (SUBSTITUTION_CHAR == matchedProximityCharId
    338             || ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) {
    339         if (ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) {
    340             mAdditionalProximityMatching = true;
    341         }
    342         // TODO: Optimize
    343         // As the current char turned out to be an unrelated char,
    344         // we will try other correction-types. Please note that mCorrectionStates[mOutputIndex]
    345         // here refers to the previous state.
    346         if (mInputIndex < mInputSize - 1 && mOutputIndex > 0 && mTransposedCount > 0
    347                 && !mCorrectionStates[mOutputIndex].mTransposing
    348                 && mCorrectionStates[mOutputIndex - 1].mTransposing
    349                 && isEquivalentChar(mProximityInfoState.getProximityType(
    350                         mInputIndex, mWord[mOutputIndex - 1], false))
    351                 && isEquivalentChar(
    352                         mProximityInfoState.getProximityType(mInputIndex + 1, c, false))) {
    353             // Conversion t->e
    354             // Example:
    355             // occaisional -> occa   sional
    356             // mmmmttx     -> mmmm(E)mmmmmm
    357             mTransposedCount -= 2;
    358             ++mExcessiveCount;
    359             ++mInputIndex;
    360         } else if (mOutputIndex > 0 && mInputIndex > 0 && mTransposedCount > 0
    361                 && !mCorrectionStates[mOutputIndex].mTransposing
    362                 && mCorrectionStates[mOutputIndex - 1].mTransposing
    363                 && isEquivalentChar(
    364                         mProximityInfoState.getProximityType(mInputIndex - 1, c, false))) {
    365             // Conversion t->s
    366             // Example:
    367             // chcolate -> chocolate
    368             // mmttx    -> mmsmmmmmm
    369             mTransposedCount -= 2;
    370             ++mSkippedCount;
    371             --mInputIndex;
    372         } else if (canTryCorrection && mInputIndex > 0
    373                 && mCorrectionStates[mOutputIndex].mProximityMatching
    374                 && mCorrectionStates[mOutputIndex].mSkipping
    375                 && isEquivalentChar(
    376                         mProximityInfoState.getProximityType(mInputIndex - 1, c, false))) {
    377             // Conversion p->s
    378             // Note: This logic tries saving cases like contrst --> contrast -- "a" is one of
    379             // proximity chars of "s", but it should rather be handled as a skipped char.
    380             ++mSkippedCount;
    381             --mProximityCount;
    382             return processSkipChar(c, isTerminal, false);
    383         } else if (mInputIndex - 1 < mInputSize
    384                 && mSkippedCount > 0
    385                 && mCorrectionStates[mOutputIndex].mSkipping
    386                 && mCorrectionStates[mOutputIndex].mAdditionalProximityMatching
    387                 && isProximityCharOrEquivalentChar(
    388                         mProximityInfoState.getProximityType(mInputIndex + 1, c, false))) {
    389             // Conversion s->a
    390             incrementInputIndex();
    391             --mSkippedCount;
    392             mProximityMatching = true;
    393             ++mProximityCount;
    394             mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO;
    395         } else if ((mExceeding || mTransposing) && mInputIndex - 1 < mInputSize
    396                 && isEquivalentChar(
    397                         mProximityInfoState.getProximityType(mInputIndex + 1, c, false))) {
    398             // 1.2. Excessive or transpose correction
    399             if (mTransposing) {
    400                 ++mTransposedCount;
    401             } else {
    402                 ++mExcessiveCount;
    403                 incrementInputIndex();
    404             }
    405             if (DEBUG_CORRECTION
    406                     && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize)
    407                     && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
    408                             || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
    409                 DUMP_WORD(mWord, mOutputIndex);
    410                 if (mTransposing) {
    411                     AKLOGI("TRANSPOSE: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
    412                             mTransposedCount, mExcessiveCount, c);
    413                 } else {
    414                     AKLOGI("EXCEED: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
    415                             mTransposedCount, mExcessiveCount, c);
    416                 }
    417             }
    418         } else if (mSkipping) {
    419             // 3. Skip correction
    420             ++mSkippedCount;
    421             if (DEBUG_CORRECTION
    422                     && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize)
    423                     && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
    424                             || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
    425                 AKLOGI("SKIP: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
    426                         mTransposedCount, mExcessiveCount, c);
    427             }
    428             return processSkipChar(c, isTerminal, false);
    429         } else if (ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) {
    430             // As a last resort, use additional proximity characters
    431             mProximityMatching = true;
    432             ++mProximityCount;
    433             mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO;
    434             if (DEBUG_CORRECTION
    435                     && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize)
    436                     && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
    437                             || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
    438                 AKLOGI("ADDITIONALPROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
    439                         mTransposedCount, mExcessiveCount, c);
    440             }
    441         } else {
    442             if (DEBUG_CORRECTION
    443                     && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize)
    444                     && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
    445                             || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
    446                 DUMP_WORD(mWord, mOutputIndex);
    447                 AKLOGI("UNRELATED(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
    448                         mTransposedCount, mExcessiveCount, c);
    449             }
    450             return processUnrelatedCorrectionType();
    451         }
    452     } else if (secondTransposing) {
    453         // If inputIndex is greater than mInputSize, that means there is no
    454         // proximity chars. So, we don't need to check proximity.
    455         mMatching = true;
    456     } else if (isEquivalentChar(matchedProximityCharId)) {
    457         mMatching = true;
    458         ++mEquivalentCharCount;
    459         mDistances[mOutputIndex] = mProximityInfoState.getNormalizedSquaredDistance(mInputIndex, 0);
    460     } else if (PROXIMITY_CHAR == matchedProximityCharId) {
    461         mProximityMatching = true;
    462         ++mProximityCount;
    463         mDistances[mOutputIndex] =
    464                 mProximityInfoState.getNormalizedSquaredDistance(mInputIndex, proximityIndex);
    465         if (DEBUG_CORRECTION
    466                 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize)
    467                 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
    468                         || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
    469             AKLOGI("PROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
    470                     mTransposedCount, mExcessiveCount, c);
    471         }
    472     }
    473 
    474     addCharToCurrentWord(c);
    475 
    476     // 4. Last char excessive correction
    477     mLastCharExceeded = mExcessiveCount == 0 && mSkippedCount == 0 && mTransposedCount == 0
    478             && mProximityCount == 0 && (mInputIndex == mInputSize - 2);
    479     const bool isSameAsUserTypedLength = (mInputSize == mInputIndex + 1) || mLastCharExceeded;
    480     if (mLastCharExceeded) {
    481         ++mExcessiveCount;
    482     }
    483 
    484     // Start traversing all nodes after the index exceeds the user typed length
    485     if (isSameAsUserTypedLength) {
    486         startToTraverseAllNodes();
    487     }
    488 
    489     const bool needsToTryOnTerminalForTheLastPossibleExcessiveChar =
    490             mExceeding && mInputIndex == mInputSize - 2;
    491 
    492     // Finally, we are ready to go to the next character, the next "virtual node".
    493     // We should advance the input index.
    494     // We do this in this branch of the 'if traverseAllNodes' because we are still matching
    495     // characters to input; the other branch is not matching them but searching for
    496     // completions, this is why it does not have to do it.
    497     incrementInputIndex();
    498     // Also, the next char is one "virtual node" depth more than this char.
    499     incrementOutputIndex();
    500 
    501     if ((needsToTryOnTerminalForTheLastPossibleExcessiveChar
    502             || isSameAsUserTypedLength) && isTerminal) {
    503         mTerminalInputIndex = mInputIndex - 1;
    504         mTerminalOutputIndex = mOutputIndex - 1;
    505         if (DEBUG_CORRECTION
    506                 && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize)
    507                 && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
    508             DUMP_WORD(mWord, mOutputIndex);
    509             AKLOGI("ONTERMINAL(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
    510                     mTransposedCount, mExcessiveCount, c);
    511         }
    512         return ON_TERMINAL;
    513     } else {
    514         mTerminalInputIndex = mInputIndex - 1;
    515         mTerminalOutputIndex = mOutputIndex - 1;
    516         return NOT_ON_TERMINAL;
    517     }
    518 }
    519 
    520 inline static int getQuoteCount(const int *word, const int length) {
    521     int quoteCount = 0;
    522     for (int i = 0; i < length; ++i) {
    523         if (word[i] == KEYCODE_SINGLE_QUOTE) {
    524             ++quoteCount;
    525         }
    526     }
    527     return quoteCount;
    528 }
    529 
    530 inline static bool isUpperCase(unsigned short c) {
    531     return isAsciiUpper(toBaseCodePoint(c));
    532 }
    533 
    534 //////////////////////
    535 // RankingAlgorithm //
    536 //////////////////////
    537 
    538 /* static */ int Correction::RankingAlgorithm::calculateFinalProbability(const int inputIndex,
    539         const int outputIndex, const int freq, int *editDistanceTable, const Correction *correction,
    540         const int inputSize) {
    541     const int excessivePos = correction->getExcessivePos();
    542     const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
    543     const int fullWordMultiplier = correction->FULL_WORD_MULTIPLIER;
    544     const ProximityInfoState *proximityInfoState = &correction->mProximityInfoState;
    545     const int skippedCount = correction->mSkippedCount;
    546     const int transposedCount = correction->mTransposedCount / 2;
    547     const int excessiveCount = correction->mExcessiveCount + correction->mTransposedCount % 2;
    548     const int proximityMatchedCount = correction->mProximityCount;
    549     const bool lastCharExceeded = correction->mLastCharExceeded;
    550     const bool useFullEditDistance = correction->mUseFullEditDistance;
    551     const int outputLength = outputIndex + 1;
    552     if (skippedCount >= inputSize || inputSize == 0) {
    553         return -1;
    554     }
    555 
    556     // TODO: find more robust way
    557     bool sameLength = lastCharExceeded ? (inputSize == inputIndex + 2)
    558             : (inputSize == inputIndex + 1);
    559 
    560     // TODO: use mExcessiveCount
    561     const int matchCount = inputSize - correction->mProximityCount - excessiveCount;
    562 
    563     const int *word = correction->mWord;
    564     const bool skipped = skippedCount > 0;
    565 
    566     const int quoteDiffCount = max(0, getQuoteCount(word, outputLength)
    567             - getQuoteCount(proximityInfoState->getPrimaryInputWord(), inputSize));
    568 
    569     // TODO: Calculate edit distance for transposed and excessive
    570     int ed = 0;
    571     if (DEBUG_DICT_FULL) {
    572         dumpEditDistance10ForDebug(editDistanceTable, correction->mInputSize, outputLength);
    573     }
    574     int adjustedProximityMatchedCount = proximityMatchedCount;
    575 
    576     int finalFreq = freq;
    577 
    578     if (DEBUG_CORRECTION_FREQ
    579             && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputSize)) {
    580         AKLOGI("FinalFreq0: %d", finalFreq);
    581     }
    582     // TODO: Optimize this.
    583     if (transposedCount > 0 || proximityMatchedCount > 0 || skipped || excessiveCount > 0) {
    584         ed = getCurrentEditDistance(editDistanceTable, correction->mInputSize, outputLength,
    585                 inputSize) - transposedCount;
    586 
    587         const int matchWeight = powerIntCapped(typedLetterMultiplier,
    588                 max(inputSize, outputLength) - ed);
    589         multiplyIntCapped(matchWeight, &finalFreq);
    590 
    591         // TODO: Demote further if there are two or more excessive chars with longer user input?
    592         if (inputSize > outputLength) {
    593             multiplyRate(INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE, &finalFreq);
    594         }
    595 
    596         ed = max(0, ed - quoteDiffCount);
    597         adjustedProximityMatchedCount = min(max(0, ed - (outputLength - inputSize)),
    598                 proximityMatchedCount);
    599         if (transposedCount <= 0) {
    600             if (ed == 1 && (inputSize == outputLength - 1 || inputSize == outputLength + 1)) {
    601                 // Promote a word with just one skipped or excessive char
    602                 if (sameLength) {
    603                     multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE
    604                             + WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_MULTIPLIER * outputLength,
    605                             &finalFreq);
    606                 } else {
    607                     multiplyIntCapped(typedLetterMultiplier, &finalFreq);
    608                 }
    609             } else if (ed == 0) {
    610                 multiplyIntCapped(typedLetterMultiplier, &finalFreq);
    611                 sameLength = true;
    612             }
    613         }
    614     } else {
    615         const int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount);
    616         multiplyIntCapped(matchWeight, &finalFreq);
    617     }
    618 
    619     if (proximityInfoState->getProximityType(0, word[0], true) == SUBSTITUTION_CHAR) {
    620         multiplyRate(FIRST_CHAR_DIFFERENT_DEMOTION_RATE, &finalFreq);
    621     }
    622 
    623     ///////////////////////////////////////////////
    624     // Promotion and Demotion for each correction
    625 
    626     // Demotion for a word with missing character
    627     if (skipped) {
    628         const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
    629                 * (10 * inputSize - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
    630                 / (10 * inputSize
    631                         - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
    632         if (DEBUG_DICT_FULL) {
    633             AKLOGI("Demotion rate for missing character is %d.", demotionRate);
    634         }
    635         multiplyRate(demotionRate, &finalFreq);
    636     }
    637 
    638     // Demotion for a word with transposed character
    639     if (transposedCount > 0) multiplyRate(
    640             WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
    641 
    642     // Demotion for a word with excessive character
    643     if (excessiveCount > 0) {
    644         multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
    645         if (!lastCharExceeded && !proximityInfoState->existsAdjacentProximityChars(excessivePos)) {
    646             if (DEBUG_DICT_FULL) {
    647                 AKLOGI("Double excessive demotion");
    648             }
    649             // If an excessive character is not adjacent to the left char or the right char,
    650             // we will demote this word.
    651             multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);
    652         }
    653     }
    654 
    655     int additionalProximityCount = 0;
    656     // Demote additional proximity characters
    657     for (int i = 0; i < outputLength; ++i) {
    658         const int squaredDistance = correction->mDistances[i];
    659         if (squaredDistance == ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO) {
    660             ++additionalProximityCount;
    661         }
    662     }
    663 
    664     const bool performTouchPositionCorrection =
    665             CALIBRATE_SCORE_BY_TOUCH_COORDINATES
    666                     && proximityInfoState->touchPositionCorrectionEnabled()
    667                     && skippedCount == 0 && excessiveCount == 0 && transposedCount == 0
    668                     && additionalProximityCount == 0;
    669 
    670     // Score calibration by touch coordinates is being done only for pure-fat finger typing error
    671     // cases.
    672     // TODO: Remove this constraint.
    673     if (performTouchPositionCorrection) {
    674         for (int i = 0; i < outputLength; ++i) {
    675             const int squaredDistance = correction->mDistances[i];
    676             if (i < adjustedProximityMatchedCount) {
    677                 multiplyIntCapped(typedLetterMultiplier, &finalFreq);
    678             }
    679             const float factor =
    680                     SuggestUtils::getLengthScalingFactor(static_cast<float>(squaredDistance));
    681             if (factor > 0.0f) {
    682                 multiplyRate(static_cast<int>(factor * 100.0f), &finalFreq);
    683             } else if (squaredDistance == PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO) {
    684                 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
    685             }
    686         }
    687     } else {
    688         // Promotion for a word with proximity characters
    689         for (int i = 0; i < adjustedProximityMatchedCount; ++i) {
    690             // A word with proximity corrections
    691             if (DEBUG_DICT_FULL) {
    692                 AKLOGI("Found a proximity correction.");
    693             }
    694             multiplyIntCapped(typedLetterMultiplier, &finalFreq);
    695             if (i < additionalProximityCount) {
    696                 multiplyRate(WORDS_WITH_ADDITIONAL_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
    697             } else {
    698                 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
    699             }
    700         }
    701     }
    702 
    703     // If the user types too many(three or more) proximity characters with additional proximity
    704     // character,do not treat as the same length word.
    705     if (sameLength && additionalProximityCount > 0 && (adjustedProximityMatchedCount >= 3
    706             || transposedCount > 0 || skipped || excessiveCount > 0)) {
    707         sameLength = false;
    708     }
    709 
    710     const int errorCount = adjustedProximityMatchedCount > 0
    711             ? adjustedProximityMatchedCount
    712             : (proximityMatchedCount + transposedCount);
    713     multiplyRate(
    714             100 - CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE * errorCount / inputSize, &finalFreq);
    715 
    716     // Promotion for an exactly matched word
    717     if (ed == 0) {
    718         // Full exact match
    719         if (sameLength && transposedCount == 0 && !skipped && excessiveCount == 0
    720                 && quoteDiffCount == 0 && additionalProximityCount == 0) {
    721             finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
    722         }
    723     }
    724 
    725     // Promote a word with no correction
    726     if (proximityMatchedCount == 0 && transposedCount == 0 && !skipped && excessiveCount == 0
    727             && additionalProximityCount == 0) {
    728         multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
    729     }
    730 
    731     // TODO: Check excessive count and transposed count
    732     // TODO: Remove this if possible
    733     /*
    734          If the last character of the user input word is the same as the next character
    735          of the output word, and also all of characters of the user input are matched
    736          to the output word, we'll promote that word a bit because
    737          that word can be considered the combination of skipped and matched characters.
    738          This means that the 'sm' pattern wins over the 'ma' pattern.
    739          e.g.)
    740          shel -> shell [mmmma] or [mmmsm]
    741          hel -> hello [mmmaa] or [mmsma]
    742          m ... matching
    743          s ... skipping
    744          a ... traversing all
    745          t ... transposing
    746          e ... exceeding
    747          p ... proximity matching
    748      */
    749     if (matchCount == inputSize && matchCount >= 2 && !skipped
    750             && word[matchCount] == word[matchCount - 1]) {
    751         multiplyRate(WORDS_WITH_MATCH_SKIP_PROMOTION_RATE, &finalFreq);
    752     }
    753 
    754     // TODO: Do not use sameLength?
    755     if (sameLength) {
    756         multiplyIntCapped(fullWordMultiplier, &finalFreq);
    757     }
    758 
    759     if (useFullEditDistance && outputLength > inputSize + 1) {
    760         const int diff = outputLength - inputSize - 1;
    761         const int divider = diff < 31 ? 1 << diff : S_INT_MAX;
    762         finalFreq = divider > finalFreq ? 1 : finalFreq / divider;
    763     }
    764 
    765     if (DEBUG_DICT_FULL) {
    766         AKLOGI("calc: %d, %d", outputLength, sameLength);
    767     }
    768 
    769     if (DEBUG_CORRECTION_FREQ
    770             && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputSize)) {
    771         DUMP_WORD(correction->getPrimaryInputWord(), inputSize);
    772         DUMP_WORD(correction->mWord, outputLength);
    773         AKLOGI("FinalFreq: [P%d, S%d, T%d, E%d, A%d] %d, %d, %d, %d, %d, %d", proximityMatchedCount,
    774                 skippedCount, transposedCount, excessiveCount, additionalProximityCount,
    775                 outputLength, lastCharExceeded, sameLength, quoteDiffCount, ed, finalFreq);
    776     }
    777 
    778     return finalFreq;
    779 }
    780 
    781 /* static */ int Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(const int *freqArray,
    782         const int *wordLengthArray, const int wordCount, const Correction *correction,
    783         const bool isSpaceProximity, const int *word) {
    784     const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
    785 
    786     bool firstCapitalizedWordDemotion = false;
    787     bool secondCapitalizedWordDemotion = false;
    788 
    789     {
    790         // TODO: Handle multiple capitalized word demotion properly
    791         const int firstWordLength = wordLengthArray[0];
    792         const int secondWordLength = wordLengthArray[1];
    793         if (firstWordLength >= 2) {
    794             firstCapitalizedWordDemotion = isUpperCase(word[0]);
    795         }
    796 
    797         if (secondWordLength >= 2) {
    798             // FIXME: word[firstWordLength + 1] is incorrect.
    799             secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
    800         }
    801     }
    802 
    803 
    804     const bool capitalizedWordDemotion =
    805             firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;
    806 
    807     int totalLength = 0;
    808     int totalFreq = 0;
    809     for (int i = 0; i < wordCount; ++i) {
    810         const int wordLength = wordLengthArray[i];
    811         if (wordLength <= 0) {
    812             return 0;
    813         }
    814         totalLength += wordLength;
    815         const int demotionRate = 100 - TWO_WORDS_CORRECTION_DEMOTION_BASE / (wordLength + 1);
    816         int tempFirstFreq = freqArray[i];
    817         multiplyRate(demotionRate, &tempFirstFreq);
    818         totalFreq += tempFirstFreq;
    819     }
    820 
    821     if (totalLength <= 0 || totalFreq <= 0) {
    822         return 0;
    823     }
    824 
    825     // TODO: Currently totalFreq is adjusted to two word metrix.
    826     // Promote pairFreq with multiplying by 2, because the word length is the same as the typed
    827     // length.
    828     totalFreq = totalFreq * 2 / wordCount;
    829     if (wordCount > 2) {
    830         // Safety net for 3+ words -- Caveats: many heuristics and workarounds here.
    831         int oneLengthCounter = 0;
    832         int twoLengthCounter = 0;
    833         for (int i = 0; i < wordCount; ++i) {
    834             const int wordLength = wordLengthArray[i];
    835             // TODO: Use bigram instead of this safety net
    836             if (i < wordCount - 1) {
    837                 const int nextWordLength = wordLengthArray[i + 1];
    838                 if (wordLength == 1 && nextWordLength == 2) {
    839                     // Safety net to filter 1 length and 2 length sequential words
    840                     return 0;
    841                 }
    842             }
    843             const int freq = freqArray[i];
    844             // Demote too short weak words
    845             if (wordLength <= 4 && freq <= SUPPRESS_SHORT_MULTIPLE_WORDS_THRESHOLD_FREQ) {
    846                 multiplyRate(100 * freq / MAX_PROBABILITY, &totalFreq);
    847             }
    848             if (wordLength == 1) {
    849                 ++oneLengthCounter;
    850             } else if (wordLength == 2) {
    851                 ++twoLengthCounter;
    852             }
    853             if (oneLengthCounter >= 2 || (oneLengthCounter + twoLengthCounter) >= 4) {
    854                 // Safety net to filter too many short words
    855                 return 0;
    856             }
    857         }
    858         multiplyRate(MULTIPLE_WORDS_DEMOTION_RATE, &totalFreq);
    859     }
    860 
    861     // This is a workaround to try offsetting the not-enough-demotion which will be done in
    862     // calcNormalizedScore in Utils.java.
    863     // In calcNormalizedScore the score will be demoted by (1 - 1 / length)
    864     // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by
    865     // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))
    866     const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);
    867     multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);
    868 
    869     // At this moment, totalFreq is calculated by the following formula:
    870     // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))
    871     //        * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))
    872 
    873     multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);
    874 
    875     // This is another workaround to offset the demotion which will be done in
    876     // calcNormalizedScore in Utils.java.
    877     // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote
    878     // the same amount because we already have adjusted the synthetic freq of this "missing or
    879     // mistyped space" suggestion candidate above in this method.
    880     const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);
    881     multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);
    882 
    883     if (isSpaceProximity) {
    884         // A word pair with one space proximity correction
    885         if (DEBUG_DICT) {
    886             AKLOGI("Found a word pair with space proximity correction.");
    887         }
    888         multiplyIntCapped(typedLetterMultiplier, &totalFreq);
    889         multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
    890     }
    891 
    892     if (isSpaceProximity) {
    893         multiplyRate(WORDS_WITH_MISTYPED_SPACE_DEMOTION_RATE, &totalFreq);
    894     } else {
    895         multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
    896     }
    897 
    898     if (capitalizedWordDemotion) {
    899         multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq);
    900     }
    901 
    902     if (DEBUG_CORRECTION_FREQ) {
    903         AKLOGI("Multiple words (%d, %d) (%d, %d) %d, %d", freqArray[0], freqArray[1],
    904                 wordLengthArray[0], wordLengthArray[1], capitalizedWordDemotion, totalFreq);
    905         DUMP_WORD(word, wordLengthArray[0]);
    906     }
    907 
    908     return totalFreq;
    909 }
    910 
    911 /* static */ int Correction::RankingAlgorithm::editDistance(const int *before,
    912         const int beforeLength, const int *after, const int afterLength) {
    913     const DamerauLevenshteinEditDistancePolicy daemaruLevenshtein(
    914             before, beforeLength, after, afterLength);
    915     return static_cast<int>(EditDistance::getEditDistance(&daemaruLevenshtein));
    916 }
    917 
    918 
    919 // In dictionary.cpp, getSuggestion() method,
    920 // When USE_SUGGEST_INTERFACE_FOR_TYPING is true:
    921 //   SUGGEST_INTERFACE_OUTPUT_SCALE was multiplied to the original suggestion scores to convert
    922 //   them to integers.
    923 //     score = (int)((original score) * SUGGEST_INTERFACE_OUTPUT_SCALE)
    924 //   Undo the scaling here to recover the original score.
    925 //     normalizedScore = ((float)score) / SUGGEST_INTERFACE_OUTPUT_SCALE
    926 // Otherwise: suggestion scores are computed using the below formula.
    927 // original score
    928 //  := powf(mTypedLetterMultiplier (this is defined 2),
    929 //         (the number of matched characters between typed word and suggested word))
    930 //     * (individual word's score which defined in the unigram dictionary,
    931 //         and this score is defined in range [0, 255].)
    932 // Then, the following processing is applied.
    933 //     - If the dictionary word is matched up to the point of the user entry
    934 //       (full match up to min(before.length(), after.length())
    935 //       => Then multiply by FULL_MATCHED_WORDS_PROMOTION_RATE (this is defined 1.2)
    936 //     - If the word is a true full match except for differences in accents or
    937 //       capitalization, then treat it as if the score was 255.
    938 //     - If before.length() == after.length()
    939 //       => multiply by mFullWordMultiplier (this is defined 2))
    940 // So, maximum original score is powf(2, min(before.length(), after.length())) * 255 * 2 * 1.2
    941 // For historical reasons we ignore the 1.2 modifier (because the measure for a good
    942 // autocorrection threshold was done at a time when it didn't exist). This doesn't change
    943 // the result.
    944 // So, we can normalize original score by dividing powf(2, min(b.l(),a.l())) * 255 * 2.
    945 
    946 /* static */ float Correction::RankingAlgorithm::calcNormalizedScore(const int *before,
    947         const int beforeLength, const int *after, const int afterLength, const int score) {
    948     if (0 == beforeLength || 0 == afterLength) {
    949         return 0.0f;
    950     }
    951     const int distance = editDistance(before, beforeLength, after, afterLength);
    952     int spaceCount = 0;
    953     for (int i = 0; i < afterLength; ++i) {
    954         if (after[i] == KEYCODE_SPACE) {
    955             ++spaceCount;
    956         }
    957     }
    958 
    959     if (spaceCount == afterLength) {
    960         return 0.0f;
    961     }
    962 
    963     // add a weight based on edit distance.
    964     // distance <= max(afterLength, beforeLength) == afterLength,
    965     // so, 0 <= distance / afterLength <= 1
    966     const float weight = 1.0f - static_cast<float>(distance) / static_cast<float>(afterLength);
    967 
    968     if (USE_SUGGEST_INTERFACE_FOR_TYPING) {
    969         return (static_cast<float>(score) / SUGGEST_INTERFACE_OUTPUT_SCALE) * weight;
    970     }
    971     const float maxScore = score >= S_INT_MAX ? static_cast<float>(S_INT_MAX)
    972             : static_cast<float>(MAX_INITIAL_SCORE)
    973                     * powf(static_cast<float>(TYPED_LETTER_MULTIPLIER),
    974                             static_cast<float>(min(beforeLength, afterLength - spaceCount)))
    975                     * static_cast<float>(FULL_WORD_MULTIPLIER);
    976 
    977     return (static_cast<float>(score) / maxScore) * weight;
    978 }
    979 } // namespace latinime
    980