Home | History | Annotate | Download | only in result
      1 /*
      2  * Copyright (C) 2013 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "suggest/core/result/suggestions_output_utils.h"
     18 
     19 #include <algorithm>
     20 #include <vector>
     21 
     22 #include "suggest/core/dicnode/dic_node.h"
     23 #include "suggest/core/dicnode/dic_node_utils.h"
     24 #include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h"
     25 #include "suggest/core/dictionary/error_type_utils.h"
     26 #include "suggest/core/policy/scoring.h"
     27 #include "suggest/core/result/suggestion_results.h"
     28 #include "suggest/core/session/dic_traverse_session.h"
     29 #include "suggest/core/suggest_options.h"
     30 
     31 namespace latinime {
     32 
     33 const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
     34 
     35 /* static */ void SuggestionsOutputUtils::outputSuggestions(
     36         const Scoring *const scoringPolicy, DicTraverseSession *traverseSession,
     37         const float languageWeight, SuggestionResults *const outSuggestionResults) {
     38 #if DEBUG_EVALUATE_MOST_PROBABLE_STRING
     39     const int terminalSize = 0;
     40 #else
     41     const int terminalSize = traverseSession->getDicTraverseCache()->terminalSize();
     42 #endif
     43     std::vector<DicNode> terminals(terminalSize);
     44     for (int index = terminalSize - 1; index >= 0; --index) {
     45         traverseSession->getDicTraverseCache()->popTerminal(&terminals[index]);
     46     }
     47     // Compute a language weight when an invalid language weight is passed.
     48     // NOT_A_LANGUAGE_WEIGHT (-1) is assumed as an invalid language weight.
     49     const float languageWeightToOutputSuggestions = (languageWeight < 0.0f) ?
     50             scoringPolicy->getAdjustedLanguageWeight(
     51                     traverseSession, terminals.data(), terminalSize) : languageWeight;
     52     outSuggestionResults->setLanguageWeight(languageWeightToOutputSuggestions);
     53     // Force autocorrection for obvious long multi-word suggestions when the top suggestion is
     54     // a long multiple words suggestion.
     55     // TODO: Implement a smarter auto-commit method for handling multi-word suggestions.
     56     const bool forceCommitMultiWords = scoringPolicy->autoCorrectsToMultiWordSuggestionIfTop()
     57             && (traverseSession->getInputSize() >= MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT
     58                     && !terminals.empty() && terminals.front().hasMultipleWords());
     59     // TODO: have partial commit work even with multiple pointers.
     60     const bool outputSecondWordFirstLetterInputIndex =
     61             traverseSession->isOnlyOnePointerUsed(0 /* pointerId */);
     62     const bool boostExactMatches = traverseSession->getDictionaryStructurePolicy()->
     63             getHeaderStructurePolicy()->shouldBoostExactMatches();
     64 
     65     // Output suggestion results here
     66     for (auto &terminalDicNode : terminals) {
     67         outputSuggestionsOfDicNode(scoringPolicy, traverseSession, &terminalDicNode,
     68                 languageWeightToOutputSuggestions, boostExactMatches, forceCommitMultiWords,
     69                 outputSecondWordFirstLetterInputIndex, outSuggestionResults);
     70     }
     71     scoringPolicy->getMostProbableString(traverseSession, languageWeightToOutputSuggestions,
     72             outSuggestionResults);
     73 }
     74 
     75 /* static */ void SuggestionsOutputUtils::outputSuggestionsOfDicNode(
     76         const Scoring *const scoringPolicy, DicTraverseSession *traverseSession,
     77         const DicNode *const terminalDicNode, const float languageWeight,
     78         const bool boostExactMatches, const bool forceCommitMultiWords,
     79         const bool outputSecondWordFirstLetterInputIndex,
     80         SuggestionResults *const outSuggestionResults) {
     81     if (DEBUG_GEO_FULL) {
     82         terminalDicNode->dump("OUT:");
     83     }
     84     const float doubleLetterCost =
     85             scoringPolicy->getDoubleLetterDemotionDistanceCost(terminalDicNode);
     86     const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight)
     87             + doubleLetterCost;
     88     const bool isPossiblyOffensiveWord =
     89             traverseSession->getDictionaryStructurePolicy()->getProbability(
     90                     terminalDicNode->getProbability(), NOT_A_PROBABILITY) <= 0;
     91     const bool isExactMatch =
     92             ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes());
     93     const bool isExactMatchWithIntentionalOmission =
     94             ErrorTypeUtils::isExactMatchWithIntentionalOmission(
     95                     terminalDicNode->getContainedErrorTypes());
     96     const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase();
     97     // Heuristic: We exclude probability=0 first-char-uppercase words from exact match.
     98     // (e.g. "AMD" and "and")
     99     const bool isSafeExactMatch = isExactMatch
    100             && !(isPossiblyOffensiveWord && isFirstCharUppercase);
    101     const int outputTypeFlags =
    102             (isPossiblyOffensiveWord ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0)
    103             | ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0)
    104             | (isExactMatchWithIntentionalOmission ?
    105                     Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0);
    106 
    107     // Entries that are blacklisted or do not represent a word should not be output.
    108     const bool isValidWord = !terminalDicNode->isBlacklistedOrNotAWord();
    109     // When we have to block offensive words, non-exact matched offensive words should not be
    110     // output.
    111     const bool blockOffensiveWords = traverseSession->getSuggestOptions()->blockOffensiveWords();
    112     const bool isBlockedOffensiveWord = blockOffensiveWords && isPossiblyOffensiveWord
    113             && !isSafeExactMatch;
    114 
    115     // Increase output score of top typing suggestion to ensure autocorrection.
    116     // TODO: Better integration with java side autocorrection logic.
    117     const int finalScore = scoringPolicy->calculateFinalScore(
    118             compoundDistance, traverseSession->getInputSize(),
    119             terminalDicNode->getContainedErrorTypes(),
    120             (forceCommitMultiWords && terminalDicNode->hasMultipleWords()),
    121             boostExactMatches);
    122 
    123     // Don't output invalid or blocked offensive words. However, we still need to submit their
    124     // shortcuts if any.
    125     if (isValidWord && !isBlockedOffensiveWord) {
    126         int codePoints[MAX_WORD_LENGTH];
    127         terminalDicNode->outputResult(codePoints);
    128         const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ?
    129                 terminalDicNode->getSecondWordFirstInputIndex(
    130                         traverseSession->getProximityInfoState(0)) :
    131                 NOT_AN_INDEX;
    132         outSuggestionResults->addSuggestion(codePoints,
    133                 terminalDicNode->getTotalNodeCodePointCount(),
    134                 finalScore, Dictionary::KIND_CORRECTION | outputTypeFlags,
    135                 indexToPartialCommit, computeFirstWordConfidence(terminalDicNode));
    136     }
    137 
    138     // Output shortcuts.
    139     // Shortcut is not supported for multiple words suggestions.
    140     // TODO: Check shortcuts during traversal for multiple words suggestions.
    141     if (!terminalDicNode->hasMultipleWords()) {
    142         BinaryDictionaryShortcutIterator shortcutIt(
    143                 traverseSession->getDictionaryStructurePolicy()->getShortcutsStructurePolicy(),
    144                 traverseSession->getDictionaryStructurePolicy()
    145                         ->getShortcutPositionOfPtNode(terminalDicNode->getPtNodePos()));
    146         const bool sameAsTyped = scoringPolicy->sameAsTyped(traverseSession, terminalDicNode);
    147         outputShortcuts(&shortcutIt, finalScore, sameAsTyped, outSuggestionResults);
    148     }
    149 }
    150 
    151 /* static */ int SuggestionsOutputUtils::computeFirstWordConfidence(
    152         const DicNode *const terminalDicNode) {
    153     // Get the number of spaces in the first suggestion
    154     const int spaceCount = terminalDicNode->getTotalNodeSpaceCount();
    155     // Get the number of characters in the first suggestion
    156     const int length = terminalDicNode->getTotalNodeCodePointCount();
    157     // Get the distance for the first word of the suggestion
    158     const float distance = terminalDicNode->getNormalizedCompoundDistanceAfterFirstWord();
    159 
    160     // Arbitrarily, we give a score whose useful values range from 0 to 1,000,000.
    161     // 1,000,000 will be the cutoff to auto-commit. It's fine if the number is under 0 or
    162     // above 1,000,000 : under 0 just means it's very bad to commit, and above 1,000,000 means
    163     // we are very confident.
    164     // Expected space count is 1 ~ 5
    165     static const int MIN_EXPECTED_SPACE_COUNT = 1;
    166     static const int MAX_EXPECTED_SPACE_COUNT = 5;
    167     // Expected length is about 4 ~ 30
    168     static const int MIN_EXPECTED_LENGTH = 4;
    169     static const int MAX_EXPECTED_LENGTH = 30;
    170     // Expected distance is about 0.2 ~ 2.0, but consider 0.0 ~ 2.0
    171     static const float MIN_EXPECTED_DISTANCE = 0.0;
    172     static const float MAX_EXPECTED_DISTANCE = 2.0;
    173     // This is not strict: it's where most stuff will be falling, but it's still fine if it's
    174     // outside these values. We want to output a value that reflects all of these. Each factor
    175     // contributes a bit.
    176 
    177     // We need at least a space.
    178     if (spaceCount < 1) return NOT_A_FIRST_WORD_CONFIDENCE;
    179 
    180     // The smaller the edit distance, the higher the contribution. MIN_EXPECTED_DISTANCE means 0
    181     // contribution, while MAX_EXPECTED_DISTANCE means full contribution according to the
    182     // weight of the distance. Clamp to avoid overflows.
    183     const float clampedDistance = distance < MIN_EXPECTED_DISTANCE ? MIN_EXPECTED_DISTANCE
    184             : distance > MAX_EXPECTED_DISTANCE ? MAX_EXPECTED_DISTANCE : distance;
    185     const int distanceContribution = DISTANCE_WEIGHT_FOR_AUTO_COMMIT
    186             * (MAX_EXPECTED_DISTANCE - clampedDistance)
    187             / (MAX_EXPECTED_DISTANCE - MIN_EXPECTED_DISTANCE);
    188     // The larger the suggestion length, the larger the contribution. MIN_EXPECTED_LENGTH is no
    189     // contribution, MAX_EXPECTED_LENGTH is full contribution according to the weight of the
    190     // length. Length is guaranteed to be between 1 and 48, so we don't need to clamp.
    191     const int lengthContribution = LENGTH_WEIGHT_FOR_AUTO_COMMIT
    192             * (length - MIN_EXPECTED_LENGTH) / (MAX_EXPECTED_LENGTH - MIN_EXPECTED_LENGTH);
    193     // The more spaces, the larger the contribution. MIN_EXPECTED_SPACE_COUNT space is no
    194     // contribution, MAX_EXPECTED_SPACE_COUNT spaces is full contribution according to the
    195     // weight of the space count.
    196     const int spaceContribution = SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT
    197             * (spaceCount - MIN_EXPECTED_SPACE_COUNT)
    198             / (MAX_EXPECTED_SPACE_COUNT - MIN_EXPECTED_SPACE_COUNT);
    199 
    200     return distanceContribution + lengthContribution + spaceContribution;
    201 }
    202 
    203 /* static */ void SuggestionsOutputUtils::outputShortcuts(
    204         BinaryDictionaryShortcutIterator *const shortcutIt, const int finalScore,
    205         const bool sameAsTyped, SuggestionResults *const outSuggestionResults) {
    206     int shortcutTarget[MAX_WORD_LENGTH];
    207     while (shortcutIt->hasNextShortcutTarget()) {
    208         bool isWhilelist;
    209         int shortcutTargetStringLength;
    210         shortcutIt->nextShortcutTarget(MAX_WORD_LENGTH, shortcutTarget,
    211                 &shortcutTargetStringLength, &isWhilelist);
    212         int shortcutScore;
    213         int kind;
    214         if (isWhilelist && sameAsTyped) {
    215             shortcutScore = S_INT_MAX;
    216             kind = Dictionary::KIND_WHITELIST;
    217         } else {
    218             // shortcut entry's score == its base entry's score - 1
    219             shortcutScore = finalScore;
    220             // Protection against int underflow
    221             shortcutScore = std::max(S_INT_MIN + 1, shortcutScore) - 1;
    222             kind = Dictionary::KIND_SHORTCUT;
    223         }
    224         outSuggestionResults->addSuggestion(shortcutTarget, shortcutTargetStringLength,
    225                 std::max(S_INT_MIN + 1, shortcutScore) - 1, kind, NOT_AN_INDEX,
    226                 NOT_A_FIRST_WORD_CONFIDENCE);
    227     }
    228 }
    229 } // namespace latinime
    230