Home | History | Annotate | Download | only in result
      1 /*
      2  * Copyright (C) 2013 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "suggest/core/result/suggestions_output_utils.h"
     18 
     19 #include <algorithm>
     20 #include <vector>
     21 
     22 #include "dictionary/utils/binary_dictionary_shortcut_iterator.h"
     23 #include "suggest/core/dicnode/dic_node.h"
     24 #include "suggest/core/dicnode/dic_node_utils.h"
     25 #include "suggest/core/dictionary/error_type_utils.h"
     26 #include "suggest/core/policy/scoring.h"
     27 #include "suggest/core/result/suggestion_results.h"
     28 #include "suggest/core/session/dic_traverse_session.h"
     29 #include "suggest/core/suggest_options.h"
     30 
     31 namespace latinime {
     32 
     33 const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
     34 
     35 /* static */ void SuggestionsOutputUtils::outputSuggestions(
     36         const Scoring *const scoringPolicy, DicTraverseSession *traverseSession,
     37         const float weightOfLangModelVsSpatialModel,
     38         SuggestionResults *const outSuggestionResults) {
     39 #if DEBUG_EVALUATE_MOST_PROBABLE_STRING
     40     const int terminalSize = 0;
     41 #else
     42     const int terminalSize = traverseSession->getDicTraverseCache()->terminalSize();
     43 #endif
     44     std::vector<DicNode> terminals(terminalSize);
     45     for (int index = terminalSize - 1; index >= 0; --index) {
     46         traverseSession->getDicTraverseCache()->popTerminal(&terminals[index]);
     47     }
     48     // Compute a weight of language model when an invalid weight is passed.
     49     // NOT_A_WEIGHT_OF_LANG_MODEL_VS_SPATIAL_MODEL (-1) is taken as an invalid value.
     50     const float weightOfLangModelVsSpatialModelToOutputSuggestions =
     51             (weightOfLangModelVsSpatialModel < 0.0f)
     52             ? scoringPolicy->getAdjustedWeightOfLangModelVsSpatialModel(traverseSession,
     53                     terminals.data(), terminalSize)
     54             : weightOfLangModelVsSpatialModel;
     55     outSuggestionResults->setWeightOfLangModelVsSpatialModel(
     56             weightOfLangModelVsSpatialModelToOutputSuggestions);
     57     // Force autocorrection for obvious long multi-word suggestions when the top suggestion is
     58     // a long multiple words suggestion.
     59     // TODO: Implement a smarter auto-commit method for handling multi-word suggestions.
     60     const bool forceCommitMultiWords = scoringPolicy->autoCorrectsToMultiWordSuggestionIfTop()
     61             && (traverseSession->getInputSize() >= MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT
     62                     && !terminals.empty() && terminals.front().hasMultipleWords());
     63     // TODO: have partial commit work even with multiple pointers.
     64     const bool outputSecondWordFirstLetterInputIndex =
     65             traverseSession->isOnlyOnePointerUsed(0 /* pointerId */);
     66     const bool boostExactMatches = traverseSession->getDictionaryStructurePolicy()->
     67             getHeaderStructurePolicy()->shouldBoostExactMatches();
     68 
     69     // Output suggestion results here
     70     for (auto &terminalDicNode : terminals) {
     71         outputSuggestionsOfDicNode(scoringPolicy, traverseSession, &terminalDicNode,
     72                 weightOfLangModelVsSpatialModelToOutputSuggestions, boostExactMatches,
     73                 forceCommitMultiWords, outputSecondWordFirstLetterInputIndex, outSuggestionResults);
     74     }
     75     scoringPolicy->getMostProbableString(traverseSession,
     76             weightOfLangModelVsSpatialModelToOutputSuggestions, outSuggestionResults);
     77 }
     78 
     79 /* static */ bool SuggestionsOutputUtils::shouldBlockWord(
     80         const SuggestOptions *const suggestOptions, const DicNode *const terminalDicNode,
     81         const WordAttributes wordAttributes, const bool isLastWord) {
     82     const bool currentWordExactMatch =
     83             ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes());
     84     // When we have to block offensive words, non-exact matched offensive words should not be
     85     // output.
     86     const bool shouldBlockOffensiveWords = suggestOptions->blockOffensiveWords();
     87 
     88     const bool isBlockedOffensiveWord = shouldBlockOffensiveWords &&
     89             wordAttributes.isPossiblyOffensive();
     90 
     91     // This function is called in two situations:
     92     //
     93     // 1) At the end of a search, in which case terminalDicNode will point to the last DicNode
     94     //    of the search, and isLastWord will be true.
     95     //                    "fuck"
     96     //                        |
     97     //                        \ terminalDicNode (isLastWord=true, currentWordExactMatch=true)
     98     //    In this case, if the current word is an exact match, we will always let the word
     99     //    through, even if the user is blocking offensive words (it's exactly what they typed!)
    100     //
    101     // 2) In the middle of the search, when we hit a terminal node, to decide whether or not
    102     //    to start a new search at root, to try to match the rest of the input. In this case,
    103     //    terminalDicNode will point to the terminal node we just hit, and isLastWord will be
    104     //    false.
    105     //                    "fuckvthis"
    106     //                        |
    107     //                        \ terminalDicNode (isLastWord=false, currentWordExactMatch=true)
    108     //
    109     // In this case, we should NOT allow the match through (correcting "fuckthis" to "fuck this"
    110     // when offensive words are blocked would be a bad idea).
    111     //
    112     // In the case of a multi-word correction where the offensive word is typed last (eg.
    113     // for the input "allfuck"), this function will be called with isLastWord==true, but
    114     // currentWordExactMatch==false. So we are OK in this case as well.
    115     //                    "allfuck"
    116     //                           |
    117     //                           \ terminalDicNode (isLastWord=true, currentWordExactMatch=false)
    118     if (isLastWord && currentWordExactMatch) {
    119         return false;
    120     } else {
    121         return isBlockedOffensiveWord;
    122     }
    123 }
    124 
    125 /* static */ void SuggestionsOutputUtils::outputSuggestionsOfDicNode(
    126         const Scoring *const scoringPolicy, DicTraverseSession *traverseSession,
    127         const DicNode *const terminalDicNode, const float weightOfLangModelVsSpatialModel,
    128         const bool boostExactMatches, const bool forceCommitMultiWords,
    129         const bool outputSecondWordFirstLetterInputIndex,
    130         SuggestionResults *const outSuggestionResults) {
    131     if (DEBUG_GEO_FULL) {
    132         terminalDicNode->dump("OUT:");
    133     }
    134     const float doubleLetterCost =
    135             scoringPolicy->getDoubleLetterDemotionDistanceCost(terminalDicNode);
    136     const float compoundDistance =
    137             terminalDicNode->getCompoundDistance(weightOfLangModelVsSpatialModel)
    138                     + doubleLetterCost;
    139     const WordAttributes wordAttributes = traverseSession->getDictionaryStructurePolicy()
    140             ->getWordAttributesInContext(terminalDicNode->getPrevWordIds(),
    141                     terminalDicNode->getWordId(), nullptr /* multiBigramMap */);
    142     const bool isExactMatch =
    143             ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes());
    144     const bool isExactMatchWithIntentionalOmission =
    145             ErrorTypeUtils::isExactMatchWithIntentionalOmission(
    146                     terminalDicNode->getContainedErrorTypes());
    147     // TODO: Decide whether the word should be auto-corrected or not here.
    148     const bool isAppropriateForAutoCorrection = !ErrorTypeUtils::isMissingExplicitAccent(
    149             terminalDicNode->getContainedErrorTypes());
    150     const int outputTypeFlags =
    151             (wordAttributes.isPossiblyOffensive() ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0)
    152             | ((isExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0)
    153             | (isExactMatchWithIntentionalOmission ?
    154                     Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0)
    155             | (isAppropriateForAutoCorrection ?
    156                     Dictionary::KIND_FLAG_APPROPRIATE_FOR_AUTOCORRECTION : 0);
    157     // Entries that are blacklisted or do not represent a word should not be output.
    158     const bool isValidWord = !(wordAttributes.isBlacklisted() || wordAttributes.isNotAWord());
    159 
    160     const bool shouldBlockThisWord = shouldBlockWord(traverseSession->getSuggestOptions(),
    161             terminalDicNode, wordAttributes, true /* isLastWord */);
    162 
    163     // Increase output score of top typing suggestion to ensure autocorrection.
    164     // TODO: Better integration with java side autocorrection logic.
    165     const int finalScore = scoringPolicy->calculateFinalScore(
    166             compoundDistance, traverseSession->getInputSize(),
    167             terminalDicNode->getContainedErrorTypes(),
    168             (forceCommitMultiWords && terminalDicNode->hasMultipleWords()),
    169             boostExactMatches, wordAttributes.getProbability() == 0);
    170 
    171     // Don't output invalid or blocked offensive words. However, we still need to submit their
    172     // shortcuts if any.
    173     if (isValidWord && !shouldBlockThisWord) {
    174         int codePoints[MAX_WORD_LENGTH];
    175         terminalDicNode->outputResult(codePoints);
    176         const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ?
    177                 terminalDicNode->getSecondWordFirstInputIndex(
    178                         traverseSession->getProximityInfoState(0)) :
    179                 NOT_AN_INDEX;
    180         outSuggestionResults->addSuggestion(codePoints,
    181                 terminalDicNode->getTotalNodeCodePointCount(),
    182                 finalScore, Dictionary::KIND_CORRECTION | outputTypeFlags,
    183                 indexToPartialCommit, computeFirstWordConfidence(terminalDicNode));
    184     }
    185 
    186     // Output shortcuts.
    187     // Shortcut is not supported for multiple words suggestions.
    188     // TODO: Check shortcuts during traversal for multiple words suggestions.
    189     if (!terminalDicNode->hasMultipleWords()) {
    190         BinaryDictionaryShortcutIterator shortcutIt =
    191                 traverseSession->getDictionaryStructurePolicy()->getShortcutIterator(
    192                         terminalDicNode->getWordId());
    193         const bool sameAsTyped = scoringPolicy->sameAsTyped(traverseSession, terminalDicNode);
    194         outputShortcuts(&shortcutIt, finalScore, sameAsTyped, outSuggestionResults);
    195     }
    196 }
    197 
    198 /* static */ int SuggestionsOutputUtils::computeFirstWordConfidence(
    199         const DicNode *const terminalDicNode) {
    200     // Get the number of spaces in the first suggestion
    201     const int spaceCount = terminalDicNode->getTotalNodeSpaceCount();
    202     // Get the number of characters in the first suggestion
    203     const int length = terminalDicNode->getTotalNodeCodePointCount();
    204     // Get the distance for the first word of the suggestion
    205     const float distance = terminalDicNode->getNormalizedCompoundDistanceAfterFirstWord();
    206 
    207     // Arbitrarily, we give a score whose useful values range from 0 to 1,000,000.
    208     // 1,000,000 will be the cutoff to auto-commit. It's fine if the number is under 0 or
    209     // above 1,000,000 : under 0 just means it's very bad to commit, and above 1,000,000 means
    210     // we are very confident.
    211     // Expected space count is 1 ~ 5
    212     static const int MIN_EXPECTED_SPACE_COUNT = 1;
    213     static const int MAX_EXPECTED_SPACE_COUNT = 5;
    214     // Expected length is about 4 ~ 30
    215     static const int MIN_EXPECTED_LENGTH = 4;
    216     static const int MAX_EXPECTED_LENGTH = 30;
    217     // Expected distance is about 0.2 ~ 2.0, but consider 0.0 ~ 2.0
    218     static const float MIN_EXPECTED_DISTANCE = 0.0;
    219     static const float MAX_EXPECTED_DISTANCE = 2.0;
    220     // This is not strict: it's where most stuff will be falling, but it's still fine if it's
    221     // outside these values. We want to output a value that reflects all of these. Each factor
    222     // contributes a bit.
    223 
    224     // We need at least a space.
    225     if (spaceCount < 1) return NOT_A_FIRST_WORD_CONFIDENCE;
    226 
    227     // The smaller the edit distance, the higher the contribution. MIN_EXPECTED_DISTANCE means 0
    228     // contribution, while MAX_EXPECTED_DISTANCE means full contribution according to the
    229     // weight of the distance. Clamp to avoid overflows.
    230     const float clampedDistance = distance < MIN_EXPECTED_DISTANCE ? MIN_EXPECTED_DISTANCE
    231             : distance > MAX_EXPECTED_DISTANCE ? MAX_EXPECTED_DISTANCE : distance;
    232     const int distanceContribution = DISTANCE_WEIGHT_FOR_AUTO_COMMIT
    233             * (MAX_EXPECTED_DISTANCE - clampedDistance)
    234             / (MAX_EXPECTED_DISTANCE - MIN_EXPECTED_DISTANCE);
    235     // The larger the suggestion length, the larger the contribution. MIN_EXPECTED_LENGTH is no
    236     // contribution, MAX_EXPECTED_LENGTH is full contribution according to the weight of the
    237     // length. Length is guaranteed to be between 1 and 48, so we don't need to clamp.
    238     const int lengthContribution = LENGTH_WEIGHT_FOR_AUTO_COMMIT
    239             * (length - MIN_EXPECTED_LENGTH) / (MAX_EXPECTED_LENGTH - MIN_EXPECTED_LENGTH);
    240     // The more spaces, the larger the contribution. MIN_EXPECTED_SPACE_COUNT space is no
    241     // contribution, MAX_EXPECTED_SPACE_COUNT spaces is full contribution according to the
    242     // weight of the space count.
    243     const int spaceContribution = SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT
    244             * (spaceCount - MIN_EXPECTED_SPACE_COUNT)
    245             / (MAX_EXPECTED_SPACE_COUNT - MIN_EXPECTED_SPACE_COUNT);
    246 
    247     return distanceContribution + lengthContribution + spaceContribution;
    248 }
    249 
    250 /* static */ void SuggestionsOutputUtils::outputShortcuts(
    251         BinaryDictionaryShortcutIterator *const shortcutIt, const int finalScore,
    252         const bool sameAsTyped, SuggestionResults *const outSuggestionResults) {
    253     int shortcutTarget[MAX_WORD_LENGTH];
    254     while (shortcutIt->hasNextShortcutTarget()) {
    255         bool isWhilelist;
    256         int shortcutTargetStringLength;
    257         shortcutIt->nextShortcutTarget(MAX_WORD_LENGTH, shortcutTarget,
    258                 &shortcutTargetStringLength, &isWhilelist);
    259         int shortcutScore;
    260         int kind;
    261         if (isWhilelist && sameAsTyped) {
    262             shortcutScore = S_INT_MAX;
    263             kind = Dictionary::KIND_WHITELIST;
    264         } else {
    265             // shortcut entry's score == its base entry's score - 1
    266             shortcutScore = finalScore;
    267             // Protection against int underflow
    268             shortcutScore = std::max(S_INT_MIN + 1, shortcutScore) - 1;
    269             kind = Dictionary::KIND_SHORTCUT;
    270         }
    271         outSuggestionResults->addSuggestion(shortcutTarget, shortcutTargetStringLength,
    272                 std::max(S_INT_MIN + 1, shortcutScore) - 1, kind, NOT_AN_INDEX,
    273                 NOT_A_FIRST_WORD_CONFIDENCE);
    274     }
    275 }
    276 } // namespace latinime
    277