1 /* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "suggest/core/result/suggestions_output_utils.h" 18 19 #include <algorithm> 20 #include <vector> 21 22 #include "dictionary/utils/binary_dictionary_shortcut_iterator.h" 23 #include "suggest/core/dicnode/dic_node.h" 24 #include "suggest/core/dicnode/dic_node_utils.h" 25 #include "suggest/core/dictionary/error_type_utils.h" 26 #include "suggest/core/policy/scoring.h" 27 #include "suggest/core/result/suggestion_results.h" 28 #include "suggest/core/session/dic_traverse_session.h" 29 #include "suggest/core/suggest_options.h" 30 31 namespace latinime { 32 33 const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; 34 35 /* static */ void SuggestionsOutputUtils::outputSuggestions( 36 const Scoring *const scoringPolicy, DicTraverseSession *traverseSession, 37 const float weightOfLangModelVsSpatialModel, 38 SuggestionResults *const outSuggestionResults) { 39 #if DEBUG_EVALUATE_MOST_PROBABLE_STRING 40 const int terminalSize = 0; 41 #else 42 const int terminalSize = traverseSession->getDicTraverseCache()->terminalSize(); 43 #endif 44 std::vector<DicNode> terminals(terminalSize); 45 for (int index = terminalSize - 1; index >= 0; --index) { 46 traverseSession->getDicTraverseCache()->popTerminal(&terminals[index]); 47 } 48 // Compute a weight of language model when an invalid weight is passed. 49 // NOT_A_WEIGHT_OF_LANG_MODEL_VS_SPATIAL_MODEL (-1) is taken as an invalid value. 50 const float weightOfLangModelVsSpatialModelToOutputSuggestions = 51 (weightOfLangModelVsSpatialModel < 0.0f) 52 ? scoringPolicy->getAdjustedWeightOfLangModelVsSpatialModel(traverseSession, 53 terminals.data(), terminalSize) 54 : weightOfLangModelVsSpatialModel; 55 outSuggestionResults->setWeightOfLangModelVsSpatialModel( 56 weightOfLangModelVsSpatialModelToOutputSuggestions); 57 // Force autocorrection for obvious long multi-word suggestions when the top suggestion is 58 // a long multiple words suggestion. 59 // TODO: Implement a smarter auto-commit method for handling multi-word suggestions. 60 const bool forceCommitMultiWords = scoringPolicy->autoCorrectsToMultiWordSuggestionIfTop() 61 && (traverseSession->getInputSize() >= MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT 62 && !terminals.empty() && terminals.front().hasMultipleWords()); 63 // TODO: have partial commit work even with multiple pointers. 64 const bool outputSecondWordFirstLetterInputIndex = 65 traverseSession->isOnlyOnePointerUsed(0 /* pointerId */); 66 const bool boostExactMatches = traverseSession->getDictionaryStructurePolicy()-> 67 getHeaderStructurePolicy()->shouldBoostExactMatches(); 68 69 // Output suggestion results here 70 for (auto &terminalDicNode : terminals) { 71 outputSuggestionsOfDicNode(scoringPolicy, traverseSession, &terminalDicNode, 72 weightOfLangModelVsSpatialModelToOutputSuggestions, boostExactMatches, 73 forceCommitMultiWords, outputSecondWordFirstLetterInputIndex, outSuggestionResults); 74 } 75 scoringPolicy->getMostProbableString(traverseSession, 76 weightOfLangModelVsSpatialModelToOutputSuggestions, outSuggestionResults); 77 } 78 79 /* static */ bool SuggestionsOutputUtils::shouldBlockWord( 80 const SuggestOptions *const suggestOptions, const DicNode *const terminalDicNode, 81 const WordAttributes wordAttributes, const bool isLastWord) { 82 const bool currentWordExactMatch = 83 ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes()); 84 // When we have to block offensive words, non-exact matched offensive words should not be 85 // output. 86 const bool shouldBlockOffensiveWords = suggestOptions->blockOffensiveWords(); 87 88 const bool isBlockedOffensiveWord = shouldBlockOffensiveWords && 89 wordAttributes.isPossiblyOffensive(); 90 91 // This function is called in two situations: 92 // 93 // 1) At the end of a search, in which case terminalDicNode will point to the last DicNode 94 // of the search, and isLastWord will be true. 95 // "fuck" 96 // | 97 // \ terminalDicNode (isLastWord=true, currentWordExactMatch=true) 98 // In this case, if the current word is an exact match, we will always let the word 99 // through, even if the user is blocking offensive words (it's exactly what they typed!) 100 // 101 // 2) In the middle of the search, when we hit a terminal node, to decide whether or not 102 // to start a new search at root, to try to match the rest of the input. In this case, 103 // terminalDicNode will point to the terminal node we just hit, and isLastWord will be 104 // false. 105 // "fuckvthis" 106 // | 107 // \ terminalDicNode (isLastWord=false, currentWordExactMatch=true) 108 // 109 // In this case, we should NOT allow the match through (correcting "fuckthis" to "fuck this" 110 // when offensive words are blocked would be a bad idea). 111 // 112 // In the case of a multi-word correction where the offensive word is typed last (eg. 113 // for the input "allfuck"), this function will be called with isLastWord==true, but 114 // currentWordExactMatch==false. So we are OK in this case as well. 115 // "allfuck" 116 // | 117 // \ terminalDicNode (isLastWord=true, currentWordExactMatch=false) 118 if (isLastWord && currentWordExactMatch) { 119 return false; 120 } else { 121 return isBlockedOffensiveWord; 122 } 123 } 124 125 /* static */ void SuggestionsOutputUtils::outputSuggestionsOfDicNode( 126 const Scoring *const scoringPolicy, DicTraverseSession *traverseSession, 127 const DicNode *const terminalDicNode, const float weightOfLangModelVsSpatialModel, 128 const bool boostExactMatches, const bool forceCommitMultiWords, 129 const bool outputSecondWordFirstLetterInputIndex, 130 SuggestionResults *const outSuggestionResults) { 131 if (DEBUG_GEO_FULL) { 132 terminalDicNode->dump("OUT:"); 133 } 134 const float doubleLetterCost = 135 scoringPolicy->getDoubleLetterDemotionDistanceCost(terminalDicNode); 136 const float compoundDistance = 137 terminalDicNode->getCompoundDistance(weightOfLangModelVsSpatialModel) 138 + doubleLetterCost; 139 const WordAttributes wordAttributes = traverseSession->getDictionaryStructurePolicy() 140 ->getWordAttributesInContext(terminalDicNode->getPrevWordIds(), 141 terminalDicNode->getWordId(), nullptr /* multiBigramMap */); 142 const bool isExactMatch = 143 ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes()); 144 const bool isExactMatchWithIntentionalOmission = 145 ErrorTypeUtils::isExactMatchWithIntentionalOmission( 146 terminalDicNode->getContainedErrorTypes()); 147 // TODO: Decide whether the word should be auto-corrected or not here. 148 const bool isAppropriateForAutoCorrection = !ErrorTypeUtils::isMissingExplicitAccent( 149 terminalDicNode->getContainedErrorTypes()); 150 const int outputTypeFlags = 151 (wordAttributes.isPossiblyOffensive() ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0) 152 | ((isExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0) 153 | (isExactMatchWithIntentionalOmission ? 154 Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0) 155 | (isAppropriateForAutoCorrection ? 156 Dictionary::KIND_FLAG_APPROPRIATE_FOR_AUTOCORRECTION : 0); 157 // Entries that are blacklisted or do not represent a word should not be output. 158 const bool isValidWord = !(wordAttributes.isBlacklisted() || wordAttributes.isNotAWord()); 159 160 const bool shouldBlockThisWord = shouldBlockWord(traverseSession->getSuggestOptions(), 161 terminalDicNode, wordAttributes, true /* isLastWord */); 162 163 // Increase output score of top typing suggestion to ensure autocorrection. 164 // TODO: Better integration with java side autocorrection logic. 165 const int finalScore = scoringPolicy->calculateFinalScore( 166 compoundDistance, traverseSession->getInputSize(), 167 terminalDicNode->getContainedErrorTypes(), 168 (forceCommitMultiWords && terminalDicNode->hasMultipleWords()), 169 boostExactMatches, wordAttributes.getProbability() == 0); 170 171 // Don't output invalid or blocked offensive words. However, we still need to submit their 172 // shortcuts if any. 173 if (isValidWord && !shouldBlockThisWord) { 174 int codePoints[MAX_WORD_LENGTH]; 175 terminalDicNode->outputResult(codePoints); 176 const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ? 177 terminalDicNode->getSecondWordFirstInputIndex( 178 traverseSession->getProximityInfoState(0)) : 179 NOT_AN_INDEX; 180 outSuggestionResults->addSuggestion(codePoints, 181 terminalDicNode->getTotalNodeCodePointCount(), 182 finalScore, Dictionary::KIND_CORRECTION | outputTypeFlags, 183 indexToPartialCommit, computeFirstWordConfidence(terminalDicNode)); 184 } 185 186 // Output shortcuts. 187 // Shortcut is not supported for multiple words suggestions. 188 // TODO: Check shortcuts during traversal for multiple words suggestions. 189 if (!terminalDicNode->hasMultipleWords()) { 190 BinaryDictionaryShortcutIterator shortcutIt = 191 traverseSession->getDictionaryStructurePolicy()->getShortcutIterator( 192 terminalDicNode->getWordId()); 193 const bool sameAsTyped = scoringPolicy->sameAsTyped(traverseSession, terminalDicNode); 194 outputShortcuts(&shortcutIt, finalScore, sameAsTyped, outSuggestionResults); 195 } 196 } 197 198 /* static */ int SuggestionsOutputUtils::computeFirstWordConfidence( 199 const DicNode *const terminalDicNode) { 200 // Get the number of spaces in the first suggestion 201 const int spaceCount = terminalDicNode->getTotalNodeSpaceCount(); 202 // Get the number of characters in the first suggestion 203 const int length = terminalDicNode->getTotalNodeCodePointCount(); 204 // Get the distance for the first word of the suggestion 205 const float distance = terminalDicNode->getNormalizedCompoundDistanceAfterFirstWord(); 206 207 // Arbitrarily, we give a score whose useful values range from 0 to 1,000,000. 208 // 1,000,000 will be the cutoff to auto-commit. It's fine if the number is under 0 or 209 // above 1,000,000 : under 0 just means it's very bad to commit, and above 1,000,000 means 210 // we are very confident. 211 // Expected space count is 1 ~ 5 212 static const int MIN_EXPECTED_SPACE_COUNT = 1; 213 static const int MAX_EXPECTED_SPACE_COUNT = 5; 214 // Expected length is about 4 ~ 30 215 static const int MIN_EXPECTED_LENGTH = 4; 216 static const int MAX_EXPECTED_LENGTH = 30; 217 // Expected distance is about 0.2 ~ 2.0, but consider 0.0 ~ 2.0 218 static const float MIN_EXPECTED_DISTANCE = 0.0; 219 static const float MAX_EXPECTED_DISTANCE = 2.0; 220 // This is not strict: it's where most stuff will be falling, but it's still fine if it's 221 // outside these values. We want to output a value that reflects all of these. Each factor 222 // contributes a bit. 223 224 // We need at least a space. 225 if (spaceCount < 1) return NOT_A_FIRST_WORD_CONFIDENCE; 226 227 // The smaller the edit distance, the higher the contribution. MIN_EXPECTED_DISTANCE means 0 228 // contribution, while MAX_EXPECTED_DISTANCE means full contribution according to the 229 // weight of the distance. Clamp to avoid overflows. 230 const float clampedDistance = distance < MIN_EXPECTED_DISTANCE ? MIN_EXPECTED_DISTANCE 231 : distance > MAX_EXPECTED_DISTANCE ? MAX_EXPECTED_DISTANCE : distance; 232 const int distanceContribution = DISTANCE_WEIGHT_FOR_AUTO_COMMIT 233 * (MAX_EXPECTED_DISTANCE - clampedDistance) 234 / (MAX_EXPECTED_DISTANCE - MIN_EXPECTED_DISTANCE); 235 // The larger the suggestion length, the larger the contribution. MIN_EXPECTED_LENGTH is no 236 // contribution, MAX_EXPECTED_LENGTH is full contribution according to the weight of the 237 // length. Length is guaranteed to be between 1 and 48, so we don't need to clamp. 238 const int lengthContribution = LENGTH_WEIGHT_FOR_AUTO_COMMIT 239 * (length - MIN_EXPECTED_LENGTH) / (MAX_EXPECTED_LENGTH - MIN_EXPECTED_LENGTH); 240 // The more spaces, the larger the contribution. MIN_EXPECTED_SPACE_COUNT space is no 241 // contribution, MAX_EXPECTED_SPACE_COUNT spaces is full contribution according to the 242 // weight of the space count. 243 const int spaceContribution = SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT 244 * (spaceCount - MIN_EXPECTED_SPACE_COUNT) 245 / (MAX_EXPECTED_SPACE_COUNT - MIN_EXPECTED_SPACE_COUNT); 246 247 return distanceContribution + lengthContribution + spaceContribution; 248 } 249 250 /* static */ void SuggestionsOutputUtils::outputShortcuts( 251 BinaryDictionaryShortcutIterator *const shortcutIt, const int finalScore, 252 const bool sameAsTyped, SuggestionResults *const outSuggestionResults) { 253 int shortcutTarget[MAX_WORD_LENGTH]; 254 while (shortcutIt->hasNextShortcutTarget()) { 255 bool isWhilelist; 256 int shortcutTargetStringLength; 257 shortcutIt->nextShortcutTarget(MAX_WORD_LENGTH, shortcutTarget, 258 &shortcutTargetStringLength, &isWhilelist); 259 int shortcutScore; 260 int kind; 261 if (isWhilelist && sameAsTyped) { 262 shortcutScore = S_INT_MAX; 263 kind = Dictionary::KIND_WHITELIST; 264 } else { 265 // shortcut entry's score == its base entry's score - 1 266 shortcutScore = finalScore; 267 // Protection against int underflow 268 shortcutScore = std::max(S_INT_MIN + 1, shortcutScore) - 1; 269 kind = Dictionary::KIND_SHORTCUT; 270 } 271 outSuggestionResults->addSuggestion(shortcutTarget, shortcutTargetStringLength, 272 std::max(S_INT_MIN + 1, shortcutScore) - 1, kind, NOT_AN_INDEX, 273 NOT_A_FIRST_WORD_CONFIDENCE); 274 } 275 } 276 } // namespace latinime 277