1 /* 2 * Copyright (C) 2013, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "dictionary/utils/forgetting_curve_utils.h" 18 19 #include <algorithm> 20 #include <cmath> 21 #include <stdlib.h> 22 23 #include "dictionary/header/header_policy.h" 24 #include "dictionary/utils/probability_utils.h" 25 #include "utils/time_keeper.h" 26 27 namespace latinime { 28 29 const int ForgettingCurveUtils::MULTIPLIER_TWO_IN_PROBABILITY_SCALE = 8; 30 const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60; 31 32 const int ForgettingCurveUtils::MAX_LEVEL = 15; 33 const int ForgettingCurveUtils::MIN_VISIBLE_LEVEL = 2; 34 const int ForgettingCurveUtils::MAX_ELAPSED_TIME_STEP_COUNT = 31; 35 const int ForgettingCurveUtils::DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD = 30; 36 const int ForgettingCurveUtils::OCCURRENCES_TO_RAISE_THE_LEVEL = 1; 37 // TODO: Evaluate whether this should be 7.5 days. 38 // 15 days 39 const int ForgettingCurveUtils::DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS = 15 * 24 * 60 * 60; 40 41 const float ForgettingCurveUtils::ENTRY_COUNT_HARD_LIMIT_WEIGHT = 1.2; 42 43 const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable; 44 45 // TODO: Revise the logic to decide the initial probability depending on the given probability. 46 /* static */ const HistoricalInfo ForgettingCurveUtils::createUpdatedHistoricalInfo( 47 const HistoricalInfo *const originalHistoricalInfo, const int newProbability, 48 const HistoricalInfo *const newHistoricalInfo, const HeaderPolicy *const headerPolicy) { 49 const int timestamp = newHistoricalInfo->getTimestamp(); 50 if (newProbability != NOT_A_PROBABILITY && originalHistoricalInfo->getLevel() == 0) { 51 // Add entry as a valid word. 52 const int level = clampToVisibleEntryLevelRange(newHistoricalInfo->getLevel()); 53 const int count = clampToValidCountRange(newHistoricalInfo->getCount(), headerPolicy); 54 return HistoricalInfo(timestamp, level, count); 55 } else if (!originalHistoricalInfo->isValid() 56 || originalHistoricalInfo->getLevel() < newHistoricalInfo->getLevel() 57 || (originalHistoricalInfo->getLevel() == newHistoricalInfo->getLevel() 58 && originalHistoricalInfo->getCount() < newHistoricalInfo->getCount())) { 59 // Initial information. 60 int count = newHistoricalInfo->getCount(); 61 if (count >= OCCURRENCES_TO_RAISE_THE_LEVEL) { 62 const int level = clampToValidLevelRange(newHistoricalInfo->getLevel() + 1); 63 return HistoricalInfo(timestamp, level, 0 /* count */); 64 } 65 const int level = clampToValidLevelRange(newHistoricalInfo->getLevel()); 66 return HistoricalInfo(timestamp, level, clampToValidCountRange(count, headerPolicy)); 67 } else { 68 const int updatedCount = originalHistoricalInfo->getCount() + 1; 69 if (updatedCount >= OCCURRENCES_TO_RAISE_THE_LEVEL) { 70 // The count exceeds the max value the level can be incremented. 71 if (originalHistoricalInfo->getLevel() >= MAX_LEVEL) { 72 // The level is already max. 73 return HistoricalInfo(timestamp, 74 originalHistoricalInfo->getLevel(), originalHistoricalInfo->getCount()); 75 } else { 76 // Raise the level. 77 return HistoricalInfo(timestamp, 78 originalHistoricalInfo->getLevel() + 1, 0 /* count */); 79 } 80 } else { 81 return HistoricalInfo(timestamp, originalHistoricalInfo->getLevel(), updatedCount); 82 } 83 } 84 } 85 86 /* static */ int ForgettingCurveUtils::decodeProbability( 87 const HistoricalInfo *const historicalInfo, const HeaderPolicy *const headerPolicy) { 88 const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimestamp(), 89 DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS); 90 return sProbabilityTable.getProbability( 91 headerPolicy->getForgettingCurveProbabilityValuesTableId(), 92 clampToValidLevelRange(historicalInfo->getLevel()), 93 clampToValidTimeStepCountRange(elapsedTimeStepCount)); 94 } 95 96 /* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo, 97 const HeaderPolicy *const headerPolicy) { 98 return historicalInfo->getLevel() > 0 99 || getElapsedTimeStepCount(historicalInfo->getTimestamp(), 100 DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS) 101 < DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD; 102 } 103 104 /* static */ const HistoricalInfo ForgettingCurveUtils::createHistoricalInfoToSave( 105 const HistoricalInfo *const originalHistoricalInfo, 106 const HeaderPolicy *const headerPolicy) { 107 if (originalHistoricalInfo->getTimestamp() == NOT_A_TIMESTAMP) { 108 return HistoricalInfo(); 109 } 110 const int durationToLevelDownInSeconds = DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS; 111 const int elapsedTimeStep = getElapsedTimeStepCount( 112 originalHistoricalInfo->getTimestamp(), durationToLevelDownInSeconds); 113 if (elapsedTimeStep <= MAX_ELAPSED_TIME_STEP_COUNT) { 114 // No need to update historical info. 115 return *originalHistoricalInfo; 116 } 117 // Lower the level. 118 const int maxLevelDownAmonut = elapsedTimeStep / (MAX_ELAPSED_TIME_STEP_COUNT + 1); 119 const int levelDownAmount = (maxLevelDownAmonut >= originalHistoricalInfo->getLevel()) ? 120 originalHistoricalInfo->getLevel() : maxLevelDownAmonut; 121 const int adjustedTimestampInSeconds = originalHistoricalInfo->getTimestamp() + 122 levelDownAmount * durationToLevelDownInSeconds; 123 return HistoricalInfo(adjustedTimestampInSeconds, 124 originalHistoricalInfo->getLevel() - levelDownAmount, 0 /* count */); 125 } 126 127 /* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay, 128 const EntryCounts &entryCounts, const HeaderPolicy *const headerPolicy) { 129 const EntryCounts &maxNgramCounts = headerPolicy->getMaxNgramCounts(); 130 for (const auto ngramType : AllNgramTypes::ASCENDING) { 131 if (entryCounts.getNgramCount(ngramType) 132 >= getEntryCountHardLimit(maxNgramCounts.getNgramCount(ngramType))) { 133 // Unigram count exceeds the limit. 134 return true; 135 } 136 } 137 if (mindsBlockByDecay) { 138 return false; 139 } 140 if (headerPolicy->getLastDecayedTime() + DECAY_INTERVAL_SECONDS 141 < TimeKeeper::peekCurrentTime()) { 142 // Time to decay. 143 return true; 144 } 145 return false; 146 } 147 148 // See comments in ProbabilityUtils::backoff(). 149 /* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) { 150 // See TODO comments in ForgettingCurveUtils::getProbability(). 151 return unigramProbability; 152 } 153 154 /* static */ int ForgettingCurveUtils::getElapsedTimeStepCount(const int timestamp, 155 const int durationToLevelDownInSeconds) { 156 const int elapsedTimeInSeconds = TimeKeeper::peekCurrentTime() - timestamp; 157 const int timeStepDurationInSeconds = 158 durationToLevelDownInSeconds / (MAX_ELAPSED_TIME_STEP_COUNT + 1); 159 return elapsedTimeInSeconds / timeStepDurationInSeconds; 160 } 161 162 /* static */ int ForgettingCurveUtils::clampToVisibleEntryLevelRange(const int level) { 163 return std::min(std::max(level, MIN_VISIBLE_LEVEL), MAX_LEVEL); 164 } 165 166 /* static */ int ForgettingCurveUtils::clampToValidCountRange(const int count, 167 const HeaderPolicy *const headerPolicy) { 168 return std::min(std::max(count, 0), OCCURRENCES_TO_RAISE_THE_LEVEL - 1); 169 } 170 171 /* static */ int ForgettingCurveUtils::clampToValidLevelRange(const int level) { 172 return std::min(std::max(level, 0), MAX_LEVEL); 173 } 174 175 /* static */ int ForgettingCurveUtils::clampToValidTimeStepCountRange(const int timeStepCount) { 176 return std::min(std::max(timeStepCount, 0), MAX_ELAPSED_TIME_STEP_COUNT); 177 } 178 179 const int ForgettingCurveUtils::ProbabilityTable::PROBABILITY_TABLE_COUNT = 4; 180 const int ForgettingCurveUtils::ProbabilityTable::WEAK_PROBABILITY_TABLE_ID = 0; 181 const int ForgettingCurveUtils::ProbabilityTable::MODEST_PROBABILITY_TABLE_ID = 1; 182 const int ForgettingCurveUtils::ProbabilityTable::STRONG_PROBABILITY_TABLE_ID = 2; 183 const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_PROBABILITY_TABLE_ID = 3; 184 const int ForgettingCurveUtils::ProbabilityTable::WEAK_MAX_PROBABILITY = 127; 185 const int ForgettingCurveUtils::ProbabilityTable::MODEST_BASE_PROBABILITY = 8; 186 const int ForgettingCurveUtils::ProbabilityTable::STRONG_BASE_PROBABILITY = 9; 187 const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_BASE_PROBABILITY = 10; 188 189 190 ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTables() { 191 mTables.resize(PROBABILITY_TABLE_COUNT); 192 for (int tableId = 0; tableId < PROBABILITY_TABLE_COUNT; ++tableId) { 193 mTables[tableId].resize(MAX_LEVEL + 1); 194 for (int level = 0; level <= MAX_LEVEL; ++level) { 195 mTables[tableId][level].resize(MAX_ELAPSED_TIME_STEP_COUNT + 1); 196 const float initialProbability = getBaseProbabilityForLevel(tableId, level); 197 const float endProbability = getBaseProbabilityForLevel(tableId, level - 1); 198 for (int timeStepCount = 0; timeStepCount <= MAX_ELAPSED_TIME_STEP_COUNT; 199 ++timeStepCount) { 200 if (level < MIN_VISIBLE_LEVEL) { 201 mTables[tableId][level][timeStepCount] = NOT_A_PROBABILITY; 202 continue; 203 } 204 const float probability = initialProbability 205 * powf(initialProbability / endProbability, 206 -1.0f * static_cast<float>(timeStepCount) 207 / static_cast<float>(MAX_ELAPSED_TIME_STEP_COUNT + 1)); 208 mTables[tableId][level][timeStepCount] = 209 std::min(std::max(static_cast<int>(probability), 1), MAX_PROBABILITY); 210 } 211 } 212 } 213 } 214 215 /* static */ int ForgettingCurveUtils::ProbabilityTable::getBaseProbabilityForLevel( 216 const int tableId, const int level) { 217 if (tableId == WEAK_PROBABILITY_TABLE_ID) { 218 // Max probability is 127. 219 return static_cast<float>(WEAK_MAX_PROBABILITY / (1 << (MAX_LEVEL - level))); 220 } else if (tableId == MODEST_PROBABILITY_TABLE_ID) { 221 // Max probability is 128. 222 return static_cast<float>(MODEST_BASE_PROBABILITY * (level + 1)); 223 } else if (tableId == STRONG_PROBABILITY_TABLE_ID) { 224 // Max probability is 140. 225 return static_cast<float>(STRONG_BASE_PROBABILITY * (level + 1)); 226 } else if (tableId == AGGRESSIVE_PROBABILITY_TABLE_ID) { 227 // Max probability is 160. 228 return static_cast<float>(AGGRESSIVE_BASE_PROBABILITY * (level + 1)); 229 } else { 230 return NOT_A_PROBABILITY; 231 } 232 } 233 234 } // namespace latinime 235