Home | History | Annotate | Download | only in utils
      1 /*
      2  * Copyright (C) 2013, The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "dictionary/utils/forgetting_curve_utils.h"
     18 
     19 #include <algorithm>
     20 #include <cmath>
     21 #include <stdlib.h>
     22 
     23 #include "dictionary/header/header_policy.h"
     24 #include "dictionary/utils/probability_utils.h"
     25 #include "utils/time_keeper.h"
     26 
     27 namespace latinime {
     28 
     29 const int ForgettingCurveUtils::MULTIPLIER_TWO_IN_PROBABILITY_SCALE = 8;
     30 const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60;
     31 
     32 const int ForgettingCurveUtils::MAX_LEVEL = 15;
     33 const int ForgettingCurveUtils::MIN_VISIBLE_LEVEL = 2;
     34 const int ForgettingCurveUtils::MAX_ELAPSED_TIME_STEP_COUNT = 31;
     35 const int ForgettingCurveUtils::DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD = 30;
     36 const int ForgettingCurveUtils::OCCURRENCES_TO_RAISE_THE_LEVEL = 1;
     37 // TODO: Evaluate whether this should be 7.5 days.
     38 // 15 days
     39 const int ForgettingCurveUtils::DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS = 15 * 24 * 60 * 60;
     40 
     41 const float ForgettingCurveUtils::ENTRY_COUNT_HARD_LIMIT_WEIGHT = 1.2;
     42 
     43 const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable;
     44 
     45 // TODO: Revise the logic to decide the initial probability depending on the given probability.
     46 /* static */ const HistoricalInfo ForgettingCurveUtils::createUpdatedHistoricalInfo(
     47         const HistoricalInfo *const originalHistoricalInfo, const int newProbability,
     48         const HistoricalInfo *const newHistoricalInfo, const HeaderPolicy *const headerPolicy) {
     49     const int timestamp = newHistoricalInfo->getTimestamp();
     50     if (newProbability != NOT_A_PROBABILITY && originalHistoricalInfo->getLevel() == 0) {
     51         // Add entry as a valid word.
     52         const int level = clampToVisibleEntryLevelRange(newHistoricalInfo->getLevel());
     53         const int count = clampToValidCountRange(newHistoricalInfo->getCount(), headerPolicy);
     54         return HistoricalInfo(timestamp, level, count);
     55     } else if (!originalHistoricalInfo->isValid()
     56             || originalHistoricalInfo->getLevel() < newHistoricalInfo->getLevel()
     57             || (originalHistoricalInfo->getLevel() == newHistoricalInfo->getLevel()
     58                     && originalHistoricalInfo->getCount() < newHistoricalInfo->getCount())) {
     59         // Initial information.
     60         int count = newHistoricalInfo->getCount();
     61         if (count >= OCCURRENCES_TO_RAISE_THE_LEVEL) {
     62             const int level = clampToValidLevelRange(newHistoricalInfo->getLevel() + 1);
     63             return HistoricalInfo(timestamp, level, 0 /* count */);
     64         }
     65         const int level = clampToValidLevelRange(newHistoricalInfo->getLevel());
     66         return HistoricalInfo(timestamp, level, clampToValidCountRange(count, headerPolicy));
     67     } else {
     68         const int updatedCount = originalHistoricalInfo->getCount() + 1;
     69         if (updatedCount >= OCCURRENCES_TO_RAISE_THE_LEVEL) {
     70             // The count exceeds the max value the level can be incremented.
     71             if (originalHistoricalInfo->getLevel() >= MAX_LEVEL) {
     72                 // The level is already max.
     73                 return HistoricalInfo(timestamp,
     74                         originalHistoricalInfo->getLevel(), originalHistoricalInfo->getCount());
     75             } else {
     76                 // Raise the level.
     77                 return HistoricalInfo(timestamp,
     78                         originalHistoricalInfo->getLevel() + 1, 0 /* count */);
     79             }
     80         } else {
     81             return HistoricalInfo(timestamp, originalHistoricalInfo->getLevel(), updatedCount);
     82         }
     83     }
     84 }
     85 
     86 /* static */ int ForgettingCurveUtils::decodeProbability(
     87         const HistoricalInfo *const historicalInfo, const HeaderPolicy *const headerPolicy) {
     88     const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimestamp(),
     89             DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS);
     90     return sProbabilityTable.getProbability(
     91             headerPolicy->getForgettingCurveProbabilityValuesTableId(),
     92             clampToValidLevelRange(historicalInfo->getLevel()),
     93             clampToValidTimeStepCountRange(elapsedTimeStepCount));
     94 }
     95 
     96 /* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo,
     97         const HeaderPolicy *const headerPolicy) {
     98     return historicalInfo->getLevel() > 0
     99             || getElapsedTimeStepCount(historicalInfo->getTimestamp(),
    100                     DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS)
    101                             < DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD;
    102 }
    103 
    104 /* static */ const HistoricalInfo ForgettingCurveUtils::createHistoricalInfoToSave(
    105         const HistoricalInfo *const originalHistoricalInfo,
    106         const HeaderPolicy *const headerPolicy) {
    107     if (originalHistoricalInfo->getTimestamp() == NOT_A_TIMESTAMP) {
    108         return HistoricalInfo();
    109     }
    110     const int durationToLevelDownInSeconds = DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS;
    111     const int elapsedTimeStep = getElapsedTimeStepCount(
    112             originalHistoricalInfo->getTimestamp(), durationToLevelDownInSeconds);
    113     if (elapsedTimeStep <= MAX_ELAPSED_TIME_STEP_COUNT) {
    114         // No need to update historical info.
    115         return *originalHistoricalInfo;
    116     }
    117     // Lower the level.
    118     const int maxLevelDownAmonut = elapsedTimeStep / (MAX_ELAPSED_TIME_STEP_COUNT + 1);
    119     const int levelDownAmount = (maxLevelDownAmonut >= originalHistoricalInfo->getLevel()) ?
    120             originalHistoricalInfo->getLevel() : maxLevelDownAmonut;
    121     const int adjustedTimestampInSeconds = originalHistoricalInfo->getTimestamp() +
    122             levelDownAmount * durationToLevelDownInSeconds;
    123     return HistoricalInfo(adjustedTimestampInSeconds,
    124             originalHistoricalInfo->getLevel() - levelDownAmount, 0 /* count */);
    125 }
    126 
    127 /* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay,
    128         const EntryCounts &entryCounts, const HeaderPolicy *const headerPolicy) {
    129     const EntryCounts &maxNgramCounts = headerPolicy->getMaxNgramCounts();
    130     for (const auto ngramType : AllNgramTypes::ASCENDING) {
    131         if (entryCounts.getNgramCount(ngramType)
    132                 >= getEntryCountHardLimit(maxNgramCounts.getNgramCount(ngramType))) {
    133             // Unigram count exceeds the limit.
    134             return true;
    135         }
    136     }
    137     if (mindsBlockByDecay) {
    138         return false;
    139     }
    140     if (headerPolicy->getLastDecayedTime() + DECAY_INTERVAL_SECONDS
    141             < TimeKeeper::peekCurrentTime()) {
    142         // Time to decay.
    143         return true;
    144     }
    145     return false;
    146 }
    147 
    148 // See comments in ProbabilityUtils::backoff().
    149 /* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) {
    150     // See TODO comments in ForgettingCurveUtils::getProbability().
    151     return unigramProbability;
    152 }
    153 
    154 /* static */ int ForgettingCurveUtils::getElapsedTimeStepCount(const int timestamp,
    155         const int durationToLevelDownInSeconds) {
    156     const int elapsedTimeInSeconds = TimeKeeper::peekCurrentTime() - timestamp;
    157     const int timeStepDurationInSeconds =
    158             durationToLevelDownInSeconds / (MAX_ELAPSED_TIME_STEP_COUNT + 1);
    159     return elapsedTimeInSeconds / timeStepDurationInSeconds;
    160 }
    161 
    162 /* static */ int ForgettingCurveUtils::clampToVisibleEntryLevelRange(const int level) {
    163     return std::min(std::max(level, MIN_VISIBLE_LEVEL), MAX_LEVEL);
    164 }
    165 
    166 /* static */ int ForgettingCurveUtils::clampToValidCountRange(const int count,
    167         const HeaderPolicy *const headerPolicy) {
    168     return std::min(std::max(count, 0), OCCURRENCES_TO_RAISE_THE_LEVEL - 1);
    169 }
    170 
    171 /* static */ int ForgettingCurveUtils::clampToValidLevelRange(const int level) {
    172     return std::min(std::max(level, 0), MAX_LEVEL);
    173 }
    174 
    175 /* static */ int ForgettingCurveUtils::clampToValidTimeStepCountRange(const int timeStepCount) {
    176     return std::min(std::max(timeStepCount, 0), MAX_ELAPSED_TIME_STEP_COUNT);
    177 }
    178 
    179 const int ForgettingCurveUtils::ProbabilityTable::PROBABILITY_TABLE_COUNT = 4;
    180 const int ForgettingCurveUtils::ProbabilityTable::WEAK_PROBABILITY_TABLE_ID = 0;
    181 const int ForgettingCurveUtils::ProbabilityTable::MODEST_PROBABILITY_TABLE_ID = 1;
    182 const int ForgettingCurveUtils::ProbabilityTable::STRONG_PROBABILITY_TABLE_ID = 2;
    183 const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_PROBABILITY_TABLE_ID = 3;
    184 const int ForgettingCurveUtils::ProbabilityTable::WEAK_MAX_PROBABILITY = 127;
    185 const int ForgettingCurveUtils::ProbabilityTable::MODEST_BASE_PROBABILITY = 8;
    186 const int ForgettingCurveUtils::ProbabilityTable::STRONG_BASE_PROBABILITY = 9;
    187 const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_BASE_PROBABILITY = 10;
    188 
    189 
    190 ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTables() {
    191     mTables.resize(PROBABILITY_TABLE_COUNT);
    192     for (int tableId = 0; tableId < PROBABILITY_TABLE_COUNT; ++tableId) {
    193         mTables[tableId].resize(MAX_LEVEL + 1);
    194         for (int level = 0; level <= MAX_LEVEL; ++level) {
    195             mTables[tableId][level].resize(MAX_ELAPSED_TIME_STEP_COUNT + 1);
    196             const float initialProbability = getBaseProbabilityForLevel(tableId, level);
    197             const float endProbability = getBaseProbabilityForLevel(tableId, level - 1);
    198             for (int timeStepCount = 0; timeStepCount <= MAX_ELAPSED_TIME_STEP_COUNT;
    199                     ++timeStepCount) {
    200                 if (level < MIN_VISIBLE_LEVEL) {
    201                     mTables[tableId][level][timeStepCount] = NOT_A_PROBABILITY;
    202                     continue;
    203                 }
    204                 const float probability = initialProbability
    205                         * powf(initialProbability / endProbability,
    206                                 -1.0f * static_cast<float>(timeStepCount)
    207                                         / static_cast<float>(MAX_ELAPSED_TIME_STEP_COUNT + 1));
    208                 mTables[tableId][level][timeStepCount] =
    209                         std::min(std::max(static_cast<int>(probability), 1), MAX_PROBABILITY);
    210             }
    211         }
    212     }
    213 }
    214 
    215 /* static */ int ForgettingCurveUtils::ProbabilityTable::getBaseProbabilityForLevel(
    216         const int tableId, const int level) {
    217     if (tableId == WEAK_PROBABILITY_TABLE_ID) {
    218         // Max probability is 127.
    219         return static_cast<float>(WEAK_MAX_PROBABILITY / (1 << (MAX_LEVEL - level)));
    220     } else if (tableId == MODEST_PROBABILITY_TABLE_ID) {
    221         // Max probability is 128.
    222         return static_cast<float>(MODEST_BASE_PROBABILITY * (level + 1));
    223     } else if (tableId == STRONG_PROBABILITY_TABLE_ID) {
    224         // Max probability is 140.
    225         return static_cast<float>(STRONG_BASE_PROBABILITY * (level + 1));
    226     } else if (tableId == AGGRESSIVE_PROBABILITY_TABLE_ID) {
    227         // Max probability is 160.
    228         return static_cast<float>(AGGRESSIVE_BASE_PROBABILITY * (level + 1));
    229     } else {
    230         return NOT_A_PROBABILITY;
    231     }
    232 }
    233 
    234 } // namespace latinime
    235