Home | History | Annotate | Download | only in utils
      1 /*
      2  * Copyright (C) 2013, The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include <cmath>
     18 #include <ctime>
     19 #include <stdlib.h>
     20 
     21 #include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
     22 
     23 #include "suggest/core/policy/dictionary_header_structure_policy.h"
     24 #include "suggest/policyimpl/dictionary/utils/probability_utils.h"
     25 
     26 namespace latinime {
     27 
     28 const int ForgettingCurveUtils::MAX_UNIGRAM_COUNT = 12000;
     29 const int ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC = 10000;
     30 const int ForgettingCurveUtils::MAX_BIGRAM_COUNT = 12000;
     31 const int ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC = 10000;
     32 
     33 const int ForgettingCurveUtils::MAX_COMPUTED_PROBABILITY = 127;
     34 const int ForgettingCurveUtils::MAX_ENCODED_PROBABILITY = 15;
     35 const int ForgettingCurveUtils::MIN_VALID_ENCODED_PROBABILITY = 3;
     36 const int ForgettingCurveUtils::ENCODED_PROBABILITY_STEP = 1;
     37 // Currently, we try to decay each uni/bigram once every 2 hours. Accordingly, the expected
     38 // duration of the decay is approximately 66hours.
     39 const float ForgettingCurveUtils::MIN_PROBABILITY_TO_DECAY = 0.03f;
     40 const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60;
     41 
     42 const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable;
     43 ForgettingCurveUtils::TimeKeeper ForgettingCurveUtils::sTimeKeeper;
     44 
     45 void ForgettingCurveUtils::TimeKeeper::setCurrentTime() {
     46     mCurrentTime = time(0);
     47 }
     48 
     49 /* static */ int ForgettingCurveUtils::getProbability(const int encodedUnigramProbability,
     50         const int encodedBigramProbability) {
     51     if (encodedUnigramProbability == NOT_A_PROBABILITY) {
     52         return NOT_A_PROBABILITY;
     53     } else if (encodedBigramProbability == NOT_A_PROBABILITY) {
     54         return backoff(decodeProbability(encodedUnigramProbability));
     55     } else {
     56         const int unigramProbability = decodeProbability(encodedUnigramProbability);
     57         const int bigramProbability = decodeProbability(encodedBigramProbability);
     58         return min(max(unigramProbability, bigramProbability), MAX_COMPUTED_PROBABILITY);
     59     }
     60 }
     61 
     62 // Caveat: Unlike getProbability(), this method doesn't assume special bigram probability encoding
     63 // (i.e. unigram probability + bigram probability delta).
     64 /* static */ int ForgettingCurveUtils::getUpdatedEncodedProbability(
     65         const int originalEncodedProbability, const int newProbability) {
     66     if (originalEncodedProbability == NOT_A_PROBABILITY) {
     67         // The bigram relation is not in this dictionary.
     68         if (newProbability == NOT_A_PROBABILITY) {
     69             // The bigram target is not in other dictionaries.
     70             return 0;
     71         } else {
     72             return MIN_VALID_ENCODED_PROBABILITY;
     73         }
     74     } else {
     75         if (newProbability != NOT_A_PROBABILITY
     76                 && originalEncodedProbability < MIN_VALID_ENCODED_PROBABILITY) {
     77             return MIN_VALID_ENCODED_PROBABILITY;
     78         }
     79         return min(originalEncodedProbability + ENCODED_PROBABILITY_STEP, MAX_ENCODED_PROBABILITY);
     80     }
     81 }
     82 
     83 /* static */ int ForgettingCurveUtils::isValidEncodedProbability(const int encodedProbability) {
     84     return encodedProbability >= MIN_VALID_ENCODED_PROBABILITY;
     85 }
     86 
     87 /* static */ int ForgettingCurveUtils::getEncodedProbabilityToSave(const int encodedProbability,
     88         const DictionaryHeaderStructurePolicy *const headerPolicy) {
     89     const int elapsedTime = sTimeKeeper.peekCurrentTime() - headerPolicy->getLastDecayedTime();
     90     const int decayIterationCount = max(elapsedTime / DECAY_INTERVAL_SECONDS, 1);
     91     int currentEncodedProbability = max(min(encodedProbability, MAX_ENCODED_PROBABILITY), 0);
     92     // TODO: Implement the decay in more proper way.
     93     for (int i = 0; i < decayIterationCount; ++i) {
     94         const float currentRate = static_cast<float>(currentEncodedProbability)
     95                 / static_cast<float>(MAX_ENCODED_PROBABILITY);
     96         const float thresholdToDecay = (1.0f - MIN_PROBABILITY_TO_DECAY) * currentRate;
     97         const float randValue = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
     98         if (thresholdToDecay < randValue) {
     99             currentEncodedProbability = max(currentEncodedProbability - ENCODED_PROBABILITY_STEP,
    100                     0);
    101         }
    102     }
    103     return currentEncodedProbability;
    104 }
    105 
    106 /* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay,
    107         const int unigramCount, const int bigramCount,
    108         const DictionaryHeaderStructurePolicy *const headerPolicy) {
    109     if (unigramCount >= ForgettingCurveUtils::MAX_UNIGRAM_COUNT) {
    110         // Unigram count exceeds the limit.
    111         return true;
    112     } else if (bigramCount >= ForgettingCurveUtils::MAX_BIGRAM_COUNT) {
    113         // Bigram count exceeds the limit.
    114         return true;
    115     }
    116     if (mindsBlockByDecay) {
    117         return false;
    118     }
    119     if (headerPolicy->getLastDecayedTime() + DECAY_INTERVAL_SECONDS < time(0)) {
    120         // Time to decay.
    121         return true;
    122     }
    123     return false;
    124 }
    125 
    126 /* static */ int ForgettingCurveUtils::decodeProbability(const int encodedProbability) {
    127     if (encodedProbability < MIN_VALID_ENCODED_PROBABILITY) {
    128         return NOT_A_PROBABILITY;
    129     } else {
    130         return min(sProbabilityTable.getProbability(encodedProbability), MAX_ENCODED_PROBABILITY);
    131     }
    132 }
    133 
    134 // See comments in ProbabilityUtils::backoff().
    135 /* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) {
    136     if (unigramProbability == NOT_A_PROBABILITY) {
    137         return NOT_A_PROBABILITY;
    138     } else {
    139         return max(unigramProbability - 8, 0);
    140     }
    141 }
    142 
    143 ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTable() {
    144     // Table entry is as follows:
    145     // 1, 1, 1, 2, 3, 5, 6, 9, 13, 18, 25, 34, 48, 66, 91, 127.
    146     // Note that first MIN_VALID_ENCODED_PROBABILITY values are not used.
    147     mTable.resize(MAX_ENCODED_PROBABILITY + 1);
    148     for (int i = 0; i <= MAX_ENCODED_PROBABILITY; ++i) {
    149         const int probability = static_cast<int>(powf(static_cast<float>(MAX_COMPUTED_PROBABILITY),
    150                 static_cast<float>(i) / static_cast<float>(MAX_ENCODED_PROBABILITY)));
    151          mTable[i] = min(MAX_COMPUTED_PROBABILITY, max(0, probability));
    152     }
    153 }
    154 
    155 } // namespace latinime
    156