1 /* 2 * Copyright (C) 2013, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <cmath> 18 #include <ctime> 19 #include <stdlib.h> 20 21 #include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" 22 23 #include "suggest/core/policy/dictionary_header_structure_policy.h" 24 #include "suggest/policyimpl/dictionary/utils/probability_utils.h" 25 26 namespace latinime { 27 28 const int ForgettingCurveUtils::MAX_UNIGRAM_COUNT = 12000; 29 const int ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC = 10000; 30 const int ForgettingCurveUtils::MAX_BIGRAM_COUNT = 12000; 31 const int ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC = 10000; 32 33 const int ForgettingCurveUtils::MAX_COMPUTED_PROBABILITY = 127; 34 const int ForgettingCurveUtils::MAX_ENCODED_PROBABILITY = 15; 35 const int ForgettingCurveUtils::MIN_VALID_ENCODED_PROBABILITY = 3; 36 const int ForgettingCurveUtils::ENCODED_PROBABILITY_STEP = 1; 37 // Currently, we try to decay each uni/bigram once every 2 hours. Accordingly, the expected 38 // duration of the decay is approximately 66hours. 39 const float ForgettingCurveUtils::MIN_PROBABILITY_TO_DECAY = 0.03f; 40 const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60; 41 42 const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable; 43 ForgettingCurveUtils::TimeKeeper ForgettingCurveUtils::sTimeKeeper; 44 45 void ForgettingCurveUtils::TimeKeeper::setCurrentTime() { 46 mCurrentTime = time(0); 47 } 48 49 /* static */ int ForgettingCurveUtils::getProbability(const int encodedUnigramProbability, 50 const int encodedBigramProbability) { 51 if (encodedUnigramProbability == NOT_A_PROBABILITY) { 52 return NOT_A_PROBABILITY; 53 } else if (encodedBigramProbability == NOT_A_PROBABILITY) { 54 return backoff(decodeProbability(encodedUnigramProbability)); 55 } else { 56 const int unigramProbability = decodeProbability(encodedUnigramProbability); 57 const int bigramProbability = decodeProbability(encodedBigramProbability); 58 return min(max(unigramProbability, bigramProbability), MAX_COMPUTED_PROBABILITY); 59 } 60 } 61 62 // Caveat: Unlike getProbability(), this method doesn't assume special bigram probability encoding 63 // (i.e. unigram probability + bigram probability delta). 64 /* static */ int ForgettingCurveUtils::getUpdatedEncodedProbability( 65 const int originalEncodedProbability, const int newProbability) { 66 if (originalEncodedProbability == NOT_A_PROBABILITY) { 67 // The bigram relation is not in this dictionary. 68 if (newProbability == NOT_A_PROBABILITY) { 69 // The bigram target is not in other dictionaries. 70 return 0; 71 } else { 72 return MIN_VALID_ENCODED_PROBABILITY; 73 } 74 } else { 75 if (newProbability != NOT_A_PROBABILITY 76 && originalEncodedProbability < MIN_VALID_ENCODED_PROBABILITY) { 77 return MIN_VALID_ENCODED_PROBABILITY; 78 } 79 return min(originalEncodedProbability + ENCODED_PROBABILITY_STEP, MAX_ENCODED_PROBABILITY); 80 } 81 } 82 83 /* static */ int ForgettingCurveUtils::isValidEncodedProbability(const int encodedProbability) { 84 return encodedProbability >= MIN_VALID_ENCODED_PROBABILITY; 85 } 86 87 /* static */ int ForgettingCurveUtils::getEncodedProbabilityToSave(const int encodedProbability, 88 const DictionaryHeaderStructurePolicy *const headerPolicy) { 89 const int elapsedTime = sTimeKeeper.peekCurrentTime() - headerPolicy->getLastDecayedTime(); 90 const int decayIterationCount = max(elapsedTime / DECAY_INTERVAL_SECONDS, 1); 91 int currentEncodedProbability = max(min(encodedProbability, MAX_ENCODED_PROBABILITY), 0); 92 // TODO: Implement the decay in more proper way. 93 for (int i = 0; i < decayIterationCount; ++i) { 94 const float currentRate = static_cast<float>(currentEncodedProbability) 95 / static_cast<float>(MAX_ENCODED_PROBABILITY); 96 const float thresholdToDecay = (1.0f - MIN_PROBABILITY_TO_DECAY) * currentRate; 97 const float randValue = static_cast<float>(rand()) / static_cast<float>(RAND_MAX); 98 if (thresholdToDecay < randValue) { 99 currentEncodedProbability = max(currentEncodedProbability - ENCODED_PROBABILITY_STEP, 100 0); 101 } 102 } 103 return currentEncodedProbability; 104 } 105 106 /* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay, 107 const int unigramCount, const int bigramCount, 108 const DictionaryHeaderStructurePolicy *const headerPolicy) { 109 if (unigramCount >= ForgettingCurveUtils::MAX_UNIGRAM_COUNT) { 110 // Unigram count exceeds the limit. 111 return true; 112 } else if (bigramCount >= ForgettingCurveUtils::MAX_BIGRAM_COUNT) { 113 // Bigram count exceeds the limit. 114 return true; 115 } 116 if (mindsBlockByDecay) { 117 return false; 118 } 119 if (headerPolicy->getLastDecayedTime() + DECAY_INTERVAL_SECONDS < time(0)) { 120 // Time to decay. 121 return true; 122 } 123 return false; 124 } 125 126 /* static */ int ForgettingCurveUtils::decodeProbability(const int encodedProbability) { 127 if (encodedProbability < MIN_VALID_ENCODED_PROBABILITY) { 128 return NOT_A_PROBABILITY; 129 } else { 130 return min(sProbabilityTable.getProbability(encodedProbability), MAX_ENCODED_PROBABILITY); 131 } 132 } 133 134 // See comments in ProbabilityUtils::backoff(). 135 /* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) { 136 if (unigramProbability == NOT_A_PROBABILITY) { 137 return NOT_A_PROBABILITY; 138 } else { 139 return max(unigramProbability - 8, 0); 140 } 141 } 142 143 ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTable() { 144 // Table entry is as follows: 145 // 1, 1, 1, 2, 3, 5, 6, 9, 13, 18, 25, 34, 48, 66, 91, 127. 146 // Note that first MIN_VALID_ENCODED_PROBABILITY values are not used. 147 mTable.resize(MAX_ENCODED_PROBABILITY + 1); 148 for (int i = 0; i <= MAX_ENCODED_PROBABILITY; ++i) { 149 const int probability = static_cast<int>(powf(static_cast<float>(MAX_COMPUTED_PROBABILITY), 150 static_cast<float>(i) / static_cast<float>(MAX_ENCODED_PROBABILITY))); 151 mTable[i] = min(MAX_COMPUTED_PROBABILITY, max(0, probability)); 152 } 153 } 154 155 } // namespace latinime 156