1 /* 2 * Copyright (C) 2013, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "dictionary/header/header_policy.h" 18 19 #include <algorithm> 20 21 #include "utils/ngram_utils.h" 22 23 namespace latinime { 24 25 // Note that these are corresponding definitions in Java side in DictionaryHeader. 26 const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE"; 27 const char *const HeaderPolicy::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY = 28 "REQUIRES_GERMAN_UMLAUT_PROCESSING"; 29 // TODO: Change attribute string to "IS_DECAYING_DICT". 30 const char *const HeaderPolicy::IS_DECAYING_DICT_KEY = "USES_FORGETTING_CURVE"; 31 const char *const HeaderPolicy::DATE_KEY = "date"; 32 const char *const HeaderPolicy::LAST_DECAYED_TIME_KEY = "LAST_DECAYED_TIME"; 33 const char *const HeaderPolicy::NGRAM_COUNT_KEYS[] = 34 {"UNIGRAM_COUNT", "BIGRAM_COUNT", "TRIGRAM_COUNT", "QUADGRAM_COUNT"}; 35 const char *const HeaderPolicy::MAX_NGRAM_COUNT_KEYS[] = 36 {"MAX_UNIGRAM_ENTRY_COUNT", "MAX_BIGRAM_ENTRY_COUNT", "MAX_TRIGRAM_ENTRY_COUNT", 37 "MAX_QUADGRAM_ENTRY_COUNT"}; 38 const int HeaderPolicy::DEFAULT_MAX_NGRAM_COUNTS[] = {10000, 30000, 30000, 30000}; 39 const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE"; 40 // Historical info is information that is needed to support decaying such as timestamp, level and 41 // count. 42 const char *const HeaderPolicy::HAS_HISTORICAL_INFO_KEY = "HAS_HISTORICAL_INFO"; 43 const char *const HeaderPolicy::LOCALE_KEY = "locale"; // match Java declaration 44 const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY = 45 "FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID"; 46 47 const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100; 48 const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f; 49 const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 3; 50 51 // Used for logging. Question mark is used to indicate that the key is not found. 52 void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue, 53 int outValueSize) const { 54 if (outValueSize <= 0) return; 55 if (outValueSize == 1) { 56 outValue[0] = '\0'; 57 return; 58 } 59 std::vector<int> keyCodePointVector; 60 HeaderReadWriteUtils::insertCharactersIntoVector(key, &keyCodePointVector); 61 DictionaryHeaderStructurePolicy::AttributeMap::const_iterator it = 62 mAttributeMap.find(keyCodePointVector); 63 if (it == mAttributeMap.end()) { 64 // The key was not found. 65 outValue[0] = '?'; 66 outValue[1] = '\0'; 67 return; 68 } 69 const int terminalIndex = std::min(static_cast<int>(it->second.size()), outValueSize - 1); 70 for (int i = 0; i < terminalIndex; ++i) { 71 outValue[i] = it->second[i]; 72 } 73 outValue[terminalIndex] = '\0'; 74 } 75 76 const std::vector<int> HeaderPolicy::readLocale() const { 77 return HeaderReadWriteUtils::readCodePointVectorAttributeValue(&mAttributeMap, LOCALE_KEY); 78 } 79 80 float HeaderPolicy::readMultipleWordCostMultiplier() const { 81 const int demotionRate = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, 82 MULTIPLE_WORDS_DEMOTION_RATE_KEY, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE); 83 if (demotionRate <= 0) { 84 return static_cast<float>(MAX_VALUE_FOR_WEIGHTING); 85 } 86 return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast<float>(demotionRate); 87 } 88 89 bool HeaderPolicy::readRequiresGermanUmlautProcessing() const { 90 return HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, 91 REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, false); 92 } 93 94 bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime, 95 const EntryCounts &entryCounts, const int extendedRegionSize, 96 BufferWithExtendableBuffer *const outBuffer) const { 97 int writingPos = 0; 98 DictionaryHeaderStructurePolicy::AttributeMap attributeMapToWrite(mAttributeMap); 99 fillInHeader(updatesLastDecayedTime, entryCounts, extendedRegionSize, &attributeMapToWrite); 100 if (!HeaderReadWriteUtils::writeDictionaryVersion(outBuffer, mDictFormatVersion, 101 &writingPos)) { 102 return false; 103 } 104 if (!HeaderReadWriteUtils::writeDictionaryFlags(outBuffer, mDictionaryFlags, 105 &writingPos)) { 106 return false; 107 } 108 // Temporarily writes a dummy header size. 109 int headerSizeFieldPos = writingPos; 110 if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(outBuffer, 0 /* size */, 111 &writingPos)) { 112 return false; 113 } 114 if (!HeaderReadWriteUtils::writeHeaderAttributes(outBuffer, &attributeMapToWrite, 115 &writingPos)) { 116 return false; 117 } 118 // Writes the actual header size. 119 if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(outBuffer, writingPos, 120 &headerSizeFieldPos)) { 121 return false; 122 } 123 return true; 124 } 125 126 namespace { 127 128 int getIndexFromNgramType(const NgramType ngramType) { 129 return static_cast<int>(ngramType); 130 } 131 132 } // namespace 133 134 void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime, 135 const EntryCounts &entryCounts, const int extendedRegionSize, 136 DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const { 137 for (const auto ngramType : AllNgramTypes::ASCENDING) { 138 HeaderReadWriteUtils::setIntAttribute(outAttributeMap, 139 NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)], 140 entryCounts.getNgramCount(ngramType)); 141 } 142 HeaderReadWriteUtils::setIntAttribute(outAttributeMap, EXTENDED_REGION_SIZE_KEY, 143 extendedRegionSize); 144 // Set the current time as the generation time. 145 HeaderReadWriteUtils::setIntAttribute(outAttributeMap, DATE_KEY, 146 TimeKeeper::peekCurrentTime()); 147 HeaderReadWriteUtils::setCodePointVectorAttribute(outAttributeMap, LOCALE_KEY, mLocale); 148 if (updatesLastDecayedTime) { 149 // Set current time as the last updated time. 150 HeaderReadWriteUtils::setIntAttribute(outAttributeMap, LAST_DECAYED_TIME_KEY, 151 TimeKeeper::peekCurrentTime()); 152 } 153 } 154 155 /* static */ DictionaryHeaderStructurePolicy::AttributeMap 156 HeaderPolicy::createAttributeMapAndReadAllAttributes(const uint8_t *const dictBuf) { 157 DictionaryHeaderStructurePolicy::AttributeMap attributeMap; 158 HeaderReadWriteUtils::fetchAllHeaderAttributes(dictBuf, &attributeMap); 159 return attributeMap; 160 } 161 162 /* static */ const EntryCounts HeaderPolicy::readNgramCounts() const { 163 MutableEntryCounters entryCounters; 164 for (const auto ngramType : AllNgramTypes::ASCENDING) { 165 const int entryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, 166 NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)], 0 /* defaultValue */); 167 entryCounters.setNgramCount(ngramType, entryCount); 168 } 169 return entryCounters.getEntryCounts(); 170 } 171 172 /* static */ const EntryCounts HeaderPolicy::readMaxNgramCounts() const { 173 MutableEntryCounters entryCounters; 174 for (const auto ngramType : AllNgramTypes::ASCENDING) { 175 const int index = getIndexFromNgramType(ngramType); 176 const int maxEntryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, 177 MAX_NGRAM_COUNT_KEYS[index], DEFAULT_MAX_NGRAM_COUNTS[index]); 178 entryCounters.setNgramCount(ngramType, maxEntryCount); 179 } 180 return entryCounters.getEntryCounts(); 181 } 182 183 } // namespace latinime 184