1 /* 2 * Copyright (C) 2013, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LATINIME_HEADER_POLICY_H 18 #define LATINIME_HEADER_POLICY_H 19 20 #include <cstdint> 21 22 #include "defines.h" 23 #include "dictionary/header/header_read_write_utils.h" 24 #include "dictionary/interface/dictionary_header_structure_policy.h" 25 #include "dictionary/utils/entry_counters.h" 26 #include "dictionary/utils/format_utils.h" 27 #include "utils/char_utils.h" 28 #include "utils/time_keeper.h" 29 30 namespace latinime { 31 32 class HeaderPolicy : public DictionaryHeaderStructurePolicy { 33 public: 34 // Reads information from existing dictionary buffer. 35 HeaderPolicy(const uint8_t *const dictBuf, const FormatUtils::FORMAT_VERSION formatVersion) 36 : mDictFormatVersion(formatVersion), 37 mDictionaryFlags(HeaderReadWriteUtils::getFlags(dictBuf)), 38 mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)), 39 mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)), 40 mLocale(readLocale()), 41 mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), 42 mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()), 43 mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, 44 IS_DECAYING_DICT_KEY, false /* defaultValue */)), 45 mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, 46 DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), 47 mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, 48 LAST_DECAYED_TIME_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), 49 mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()), 50 mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, 51 EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)), 52 mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( 53 &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)), 54 mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( 55 &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, 56 DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), 57 mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {} 58 59 // Constructs header information using an attribute map. 60 HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion, 61 const std::vector<int> &locale, 62 const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) 63 : mDictFormatVersion(dictFormatVersion), 64 mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap( 65 attributeMap)), mSize(0), mAttributeMap(*attributeMap), mLocale(locale), 66 mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), 67 mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()), 68 mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, 69 IS_DECAYING_DICT_KEY, false /* defaultValue */)), 70 mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, 71 DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), 72 mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, 73 DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), 74 mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()), 75 mExtendedRegionSize(0), 76 mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( 77 &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)), 78 mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( 79 &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, 80 DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), 81 mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {} 82 83 // Copy header information 84 HeaderPolicy(const HeaderPolicy *const headerPolicy) 85 : mDictFormatVersion(headerPolicy->mDictFormatVersion), 86 mDictionaryFlags(headerPolicy->mDictionaryFlags), mSize(headerPolicy->mSize), 87 mAttributeMap(headerPolicy->mAttributeMap), mLocale(headerPolicy->mLocale), 88 mMultiWordCostMultiplier(headerPolicy->mMultiWordCostMultiplier), 89 mRequiresGermanUmlautProcessing(headerPolicy->mRequiresGermanUmlautProcessing), 90 mIsDecayingDict(headerPolicy->mIsDecayingDict), 91 mDate(headerPolicy->mDate), mLastDecayedTime(headerPolicy->mLastDecayedTime), 92 mNgramCounts(headerPolicy->mNgramCounts), 93 mMaxNgramCounts(headerPolicy->mMaxNgramCounts), 94 mExtendedRegionSize(headerPolicy->mExtendedRegionSize), 95 mHasHistoricalInfoOfWords(headerPolicy->mHasHistoricalInfoOfWords), 96 mForgettingCurveProbabilityValuesTableId( 97 headerPolicy->mForgettingCurveProbabilityValuesTableId), 98 mCodePointTable(headerPolicy->mCodePointTable) {} 99 100 // Temporary dummy header. 101 HeaderPolicy() 102 : mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), mDictionaryFlags(0), mSize(0), 103 mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostMultiplier(0.0f), 104 mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false), 105 mDate(0), mLastDecayedTime(0), mNgramCounts(), mMaxNgramCounts(), 106 mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false), 107 mForgettingCurveProbabilityValuesTableId(0), mCodePointTable(nullptr) {} 108 109 ~HeaderPolicy() {} 110 111 virtual int getFormatVersionNumber() const { 112 // Conceptually this converts the symbolic value we use in the code into the 113 // hardcoded of the bytes in the file. But we want the constants to be the 114 // same so we use them for both here. 115 switch (mDictFormatVersion) { 116 case FormatUtils::VERSION_2: 117 case FormatUtils::VERSION_201: 118 AKLOGE("Dictionary versions 2 and 201 are incompatible with this version"); 119 return FormatUtils::UNKNOWN_VERSION; 120 case FormatUtils::VERSION_202: 121 return FormatUtils::VERSION_202; 122 case FormatUtils::VERSION_4_ONLY_FOR_TESTING: 123 return FormatUtils::VERSION_4_ONLY_FOR_TESTING; 124 case FormatUtils::VERSION_402: 125 return FormatUtils::VERSION_402; 126 case FormatUtils::VERSION_403: 127 return FormatUtils::VERSION_403; 128 default: 129 return FormatUtils::UNKNOWN_VERSION; 130 } 131 } 132 133 AK_FORCE_INLINE bool isValid() const { 134 // Decaying dictionary must have historical information. 135 if (!mIsDecayingDict) { 136 return true; 137 } 138 if (mHasHistoricalInfoOfWords) { 139 return true; 140 } else { 141 return false; 142 } 143 } 144 145 AK_FORCE_INLINE int getSize() const { 146 return mSize; 147 } 148 149 AK_FORCE_INLINE float getMultiWordCostMultiplier() const { 150 return mMultiWordCostMultiplier; 151 } 152 153 AK_FORCE_INLINE bool isDecayingDict() const { 154 return mIsDecayingDict; 155 } 156 157 AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const { 158 return mRequiresGermanUmlautProcessing; 159 } 160 161 AK_FORCE_INLINE int getDate() const { 162 return mDate; 163 } 164 165 AK_FORCE_INLINE int getLastDecayedTime() const { 166 return mLastDecayedTime; 167 } 168 169 AK_FORCE_INLINE const EntryCounts &getNgramCounts() const { 170 return mNgramCounts; 171 } 172 173 AK_FORCE_INLINE const EntryCounts getMaxNgramCounts() const { 174 return mMaxNgramCounts; 175 } 176 177 AK_FORCE_INLINE int getExtendedRegionSize() const { 178 return mExtendedRegionSize; 179 } 180 181 AK_FORCE_INLINE bool hasHistoricalInfoOfWords() const { 182 return mHasHistoricalInfoOfWords; 183 } 184 185 AK_FORCE_INLINE bool shouldBoostExactMatches() const { 186 // TODO: Investigate better ways to handle exact matches for personalized dictionaries. 187 return !isDecayingDict(); 188 } 189 190 const DictionaryHeaderStructurePolicy::AttributeMap *getAttributeMap() const { 191 return &mAttributeMap; 192 } 193 194 AK_FORCE_INLINE int getForgettingCurveProbabilityValuesTableId() const { 195 return mForgettingCurveProbabilityValuesTableId; 196 } 197 198 void readHeaderValueOrQuestionMark(const char *const key, 199 int *outValue, int outValueSize) const; 200 201 bool fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime, 202 const EntryCounts &entryCounts, const int extendedRegionSize, 203 BufferWithExtendableBuffer *const outBuffer) const; 204 205 void fillInHeader(const bool updatesLastDecayedTime, const EntryCounts &entryCounts, 206 const int extendedRegionSize, 207 DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const; 208 209 AK_FORCE_INLINE const std::vector<int> *getLocale() const { 210 return &mLocale; 211 } 212 213 bool supportsBeginningOfSentence() const { 214 return mDictFormatVersion >= FormatUtils::VERSION_402; 215 } 216 217 const int *getCodePointTable() const { 218 return mCodePointTable; 219 } 220 221 private: 222 DISALLOW_COPY_AND_ASSIGN(HeaderPolicy); 223 224 static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY; 225 static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY; 226 static const char *const IS_DECAYING_DICT_KEY; 227 static const char *const DATE_KEY; 228 static const char *const LAST_DECAYED_TIME_KEY; 229 static const char *const NGRAM_COUNT_KEYS[]; 230 static const char *const MAX_NGRAM_COUNT_KEYS[]; 231 static const int DEFAULT_MAX_NGRAM_COUNTS[]; 232 static const char *const EXTENDED_REGION_SIZE_KEY; 233 static const char *const HAS_HISTORICAL_INFO_KEY; 234 static const char *const LOCALE_KEY; 235 static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY; 236 static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY; 237 static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY; 238 static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE; 239 static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE; 240 static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID; 241 242 const FormatUtils::FORMAT_VERSION mDictFormatVersion; 243 const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags; 244 const int mSize; 245 DictionaryHeaderStructurePolicy::AttributeMap mAttributeMap; 246 const std::vector<int> mLocale; 247 const float mMultiWordCostMultiplier; 248 const bool mRequiresGermanUmlautProcessing; 249 const bool mIsDecayingDict; 250 const int mDate; 251 const int mLastDecayedTime; 252 const EntryCounts mNgramCounts; 253 const EntryCounts mMaxNgramCounts; 254 const int mExtendedRegionSize; 255 const bool mHasHistoricalInfoOfWords; 256 const int mForgettingCurveProbabilityValuesTableId; 257 const int *const mCodePointTable; 258 259 const std::vector<int> readLocale() const; 260 float readMultipleWordCostMultiplier() const; 261 bool readRequiresGermanUmlautProcessing() const; 262 const EntryCounts readNgramCounts() const; 263 const EntryCounts readMaxNgramCounts() const; 264 static DictionaryHeaderStructurePolicy::AttributeMap createAttributeMapAndReadAllAttributes( 265 const uint8_t *const dictBuf); 266 }; 267 } // namespace latinime 268 #endif /* LATINIME_HEADER_POLICY_H */ 269