Home | History | Annotate | Download | only in header
      1 /*
      2  * Copyright (C) 2013, The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef LATINIME_HEADER_POLICY_H
     18 #define LATINIME_HEADER_POLICY_H
     19 
     20 #include <cstdint>
     21 
     22 #include "defines.h"
     23 #include "suggest/core/policy/dictionary_header_structure_policy.h"
     24 #include "suggest/policyimpl/dictionary/header/header_read_write_utils.h"
     25 #include "suggest/policyimpl/dictionary/utils/format_utils.h"
     26 #include "utils/char_utils.h"
     27 #include "utils/time_keeper.h"
     28 
     29 namespace latinime {
     30 
     31 class HeaderPolicy : public DictionaryHeaderStructurePolicy {
     32  public:
     33     // Reads information from existing dictionary buffer.
     34     HeaderPolicy(const uint8_t *const dictBuf, const FormatUtils::FORMAT_VERSION formatVersion)
     35             : mDictFormatVersion(formatVersion),
     36               mDictionaryFlags(HeaderReadWriteUtils::getFlags(dictBuf)),
     37               mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)),
     38               mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)),
     39               mLocale(readLocale()),
     40               mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
     41               mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()),
     42               mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
     43                       IS_DECAYING_DICT_KEY, false /* defaultValue */)),
     44               mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
     45                       DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
     46               mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
     47                       LAST_DECAYED_TIME_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
     48               mUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
     49                       UNIGRAM_COUNT_KEY, 0 /* defaultValue */)),
     50               mBigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
     51                       BIGRAM_COUNT_KEY, 0 /* defaultValue */)),
     52               mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
     53                       EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)),
     54               mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue(
     55                       &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)),
     56               mForgettingCurveOccurrencesToLevelUp(HeaderReadWriteUtils::readIntAttributeValue(
     57                       &mAttributeMap, FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY,
     58                       DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)),
     59               mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
     60                       &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
     61                       DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
     62               mForgettingCurveDurationToLevelDown(HeaderReadWriteUtils::readIntAttributeValue(
     63                       &mAttributeMap, FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY,
     64                       DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS)),
     65               mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
     66                       &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
     67               mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
     68                       &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {}
     69 
     70     // Constructs header information using an attribute map.
     71     HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
     72             const std::vector<int> &locale,
     73             const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap)
     74             : mDictFormatVersion(dictFormatVersion),
     75               mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap(
     76                       attributeMap)), mSize(0), mAttributeMap(*attributeMap), mLocale(locale),
     77               mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
     78               mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()),
     79               mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
     80                       IS_DECAYING_DICT_KEY, false /* defaultValue */)),
     81               mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
     82                       DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
     83               mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
     84                       DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
     85               mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0),
     86               mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue(
     87                       &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)),
     88               mForgettingCurveOccurrencesToLevelUp(HeaderReadWriteUtils::readIntAttributeValue(
     89                       &mAttributeMap, FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY,
     90                       DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)),
     91               mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
     92                       &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
     93                       DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
     94               mForgettingCurveDurationToLevelDown(HeaderReadWriteUtils::readIntAttributeValue(
     95                       &mAttributeMap, FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY,
     96                       DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS)),
     97               mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
     98                       &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
     99               mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
    100                       &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {}
    101 
    102     // Copy header information
    103     HeaderPolicy(const HeaderPolicy *const headerPolicy)
    104             : mDictFormatVersion(headerPolicy->mDictFormatVersion),
    105               mDictionaryFlags(headerPolicy->mDictionaryFlags), mSize(headerPolicy->mSize),
    106               mAttributeMap(headerPolicy->mAttributeMap), mLocale(headerPolicy->mLocale),
    107               mMultiWordCostMultiplier(headerPolicy->mMultiWordCostMultiplier),
    108               mRequiresGermanUmlautProcessing(headerPolicy->mRequiresGermanUmlautProcessing),
    109               mIsDecayingDict(headerPolicy->mIsDecayingDict),
    110               mDate(headerPolicy->mDate), mLastDecayedTime(headerPolicy->mLastDecayedTime),
    111               mUnigramCount(headerPolicy->mUnigramCount), mBigramCount(headerPolicy->mBigramCount),
    112               mExtendedRegionSize(headerPolicy->mExtendedRegionSize),
    113               mHasHistoricalInfoOfWords(headerPolicy->mHasHistoricalInfoOfWords),
    114               mForgettingCurveOccurrencesToLevelUp(
    115                       headerPolicy->mForgettingCurveOccurrencesToLevelUp),
    116               mForgettingCurveProbabilityValuesTableId(
    117                       headerPolicy->mForgettingCurveProbabilityValuesTableId),
    118               mForgettingCurveDurationToLevelDown(
    119                       headerPolicy->mForgettingCurveDurationToLevelDown),
    120               mMaxUnigramCount(headerPolicy->mMaxUnigramCount),
    121               mMaxBigramCount(headerPolicy->mMaxBigramCount) {}
    122 
    123     // Temporary dummy header.
    124     HeaderPolicy()
    125             : mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), mDictionaryFlags(0), mSize(0),
    126               mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostMultiplier(0.0f),
    127               mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false),
    128               mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0),
    129               mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false),
    130               mForgettingCurveOccurrencesToLevelUp(0), mForgettingCurveProbabilityValuesTableId(0),
    131               mForgettingCurveDurationToLevelDown(0), mMaxUnigramCount(0), mMaxBigramCount(0) {}
    132 
    133     ~HeaderPolicy() {}
    134 
    135     virtual int getFormatVersionNumber() const {
    136         // Conceptually this converts the symbolic value we use in the code into the
    137         // hardcoded of the bytes in the file. But we want the constants to be the
    138         // same so we use them for both here.
    139         switch (mDictFormatVersion) {
    140             case FormatUtils::VERSION_2:
    141                 return FormatUtils::VERSION_2;
    142             case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
    143                 return FormatUtils::VERSION_4_ONLY_FOR_TESTING;
    144             case FormatUtils::VERSION_4:
    145                 return FormatUtils::VERSION_4;
    146             case FormatUtils::VERSION_4_DEV:
    147                 return FormatUtils::VERSION_4_DEV;
    148             default:
    149                 return FormatUtils::UNKNOWN_VERSION;
    150         }
    151     }
    152 
    153     AK_FORCE_INLINE bool isValid() const {
    154         // Decaying dictionary must have historical information.
    155         if (!mIsDecayingDict) {
    156             return true;
    157         }
    158         if (mHasHistoricalInfoOfWords) {
    159             return true;
    160         } else {
    161             return false;
    162         }
    163     }
    164 
    165     AK_FORCE_INLINE int getSize() const {
    166         return mSize;
    167     }
    168 
    169     AK_FORCE_INLINE float getMultiWordCostMultiplier() const {
    170         return mMultiWordCostMultiplier;
    171     }
    172 
    173     AK_FORCE_INLINE bool isDecayingDict() const {
    174         return mIsDecayingDict;
    175     }
    176 
    177     AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const {
    178         return mRequiresGermanUmlautProcessing;
    179     }
    180 
    181     AK_FORCE_INLINE int getDate() const {
    182         return mDate;
    183     }
    184 
    185     AK_FORCE_INLINE int getLastDecayedTime() const {
    186         return mLastDecayedTime;
    187     }
    188 
    189     AK_FORCE_INLINE int getUnigramCount() const {
    190         return mUnigramCount;
    191     }
    192 
    193     AK_FORCE_INLINE int getBigramCount() const {
    194         return mBigramCount;
    195     }
    196 
    197     AK_FORCE_INLINE int getExtendedRegionSize() const {
    198         return mExtendedRegionSize;
    199     }
    200 
    201     AK_FORCE_INLINE bool hasHistoricalInfoOfWords() const {
    202         return mHasHistoricalInfoOfWords;
    203     }
    204 
    205     AK_FORCE_INLINE bool shouldBoostExactMatches() const {
    206         // TODO: Investigate better ways to handle exact matches for personalized dictionaries.
    207         return !isDecayingDict();
    208     }
    209 
    210     const DictionaryHeaderStructurePolicy::AttributeMap *getAttributeMap() const {
    211         return &mAttributeMap;
    212     }
    213 
    214     AK_FORCE_INLINE int getForgettingCurveOccurrencesToLevelUp() const {
    215         return mForgettingCurveOccurrencesToLevelUp;
    216     }
    217 
    218     AK_FORCE_INLINE int getForgettingCurveProbabilityValuesTableId() const {
    219         return mForgettingCurveProbabilityValuesTableId;
    220     }
    221 
    222     AK_FORCE_INLINE int getForgettingCurveDurationToLevelDown() const {
    223         return mForgettingCurveDurationToLevelDown;
    224     }
    225 
    226     AK_FORCE_INLINE int getMaxUnigramCount() const {
    227         return mMaxUnigramCount;
    228     }
    229 
    230     AK_FORCE_INLINE int getMaxBigramCount() const {
    231         return mMaxBigramCount;
    232     }
    233 
    234     void readHeaderValueOrQuestionMark(const char *const key,
    235             int *outValue, int outValueSize) const;
    236 
    237     bool fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime,
    238             const int unigramCount, const int bigramCount,
    239             const int extendedRegionSize, BufferWithExtendableBuffer *const outBuffer) const;
    240 
    241     void fillInHeader(const bool updatesLastDecayedTime,
    242             const int unigramCount, const int bigramCount, const int extendedRegionSize,
    243             DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const;
    244 
    245     AK_FORCE_INLINE const std::vector<int> *getLocale() const {
    246         return &mLocale;
    247     }
    248 
    249     bool supportsBeginningOfSentence() const {
    250         return mDictFormatVersion >= FormatUtils::VERSION_4;
    251     }
    252 
    253  private:
    254     DISALLOW_COPY_AND_ASSIGN(HeaderPolicy);
    255 
    256     static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY;
    257     static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY;
    258     static const char *const IS_DECAYING_DICT_KEY;
    259     static const char *const DATE_KEY;
    260     static const char *const LAST_DECAYED_TIME_KEY;
    261     static const char *const UNIGRAM_COUNT_KEY;
    262     static const char *const BIGRAM_COUNT_KEY;
    263     static const char *const EXTENDED_REGION_SIZE_KEY;
    264     static const char *const HAS_HISTORICAL_INFO_KEY;
    265     static const char *const LOCALE_KEY;
    266     static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY;
    267     static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY;
    268     static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY;
    269     static const char *const MAX_UNIGRAM_COUNT_KEY;
    270     static const char *const MAX_BIGRAM_COUNT_KEY;
    271     static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE;
    272     static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE;
    273     static const int DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP;
    274     static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID;
    275     static const int DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS;
    276     static const int DEFAULT_MAX_UNIGRAM_COUNT;
    277     static const int DEFAULT_MAX_BIGRAM_COUNT;
    278 
    279     const FormatUtils::FORMAT_VERSION mDictFormatVersion;
    280     const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags;
    281     const int mSize;
    282     DictionaryHeaderStructurePolicy::AttributeMap mAttributeMap;
    283     const std::vector<int> mLocale;
    284     const float mMultiWordCostMultiplier;
    285     const bool mRequiresGermanUmlautProcessing;
    286     const bool mIsDecayingDict;
    287     const int mDate;
    288     const int mLastDecayedTime;
    289     const int mUnigramCount;
    290     const int mBigramCount;
    291     const int mExtendedRegionSize;
    292     const bool mHasHistoricalInfoOfWords;
    293     const int mForgettingCurveOccurrencesToLevelUp;
    294     const int mForgettingCurveProbabilityValuesTableId;
    295     const int mForgettingCurveDurationToLevelDown;
    296     const int mMaxUnigramCount;
    297     const int mMaxBigramCount;
    298 
    299     const std::vector<int> readLocale() const;
    300     float readMultipleWordCostMultiplier() const;
    301     bool readRequiresGermanUmlautProcessing() const;
    302 
    303     static DictionaryHeaderStructurePolicy::AttributeMap createAttributeMapAndReadAllAttributes(
    304             const uint8_t *const dictBuf);
    305 };
    306 } // namespace latinime
    307 #endif /* LATINIME_HEADER_POLICY_H */
    308