Home | History | Annotate | Download | only in header
      1 /*
      2  * Copyright (C) 2013, The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef LATINIME_HEADER_POLICY_H
     18 #define LATINIME_HEADER_POLICY_H
     19 
     20 #include <cstdint>
     21 
     22 #include "defines.h"
     23 #include "dictionary/header/header_read_write_utils.h"
     24 #include "dictionary/interface/dictionary_header_structure_policy.h"
     25 #include "dictionary/utils/entry_counters.h"
     26 #include "dictionary/utils/format_utils.h"
     27 #include "utils/char_utils.h"
     28 #include "utils/time_keeper.h"
     29 
     30 namespace latinime {
     31 
     32 class HeaderPolicy : public DictionaryHeaderStructurePolicy {
     33  public:
     34     // Reads information from existing dictionary buffer.
     35     HeaderPolicy(const uint8_t *const dictBuf, const FormatUtils::FORMAT_VERSION formatVersion)
     36             : mDictFormatVersion(formatVersion),
     37               mDictionaryFlags(HeaderReadWriteUtils::getFlags(dictBuf)),
     38               mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)),
     39               mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)),
     40               mLocale(readLocale()),
     41               mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
     42               mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()),
     43               mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
     44                       IS_DECAYING_DICT_KEY, false /* defaultValue */)),
     45               mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
     46                       DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
     47               mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
     48                       LAST_DECAYED_TIME_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
     49               mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()),
     50               mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
     51                       EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)),
     52               mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue(
     53                       &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)),
     54               mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
     55                       &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
     56                       DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
     57               mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}
     58 
     59     // Constructs header information using an attribute map.
     60     HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
     61             const std::vector<int> &locale,
     62             const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap)
     63             : mDictFormatVersion(dictFormatVersion),
     64               mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap(
     65                       attributeMap)), mSize(0), mAttributeMap(*attributeMap), mLocale(locale),
     66               mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
     67               mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()),
     68               mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
     69                       IS_DECAYING_DICT_KEY, false /* defaultValue */)),
     70               mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
     71                       DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
     72               mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
     73                       DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
     74               mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()),
     75               mExtendedRegionSize(0),
     76               mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue(
     77                       &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)),
     78               mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
     79                       &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
     80                       DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
     81               mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}
     82 
     83     // Copy header information
     84     HeaderPolicy(const HeaderPolicy *const headerPolicy)
     85             : mDictFormatVersion(headerPolicy->mDictFormatVersion),
     86               mDictionaryFlags(headerPolicy->mDictionaryFlags), mSize(headerPolicy->mSize),
     87               mAttributeMap(headerPolicy->mAttributeMap), mLocale(headerPolicy->mLocale),
     88               mMultiWordCostMultiplier(headerPolicy->mMultiWordCostMultiplier),
     89               mRequiresGermanUmlautProcessing(headerPolicy->mRequiresGermanUmlautProcessing),
     90               mIsDecayingDict(headerPolicy->mIsDecayingDict),
     91               mDate(headerPolicy->mDate), mLastDecayedTime(headerPolicy->mLastDecayedTime),
     92               mNgramCounts(headerPolicy->mNgramCounts),
     93               mMaxNgramCounts(headerPolicy->mMaxNgramCounts),
     94               mExtendedRegionSize(headerPolicy->mExtendedRegionSize),
     95               mHasHistoricalInfoOfWords(headerPolicy->mHasHistoricalInfoOfWords),
     96               mForgettingCurveProbabilityValuesTableId(
     97                       headerPolicy->mForgettingCurveProbabilityValuesTableId),
     98               mCodePointTable(headerPolicy->mCodePointTable) {}
     99 
    100     // Temporary dummy header.
    101     HeaderPolicy()
    102             : mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), mDictionaryFlags(0), mSize(0),
    103               mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostMultiplier(0.0f),
    104               mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false),
    105               mDate(0), mLastDecayedTime(0), mNgramCounts(), mMaxNgramCounts(),
    106               mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false),
    107               mForgettingCurveProbabilityValuesTableId(0), mCodePointTable(nullptr) {}
    108 
    109     ~HeaderPolicy() {}
    110 
    111     virtual int getFormatVersionNumber() const {
    112         // Conceptually this converts the symbolic value we use in the code into the
    113         // hardcoded of the bytes in the file. But we want the constants to be the
    114         // same so we use them for both here.
    115         switch (mDictFormatVersion) {
    116             case FormatUtils::VERSION_2:
    117             case FormatUtils::VERSION_201:
    118                 AKLOGE("Dictionary versions 2 and 201 are incompatible with this version");
    119                 return FormatUtils::UNKNOWN_VERSION;
    120             case FormatUtils::VERSION_202:
    121                 return FormatUtils::VERSION_202;
    122             case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
    123                 return FormatUtils::VERSION_4_ONLY_FOR_TESTING;
    124             case FormatUtils::VERSION_402:
    125                 return FormatUtils::VERSION_402;
    126             case FormatUtils::VERSION_403:
    127                 return FormatUtils::VERSION_403;
    128             default:
    129                 return FormatUtils::UNKNOWN_VERSION;
    130         }
    131     }
    132 
    133     AK_FORCE_INLINE bool isValid() const {
    134         // Decaying dictionary must have historical information.
    135         if (!mIsDecayingDict) {
    136             return true;
    137         }
    138         if (mHasHistoricalInfoOfWords) {
    139             return true;
    140         } else {
    141             return false;
    142         }
    143     }
    144 
    145     AK_FORCE_INLINE int getSize() const {
    146         return mSize;
    147     }
    148 
    149     AK_FORCE_INLINE float getMultiWordCostMultiplier() const {
    150         return mMultiWordCostMultiplier;
    151     }
    152 
    153     AK_FORCE_INLINE bool isDecayingDict() const {
    154         return mIsDecayingDict;
    155     }
    156 
    157     AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const {
    158         return mRequiresGermanUmlautProcessing;
    159     }
    160 
    161     AK_FORCE_INLINE int getDate() const {
    162         return mDate;
    163     }
    164 
    165     AK_FORCE_INLINE int getLastDecayedTime() const {
    166         return mLastDecayedTime;
    167     }
    168 
    169     AK_FORCE_INLINE const EntryCounts &getNgramCounts() const {
    170         return mNgramCounts;
    171     }
    172 
    173     AK_FORCE_INLINE const EntryCounts getMaxNgramCounts() const {
    174         return mMaxNgramCounts;
    175     }
    176 
    177     AK_FORCE_INLINE int getExtendedRegionSize() const {
    178         return mExtendedRegionSize;
    179     }
    180 
    181     AK_FORCE_INLINE bool hasHistoricalInfoOfWords() const {
    182         return mHasHistoricalInfoOfWords;
    183     }
    184 
    185     AK_FORCE_INLINE bool shouldBoostExactMatches() const {
    186         // TODO: Investigate better ways to handle exact matches for personalized dictionaries.
    187         return !isDecayingDict();
    188     }
    189 
    190     const DictionaryHeaderStructurePolicy::AttributeMap *getAttributeMap() const {
    191         return &mAttributeMap;
    192     }
    193 
    194     AK_FORCE_INLINE int getForgettingCurveProbabilityValuesTableId() const {
    195         return mForgettingCurveProbabilityValuesTableId;
    196     }
    197 
    198     void readHeaderValueOrQuestionMark(const char *const key,
    199             int *outValue, int outValueSize) const;
    200 
    201     bool fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime,
    202             const EntryCounts &entryCounts, const int extendedRegionSize,
    203             BufferWithExtendableBuffer *const outBuffer) const;
    204 
    205     void fillInHeader(const bool updatesLastDecayedTime, const EntryCounts &entryCounts,
    206             const int extendedRegionSize,
    207             DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const;
    208 
    209     AK_FORCE_INLINE const std::vector<int> *getLocale() const {
    210         return &mLocale;
    211     }
    212 
    213     bool supportsBeginningOfSentence() const {
    214         return mDictFormatVersion >= FormatUtils::VERSION_402;
    215     }
    216 
    217     const int *getCodePointTable() const {
    218         return mCodePointTable;
    219     }
    220 
    221  private:
    222     DISALLOW_COPY_AND_ASSIGN(HeaderPolicy);
    223 
    224     static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY;
    225     static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY;
    226     static const char *const IS_DECAYING_DICT_KEY;
    227     static const char *const DATE_KEY;
    228     static const char *const LAST_DECAYED_TIME_KEY;
    229     static const char *const NGRAM_COUNT_KEYS[];
    230     static const char *const MAX_NGRAM_COUNT_KEYS[];
    231     static const int DEFAULT_MAX_NGRAM_COUNTS[];
    232     static const char *const EXTENDED_REGION_SIZE_KEY;
    233     static const char *const HAS_HISTORICAL_INFO_KEY;
    234     static const char *const LOCALE_KEY;
    235     static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY;
    236     static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY;
    237     static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY;
    238     static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE;
    239     static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE;
    240     static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID;
    241 
    242     const FormatUtils::FORMAT_VERSION mDictFormatVersion;
    243     const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags;
    244     const int mSize;
    245     DictionaryHeaderStructurePolicy::AttributeMap mAttributeMap;
    246     const std::vector<int> mLocale;
    247     const float mMultiWordCostMultiplier;
    248     const bool mRequiresGermanUmlautProcessing;
    249     const bool mIsDecayingDict;
    250     const int mDate;
    251     const int mLastDecayedTime;
    252     const EntryCounts mNgramCounts;
    253     const EntryCounts mMaxNgramCounts;
    254     const int mExtendedRegionSize;
    255     const bool mHasHistoricalInfoOfWords;
    256     const int mForgettingCurveProbabilityValuesTableId;
    257     const int *const mCodePointTable;
    258 
    259     const std::vector<int> readLocale() const;
    260     float readMultipleWordCostMultiplier() const;
    261     bool readRequiresGermanUmlautProcessing() const;
    262     const EntryCounts readNgramCounts() const;
    263     const EntryCounts readMaxNgramCounts() const;
    264     static DictionaryHeaderStructurePolicy::AttributeMap createAttributeMapAndReadAllAttributes(
    265             const uint8_t *const dictBuf);
    266 };
    267 } // namespace latinime
    268 #endif /* LATINIME_HEADER_POLICY_H */
    269