Home | History | Annotate | Download | only in header
      1 /*
      2  * Copyright (C) 2013, The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "dictionary/header/header_policy.h"
     18 
     19 #include <algorithm>
     20 
     21 #include "utils/ngram_utils.h"
     22 
     23 namespace latinime {
     24 
     25 // Note that these are corresponding definitions in Java side in DictionaryHeader.
     26 const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE";
     27 const char *const HeaderPolicy::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY =
     28         "REQUIRES_GERMAN_UMLAUT_PROCESSING";
     29 // TODO: Change attribute string to "IS_DECAYING_DICT".
     30 const char *const HeaderPolicy::IS_DECAYING_DICT_KEY = "USES_FORGETTING_CURVE";
     31 const char *const HeaderPolicy::DATE_KEY = "date";
     32 const char *const HeaderPolicy::LAST_DECAYED_TIME_KEY = "LAST_DECAYED_TIME";
     33 const char *const HeaderPolicy::NGRAM_COUNT_KEYS[] =
     34         {"UNIGRAM_COUNT", "BIGRAM_COUNT", "TRIGRAM_COUNT", "QUADGRAM_COUNT"};
     35 const char *const HeaderPolicy::MAX_NGRAM_COUNT_KEYS[] =
     36         {"MAX_UNIGRAM_ENTRY_COUNT", "MAX_BIGRAM_ENTRY_COUNT", "MAX_TRIGRAM_ENTRY_COUNT",
     37                 "MAX_QUADGRAM_ENTRY_COUNT"};
     38 const int HeaderPolicy::DEFAULT_MAX_NGRAM_COUNTS[] = {10000, 30000, 30000, 30000};
     39 const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE";
     40 // Historical info is information that is needed to support decaying such as timestamp, level and
     41 // count.
     42 const char *const HeaderPolicy::HAS_HISTORICAL_INFO_KEY = "HAS_HISTORICAL_INFO";
     43 const char *const HeaderPolicy::LOCALE_KEY = "locale"; // match Java declaration
     44 const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY =
     45         "FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID";
     46 
     47 const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100;
     48 const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f;
     49 const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 3;
     50 
     51 // Used for logging. Question mark is used to indicate that the key is not found.
     52 void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue,
     53         int outValueSize) const {
     54     if (outValueSize <= 0) return;
     55     if (outValueSize == 1) {
     56         outValue[0] = '\0';
     57         return;
     58     }
     59     std::vector<int> keyCodePointVector;
     60     HeaderReadWriteUtils::insertCharactersIntoVector(key, &keyCodePointVector);
     61     DictionaryHeaderStructurePolicy::AttributeMap::const_iterator it =
     62             mAttributeMap.find(keyCodePointVector);
     63     if (it == mAttributeMap.end()) {
     64         // The key was not found.
     65         outValue[0] = '?';
     66         outValue[1] = '\0';
     67         return;
     68     }
     69     const int terminalIndex = std::min(static_cast<int>(it->second.size()), outValueSize - 1);
     70     for (int i = 0; i < terminalIndex; ++i) {
     71         outValue[i] = it->second[i];
     72     }
     73     outValue[terminalIndex] = '\0';
     74 }
     75 
     76 const std::vector<int> HeaderPolicy::readLocale() const {
     77     return HeaderReadWriteUtils::readCodePointVectorAttributeValue(&mAttributeMap, LOCALE_KEY);
     78 }
     79 
     80 float HeaderPolicy::readMultipleWordCostMultiplier() const {
     81     const int demotionRate = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
     82             MULTIPLE_WORDS_DEMOTION_RATE_KEY, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE);
     83     if (demotionRate <= 0) {
     84         return static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
     85     }
     86     return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast<float>(demotionRate);
     87 }
     88 
     89 bool HeaderPolicy::readRequiresGermanUmlautProcessing() const {
     90     return HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
     91             REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, false);
     92 }
     93 
     94 bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime,
     95         const EntryCounts &entryCounts, const int extendedRegionSize,
     96         BufferWithExtendableBuffer *const outBuffer) const {
     97     int writingPos = 0;
     98     DictionaryHeaderStructurePolicy::AttributeMap attributeMapToWrite(mAttributeMap);
     99     fillInHeader(updatesLastDecayedTime, entryCounts, extendedRegionSize, &attributeMapToWrite);
    100     if (!HeaderReadWriteUtils::writeDictionaryVersion(outBuffer, mDictFormatVersion,
    101             &writingPos)) {
    102         return false;
    103     }
    104     if (!HeaderReadWriteUtils::writeDictionaryFlags(outBuffer, mDictionaryFlags,
    105             &writingPos)) {
    106         return false;
    107     }
    108     // Temporarily writes a dummy header size.
    109     int headerSizeFieldPos = writingPos;
    110     if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(outBuffer, 0 /* size */,
    111             &writingPos)) {
    112         return false;
    113     }
    114     if (!HeaderReadWriteUtils::writeHeaderAttributes(outBuffer, &attributeMapToWrite,
    115             &writingPos)) {
    116         return false;
    117     }
    118     // Writes the actual header size.
    119     if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(outBuffer, writingPos,
    120             &headerSizeFieldPos)) {
    121         return false;
    122     }
    123     return true;
    124 }
    125 
    126 namespace {
    127 
    128 int getIndexFromNgramType(const NgramType ngramType) {
    129     return static_cast<int>(ngramType);
    130 }
    131 
    132 } // namespace
    133 
    134 void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime,
    135         const EntryCounts &entryCounts, const int extendedRegionSize,
    136         DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const {
    137     for (const auto ngramType : AllNgramTypes::ASCENDING) {
    138         HeaderReadWriteUtils::setIntAttribute(outAttributeMap,
    139                 NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)],
    140                 entryCounts.getNgramCount(ngramType));
    141     }
    142     HeaderReadWriteUtils::setIntAttribute(outAttributeMap, EXTENDED_REGION_SIZE_KEY,
    143             extendedRegionSize);
    144     // Set the current time as the generation time.
    145     HeaderReadWriteUtils::setIntAttribute(outAttributeMap, DATE_KEY,
    146             TimeKeeper::peekCurrentTime());
    147     HeaderReadWriteUtils::setCodePointVectorAttribute(outAttributeMap, LOCALE_KEY, mLocale);
    148     if (updatesLastDecayedTime) {
    149         // Set current time as the last updated time.
    150         HeaderReadWriteUtils::setIntAttribute(outAttributeMap, LAST_DECAYED_TIME_KEY,
    151                 TimeKeeper::peekCurrentTime());
    152     }
    153 }
    154 
    155 /* static */ DictionaryHeaderStructurePolicy::AttributeMap
    156         HeaderPolicy::createAttributeMapAndReadAllAttributes(const uint8_t *const dictBuf) {
    157     DictionaryHeaderStructurePolicy::AttributeMap attributeMap;
    158     HeaderReadWriteUtils::fetchAllHeaderAttributes(dictBuf, &attributeMap);
    159     return attributeMap;
    160 }
    161 
    162 /* static */ const EntryCounts HeaderPolicy::readNgramCounts() const {
    163     MutableEntryCounters entryCounters;
    164     for (const auto ngramType : AllNgramTypes::ASCENDING) {
    165         const int entryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
    166                 NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)], 0 /* defaultValue */);
    167         entryCounters.setNgramCount(ngramType, entryCount);
    168     }
    169     return entryCounters.getEntryCounts();
    170 }
    171 
    172 /* static */ const EntryCounts HeaderPolicy::readMaxNgramCounts() const {
    173     MutableEntryCounters entryCounters;
    174     for (const auto ngramType : AllNgramTypes::ASCENDING) {
    175         const int index = getIndexFromNgramType(ngramType);
    176         const int maxEntryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
    177                 MAX_NGRAM_COUNT_KEYS[index], DEFAULT_MAX_NGRAM_COUNTS[index]);
    178         entryCounters.setNgramCount(ngramType, maxEntryCount);
    179     }
    180     return entryCounters.getEntryCounts();
    181 }
    182 
    183 } // namespace latinime
    184