Home | History | Annotate | Download | only in session
      1 /*
      2  * Copyright (C) 2014 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef LATINIME_PREV_WORDS_INFO_H
     18 #define LATINIME_PREV_WORDS_INFO_H
     19 
     20 #include "defines.h"
     21 #include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
     22 #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
     23 #include "utils/char_utils.h"
     24 
     25 namespace latinime {
     26 
     27 // TODO: Support n-gram.
     28 class PrevWordsInfo {
     29  public:
     30     // No prev word information.
     31     PrevWordsInfo() {
     32         clear();
     33     }
     34 
     35     PrevWordsInfo(PrevWordsInfo &&prevWordsInfo) {
     36         for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
     37             mPrevWordCodePointCount[i] = prevWordsInfo.mPrevWordCodePointCount[i];
     38             memmove(mPrevWordCodePoints[i], prevWordsInfo.mPrevWordCodePoints[i],
     39                     sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]);
     40             mIsBeginningOfSentence[i] = prevWordsInfo.mIsBeginningOfSentence[i];
     41         }
     42     }
     43 
     44     // Construct from previous words.
     45     PrevWordsInfo(const int prevWordCodePoints[][MAX_WORD_LENGTH],
     46             const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence,
     47             const size_t prevWordCount) {
     48         clear();
     49         for (size_t i = 0; i < std::min(NELEMS(mPrevWordCodePoints), prevWordCount); ++i) {
     50             if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) {
     51                 continue;
     52             }
     53             memmove(mPrevWordCodePoints[i], prevWordCodePoints[i],
     54                     sizeof(mPrevWordCodePoints[i][0]) * prevWordCodePointCount[i]);
     55             mPrevWordCodePointCount[i] = prevWordCodePointCount[i];
     56             mIsBeginningOfSentence[i] = isBeginningOfSentence[i];
     57         }
     58     }
     59 
     60     // Construct from a previous word.
     61     PrevWordsInfo(const int *const prevWordCodePoints, const int prevWordCodePointCount,
     62             const bool isBeginningOfSentence) {
     63         clear();
     64         if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) {
     65             return;
     66         }
     67         memmove(mPrevWordCodePoints[0], prevWordCodePoints,
     68                 sizeof(mPrevWordCodePoints[0][0]) * prevWordCodePointCount);
     69         mPrevWordCodePointCount[0] = prevWordCodePointCount;
     70         mIsBeginningOfSentence[0] = isBeginningOfSentence;
     71     }
     72 
     73     bool isValid() const {
     74         if (mPrevWordCodePointCount[0] > 0) {
     75             return true;
     76         }
     77         if (mIsBeginningOfSentence[0]) {
     78             return true;
     79         }
     80         return false;
     81     }
     82 
     83     void getPrevWordsTerminalPtNodePos(
     84             const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
     85             int *const outPrevWordsTerminalPtNodePos, const bool tryLowerCaseSearch) const {
     86         for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
     87             outPrevWordsTerminalPtNodePos[i] = getTerminalPtNodePosOfWord(dictStructurePolicy,
     88                     mPrevWordCodePoints[i], mPrevWordCodePointCount[i],
     89                     mIsBeginningOfSentence[i], tryLowerCaseSearch);
     90         }
     91     }
     92 
     93     // n is 1-indexed.
     94     const int *getNthPrevWordCodePoints(const int n) const {
     95         if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
     96             return nullptr;
     97         }
     98         return mPrevWordCodePoints[n - 1];
     99     }
    100 
    101     // n is 1-indexed.
    102     int getNthPrevWordCodePointCount(const int n) const {
    103         if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
    104             return 0;
    105         }
    106         return mPrevWordCodePointCount[n - 1];
    107     }
    108 
    109     // n is 1-indexed.
    110     bool isNthPrevWordBeginningOfSentence(const int n) const {
    111         if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
    112             return false;
    113         }
    114         return mIsBeginningOfSentence[n - 1];
    115     }
    116 
    117  private:
    118     DISALLOW_COPY_AND_ASSIGN(PrevWordsInfo);
    119 
    120     static int getTerminalPtNodePosOfWord(
    121             const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
    122             const int *const wordCodePoints, const int wordCodePointCount,
    123             const bool isBeginningOfSentence, const bool tryLowerCaseSearch) {
    124         if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) {
    125             return NOT_A_DICT_POS;
    126         }
    127         int codePoints[MAX_WORD_LENGTH];
    128         int codePointCount = wordCodePointCount;
    129         memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount);
    130         if (isBeginningOfSentence) {
    131             codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints,
    132                     codePointCount, MAX_WORD_LENGTH);
    133             if (codePointCount <= 0) {
    134                 return NOT_A_DICT_POS;
    135             }
    136         }
    137         const int wordPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord(
    138                 codePoints, codePointCount, false /* forceLowerCaseSearch */);
    139         if (wordPtNodePos != NOT_A_DICT_POS || !tryLowerCaseSearch) {
    140             // Return the position when when the word was found or doesn't try lower case
    141             // search.
    142             return wordPtNodePos;
    143         }
    144         // Check bigrams for lower-cased previous word if original was not found. Useful for
    145         // auto-capitalized words like "The [current_word]".
    146         return dictStructurePolicy->getTerminalPtNodePositionOfWord(
    147                 codePoints, codePointCount, true /* forceLowerCaseSearch */);
    148     }
    149 
    150     void clear() {
    151         for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
    152             mPrevWordCodePointCount[i] = 0;
    153             mIsBeginningOfSentence[i] = false;
    154         }
    155     }
    156 
    157     int mPrevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH];
    158     int mPrevWordCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
    159     bool mIsBeginningOfSentence[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
    160 };
    161 } // namespace latinime
    162 #endif // LATINIME_PREV_WORDS_INFO_H
    163