1 /* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LATINIME_PREV_WORDS_INFO_H 18 #define LATINIME_PREV_WORDS_INFO_H 19 20 #include "defines.h" 21 #include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" 22 #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" 23 #include "utils/char_utils.h" 24 25 namespace latinime { 26 27 // TODO: Support n-gram. 28 class PrevWordsInfo { 29 public: 30 // No prev word information. 31 PrevWordsInfo() { 32 clear(); 33 } 34 35 PrevWordsInfo(PrevWordsInfo &&prevWordsInfo) { 36 for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { 37 mPrevWordCodePointCount[i] = prevWordsInfo.mPrevWordCodePointCount[i]; 38 memmove(mPrevWordCodePoints[i], prevWordsInfo.mPrevWordCodePoints[i], 39 sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]); 40 mIsBeginningOfSentence[i] = prevWordsInfo.mIsBeginningOfSentence[i]; 41 } 42 } 43 44 // Construct from previous words. 45 PrevWordsInfo(const int prevWordCodePoints[][MAX_WORD_LENGTH], 46 const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence, 47 const size_t prevWordCount) { 48 clear(); 49 for (size_t i = 0; i < std::min(NELEMS(mPrevWordCodePoints), prevWordCount); ++i) { 50 if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) { 51 continue; 52 } 53 memmove(mPrevWordCodePoints[i], prevWordCodePoints[i], 54 sizeof(mPrevWordCodePoints[i][0]) * prevWordCodePointCount[i]); 55 mPrevWordCodePointCount[i] = prevWordCodePointCount[i]; 56 mIsBeginningOfSentence[i] = isBeginningOfSentence[i]; 57 } 58 } 59 60 // Construct from a previous word. 61 PrevWordsInfo(const int *const prevWordCodePoints, const int prevWordCodePointCount, 62 const bool isBeginningOfSentence) { 63 clear(); 64 if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) { 65 return; 66 } 67 memmove(mPrevWordCodePoints[0], prevWordCodePoints, 68 sizeof(mPrevWordCodePoints[0][0]) * prevWordCodePointCount); 69 mPrevWordCodePointCount[0] = prevWordCodePointCount; 70 mIsBeginningOfSentence[0] = isBeginningOfSentence; 71 } 72 73 bool isValid() const { 74 if (mPrevWordCodePointCount[0] > 0) { 75 return true; 76 } 77 if (mIsBeginningOfSentence[0]) { 78 return true; 79 } 80 return false; 81 } 82 83 void getPrevWordsTerminalPtNodePos( 84 const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, 85 int *const outPrevWordsTerminalPtNodePos, const bool tryLowerCaseSearch) const { 86 for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { 87 outPrevWordsTerminalPtNodePos[i] = getTerminalPtNodePosOfWord(dictStructurePolicy, 88 mPrevWordCodePoints[i], mPrevWordCodePointCount[i], 89 mIsBeginningOfSentence[i], tryLowerCaseSearch); 90 } 91 } 92 93 // n is 1-indexed. 94 const int *getNthPrevWordCodePoints(const int n) const { 95 if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { 96 return nullptr; 97 } 98 return mPrevWordCodePoints[n - 1]; 99 } 100 101 // n is 1-indexed. 102 int getNthPrevWordCodePointCount(const int n) const { 103 if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { 104 return 0; 105 } 106 return mPrevWordCodePointCount[n - 1]; 107 } 108 109 // n is 1-indexed. 110 bool isNthPrevWordBeginningOfSentence(const int n) const { 111 if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { 112 return false; 113 } 114 return mIsBeginningOfSentence[n - 1]; 115 } 116 117 private: 118 DISALLOW_COPY_AND_ASSIGN(PrevWordsInfo); 119 120 static int getTerminalPtNodePosOfWord( 121 const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, 122 const int *const wordCodePoints, const int wordCodePointCount, 123 const bool isBeginningOfSentence, const bool tryLowerCaseSearch) { 124 if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) { 125 return NOT_A_DICT_POS; 126 } 127 int codePoints[MAX_WORD_LENGTH]; 128 int codePointCount = wordCodePointCount; 129 memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount); 130 if (isBeginningOfSentence) { 131 codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, 132 codePointCount, MAX_WORD_LENGTH); 133 if (codePointCount <= 0) { 134 return NOT_A_DICT_POS; 135 } 136 } 137 const int wordPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord( 138 codePoints, codePointCount, false /* forceLowerCaseSearch */); 139 if (wordPtNodePos != NOT_A_DICT_POS || !tryLowerCaseSearch) { 140 // Return the position when when the word was found or doesn't try lower case 141 // search. 142 return wordPtNodePos; 143 } 144 // Check bigrams for lower-cased previous word if original was not found. Useful for 145 // auto-capitalized words like "The [current_word]". 146 return dictStructurePolicy->getTerminalPtNodePositionOfWord( 147 codePoints, codePointCount, true /* forceLowerCaseSearch */); 148 } 149 150 void clear() { 151 for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { 152 mPrevWordCodePointCount[i] = 0; 153 mIsBeginningOfSentence[i] = false; 154 } 155 } 156 157 int mPrevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH]; 158 int mPrevWordCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; 159 bool mIsBeginningOfSentence[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; 160 }; 161 } // namespace latinime 162 #endif // LATINIME_PREV_WORDS_INFO_H 163