1 /* 2 * Copyright (C) 2010, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <cstring> 18 19 #define LOG_TAG "LatinIME: bigram_dictionary.cpp" 20 21 #include "bigram_dictionary.h" 22 #include "binary_format.h" 23 #include "bloom_filter.h" 24 #include "char_utils.h" 25 #include "defines.h" 26 #include "dictionary.h" 27 28 namespace latinime { 29 30 BigramDictionary::BigramDictionary(const uint8_t *const streamStart) : DICT_ROOT(streamStart) { 31 if (DEBUG_DICT) { 32 AKLOGI("BigramDictionary - constructor"); 33 } 34 } 35 36 BigramDictionary::~BigramDictionary() { 37 } 38 39 void BigramDictionary::addWordBigram(int *word, int length, int probability, int *bigramProbability, 40 int *bigramCodePoints, int *outputTypes) const { 41 word[length] = 0; 42 if (DEBUG_DICT_FULL) { 43 #ifdef FLAG_DBG 44 char s[length + 1]; 45 for (int i = 0; i <= length; i++) s[i] = static_cast<char>(word[i]); 46 AKLOGI("Bigram: Found word = %s, freq = %d :", s, probability); 47 #endif 48 } 49 50 // Find the right insertion point 51 int insertAt = 0; 52 while (insertAt < MAX_RESULTS) { 53 if (probability > bigramProbability[insertAt] || (bigramProbability[insertAt] == probability 54 && length < getCodePointCount(MAX_WORD_LENGTH, 55 bigramCodePoints + insertAt * MAX_WORD_LENGTH))) { 56 break; 57 } 58 insertAt++; 59 } 60 if (DEBUG_DICT_FULL) { 61 AKLOGI("Bigram: InsertAt -> %d MAX_RESULTS: %d", insertAt, MAX_RESULTS); 62 } 63 if (insertAt >= MAX_RESULTS) { 64 return; 65 } 66 memmove(bigramProbability + (insertAt + 1), 67 bigramProbability + insertAt, 68 (MAX_RESULTS - insertAt - 1) * sizeof(bigramProbability[0])); 69 bigramProbability[insertAt] = probability; 70 outputTypes[insertAt] = Dictionary::KIND_PREDICTION; 71 memmove(bigramCodePoints + (insertAt + 1) * MAX_WORD_LENGTH, 72 bigramCodePoints + insertAt * MAX_WORD_LENGTH, 73 (MAX_RESULTS - insertAt - 1) * sizeof(bigramCodePoints[0]) * MAX_WORD_LENGTH); 74 int *dest = bigramCodePoints + insertAt * MAX_WORD_LENGTH; 75 while (length--) { 76 *dest++ = *word++; 77 } 78 *dest = 0; // NULL terminate 79 if (DEBUG_DICT_FULL) { 80 AKLOGI("Bigram: Added word at %d", insertAt); 81 } 82 } 83 84 /* Parameters : 85 * prevWord: the word before, the one for which we need to look up bigrams. 86 * prevWordLength: its length. 87 * inputCodePoints: what user typed, in the same format as for UnigramDictionary::getSuggestions. 88 * inputSize: the size of the codes array. 89 * bigramCodePoints: an array for output, at the same format as outwords for getSuggestions. 90 * bigramProbability: an array to output frequencies. 91 * outputTypes: an array to output types. 92 * This method returns the number of bigrams this word has, for backward compatibility. 93 * Note: this is not the number of bigrams output in the array, which is the number of 94 * bigrams this word has WHOSE first letter also matches the letter the user typed. 95 * TODO: this may not be a sensible thing to do. It makes sense when the bigrams are 96 * used to match the first letter of the second word, but once the user has typed more 97 * and the bigrams are used to boost unigram result scores, it makes little sense to 98 * reduce their scope to the ones that match the first letter. 99 */ 100 int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *inputCodePoints, 101 int inputSize, int *bigramCodePoints, int *bigramProbability, int *outputTypes) const { 102 // TODO: remove unused arguments, and refrain from storing stuff in members of this class 103 // TODO: have "in" arguments before "out" ones, and make out args explicit in the name 104 105 const uint8_t *const root = DICT_ROOT; 106 int pos = getBigramListPositionForWord(prevWord, prevWordLength, 107 false /* forceLowerCaseSearch */); 108 // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams 109 if (0 == pos) { 110 // If no bigrams for this exact word, search again in lower case. 111 pos = getBigramListPositionForWord(prevWord, prevWordLength, 112 true /* forceLowerCaseSearch */); 113 } 114 // If still no bigrams, we really don't have them! 115 if (0 == pos) return 0; 116 uint8_t bigramFlags; 117 int bigramCount = 0; 118 do { 119 bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); 120 int bigramBuffer[MAX_WORD_LENGTH]; 121 int unigramProbability = 0; 122 const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, 123 &pos); 124 const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH, 125 bigramBuffer, &unigramProbability); 126 127 // inputSize == 0 means we are trying to find bigram predictions. 128 if (inputSize < 1 || checkFirstCharacter(bigramBuffer, inputCodePoints)) { 129 const int bigramProbabilityTemp = 130 BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags; 131 // Due to space constraints, the probability for bigrams is approximate - the lower the 132 // unigram probability, the worse the precision. The theoritical maximum error in 133 // resulting probability is 8 - although in the practice it's never bigger than 3 or 4 134 // in very bad cases. This means that sometimes, we'll see some bigrams interverted 135 // here, but it can't get too bad. 136 const int probability = BinaryFormat::computeProbabilityForBigram( 137 unigramProbability, bigramProbabilityTemp); 138 addWordBigram(bigramBuffer, length, probability, bigramProbability, bigramCodePoints, 139 outputTypes); 140 ++bigramCount; 141 } 142 } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); 143 return min(bigramCount, MAX_RESULTS); 144 } 145 146 // Returns a pointer to the start of the bigram list. 147 // If the word is not found or has no bigrams, this function returns 0. 148 int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength, 149 const bool forceLowerCaseSearch) const { 150 if (0 >= prevWordLength) return 0; 151 const uint8_t *const root = DICT_ROOT; 152 int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength, 153 forceLowerCaseSearch); 154 155 if (NOT_VALID_WORD == pos) return 0; 156 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); 157 if (0 == (flags & BinaryFormat::FLAG_HAS_BIGRAMS)) return 0; 158 if (0 == (flags & BinaryFormat::FLAG_HAS_MULTIPLE_CHARS)) { 159 BinaryFormat::getCodePointAndForwardPointer(root, &pos); 160 } else { 161 pos = BinaryFormat::skipOtherCharacters(root, pos); 162 } 163 pos = BinaryFormat::skipProbability(flags, pos); 164 pos = BinaryFormat::skipChildrenPosition(flags, pos); 165 pos = BinaryFormat::skipShortcuts(root, flags, pos); 166 return pos; 167 } 168 169 void BigramDictionary::fillBigramAddressToProbabilityMapAndFilter(const int *prevWord, 170 const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const { 171 memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE); 172 const uint8_t *const root = DICT_ROOT; 173 int pos = getBigramListPositionForWord(prevWord, prevWordLength, 174 false /* forceLowerCaseSearch */); 175 if (0 == pos) { 176 // If no bigrams for this exact string, search again in lower case. 177 pos = getBigramListPositionForWord(prevWord, prevWordLength, 178 true /* forceLowerCaseSearch */); 179 } 180 if (0 == pos) return; 181 182 uint8_t bigramFlags; 183 do { 184 bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); 185 const int probability = BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags; 186 const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, 187 &pos); 188 (*map)[bigramPos] = probability; 189 setInFilter(filter, bigramPos); 190 } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); 191 } 192 193 bool BigramDictionary::checkFirstCharacter(int *word, int *inputCodePoints) const { 194 // Checks whether this word starts with same character or neighboring characters of 195 // what user typed. 196 197 int maxAlt = MAX_ALTERNATIVES; 198 const int firstBaseLowerCodePoint = toBaseLowerCase(*word); 199 while (maxAlt > 0) { 200 if (toBaseLowerCase(*inputCodePoints) == firstBaseLowerCodePoint) { 201 return true; 202 } 203 inputCodePoints++; 204 maxAlt--; 205 } 206 return false; 207 } 208 209 bool BigramDictionary::isValidBigram(const int *word1, int length1, const int *word2, 210 int length2) const { 211 const uint8_t *const root = DICT_ROOT; 212 int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */); 213 // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams 214 if (0 == pos) return false; 215 int nextWordPos = BinaryFormat::getTerminalPosition(root, word2, length2, 216 false /* forceLowerCaseSearch */); 217 if (NOT_VALID_WORD == nextWordPos) return false; 218 uint8_t bigramFlags; 219 do { 220 bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); 221 const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, 222 &pos); 223 if (bigramPos == nextWordPos) { 224 return true; 225 } 226 } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); 227 return false; 228 } 229 230 // TODO: Move functions related to bigram to here 231 } // namespace latinime 232