1 /* 2 ** 3 ** Copyright 2010, The Android Open Source Project 4 ** 5 ** Licensed under the Apache License, Version 2.0 (the "License"); 6 ** you may not use this file except in compliance with the License. 7 ** You may obtain a copy of the License at 8 ** 9 ** http://www.apache.org/licenses/LICENSE-2.0 10 ** 11 ** Unless required by applicable law or agreed to in writing, software 12 ** distributed under the License is distributed on an "AS IS" BASIS, 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 ** See the License for the specific language governing permissions and 15 ** limitations under the License. 16 */ 17 18 #include <string.h> 19 20 #define LOG_TAG "LatinIME: bigram_dictionary.cpp" 21 22 #include "bigram_dictionary.h" 23 #include "dictionary.h" 24 #include "binary_format.h" 25 26 namespace latinime { 27 28 BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength, 29 int maxAlternatives, const bool isLatestDictVersion, const bool hasBigram, 30 Dictionary *parentDictionary) 31 : DICT(dict + NEW_DICTIONARY_HEADER_SIZE), MAX_WORD_LENGTH(maxWordLength), 32 MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion), 33 HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) { 34 if (DEBUG_DICT) { 35 LOGI("BigramDictionary - constructor"); 36 LOGI("Has Bigram : %d", hasBigram); 37 } 38 } 39 40 BigramDictionary::~BigramDictionary() { 41 } 42 43 bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequency) { 44 word[length] = 0; 45 if (DEBUG_DICT) { 46 #ifdef FLAG_DBG 47 char s[length + 1]; 48 for (int i = 0; i <= length; i++) s[i] = word[i]; 49 LOGI("Bigram: Found word = %s, freq = %d :", s, frequency); 50 #endif 51 } 52 53 // Find the right insertion point 54 int insertAt = 0; 55 while (insertAt < mMaxBigrams) { 56 if (frequency > mBigramFreq[insertAt] || (mBigramFreq[insertAt] == frequency 57 && length < Dictionary::wideStrLen(mBigramChars + insertAt * MAX_WORD_LENGTH))) { 58 break; 59 } 60 insertAt++; 61 } 62 if (DEBUG_DICT) { 63 LOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams); 64 } 65 if (insertAt < mMaxBigrams) { 66 memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]), 67 (char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]), 68 (mMaxBigrams - insertAt - 1) * sizeof(mBigramFreq[0])); 69 mBigramFreq[insertAt] = frequency; 70 memmove((char*) mBigramChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short), 71 (char*) mBigramChars + (insertAt ) * MAX_WORD_LENGTH * sizeof(short), 72 (mMaxBigrams - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH); 73 unsigned short *dest = mBigramChars + (insertAt ) * MAX_WORD_LENGTH; 74 while (length--) { 75 *dest++ = *word++; 76 } 77 *dest = 0; // NULL terminate 78 if (DEBUG_DICT) { 79 LOGI("Bigram: Added word at %d", insertAt); 80 } 81 return true; 82 } 83 return false; 84 } 85 86 /* Parameters : 87 * prevWord: the word before, the one for which we need to look up bigrams. 88 * prevWordLength: its length. 89 * codes: what user typed, in the same format as for UnigramDictionary::getSuggestions. 90 * codesSize: the size of the codes array. 91 * bigramChars: an array for output, at the same format as outwords for getSuggestions. 92 * bigramFreq: an array to output frequencies. 93 * maxWordLength: the maximum size of a word. 94 * maxBigrams: the maximum number of bigrams fitting in the bigramChars array. 95 * maxAlteratives: unused. 96 * This method returns the number of bigrams this word has, for backward compatibility. 97 * Note: this is not the number of bigrams output in the array, which is the number of 98 * bigrams this word has WHOSE first letter also matches the letter the user typed. 99 * TODO: this may not be a sensible thing to do. It makes sense when the bigrams are 100 * used to match the first letter of the second word, but once the user has typed more 101 * and the bigrams are used to boost unigram result scores, it makes little sense to 102 * reduce their scope to the ones that match the first letter. 103 */ 104 int BigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, int *codes, 105 int codesSize, unsigned short *bigramChars, int *bigramFreq, int maxWordLength, 106 int maxBigrams, int maxAlternatives) { 107 // TODO: remove unused arguments, and refrain from storing stuff in members of this class 108 // TODO: have "in" arguments before "out" ones, and make out args explicit in the name 109 mBigramFreq = bigramFreq; 110 mBigramChars = bigramChars; 111 mInputCodes = codes; 112 mMaxBigrams = maxBigrams; 113 114 const uint8_t* const root = DICT; 115 int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength); 116 117 if (NOT_VALID_WORD == pos) return 0; 118 const int flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); 119 if (0 == (flags & UnigramDictionary::FLAG_HAS_BIGRAMS)) return 0; 120 if (0 == (flags & UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS)) { 121 BinaryFormat::getCharCodeAndForwardPointer(root, &pos); 122 } else { 123 pos = BinaryFormat::skipOtherCharacters(root, pos); 124 } 125 pos = BinaryFormat::skipChildrenPosition(flags, pos); 126 pos = BinaryFormat::skipFrequency(flags, pos); 127 int bigramFlags; 128 int bigramCount = 0; 129 do { 130 bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); 131 uint16_t bigramBuffer[MAX_WORD_LENGTH]; 132 const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, 133 &pos); 134 const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH, 135 bigramBuffer); 136 137 if (checkFirstCharacter(bigramBuffer)) { 138 const int frequency = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; 139 addWordBigram(bigramBuffer, length, frequency); 140 } 141 ++bigramCount; 142 } while (0 != (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags)); 143 return bigramCount; 144 } 145 146 bool BigramDictionary::checkFirstCharacter(unsigned short *word) { 147 // Checks whether this word starts with same character or neighboring characters of 148 // what user typed. 149 150 int *inputCodes = mInputCodes; 151 int maxAlt = MAX_ALTERNATIVES; 152 while (maxAlt > 0) { 153 if ((unsigned int) *inputCodes == (unsigned int) *word) { 154 return true; 155 } 156 inputCodes++; 157 maxAlt--; 158 } 159 return false; 160 } 161 162 // TODO: Move functions related to bigram to here 163 } // namespace latinime 164