1 /* 2 * Copyright (C) 2010, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <cstring> 18 19 #define LOG_TAG "LatinIME: bigram_dictionary.cpp" 20 21 #include "bigram_dictionary.h" 22 23 #include "defines.h" 24 #include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" 25 #include "suggest/core/dictionary/dictionary.h" 26 #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" 27 #include "utils/char_utils.h" 28 29 namespace latinime { 30 31 BigramDictionary::BigramDictionary( 32 const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy) 33 : mDictionaryStructurePolicy(dictionaryStructurePolicy) { 34 if (DEBUG_DICT) { 35 AKLOGI("BigramDictionary - constructor"); 36 } 37 } 38 39 BigramDictionary::~BigramDictionary() { 40 } 41 42 void BigramDictionary::addWordBigram(int *word, int length, int probability, int *bigramProbability, 43 int *bigramCodePoints, int *outputTypes) const { 44 word[length] = 0; 45 if (DEBUG_DICT_FULL) { 46 #ifdef FLAG_DBG 47 char s[length + 1]; 48 for (int i = 0; i <= length; i++) s[i] = static_cast<char>(word[i]); 49 AKLOGI("Bigram: Found word = %s, freq = %d :", s, probability); 50 #endif 51 } 52 53 // Find the right insertion point 54 int insertAt = 0; 55 while (insertAt < MAX_RESULTS) { 56 if (probability > bigramProbability[insertAt] || (bigramProbability[insertAt] == probability 57 && length < CharUtils::getCodePointCount(MAX_WORD_LENGTH, 58 bigramCodePoints + insertAt * MAX_WORD_LENGTH))) { 59 break; 60 } 61 insertAt++; 62 } 63 if (DEBUG_DICT_FULL) { 64 AKLOGI("Bigram: InsertAt -> %d MAX_RESULTS: %d", insertAt, MAX_RESULTS); 65 } 66 if (insertAt >= MAX_RESULTS) { 67 return; 68 } 69 memmove(bigramProbability + (insertAt + 1), 70 bigramProbability + insertAt, 71 (MAX_RESULTS - insertAt - 1) * sizeof(bigramProbability[0])); 72 bigramProbability[insertAt] = probability; 73 outputTypes[insertAt] = Dictionary::KIND_PREDICTION; 74 memmove(bigramCodePoints + (insertAt + 1) * MAX_WORD_LENGTH, 75 bigramCodePoints + insertAt * MAX_WORD_LENGTH, 76 (MAX_RESULTS - insertAt - 1) * sizeof(bigramCodePoints[0]) * MAX_WORD_LENGTH); 77 int *dest = bigramCodePoints + insertAt * MAX_WORD_LENGTH; 78 while (length--) { 79 *dest++ = *word++; 80 } 81 *dest = 0; // NULL terminate 82 if (DEBUG_DICT_FULL) { 83 AKLOGI("Bigram: Added word at %d", insertAt); 84 } 85 } 86 87 /* Parameters : 88 * prevWord: the word before, the one for which we need to look up bigrams. 89 * prevWordLength: its length. 90 * outBigramCodePoints: an array for output, at the same format as outwords for getSuggestions. 91 * outBigramProbability: an array to output frequencies. 92 * outputTypes: an array to output types. 93 * This method returns the number of bigrams this word has, for backward compatibility. 94 */ 95 int BigramDictionary::getPredictions(const int *prevWord, const int prevWordLength, 96 int *const outBigramCodePoints, int *const outBigramProbability, 97 int *const outputTypes) const { 98 // TODO: remove unused arguments, and refrain from storing stuff in members of this class 99 // TODO: have "in" arguments before "out" ones, and make out args explicit in the name 100 101 int pos = getBigramListPositionForWord(prevWord, prevWordLength, 102 false /* forceLowerCaseSearch */); 103 // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams 104 if (NOT_A_DICT_POS == pos) { 105 // If no bigrams for this exact word, search again in lower case. 106 pos = getBigramListPositionForWord(prevWord, prevWordLength, 107 true /* forceLowerCaseSearch */); 108 } 109 // If still no bigrams, we really don't have them! 110 if (NOT_A_DICT_POS == pos) return 0; 111 112 int bigramCount = 0; 113 int unigramProbability = 0; 114 int bigramBuffer[MAX_WORD_LENGTH]; 115 BinaryDictionaryBigramsIterator bigramsIt( 116 mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos); 117 while (bigramsIt.hasNext()) { 118 bigramsIt.next(); 119 if (bigramsIt.getBigramPos() == NOT_A_DICT_POS) { 120 continue; 121 } 122 const int codePointCount = mDictionaryStructurePolicy-> 123 getCodePointsAndProbabilityAndReturnCodePointCount(bigramsIt.getBigramPos(), 124 MAX_WORD_LENGTH, bigramBuffer, &unigramProbability); 125 if (codePointCount <= 0) { 126 continue; 127 } 128 // Due to space constraints, the probability for bigrams is approximate - the lower the 129 // unigram probability, the worse the precision. The theoritical maximum error in 130 // resulting probability is 8 - although in the practice it's never bigger than 3 or 4 131 // in very bad cases. This means that sometimes, we'll see some bigrams interverted 132 // here, but it can't get too bad. 133 const int probability = mDictionaryStructurePolicy->getProbability( 134 unigramProbability, bigramsIt.getProbability()); 135 addWordBigram(bigramBuffer, codePointCount, probability, outBigramProbability, 136 outBigramCodePoints, outputTypes); 137 ++bigramCount; 138 } 139 return min(bigramCount, MAX_RESULTS); 140 } 141 142 // Returns a pointer to the start of the bigram list. 143 // If the word is not found or has no bigrams, this function returns NOT_A_DICT_POS. 144 int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength, 145 const bool forceLowerCaseSearch) const { 146 if (0 >= prevWordLength) return NOT_A_DICT_POS; 147 int pos = mDictionaryStructurePolicy->getTerminalNodePositionOfWord(prevWord, prevWordLength, 148 forceLowerCaseSearch); 149 if (NOT_A_DICT_POS == pos) return NOT_A_DICT_POS; 150 return mDictionaryStructurePolicy->getBigramsPositionOfPtNode(pos); 151 } 152 153 int BigramDictionary::getBigramProbability(const int *word0, int length0, const int *word1, 154 int length1) const { 155 int pos = getBigramListPositionForWord(word0, length0, false /* forceLowerCaseSearch */); 156 // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams 157 if (NOT_A_DICT_POS == pos) return NOT_A_PROBABILITY; 158 int nextWordPos = mDictionaryStructurePolicy->getTerminalNodePositionOfWord(word1, length1, 159 false /* forceLowerCaseSearch */); 160 if (NOT_A_DICT_POS == nextWordPos) return NOT_A_PROBABILITY; 161 162 BinaryDictionaryBigramsIterator bigramsIt( 163 mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos); 164 while (bigramsIt.hasNext()) { 165 bigramsIt.next(); 166 if (bigramsIt.getBigramPos() == nextWordPos) { 167 return mDictionaryStructurePolicy->getProbability( 168 mDictionaryStructurePolicy->getUnigramProbabilityOfPtNode(nextWordPos), 169 bigramsIt.getProbability()); 170 } 171 } 172 return NOT_A_PROBABILITY; 173 } 174 175 // TODO: Move functions related to bigram to here 176 } // namespace latinime 177