Home | History | Annotate | Download | only in dictionary
      1 /*
      2  * Copyright (C) 2010, The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include <cstring>
     18 
     19 #define LOG_TAG "LatinIME: bigram_dictionary.cpp"
     20 
     21 #include "bigram_dictionary.h"
     22 
     23 #include "defines.h"
     24 #include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
     25 #include "suggest/core/dictionary/dictionary.h"
     26 #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
     27 #include "utils/char_utils.h"
     28 
     29 namespace latinime {
     30 
     31 BigramDictionary::BigramDictionary(
     32         const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy)
     33         : mDictionaryStructurePolicy(dictionaryStructurePolicy) {
     34     if (DEBUG_DICT) {
     35         AKLOGI("BigramDictionary - constructor");
     36     }
     37 }
     38 
     39 BigramDictionary::~BigramDictionary() {
     40 }
     41 
     42 void BigramDictionary::addWordBigram(int *word, int length, int probability, int *bigramProbability,
     43         int *bigramCodePoints, int *outputTypes) const {
     44     word[length] = 0;
     45     if (DEBUG_DICT_FULL) {
     46 #ifdef FLAG_DBG
     47         char s[length + 1];
     48         for (int i = 0; i <= length; i++) s[i] = static_cast<char>(word[i]);
     49         AKLOGI("Bigram: Found word = %s, freq = %d :", s, probability);
     50 #endif
     51     }
     52 
     53     // Find the right insertion point
     54     int insertAt = 0;
     55     while (insertAt < MAX_RESULTS) {
     56         if (probability > bigramProbability[insertAt] || (bigramProbability[insertAt] == probability
     57                 && length < CharUtils::getCodePointCount(MAX_WORD_LENGTH,
     58                         bigramCodePoints + insertAt * MAX_WORD_LENGTH))) {
     59             break;
     60         }
     61         insertAt++;
     62     }
     63     if (DEBUG_DICT_FULL) {
     64         AKLOGI("Bigram: InsertAt -> %d MAX_RESULTS: %d", insertAt, MAX_RESULTS);
     65     }
     66     if (insertAt >= MAX_RESULTS) {
     67         return;
     68     }
     69     memmove(bigramProbability + (insertAt + 1),
     70             bigramProbability + insertAt,
     71             (MAX_RESULTS - insertAt - 1) * sizeof(bigramProbability[0]));
     72     bigramProbability[insertAt] = probability;
     73     outputTypes[insertAt] = Dictionary::KIND_PREDICTION;
     74     memmove(bigramCodePoints + (insertAt + 1) * MAX_WORD_LENGTH,
     75             bigramCodePoints + insertAt * MAX_WORD_LENGTH,
     76             (MAX_RESULTS - insertAt - 1) * sizeof(bigramCodePoints[0]) * MAX_WORD_LENGTH);
     77     int *dest = bigramCodePoints + insertAt * MAX_WORD_LENGTH;
     78     while (length--) {
     79         *dest++ = *word++;
     80     }
     81     *dest = 0; // NULL terminate
     82     if (DEBUG_DICT_FULL) {
     83         AKLOGI("Bigram: Added word at %d", insertAt);
     84     }
     85 }
     86 
     87 /* Parameters :
     88  * prevWord: the word before, the one for which we need to look up bigrams.
     89  * prevWordLength: its length.
     90  * outBigramCodePoints: an array for output, at the same format as outwords for getSuggestions.
     91  * outBigramProbability: an array to output frequencies.
     92  * outputTypes: an array to output types.
     93  * This method returns the number of bigrams this word has, for backward compatibility.
     94  */
     95 int BigramDictionary::getPredictions(const int *prevWord, const int prevWordLength,
     96         int *const outBigramCodePoints, int *const outBigramProbability,
     97         int *const outputTypes) const {
     98     // TODO: remove unused arguments, and refrain from storing stuff in members of this class
     99     // TODO: have "in" arguments before "out" ones, and make out args explicit in the name
    100 
    101     int pos = getBigramListPositionForWord(prevWord, prevWordLength,
    102             false /* forceLowerCaseSearch */);
    103     // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
    104     if (NOT_A_DICT_POS == pos) {
    105         // If no bigrams for this exact word, search again in lower case.
    106         pos = getBigramListPositionForWord(prevWord, prevWordLength,
    107                 true /* forceLowerCaseSearch */);
    108     }
    109     // If still no bigrams, we really don't have them!
    110     if (NOT_A_DICT_POS == pos) return 0;
    111 
    112     int bigramCount = 0;
    113     int unigramProbability = 0;
    114     int bigramBuffer[MAX_WORD_LENGTH];
    115     BinaryDictionaryBigramsIterator bigramsIt(
    116             mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos);
    117     while (bigramsIt.hasNext()) {
    118         bigramsIt.next();
    119         if (bigramsIt.getBigramPos() == NOT_A_DICT_POS) {
    120             continue;
    121         }
    122         const int codePointCount = mDictionaryStructurePolicy->
    123                 getCodePointsAndProbabilityAndReturnCodePointCount(bigramsIt.getBigramPos(),
    124                         MAX_WORD_LENGTH, bigramBuffer, &unigramProbability);
    125         if (codePointCount <= 0) {
    126             continue;
    127         }
    128         // Due to space constraints, the probability for bigrams is approximate - the lower the
    129         // unigram probability, the worse the precision. The theoritical maximum error in
    130         // resulting probability is 8 - although in the practice it's never bigger than 3 or 4
    131         // in very bad cases. This means that sometimes, we'll see some bigrams interverted
    132         // here, but it can't get too bad.
    133         const int probability = mDictionaryStructurePolicy->getProbability(
    134                 unigramProbability, bigramsIt.getProbability());
    135         addWordBigram(bigramBuffer, codePointCount, probability, outBigramProbability,
    136                 outBigramCodePoints, outputTypes);
    137         ++bigramCount;
    138     }
    139     return min(bigramCount, MAX_RESULTS);
    140 }
    141 
    142 // Returns a pointer to the start of the bigram list.
    143 // If the word is not found or has no bigrams, this function returns NOT_A_DICT_POS.
    144 int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength,
    145         const bool forceLowerCaseSearch) const {
    146     if (0 >= prevWordLength) return NOT_A_DICT_POS;
    147     int pos = mDictionaryStructurePolicy->getTerminalNodePositionOfWord(prevWord, prevWordLength,
    148             forceLowerCaseSearch);
    149     if (NOT_A_DICT_POS == pos) return NOT_A_DICT_POS;
    150     return mDictionaryStructurePolicy->getBigramsPositionOfPtNode(pos);
    151 }
    152 
    153 int BigramDictionary::getBigramProbability(const int *word0, int length0, const int *word1,
    154         int length1) const {
    155     int pos = getBigramListPositionForWord(word0, length0, false /* forceLowerCaseSearch */);
    156     // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
    157     if (NOT_A_DICT_POS == pos) return NOT_A_PROBABILITY;
    158     int nextWordPos = mDictionaryStructurePolicy->getTerminalNodePositionOfWord(word1, length1,
    159             false /* forceLowerCaseSearch */);
    160     if (NOT_A_DICT_POS == nextWordPos) return NOT_A_PROBABILITY;
    161 
    162     BinaryDictionaryBigramsIterator bigramsIt(
    163             mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos);
    164     while (bigramsIt.hasNext()) {
    165         bigramsIt.next();
    166         if (bigramsIt.getBigramPos() == nextWordPos) {
    167             return mDictionaryStructurePolicy->getProbability(
    168                     mDictionaryStructurePolicy->getUnigramProbabilityOfPtNode(nextWordPos),
    169                     bigramsIt.getProbability());
    170         }
    171     }
    172     return NOT_A_PROBABILITY;
    173 }
    174 
    175 // TODO: Move functions related to bigram to here
    176 } // namespace latinime
    177