Home | History | Annotate | Download | only in property
      1 /*
      2  * Copyright (C) 2014 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "dictionary/property/ngram_context.h"
     18 
     19 #include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
     20 #include "utils/char_utils.h"
     21 
     22 namespace latinime {
     23 
     24 NgramContext::NgramContext() : mPrevWordCount(0) {}
     25 
     26 NgramContext::NgramContext(const NgramContext &ngramContext)
     27         : mPrevWordCount(ngramContext.mPrevWordCount) {
     28     for (size_t i = 0; i < mPrevWordCount; ++i) {
     29         mPrevWordCodePointCount[i] = ngramContext.mPrevWordCodePointCount[i];
     30         memmove(mPrevWordCodePoints[i], ngramContext.mPrevWordCodePoints[i],
     31                 sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]);
     32         mIsBeginningOfSentence[i] = ngramContext.mIsBeginningOfSentence[i];
     33     }
     34 }
     35 
     36 NgramContext::NgramContext(const int prevWordCodePoints[][MAX_WORD_LENGTH],
     37         const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence,
     38         const size_t prevWordCount)
     39         : mPrevWordCount(std::min(NELEMS(mPrevWordCodePoints), prevWordCount)) {
     40     clear();
     41     for (size_t i = 0; i < mPrevWordCount; ++i) {
     42         if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) {
     43             continue;
     44         }
     45         memmove(mPrevWordCodePoints[i], prevWordCodePoints[i],
     46                 sizeof(mPrevWordCodePoints[i][0]) * prevWordCodePointCount[i]);
     47         mPrevWordCodePointCount[i] = prevWordCodePointCount[i];
     48         mIsBeginningOfSentence[i] = isBeginningOfSentence[i];
     49     }
     50 }
     51 
     52 NgramContext::NgramContext(const int *const prevWordCodePoints, const int prevWordCodePointCount,
     53         const bool isBeginningOfSentence) : mPrevWordCount(1) {
     54     clear();
     55     if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) {
     56         return;
     57     }
     58     memmove(mPrevWordCodePoints[0], prevWordCodePoints,
     59             sizeof(mPrevWordCodePoints[0][0]) * prevWordCodePointCount);
     60     mPrevWordCodePointCount[0] = prevWordCodePointCount;
     61     mIsBeginningOfSentence[0] = isBeginningOfSentence;
     62 }
     63 
     64 bool NgramContext::isValid() const {
     65     if (mPrevWordCodePointCount[0] > 0) {
     66         return true;
     67     }
     68     if (mIsBeginningOfSentence[0]) {
     69         return true;
     70     }
     71     return false;
     72 }
     73 
     74 const CodePointArrayView NgramContext::getNthPrevWordCodePoints(const size_t n) const {
     75     if (n <= 0 || n > mPrevWordCount) {
     76         return CodePointArrayView();
     77     }
     78     return CodePointArrayView(mPrevWordCodePoints[n - 1], mPrevWordCodePointCount[n - 1]);
     79 }
     80 
     81 bool NgramContext::isNthPrevWordBeginningOfSentence(const size_t n) const {
     82     if (n <= 0 || n > mPrevWordCount) {
     83         return false;
     84     }
     85     return mIsBeginningOfSentence[n - 1];
     86 }
     87 
     88 /* static */ int NgramContext::getWordId(
     89         const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
     90         const int *const wordCodePoints, const int wordCodePointCount,
     91         const bool isBeginningOfSentence, const bool tryLowerCaseSearch) {
     92     if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) {
     93         return NOT_A_WORD_ID;
     94     }
     95     int codePoints[MAX_WORD_LENGTH];
     96     int codePointCount = wordCodePointCount;
     97     memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount);
     98     if (isBeginningOfSentence) {
     99         codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, codePointCount,
    100                 MAX_WORD_LENGTH);
    101         if (codePointCount <= 0) {
    102             return NOT_A_WORD_ID;
    103         }
    104     }
    105     const CodePointArrayView codePointArrayView(codePoints, codePointCount);
    106     const int wordId = dictStructurePolicy->getWordId(codePointArrayView,
    107             false /* forceLowerCaseSearch */);
    108     if (wordId != NOT_A_WORD_ID || !tryLowerCaseSearch) {
    109         // Return the id when when the word was found or doesn't try lower case search.
    110         return wordId;
    111     }
    112     // Check bigrams for lower-cased previous word if original was not found. Useful for
    113     // auto-capitalized words like "The [current_word]".
    114     return dictStructurePolicy->getWordId(codePointArrayView, true /* forceLowerCaseSearch */);
    115 }
    116 
    117 void NgramContext::clear() {
    118     for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
    119         mPrevWordCodePointCount[i] = 0;
    120         mIsBeginningOfSentence[i] = false;
    121     }
    122 }
    123 } // namespace latinime
    124