Home | History | Annotate | Download | only in pt_common
      1 /*
      2  * Copyright (C) 2013, The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
     18 
     19 #include "defines.h"
     20 #include "dictionary/interface/dictionary_bigrams_structure_policy.h"
     21 #include "dictionary/interface/dictionary_shortcuts_structure_policy.h"
     22 #include "dictionary/utils/byte_array_utils.h"
     23 
     24 namespace latinime {
     25 
     26 typedef PatriciaTrieReadingUtils PtReadingUtils;
     27 
     28 const PtReadingUtils::NodeFlags PtReadingUtils::MASK_CHILDREN_POSITION_TYPE = 0xC0;
     29 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_NOPOSITION = 0x00;
     30 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_ONEBYTE = 0x40;
     31 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_TWOBYTES = 0x80;
     32 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_THREEBYTES = 0xC0;
     33 
     34 // Flag for single/multiple char group
     35 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_MULTIPLE_CHARS = 0x20;
     36 // Flag for terminal PtNodes
     37 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_TERMINAL = 0x10;
     38 // Flag for shortcut targets presence
     39 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_SHORTCUT_TARGETS = 0x08;
     40 // Flag for bigram presence
     41 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_BIGRAMS = 0x04;
     42 // Flag for non-words (typically, shortcut only entries)
     43 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_NOT_A_WORD = 0x02;
     44 // Flag for possibly offensive words
     45 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_POSSIBLY_OFFENSIVE = 0x01;
     46 
     47 /* static */ int PtReadingUtils::getPtNodeArraySizeAndAdvancePosition(
     48         const uint8_t *const buffer, int *const pos) {
     49     const uint8_t firstByte = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
     50     if (firstByte < 0x80) {
     51         return firstByte;
     52     } else {
     53         return ((firstByte & 0x7F) << 8) ^ ByteArrayUtils::readUint8AndAdvancePosition(
     54                 buffer, pos);
     55     }
     56 }
     57 
     58 /* static */ PtReadingUtils::NodeFlags PtReadingUtils::getFlagsAndAdvancePosition(
     59         const uint8_t *const buffer, int *const pos) {
     60     return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
     61 }
     62 
     63 /* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer,
     64         const int *const codePointTable, int *const pos) {
     65     return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, pos);
     66 }
     67 
     68 // Returns the number of read characters.
     69 /* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer,
     70         const NodeFlags flags, const int maxLength, const int *const codePointTable,
     71         int *const outBuffer, int *const pos) {
     72     int length = 0;
     73     if (hasMultipleChars(flags)) {
     74         length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, codePointTable,
     75                 outBuffer, pos);
     76     } else {
     77         const int codePoint = getCodePointAndAdvancePosition(buffer, codePointTable, pos);
     78         if (codePoint == NOT_A_CODE_POINT) {
     79             // CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is
     80             // CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR
     81             // when the PtNode has a single code point.
     82             length = 0;
     83             AKLOGE("codePoint is NOT_A_CODE_POINT. pos: %d, codePoint: 0x%x, buffer[pos - 1]: 0x%x",
     84                     *pos - 1, codePoint, buffer[*pos - 1]);
     85             ASSERT(false);
     86         } else if (maxLength > 0) {
     87             outBuffer[0] = codePoint;
     88             length = 1;
     89         }
     90     }
     91     return length;
     92 }
     93 
     94 // Returns the number of skipped characters.
     95 /* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags,
     96         const int maxLength, const int *const codePointTable, int *const pos) {
     97     if (hasMultipleChars(flags)) {
     98         return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos);
     99     } else {
    100         if (maxLength > 0) {
    101             getCodePointAndAdvancePosition(buffer, codePointTable, pos);
    102             return 1;
    103         } else {
    104             return 0;
    105         }
    106     }
    107 }
    108 
    109 /* static */ int PtReadingUtils::readProbabilityAndAdvancePosition(const uint8_t *const buffer,
    110         int *const pos) {
    111     return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
    112 }
    113 
    114 /* static */ int PtReadingUtils::readChildrenPositionAndAdvancePosition(
    115         const uint8_t *const buffer, const NodeFlags flags, int *const pos) {
    116     const int base = *pos;
    117     int offset = 0;
    118     switch (MASK_CHILDREN_POSITION_TYPE & flags) {
    119         case FLAG_CHILDREN_POSITION_TYPE_ONEBYTE:
    120             offset = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
    121             break;
    122         case FLAG_CHILDREN_POSITION_TYPE_TWOBYTES:
    123             offset = ByteArrayUtils::readUint16AndAdvancePosition(buffer, pos);
    124             break;
    125         case FLAG_CHILDREN_POSITION_TYPE_THREEBYTES:
    126             offset = ByteArrayUtils::readUint24AndAdvancePosition(buffer, pos);
    127             break;
    128         default:
    129             // If we come here, it means we asked for the children of a word with
    130             // no children.
    131             return NOT_A_DICT_POS;
    132     }
    133     return base + offset;
    134 }
    135 
    136 /* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos,
    137         const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
    138         const DictionaryBigramsStructurePolicy *const bigramPolicy, const int *const codePointTable,
    139         NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint,
    140         int *const outProbability, int *const outChildrenPos, int *const outShortcutPos,
    141         int *const outBigramPos, int *const outSiblingPos) {
    142     int readingPos = ptNodePos;
    143     const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos);
    144     *outFlags = flags;
    145     *outCodePointCount = getCharsAndAdvancePosition(
    146             dictBuf, flags, MAX_WORD_LENGTH, codePointTable, outCodePoint, &readingPos);
    147     *outProbability = isTerminal(flags) ?
    148             readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY;
    149     *outChildrenPos = hasChildrenInFlags(flags) ?
    150             readChildrenPositionAndAdvancePosition(dictBuf, flags, &readingPos) : NOT_A_DICT_POS;
    151     *outShortcutPos = NOT_A_DICT_POS;
    152     if (hasShortcutTargets(flags)) {
    153         *outShortcutPos = readingPos;
    154         shortcutPolicy->skipAllShortcuts(&readingPos);
    155     }
    156     *outBigramPos = NOT_A_DICT_POS;
    157     if (hasBigrams(flags)) {
    158         *outBigramPos = readingPos;
    159         bigramPolicy->skipAllBigrams(&readingPos);
    160     }
    161     *outSiblingPos = readingPos;
    162 }
    163 
    164 } // namespace latinime
    165