Home | History | Annotate | Download | only in pt_common
      1 /*
      2  * Copyright (C) 2013, The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
     18 
     19 #include "defines.h"
     20 #include "suggest/core/policy/dictionary_bigrams_structure_policy.h"
     21 #include "suggest/core/policy/dictionary_shortcuts_structure_policy.h"
     22 #include "suggest/policyimpl/dictionary/utils/byte_array_utils.h"
     23 
     24 namespace latinime {
     25 
     26 typedef PatriciaTrieReadingUtils PtReadingUtils;
     27 
     28 const PtReadingUtils::NodeFlags PtReadingUtils::MASK_CHILDREN_POSITION_TYPE = 0xC0;
     29 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_NOPOSITION = 0x00;
     30 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_ONEBYTE = 0x40;
     31 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_TWOBYTES = 0x80;
     32 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_THREEBYTES = 0xC0;
     33 
     34 // Flag for single/multiple char group
     35 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_MULTIPLE_CHARS = 0x20;
     36 // Flag for terminal PtNodes
     37 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_TERMINAL = 0x10;
     38 // Flag for shortcut targets presence
     39 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_SHORTCUT_TARGETS = 0x08;
     40 // Flag for bigram presence
     41 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_BIGRAMS = 0x04;
     42 // Flag for non-words (typically, shortcut only entries)
     43 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_NOT_A_WORD = 0x02;
     44 // Flag for blacklist
     45 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
     46 
     47 /* static */ int PtReadingUtils::getPtNodeArraySizeAndAdvancePosition(
     48         const uint8_t *const buffer, int *const pos) {
     49     const uint8_t firstByte = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
     50     if (firstByte < 0x80) {
     51         return firstByte;
     52     } else {
     53         return ((firstByte & 0x7F) << 8) ^ ByteArrayUtils::readUint8AndAdvancePosition(
     54                 buffer, pos);
     55     }
     56 }
     57 
     58 /* static */ PtReadingUtils::NodeFlags PtReadingUtils::getFlagsAndAdvancePosition(
     59         const uint8_t *const buffer, int *const pos) {
     60     return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
     61 }
     62 
     63 /* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer,
     64         int *const pos) {
     65     return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, pos);
     66 }
     67 
     68 // Returns the number of read characters.
     69 /* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer,
     70         const NodeFlags flags, const int maxLength, int *const outBuffer, int *const pos) {
     71     int length = 0;
     72     if (hasMultipleChars(flags)) {
     73         length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, outBuffer,
     74                 pos);
     75     } else {
     76         const int codePoint = getCodePointAndAdvancePosition(buffer, pos);
     77         if (codePoint == NOT_A_CODE_POINT) {
     78             // CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is
     79             // CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR
     80             // when the PtNode has a single code point.
     81             length = 0;
     82             AKLOGE("codePoint is NOT_A_CODE_POINT. pos: %d, codePoint: 0x%x, buffer[pos - 1]: 0x%x",
     83                     *pos - 1, codePoint, buffer[*pos - 1]);
     84             ASSERT(false);
     85         } else if (maxLength > 0) {
     86             outBuffer[0] = codePoint;
     87             length = 1;
     88         }
     89     }
     90     return length;
     91 }
     92 
     93 // Returns the number of skipped characters.
     94 /* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags,
     95         const int maxLength, int *const pos) {
     96     if (hasMultipleChars(flags)) {
     97         return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos);
     98     } else {
     99         if (maxLength > 0) {
    100             getCodePointAndAdvancePosition(buffer, pos);
    101             return 1;
    102         } else {
    103             return 0;
    104         }
    105     }
    106 }
    107 
    108 /* static */ int PtReadingUtils::readProbabilityAndAdvancePosition(const uint8_t *const buffer,
    109         int *const pos) {
    110     return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
    111 }
    112 
    113 /* static */ int PtReadingUtils::readChildrenPositionAndAdvancePosition(
    114         const uint8_t *const buffer, const NodeFlags flags, int *const pos) {
    115     const int base = *pos;
    116     int offset = 0;
    117     switch (MASK_CHILDREN_POSITION_TYPE & flags) {
    118         case FLAG_CHILDREN_POSITION_TYPE_ONEBYTE:
    119             offset = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
    120             break;
    121         case FLAG_CHILDREN_POSITION_TYPE_TWOBYTES:
    122             offset = ByteArrayUtils::readUint16AndAdvancePosition(buffer, pos);
    123             break;
    124         case FLAG_CHILDREN_POSITION_TYPE_THREEBYTES:
    125             offset = ByteArrayUtils::readUint24AndAdvancePosition(buffer, pos);
    126             break;
    127         default:
    128             // If we come here, it means we asked for the children of a word with
    129             // no children.
    130             return NOT_A_DICT_POS;
    131     }
    132     return base + offset;
    133 }
    134 
    135 /* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos,
    136         const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
    137         const DictionaryBigramsStructurePolicy *const bigramPolicy,
    138         NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint,
    139         int *const outProbability, int *const outChildrenPos, int *const outShortcutPos,
    140         int *const outBigramPos, int *const outSiblingPos) {
    141     int readingPos = ptNodePos;
    142     const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos);
    143     *outFlags = flags;
    144     *outCodePointCount = getCharsAndAdvancePosition(
    145             dictBuf, flags, MAX_WORD_LENGTH, outCodePoint, &readingPos);
    146     *outProbability = isTerminal(flags) ?
    147             readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY;
    148     *outChildrenPos = hasChildrenInFlags(flags) ?
    149             readChildrenPositionAndAdvancePosition(dictBuf, flags, &readingPos) : NOT_A_DICT_POS;
    150     *outShortcutPos = NOT_A_DICT_POS;
    151     if (hasShortcutTargets(flags)) {
    152         *outShortcutPos = readingPos;
    153         shortcutPolicy->skipAllShortcuts(&readingPos);
    154     }
    155     *outBigramPos = NOT_A_DICT_POS;
    156     if (hasBigrams(flags)) {
    157         *outBigramPos = readingPos;
    158         bigramPolicy->skipAllBigrams(&readingPos);
    159     }
    160     *outSiblingPos = readingPos;
    161 }
    162 
    163 } // namespace latinime
    164