Home | History | Annotate | Download | only in dictionary
      1 /*
      2  * Copyright (C) 2013, The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
     18 
     19 #include "defines.h"
     20 #include "suggest/policyimpl/dictionary/utils/byte_array_utils.h"
     21 
     22 namespace latinime {
     23 
     24 typedef PatriciaTrieReadingUtils PtReadingUtils;
     25 
     26 const PtReadingUtils::NodeFlags PtReadingUtils::MASK_CHILDREN_POSITION_TYPE = 0xC0;
     27 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_NOPOSITION = 0x00;
     28 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_ONEBYTE = 0x40;
     29 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_TWOBYTES = 0x80;
     30 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_THREEBYTES = 0xC0;
     31 
     32 // Flag for single/multiple char group
     33 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_MULTIPLE_CHARS = 0x20;
     34 // Flag for terminal PtNodes
     35 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_TERMINAL = 0x10;
     36 // Flag for shortcut targets presence
     37 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_SHORTCUT_TARGETS = 0x08;
     38 // Flag for bigram presence
     39 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_BIGRAMS = 0x04;
     40 // Flag for non-words (typically, shortcut only entries)
     41 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_NOT_A_WORD = 0x02;
     42 // Flag for blacklist
     43 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
     44 
     45 /* static */ int PtReadingUtils::getPtNodeArraySizeAndAdvancePosition(
     46         const uint8_t *const buffer, int *const pos) {
     47     const uint8_t firstByte = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
     48     if (firstByte < 0x80) {
     49         return firstByte;
     50     } else {
     51         return ((firstByte & 0x7F) << 8) ^ ByteArrayUtils::readUint8AndAdvancePosition(
     52                 buffer, pos);
     53     }
     54 }
     55 
     56 /* static */ PtReadingUtils::NodeFlags PtReadingUtils::getFlagsAndAdvancePosition(
     57         const uint8_t *const buffer, int *const pos) {
     58     return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
     59 }
     60 
     61 /* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer,
     62         int *const pos) {
     63     return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, pos);
     64 }
     65 
     66 // Returns the number of read characters.
     67 /* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer,
     68         const NodeFlags flags, const int maxLength, int *const outBuffer, int *const pos) {
     69     int length = 0;
     70     if (hasMultipleChars(flags)) {
     71         length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, outBuffer,
     72                 pos);
     73     } else {
     74         const int codePoint = getCodePointAndAdvancePosition(buffer, pos);
     75         if (codePoint == NOT_A_CODE_POINT) {
     76             // CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is
     77             // CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR
     78             // when the PtNode has a single code point.
     79             length = 0;
     80             AKLOGE("codePoint is NOT_A_CODE_POINT. pos: %d, codePoint: 0x%x, buffer[pos - 1]: 0x%x",
     81                     *pos - 1, codePoint, buffer[*pos - 1]);
     82             ASSERT(false);
     83         } else if (maxLength > 0) {
     84             outBuffer[0] = codePoint;
     85             length = 1;
     86         }
     87     }
     88     return length;
     89 }
     90 
     91 // Returns the number of skipped characters.
     92 /* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags,
     93         const int maxLength, int *const pos) {
     94     if (hasMultipleChars(flags)) {
     95         return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos);
     96     } else {
     97         if (maxLength > 0) {
     98             getCodePointAndAdvancePosition(buffer, pos);
     99             return 1;
    100         } else {
    101             return 0;
    102         }
    103     }
    104 }
    105 
    106 /* static */ int PtReadingUtils::readProbabilityAndAdvancePosition(const uint8_t *const buffer,
    107         int *const pos) {
    108     return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
    109 }
    110 
    111 /* static */ int PtReadingUtils::readChildrenPositionAndAdvancePosition(
    112         const uint8_t *const buffer, const NodeFlags flags, int *const pos) {
    113     const int base = *pos;
    114     int offset = 0;
    115     switch (MASK_CHILDREN_POSITION_TYPE & flags) {
    116         case FLAG_CHILDREN_POSITION_TYPE_ONEBYTE:
    117             offset = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
    118             break;
    119         case FLAG_CHILDREN_POSITION_TYPE_TWOBYTES:
    120             offset = ByteArrayUtils::readUint16AndAdvancePosition(buffer, pos);
    121             break;
    122         case FLAG_CHILDREN_POSITION_TYPE_THREEBYTES:
    123             offset = ByteArrayUtils::readUint24AndAdvancePosition(buffer, pos);
    124             break;
    125         default:
    126             // If we come here, it means we asked for the children of a word with
    127             // no children.
    128             return NOT_A_DICT_POS;
    129     }
    130     return base + offset;
    131 }
    132 
    133 } // namespace latinime
    134