Home | History | Annotate | Download | only in pt_common
      1 /*
      2  * Copyright (C) 2013, The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
     18 
     19 #include "dictionary/structure/pt_common/pt_node_array_reader.h"
     20 #include "utils/char_utils.h"
     21 
     22 namespace latinime {
     23 
     24 // To avoid infinite loop caused by invalid or malicious forward links.
     25 const int DynamicPtReadingHelper::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000;
     26 const int DynamicPtReadingHelper::MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000;
     27 const size_t DynamicPtReadingHelper::MAX_READING_STATE_STACK_SIZE = MAX_WORD_LENGTH;
     28 
     29 bool DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions::onVisitingPtNode(
     30         const PtNodeParams *const ptNodeParams) {
     31     if (ptNodeParams->isTerminal() && !ptNodeParams->isDeleted()) {
     32         mTerminalPositions->push_back(ptNodeParams->getHeadPos());
     33     }
     34     return true;
     35 }
     36 
     37 // Visits all PtNodes in post-order depth first manner.
     38 // For example, visits c -> b -> y -> x -> a for the following dictionary:
     39 // a _ b _ c
     40 //   \ x _ y
     41 bool DynamicPtReadingHelper::traverseAllPtNodesInPostorderDepthFirstManner(
     42         TraversingEventListener *const listener) {
     43     bool alreadyVisitedChildren = false;
     44     // Descend from the root to the root PtNode array.
     45     if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) {
     46         return false;
     47     }
     48     while (!isEnd()) {
     49         const PtNodeParams ptNodeParams(getPtNodeParams());
     50         if (!ptNodeParams.isValid()) {
     51             break;
     52         }
     53         if (!alreadyVisitedChildren) {
     54             if (ptNodeParams.hasChildren()) {
     55                 // Move to the first child.
     56                 if (!listener->onDescend(ptNodeParams.getChildrenPos())) {
     57                     return false;
     58                 }
     59                 pushReadingStateToStack();
     60                 readChildNode(ptNodeParams);
     61             } else {
     62                 alreadyVisitedChildren = true;
     63             }
     64         } else {
     65             if (!listener->onVisitingPtNode(&ptNodeParams)) {
     66                 return false;
     67             }
     68             readNextSiblingNode(ptNodeParams);
     69             if (isEnd()) {
     70                 // All PtNodes in current linked PtNode arrays have been visited.
     71                 // Return to the parent.
     72                 if (!listener->onReadingPtNodeArrayTail()) {
     73                     return false;
     74                 }
     75                 if (mReadingStateStack.size() <= 0) {
     76                     break;
     77                 }
     78                 if (!listener->onAscend()) {
     79                     return false;
     80                 }
     81                 popReadingStateFromStack();
     82                 alreadyVisitedChildren = true;
     83             } else {
     84                 // Process sibling PtNode.
     85                 alreadyVisitedChildren = false;
     86             }
     87         }
     88     }
     89     // Ascend from the root PtNode array to the root.
     90     if (!listener->onAscend()) {
     91         return false;
     92     }
     93     return !isError();
     94 }
     95 
     96 // Visits all PtNodes in PtNode array level pre-order depth first manner, which is the same order
     97 // that PtNodes are written in the dictionary buffer.
     98 // For example, visits a -> b -> x -> c -> y for the following dictionary:
     99 // a _ b _ c
    100 //   \ x _ y
    101 bool DynamicPtReadingHelper::traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
    102         TraversingEventListener *const listener) {
    103     bool alreadyVisitedAllPtNodesInArray = false;
    104     bool alreadyVisitedChildren = false;
    105     // Descend from the root to the root PtNode array.
    106     if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) {
    107         return false;
    108     }
    109     if (isEnd()) {
    110         // Empty dictionary. Needs to notify the listener of the tail of empty PtNode array.
    111         if (!listener->onReadingPtNodeArrayTail()) {
    112             return false;
    113         }
    114     }
    115     pushReadingStateToStack();
    116     while (!isEnd()) {
    117         const PtNodeParams ptNodeParams(getPtNodeParams());
    118         if (!ptNodeParams.isValid()) {
    119             break;
    120         }
    121         if (alreadyVisitedAllPtNodesInArray) {
    122             if (alreadyVisitedChildren) {
    123                 // Move to next sibling PtNode's children.
    124                 readNextSiblingNode(ptNodeParams);
    125                 if (isEnd()) {
    126                     // Return to the parent PTNode.
    127                     if (!listener->onAscend()) {
    128                         return false;
    129                     }
    130                     if (mReadingStateStack.size() <= 0) {
    131                         break;
    132                     }
    133                     popReadingStateFromStack();
    134                     alreadyVisitedChildren = true;
    135                     alreadyVisitedAllPtNodesInArray = true;
    136                 } else {
    137                     alreadyVisitedChildren = false;
    138                 }
    139             } else {
    140                 if (ptNodeParams.hasChildren()) {
    141                     // Move to the first child.
    142                     if (!listener->onDescend(ptNodeParams.getChildrenPos())) {
    143                         return false;
    144                     }
    145                     pushReadingStateToStack();
    146                     readChildNode(ptNodeParams);
    147                     // Push state to return the head of PtNode array.
    148                     pushReadingStateToStack();
    149                     alreadyVisitedAllPtNodesInArray = false;
    150                     alreadyVisitedChildren = false;
    151                 } else {
    152                     alreadyVisitedChildren = true;
    153                 }
    154             }
    155         } else {
    156             if (!listener->onVisitingPtNode(&ptNodeParams)) {
    157                 return false;
    158             }
    159             readNextSiblingNode(ptNodeParams);
    160             if (isEnd()) {
    161                 if (!listener->onReadingPtNodeArrayTail()) {
    162                     return false;
    163                 }
    164                 // Return to the head of current PtNode array.
    165                 popReadingStateFromStack();
    166                 alreadyVisitedAllPtNodesInArray = true;
    167             }
    168         }
    169     }
    170     popReadingStateFromStack();
    171     // Ascend from the root PtNode array to the root.
    172     if (!listener->onAscend()) {
    173         return false;
    174     }
    175     return !isError();
    176 }
    177 
    178 int DynamicPtReadingHelper::getCodePointsAndReturnCodePointCount(const int maxCodePointCount,
    179         int *const outCodePoints) {
    180     // This method traverses parent nodes from the terminal by following parent pointers; thus,
    181     // node code points are stored in the buffer in the reverse order.
    182     int reverseCodePoints[maxCodePointCount];
    183     const PtNodeParams terminalPtNodeParams(getPtNodeParams());
    184     // First, read the terminal node and get its probability.
    185     if (!isValidTerminalNode(terminalPtNodeParams)) {
    186         // Node at the ptNodePos is not a valid terminal node.
    187         return 0;
    188     }
    189     // Then, following parent node link to the dictionary root and fetch node code points.
    190     int totalCodePointCount = 0;
    191     while (!isEnd()) {
    192         const PtNodeParams ptNodeParams(getPtNodeParams());
    193         totalCodePointCount = getTotalCodePointCount(ptNodeParams);
    194         if (!ptNodeParams.isValid() || totalCodePointCount > maxCodePointCount) {
    195             // The ptNodePos is not a valid terminal node position in the dictionary.
    196             return 0;
    197         }
    198         // Store node code points to buffer in the reverse order.
    199         fetchMergedNodeCodePointsInReverseOrder(ptNodeParams, getPrevTotalCodePointCount(),
    200                 reverseCodePoints);
    201         // Follow parent node toward the root node.
    202         readParentNode(ptNodeParams);
    203     }
    204     if (isError()) {
    205         // The node position or the dictionary is invalid.
    206         return 0;
    207     }
    208     // Reverse the stored code points to output them.
    209     for (int i = 0; i < totalCodePointCount; ++i) {
    210         outCodePoints[i] = reverseCodePoints[totalCodePointCount - i - 1];
    211     }
    212     return totalCodePointCount;
    213 }
    214 
    215 int DynamicPtReadingHelper::getTerminalPtNodePositionOfWord(const int *const inWord,
    216         const size_t length, const bool forceLowerCaseSearch) {
    217     int searchCodePoints[length];
    218     for (size_t i = 0; i < length; ++i) {
    219         searchCodePoints[i] = forceLowerCaseSearch ? CharUtils::toLowerCase(inWord[i]) : inWord[i];
    220     }
    221     while (!isEnd()) {
    222         const PtNodeParams ptNodeParams(getPtNodeParams());
    223         const int matchedCodePointCount = getPrevTotalCodePointCount();
    224         if (getTotalCodePointCount(ptNodeParams) > length
    225                 || !isMatchedCodePoint(ptNodeParams, 0 /* index */,
    226                         searchCodePoints[matchedCodePointCount])) {
    227             // Current node has too many code points or its first code point is different from
    228             // target code point. Skip this node and read the next sibling node.
    229             readNextSiblingNode(ptNodeParams);
    230             continue;
    231         }
    232         // Check following merged node code points.
    233         const int nodeCodePointCount = ptNodeParams.getCodePointCount();
    234         for (int j = 1; j < nodeCodePointCount; ++j) {
    235             if (!isMatchedCodePoint(ptNodeParams, j, searchCodePoints[matchedCodePointCount + j])) {
    236                 // Different code point is found. The given word is not included in the dictionary.
    237                 return NOT_A_DICT_POS;
    238             }
    239         }
    240         // All characters are matched.
    241         if (length == getTotalCodePointCount(ptNodeParams)) {
    242             if (!ptNodeParams.isTerminal()) {
    243                 return NOT_A_DICT_POS;
    244             }
    245             // Terminal position is found.
    246             return ptNodeParams.getHeadPos();
    247         }
    248         if (!ptNodeParams.hasChildren()) {
    249             return NOT_A_DICT_POS;
    250         }
    251         // Advance to the children nodes.
    252         readChildNode(ptNodeParams);
    253     }
    254     // If we already traversed the tree further than the word is long, there means
    255     // there was no match (or we would have found it).
    256     return NOT_A_DICT_POS;
    257 }
    258 
    259 // Read node array size and process empty node arrays. Nodes and arrays are counted up in this
    260 // method to avoid an infinite loop.
    261 void DynamicPtReadingHelper::nextPtNodeArray() {
    262     int ptNodeCountInArray = 0;
    263     int firstPtNodePos = NOT_A_DICT_POS;
    264     if (!mPtNodeArrayReader->readPtNodeArrayInfoAndReturnIfValid(
    265             mReadingState.mPos, &ptNodeCountInArray, &firstPtNodePos)) {
    266         mIsError = true;
    267         mReadingState.mPos = NOT_A_DICT_POS;
    268         return;
    269     }
    270     mReadingState.mPosOfThisPtNodeArrayHead = mReadingState.mPos;
    271     mReadingState.mRemainingPtNodeCountInThisArray = ptNodeCountInArray;
    272     mReadingState.mPos = firstPtNodePos;
    273     // Count up nodes and node arrays to avoid infinite loop.
    274     mReadingState.mTotalPtNodeIndexInThisArrayChain +=
    275             mReadingState.mRemainingPtNodeCountInThisArray;
    276     mReadingState.mPtNodeArrayIndexInThisArrayChain++;
    277     if (mReadingState.mRemainingPtNodeCountInThisArray < 0
    278             || mReadingState.mTotalPtNodeIndexInThisArrayChain
    279                     > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP
    280             || mReadingState.mPtNodeArrayIndexInThisArrayChain
    281                     > MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP) {
    282         // Invalid dictionary.
    283         AKLOGI("Invalid dictionary. nodeCount: %d, totalNodeCount: %d, MAX_CHILD_COUNT: %d"
    284                 "nodeArrayCount: %d, MAX_NODE_ARRAY_COUNT: %d",
    285                 mReadingState.mRemainingPtNodeCountInThisArray,
    286                 mReadingState.mTotalPtNodeIndexInThisArrayChain,
    287                 MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP,
    288                 mReadingState.mPtNodeArrayIndexInThisArrayChain,
    289                 MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP);
    290         ASSERT(false);
    291         mIsError = true;
    292         mReadingState.mPos = NOT_A_DICT_POS;
    293         return;
    294     }
    295     if (mReadingState.mRemainingPtNodeCountInThisArray == 0) {
    296         // Empty node array. Try following forward link.
    297         followForwardLink();
    298     }
    299 }
    300 
    301 // Follow the forward link and read the next node array if exists.
    302 void DynamicPtReadingHelper::followForwardLink() {
    303     int nextPtNodeArrayPos = NOT_A_DICT_POS;
    304     if (!mPtNodeArrayReader->readForwardLinkAndReturnIfValid(
    305             mReadingState.mPos, &nextPtNodeArrayPos)) {
    306         mIsError = true;
    307         mReadingState.mPos = NOT_A_DICT_POS;
    308         return;
    309     }
    310     mReadingState.mPosOfLastForwardLinkField = mReadingState.mPos;
    311     if (nextPtNodeArrayPos != NOT_A_DICT_POS) {
    312         // Follow the forward link.
    313         mReadingState.mPos = nextPtNodeArrayPos;
    314         nextPtNodeArray();
    315     } else {
    316         // All node arrays have been read.
    317         mReadingState.mPos = NOT_A_DICT_POS;
    318     }
    319 }
    320 
    321 } // namespace latinime
    322