Home | History | Annotate | Download | only in unicode
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *   Copyright (C) 2010-2012, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 *   file name:  bytestrie.h
      9 *   encoding:   UTF-8
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2010sep25
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #ifndef __BYTESTRIE_H__
     18 #define __BYTESTRIE_H__
     19 
     20 /**
     21  * \file
     22  * \brief C++ API: Trie for mapping byte sequences to integer values.
     23  */
     24 
     25 #include "unicode/utypes.h"
     26 #include "unicode/stringpiece.h"
     27 #include "unicode/uobject.h"
     28 #include "unicode/ustringtrie.h"
     29 
     30 U_NAMESPACE_BEGIN
     31 
     32 class ByteSink;
     33 class BytesTrieBuilder;
     34 class CharString;
     35 class UVector32;
     36 
     37 /**
     38  * Light-weight, non-const reader class for a BytesTrie.
     39  * Traverses a byte-serialized data structure with minimal state,
     40  * for mapping byte sequences to non-negative integer values.
     41  *
     42  * This class owns the serialized trie data only if it was constructed by
     43  * the builder's build() method.
     44  * The public constructor and the copy constructor only alias the data (only copy the pointer).
     45  * There is no assignment operator.
     46  *
     47  * This class is not intended for public subclassing.
     48  * @stable ICU 4.8
     49  */
     50 class U_COMMON_API BytesTrie : public UMemory {
     51 public:
     52     /**
     53      * Constructs a BytesTrie reader instance.
     54      *
     55      * The trieBytes must contain a copy of a byte sequence from the BytesTrieBuilder,
     56      * starting with the first byte of that sequence.
     57      * The BytesTrie object will not read more bytes than
     58      * the BytesTrieBuilder generated in the corresponding build() call.
     59      *
     60      * The array is not copied/cloned and must not be modified while
     61      * the BytesTrie object is in use.
     62      *
     63      * @param trieBytes The byte array that contains the serialized trie.
     64      * @stable ICU 4.8
     65      */
     66     BytesTrie(const void *trieBytes)
     67             : ownedArray_(NULL), bytes_(static_cast<const uint8_t *>(trieBytes)),
     68               pos_(bytes_), remainingMatchLength_(-1) {}
     69 
     70     /**
     71      * Destructor.
     72      * @stable ICU 4.8
     73      */
     74     ~BytesTrie();
     75 
     76     /**
     77      * Copy constructor, copies the other trie reader object and its state,
     78      * but not the byte array which will be shared. (Shallow copy.)
     79      * @param other Another BytesTrie object.
     80      * @stable ICU 4.8
     81      */
     82     BytesTrie(const BytesTrie &other)
     83             : ownedArray_(NULL), bytes_(other.bytes_),
     84               pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
     85 
     86     /**
     87      * Resets this trie to its initial state.
     88      * @return *this
     89      * @stable ICU 4.8
     90      */
     91     BytesTrie &reset() {
     92         pos_=bytes_;
     93         remainingMatchLength_=-1;
     94         return *this;
     95     }
     96 
     97     /**
     98      * BytesTrie state object, for saving a trie's current state
     99      * and resetting the trie back to this state later.
    100      * @stable ICU 4.8
    101      */
    102     class State : public UMemory {
    103     public:
    104         /**
    105          * Constructs an empty State.
    106          * @stable ICU 4.8
    107          */
    108         State() { bytes=NULL; }
    109     private:
    110         friend class BytesTrie;
    111 
    112         const uint8_t *bytes;
    113         const uint8_t *pos;
    114         int32_t remainingMatchLength;
    115     };
    116 
    117     /**
    118      * Saves the state of this trie.
    119      * @param state The State object to hold the trie's state.
    120      * @return *this
    121      * @see resetToState
    122      * @stable ICU 4.8
    123      */
    124     const BytesTrie &saveState(State &state) const {
    125         state.bytes=bytes_;
    126         state.pos=pos_;
    127         state.remainingMatchLength=remainingMatchLength_;
    128         return *this;
    129     }
    130 
    131     /**
    132      * Resets this trie to the saved state.
    133      * If the state object contains no state, or the state of a different trie,
    134      * then this trie remains unchanged.
    135      * @param state The State object which holds a saved trie state.
    136      * @return *this
    137      * @see saveState
    138      * @see reset
    139      * @stable ICU 4.8
    140      */
    141     BytesTrie &resetToState(const State &state) {
    142         if(bytes_==state.bytes && bytes_!=NULL) {
    143             pos_=state.pos;
    144             remainingMatchLength_=state.remainingMatchLength;
    145         }
    146         return *this;
    147     }
    148 
    149     /**
    150      * Determines whether the byte sequence so far matches, whether it has a value,
    151      * and whether another input byte can continue a matching byte sequence.
    152      * @return The match/value Result.
    153      * @stable ICU 4.8
    154      */
    155     UStringTrieResult current() const;
    156 
    157     /**
    158      * Traverses the trie from the initial state for this input byte.
    159      * Equivalent to reset().next(inByte).
    160      * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
    161      *               Values below -0x100 and above 0xff will never match.
    162      * @return The match/value Result.
    163      * @stable ICU 4.8
    164      */
    165     inline UStringTrieResult first(int32_t inByte) {
    166         remainingMatchLength_=-1;
    167         if(inByte<0) {
    168             inByte+=0x100;
    169         }
    170         return nextImpl(bytes_, inByte);
    171     }
    172 
    173     /**
    174      * Traverses the trie from the current state for this input byte.
    175      * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
    176      *               Values below -0x100 and above 0xff will never match.
    177      * @return The match/value Result.
    178      * @stable ICU 4.8
    179      */
    180     UStringTrieResult next(int32_t inByte);
    181 
    182     /**
    183      * Traverses the trie from the current state for this byte sequence.
    184      * Equivalent to
    185      * \code
    186      * Result result=current();
    187      * for(each c in s)
    188      *   if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH;
    189      *   result=next(c);
    190      * return result;
    191      * \endcode
    192      * @param s A string or byte sequence. Can be NULL if length is 0.
    193      * @param length The length of the byte sequence. Can be -1 if NUL-terminated.
    194      * @return The match/value Result.
    195      * @stable ICU 4.8
    196      */
    197     UStringTrieResult next(const char *s, int32_t length);
    198 
    199     /**
    200      * Returns a matching byte sequence's value if called immediately after
    201      * current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE.
    202      * getValue() can be called multiple times.
    203      *
    204      * Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE!
    205      * @return The value for the byte sequence so far.
    206      * @stable ICU 4.8
    207      */
    208     inline int32_t getValue() const {
    209         const uint8_t *pos=pos_;
    210         int32_t leadByte=*pos++;
    211         // U_ASSERT(leadByte>=kMinValueLead);
    212         return readValue(pos, leadByte>>1);
    213     }
    214 
    215     /**
    216      * Determines whether all byte sequences reachable from the current state
    217      * map to the same value.
    218      * @param uniqueValue Receives the unique value, if this function returns TRUE.
    219      *                    (output-only)
    220      * @return TRUE if all byte sequences reachable from the current state
    221      *         map to the same value.
    222      * @stable ICU 4.8
    223      */
    224     inline UBool hasUniqueValue(int32_t &uniqueValue) const {
    225         const uint8_t *pos=pos_;
    226         // Skip the rest of a pending linear-match node.
    227         return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
    228     }
    229 
    230     /**
    231      * Finds each byte which continues the byte sequence from the current state.
    232      * That is, each byte b for which it would be next(b)!=USTRINGTRIE_NO_MATCH now.
    233      * @param out Each next byte is appended to this object.
    234      *            (Only uses the out.Append(s, length) method.)
    235      * @return the number of bytes which continue the byte sequence from here
    236      * @stable ICU 4.8
    237      */
    238     int32_t getNextBytes(ByteSink &out) const;
    239 
    240     /**
    241      * Iterator for all of the (byte sequence, value) pairs in a BytesTrie.
    242      * @stable ICU 4.8
    243      */
    244     class U_COMMON_API Iterator : public UMemory {
    245     public:
    246         /**
    247          * Iterates from the root of a byte-serialized BytesTrie.
    248          * @param trieBytes The trie bytes.
    249          * @param maxStringLength If 0, the iterator returns full strings/byte sequences.
    250          *                        Otherwise, the iterator returns strings with this maximum length.
    251          * @param errorCode Standard ICU error code. Its input value must
    252          *                  pass the U_SUCCESS() test, or else the function returns
    253          *                  immediately. Check for U_FAILURE() on output or use with
    254          *                  function chaining. (See User Guide for details.)
    255          * @stable ICU 4.8
    256          */
    257         Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode);
    258 
    259         /**
    260          * Iterates from the current state of the specified BytesTrie.
    261          * @param trie The trie whose state will be copied for iteration.
    262          * @param maxStringLength If 0, the iterator returns full strings/byte sequences.
    263          *                        Otherwise, the iterator returns strings with this maximum length.
    264          * @param errorCode Standard ICU error code. Its input value must
    265          *                  pass the U_SUCCESS() test, or else the function returns
    266          *                  immediately. Check for U_FAILURE() on output or use with
    267          *                  function chaining. (See User Guide for details.)
    268          * @stable ICU 4.8
    269          */
    270         Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
    271 
    272         /**
    273          * Destructor.
    274          * @stable ICU 4.8
    275          */
    276         ~Iterator();
    277 
    278         /**
    279          * Resets this iterator to its initial state.
    280          * @return *this
    281          * @stable ICU 4.8
    282          */
    283         Iterator &reset();
    284 
    285         /**
    286          * @return TRUE if there are more elements.
    287          * @stable ICU 4.8
    288          */
    289         UBool hasNext() const;
    290 
    291         /**
    292          * Finds the next (byte sequence, value) pair if there is one.
    293          *
    294          * If the byte sequence is truncated to the maximum length and does not
    295          * have a real value, then the value is set to -1.
    296          * In this case, this "not a real value" is indistinguishable from
    297          * a real value of -1.
    298          * @param errorCode Standard ICU error code. Its input value must
    299          *                  pass the U_SUCCESS() test, or else the function returns
    300          *                  immediately. Check for U_FAILURE() on output or use with
    301          *                  function chaining. (See User Guide for details.)
    302          * @return TRUE if there is another element.
    303          * @stable ICU 4.8
    304          */
    305         UBool next(UErrorCode &errorCode);
    306 
    307         /**
    308          * @return The NUL-terminated byte sequence for the last successful next().
    309          * @stable ICU 4.8
    310          */
    311         StringPiece getString() const;
    312         /**
    313          * @return The value for the last successful next().
    314          * @stable ICU 4.8
    315          */
    316         int32_t getValue() const { return value_; }
    317 
    318     private:
    319         UBool truncateAndStop();
    320 
    321         const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode);
    322 
    323         const uint8_t *bytes_;
    324         const uint8_t *pos_;
    325         const uint8_t *initialPos_;
    326         int32_t remainingMatchLength_;
    327         int32_t initialRemainingMatchLength_;
    328 
    329         CharString *str_;
    330         int32_t maxLength_;
    331         int32_t value_;
    332 
    333         // The stack stores pairs of integers for backtracking to another
    334         // outbound edge of a branch node.
    335         // The first integer is an offset from bytes_.
    336         // The second integer has the str_->length() from before the node in bits 15..0,
    337         // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.)
    338         // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24,
    339         // but the code looks more confusing that way.)
    340         UVector32 *stack_;
    341     };
    342 
    343 private:
    344     friend class BytesTrieBuilder;
    345 
    346     /**
    347      * Constructs a BytesTrie reader instance.
    348      * Unlike the public constructor which just aliases an array,
    349      * this constructor adopts the builder's array.
    350      * This constructor is only called by the builder.
    351      */
    352     BytesTrie(void *adoptBytes, const void *trieBytes)
    353             : ownedArray_(static_cast<uint8_t *>(adoptBytes)),
    354               bytes_(static_cast<const uint8_t *>(trieBytes)),
    355               pos_(bytes_), remainingMatchLength_(-1) {}
    356 
    357     // No assignment operator.
    358     BytesTrie &operator=(const BytesTrie &other);
    359 
    360     inline void stop() {
    361         pos_=NULL;
    362     }
    363 
    364     // Reads a compact 32-bit integer.
    365     // pos is already after the leadByte, and the lead byte is already shifted right by 1.
    366     static int32_t readValue(const uint8_t *pos, int32_t leadByte);
    367     static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) {
    368         // U_ASSERT(leadByte>=kMinValueLead);
    369         if(leadByte>=(kMinTwoByteValueLead<<1)) {
    370             if(leadByte<(kMinThreeByteValueLead<<1)) {
    371                 ++pos;
    372             } else if(leadByte<(kFourByteValueLead<<1)) {
    373                 pos+=2;
    374             } else {
    375                 pos+=3+((leadByte>>1)&1);
    376             }
    377         }
    378         return pos;
    379     }
    380     static inline const uint8_t *skipValue(const uint8_t *pos) {
    381         int32_t leadByte=*pos++;
    382         return skipValue(pos, leadByte);
    383     }
    384 
    385     // Reads a jump delta and jumps.
    386     static const uint8_t *jumpByDelta(const uint8_t *pos);
    387 
    388     static inline const uint8_t *skipDelta(const uint8_t *pos) {
    389         int32_t delta=*pos++;
    390         if(delta>=kMinTwoByteDeltaLead) {
    391             if(delta<kMinThreeByteDeltaLead) {
    392                 ++pos;
    393             } else if(delta<kFourByteDeltaLead) {
    394                 pos+=2;
    395             } else {
    396                 pos+=3+(delta&1);
    397             }
    398         }
    399         return pos;
    400     }
    401 
    402     static inline UStringTrieResult valueResult(int32_t node) {
    403         return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node&kValueIsFinal));
    404     }
    405 
    406     // Handles a branch node for both next(byte) and next(string).
    407     UStringTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte);
    408 
    409     // Requires remainingLength_<0.
    410     UStringTrieResult nextImpl(const uint8_t *pos, int32_t inByte);
    411 
    412     // Helper functions for hasUniqueValue().
    413     // Recursively finds a unique value (or whether there is not a unique one)
    414     // from a branch.
    415     static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
    416                                                     UBool haveUniqueValue, int32_t &uniqueValue);
    417     // Recursively finds a unique value (or whether there is not a unique one)
    418     // starting from a position on a node lead byte.
    419     static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
    420 
    421     // Helper functions for getNextBytes().
    422     // getNextBytes() when pos is on a branch node.
    423     static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out);
    424     static void append(ByteSink &out, int c);
    425 
    426     // BytesTrie data structure
    427     //
    428     // The trie consists of a series of byte-serialized nodes for incremental
    429     // string/byte sequence matching. The root node is at the beginning of the trie data.
    430     //
    431     // Types of nodes are distinguished by their node lead byte ranges.
    432     // After each node, except a final-value node, another node follows to
    433     // encode match values or continue matching further bytes.
    434     //
    435     // Node types:
    436     //  - Value node: Stores a 32-bit integer in a compact, variable-length format.
    437     //    The value is for the string/byte sequence so far.
    438     //    One node bit indicates whether the value is final or whether
    439     //    matching continues with the next node.
    440     //  - Linear-match node: Matches a number of bytes.
    441     //  - Branch node: Branches to other nodes according to the current input byte.
    442     //    The node byte is the length of the branch (number of bytes to select from)
    443     //    minus 1. It is followed by a sub-node:
    444     //    - If the length is at most kMaxBranchLinearSubNodeLength, then
    445     //      there are length-1 (key, value) pairs and then one more comparison byte.
    446     //      If one of the key bytes matches, then the value is either a final value for
    447     //      the string/byte sequence so far, or a "jump" delta to the next node.
    448     //      If the last byte matches, then matching continues with the next node.
    449     //      (Values have the same encoding as value nodes.)
    450     //    - If the length is greater than kMaxBranchLinearSubNodeLength, then
    451     //      there is one byte and one "jump" delta.
    452     //      If the input byte is less than the sub-node byte, then "jump" by delta to
    453     //      the next sub-node which will have a length of length/2.
    454     //      (The delta has its own compact encoding.)
    455     //      Otherwise, skip the "jump" delta to the next sub-node
    456     //      which will have a length of length-length/2.
    457 
    458     // Node lead byte values.
    459 
    460     // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise
    461     // the length is one more than the next byte.
    462 
    463     // For a branch sub-node with at most this many entries, we drop down
    464     // to a linear search.
    465     static const int32_t kMaxBranchLinearSubNodeLength=5;
    466 
    467     // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node.
    468     static const int32_t kMinLinearMatch=0x10;
    469     static const int32_t kMaxLinearMatchLength=0x10;
    470 
    471     // 20..ff: Variable-length value node.
    472     // If odd, the value is final. (Otherwise, intermediate value or jump delta.)
    473     // Then shift-right by 1 bit.
    474     // The remaining lead byte value indicates the number of following bytes (0..4)
    475     // and contains the value's top bits.
    476     static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength;  // 0x20
    477     // It is a final value if bit 0 is set.
    478     static const int32_t kValueIsFinal=1;
    479 
    480     // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds.
    481     static const int32_t kMinOneByteValueLead=kMinValueLead/2;  // 0x10
    482     static const int32_t kMaxOneByteValue=0x40;  // At least 6 bits in the first byte.
    483 
    484     static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1;  // 0x51
    485     static const int32_t kMaxTwoByteValue=0x1aff;
    486 
    487     static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1;  // 0x6c
    488     static const int32_t kFourByteValueLead=0x7e;
    489 
    490     // A little more than Unicode code points. (0x11ffff)
    491     static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1;
    492 
    493     static const int32_t kFiveByteValueLead=0x7f;
    494 
    495     // Compact delta integers.
    496     static const int32_t kMaxOneByteDelta=0xbf;
    497     static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1;  // 0xc0
    498     static const int32_t kMinThreeByteDeltaLead=0xf0;
    499     static const int32_t kFourByteDeltaLead=0xfe;
    500     static const int32_t kFiveByteDeltaLead=0xff;
    501 
    502     static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1;  // 0x2fff
    503     static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1;  // 0xdffff
    504 
    505     uint8_t *ownedArray_;
    506 
    507     // Fixed value referencing the BytesTrie bytes.
    508     const uint8_t *bytes_;
    509 
    510     // Iterator variables.
    511 
    512     // Pointer to next trie byte to read. NULL if no more matches.
    513     const uint8_t *pos_;
    514     // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
    515     int32_t remainingMatchLength_;
    516 };
    517 
    518 U_NAMESPACE_END
    519 
    520 #endif  // __BYTESTRIE_H__
    521