Home | History | Annotate | Download | only in unicode
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 2010-2012, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  bytestrie.h
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2010sep25
     12 *   created by: Markus W. Scherer
     13 */
     14 
     15 #ifndef __BYTESTRIE_H__
     16 #define __BYTESTRIE_H__
     17 
     18 /**
     19  * \file
     20  * \brief C++ API: Trie for mapping byte sequences to integer values.
     21  */
     22 
     23 #include "unicode/utypes.h"
     24 #include "unicode/stringpiece.h"
     25 #include "unicode/uobject.h"
     26 #include "unicode/ustringtrie.h"
     27 
     28 U_NAMESPACE_BEGIN
     29 
     30 class ByteSink;
     31 class BytesTrieBuilder;
     32 class CharString;
     33 class UVector32;
     34 
     35 /**
     36  * Light-weight, non-const reader class for a BytesTrie.
     37  * Traverses a byte-serialized data structure with minimal state,
     38  * for mapping byte sequences to non-negative integer values.
     39  *
     40  * This class owns the serialized trie data only if it was constructed by
     41  * the builder's build() method.
     42  * The public constructor and the copy constructor only alias the data (only copy the pointer).
     43  * There is no assignment operator.
     44  *
     45  * This class is not intended for public subclassing.
     46  * @stable ICU 4.8
     47  */
     48 class U_COMMON_API BytesTrie : public UMemory {
     49 public:
     50     /**
     51      * Constructs a BytesTrie reader instance.
     52      *
     53      * The trieBytes must contain a copy of a byte sequence from the BytesTrieBuilder,
     54      * starting with the first byte of that sequence.
     55      * The BytesTrie object will not read more bytes than
     56      * the BytesTrieBuilder generated in the corresponding build() call.
     57      *
     58      * The array is not copied/cloned and must not be modified while
     59      * the BytesTrie object is in use.
     60      *
     61      * @param trieBytes The byte array that contains the serialized trie.
     62      * @stable ICU 4.8
     63      */
     64     BytesTrie(const void *trieBytes)
     65             : ownedArray_(NULL), bytes_(static_cast<const uint8_t *>(trieBytes)),
     66               pos_(bytes_), remainingMatchLength_(-1) {}
     67 
     68     /**
     69      * Destructor.
     70      * @stable ICU 4.8
     71      */
     72     ~BytesTrie();
     73 
     74     /**
     75      * Copy constructor, copies the other trie reader object and its state,
     76      * but not the byte array which will be shared. (Shallow copy.)
     77      * @param other Another BytesTrie object.
     78      * @stable ICU 4.8
     79      */
     80     BytesTrie(const BytesTrie &other)
     81             : ownedArray_(NULL), bytes_(other.bytes_),
     82               pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
     83 
     84     /**
     85      * Resets this trie to its initial state.
     86      * @return *this
     87      * @stable ICU 4.8
     88      */
     89     BytesTrie &reset() {
     90         pos_=bytes_;
     91         remainingMatchLength_=-1;
     92         return *this;
     93     }
     94 
     95     /**
     96      * BytesTrie state object, for saving a trie's current state
     97      * and resetting the trie back to this state later.
     98      * @stable ICU 4.8
     99      */
    100     class State : public UMemory {
    101     public:
    102         /**
    103          * Constructs an empty State.
    104          * @stable ICU 4.8
    105          */
    106         State() { bytes=NULL; }
    107     private:
    108         friend class BytesTrie;
    109 
    110         const uint8_t *bytes;
    111         const uint8_t *pos;
    112         int32_t remainingMatchLength;
    113     };
    114 
    115     /**
    116      * Saves the state of this trie.
    117      * @param state The State object to hold the trie's state.
    118      * @return *this
    119      * @see resetToState
    120      * @stable ICU 4.8
    121      */
    122     const BytesTrie &saveState(State &state) const {
    123         state.bytes=bytes_;
    124         state.pos=pos_;
    125         state.remainingMatchLength=remainingMatchLength_;
    126         return *this;
    127     }
    128 
    129     /**
    130      * Resets this trie to the saved state.
    131      * If the state object contains no state, or the state of a different trie,
    132      * then this trie remains unchanged.
    133      * @param state The State object which holds a saved trie state.
    134      * @return *this
    135      * @see saveState
    136      * @see reset
    137      * @stable ICU 4.8
    138      */
    139     BytesTrie &resetToState(const State &state) {
    140         if(bytes_==state.bytes && bytes_!=NULL) {
    141             pos_=state.pos;
    142             remainingMatchLength_=state.remainingMatchLength;
    143         }
    144         return *this;
    145     }
    146 
    147     /**
    148      * Determines whether the byte sequence so far matches, whether it has a value,
    149      * and whether another input byte can continue a matching byte sequence.
    150      * @return The match/value Result.
    151      * @stable ICU 4.8
    152      */
    153     UStringTrieResult current() const;
    154 
    155     /**
    156      * Traverses the trie from the initial state for this input byte.
    157      * Equivalent to reset().next(inByte).
    158      * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
    159      *               Values below -0x100 and above 0xff will never match.
    160      * @return The match/value Result.
    161      * @stable ICU 4.8
    162      */
    163     inline UStringTrieResult first(int32_t inByte) {
    164         remainingMatchLength_=-1;
    165         if(inByte<0) {
    166             inByte+=0x100;
    167         }
    168         return nextImpl(bytes_, inByte);
    169     }
    170 
    171     /**
    172      * Traverses the trie from the current state for this input byte.
    173      * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
    174      *               Values below -0x100 and above 0xff will never match.
    175      * @return The match/value Result.
    176      * @stable ICU 4.8
    177      */
    178     UStringTrieResult next(int32_t inByte);
    179 
    180     /**
    181      * Traverses the trie from the current state for this byte sequence.
    182      * Equivalent to
    183      * \code
    184      * Result result=current();
    185      * for(each c in s)
    186      *   if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH;
    187      *   result=next(c);
    188      * return result;
    189      * \endcode
    190      * @param s A string or byte sequence. Can be NULL if length is 0.
    191      * @param length The length of the byte sequence. Can be -1 if NUL-terminated.
    192      * @return The match/value Result.
    193      * @stable ICU 4.8
    194      */
    195     UStringTrieResult next(const char *s, int32_t length);
    196 
    197     /**
    198      * Returns a matching byte sequence's value if called immediately after
    199      * current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE.
    200      * getValue() can be called multiple times.
    201      *
    202      * Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE!
    203      * @return The value for the byte sequence so far.
    204      * @stable ICU 4.8
    205      */
    206     inline int32_t getValue() const {
    207         const uint8_t *pos=pos_;
    208         int32_t leadByte=*pos++;
    209         // U_ASSERT(leadByte>=kMinValueLead);
    210         return readValue(pos, leadByte>>1);
    211     }
    212 
    213     /**
    214      * Determines whether all byte sequences reachable from the current state
    215      * map to the same value.
    216      * @param uniqueValue Receives the unique value, if this function returns TRUE.
    217      *                    (output-only)
    218      * @return TRUE if all byte sequences reachable from the current state
    219      *         map to the same value.
    220      * @stable ICU 4.8
    221      */
    222     inline UBool hasUniqueValue(int32_t &uniqueValue) const {
    223         const uint8_t *pos=pos_;
    224         // Skip the rest of a pending linear-match node.
    225         return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
    226     }
    227 
    228     /**
    229      * Finds each byte which continues the byte sequence from the current state.
    230      * That is, each byte b for which it would be next(b)!=USTRINGTRIE_NO_MATCH now.
    231      * @param out Each next byte is appended to this object.
    232      *            (Only uses the out.Append(s, length) method.)
    233      * @return the number of bytes which continue the byte sequence from here
    234      * @stable ICU 4.8
    235      */
    236     int32_t getNextBytes(ByteSink &out) const;
    237 
    238     /**
    239      * Iterator for all of the (byte sequence, value) pairs in a BytesTrie.
    240      * @stable ICU 4.8
    241      */
    242     class U_COMMON_API Iterator : public UMemory {
    243     public:
    244         /**
    245          * Iterates from the root of a byte-serialized BytesTrie.
    246          * @param trieBytes The trie bytes.
    247          * @param maxStringLength If 0, the iterator returns full strings/byte sequences.
    248          *                        Otherwise, the iterator returns strings with this maximum length.
    249          * @param errorCode Standard ICU error code. Its input value must
    250          *                  pass the U_SUCCESS() test, or else the function returns
    251          *                  immediately. Check for U_FAILURE() on output or use with
    252          *                  function chaining. (See User Guide for details.)
    253          * @stable ICU 4.8
    254          */
    255         Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode);
    256 
    257         /**
    258          * Iterates from the current state of the specified BytesTrie.
    259          * @param trie The trie whose state will be copied for iteration.
    260          * @param maxStringLength If 0, the iterator returns full strings/byte sequences.
    261          *                        Otherwise, the iterator returns strings with this maximum length.
    262          * @param errorCode Standard ICU error code. Its input value must
    263          *                  pass the U_SUCCESS() test, or else the function returns
    264          *                  immediately. Check for U_FAILURE() on output or use with
    265          *                  function chaining. (See User Guide for details.)
    266          * @stable ICU 4.8
    267          */
    268         Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
    269 
    270         /**
    271          * Destructor.
    272          * @stable ICU 4.8
    273          */
    274         ~Iterator();
    275 
    276         /**
    277          * Resets this iterator to its initial state.
    278          * @return *this
    279          * @stable ICU 4.8
    280          */
    281         Iterator &reset();
    282 
    283         /**
    284          * @return TRUE if there are more elements.
    285          * @stable ICU 4.8
    286          */
    287         UBool hasNext() const;
    288 
    289         /**
    290          * Finds the next (byte sequence, value) pair if there is one.
    291          *
    292          * If the byte sequence is truncated to the maximum length and does not
    293          * have a real value, then the value is set to -1.
    294          * In this case, this "not a real value" is indistinguishable from
    295          * a real value of -1.
    296          * @param errorCode Standard ICU error code. Its input value must
    297          *                  pass the U_SUCCESS() test, or else the function returns
    298          *                  immediately. Check for U_FAILURE() on output or use with
    299          *                  function chaining. (See User Guide for details.)
    300          * @return TRUE if there is another element.
    301          * @stable ICU 4.8
    302          */
    303         UBool next(UErrorCode &errorCode);
    304 
    305         /**
    306          * @return The NUL-terminated byte sequence for the last successful next().
    307          * @stable ICU 4.8
    308          */
    309         const StringPiece &getString() const { return sp_; }
    310         /**
    311          * @return The value for the last successful next().
    312          * @stable ICU 4.8
    313          */
    314         int32_t getValue() const { return value_; }
    315 
    316     private:
    317         UBool truncateAndStop();
    318 
    319         const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode);
    320 
    321         const uint8_t *bytes_;
    322         const uint8_t *pos_;
    323         const uint8_t *initialPos_;
    324         int32_t remainingMatchLength_;
    325         int32_t initialRemainingMatchLength_;
    326 
    327         CharString *str_;
    328         StringPiece sp_;
    329         int32_t maxLength_;
    330         int32_t value_;
    331 
    332         // The stack stores pairs of integers for backtracking to another
    333         // outbound edge of a branch node.
    334         // The first integer is an offset from bytes_.
    335         // The second integer has the str_->length() from before the node in bits 15..0,
    336         // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.)
    337         // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24,
    338         // but the code looks more confusing that way.)
    339         UVector32 *stack_;
    340     };
    341 
    342 private:
    343     friend class BytesTrieBuilder;
    344 
    345     /**
    346      * Constructs a BytesTrie reader instance.
    347      * Unlike the public constructor which just aliases an array,
    348      * this constructor adopts the builder's array.
    349      * This constructor is only called by the builder.
    350      */
    351     BytesTrie(void *adoptBytes, const void *trieBytes)
    352             : ownedArray_(static_cast<uint8_t *>(adoptBytes)),
    353               bytes_(static_cast<const uint8_t *>(trieBytes)),
    354               pos_(bytes_), remainingMatchLength_(-1) {}
    355 
    356     // No assignment operator.
    357     BytesTrie &operator=(const BytesTrie &other);
    358 
    359     inline void stop() {
    360         pos_=NULL;
    361     }
    362 
    363     // Reads a compact 32-bit integer.
    364     // pos is already after the leadByte, and the lead byte is already shifted right by 1.
    365     static int32_t readValue(const uint8_t *pos, int32_t leadByte);
    366     static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) {
    367         // U_ASSERT(leadByte>=kMinValueLead);
    368         if(leadByte>=(kMinTwoByteValueLead<<1)) {
    369             if(leadByte<(kMinThreeByteValueLead<<1)) {
    370                 ++pos;
    371             } else if(leadByte<(kFourByteValueLead<<1)) {
    372                 pos+=2;
    373             } else {
    374                 pos+=3+((leadByte>>1)&1);
    375             }
    376         }
    377         return pos;
    378     }
    379     static inline const uint8_t *skipValue(const uint8_t *pos) {
    380         int32_t leadByte=*pos++;
    381         return skipValue(pos, leadByte);
    382     }
    383 
    384     // Reads a jump delta and jumps.
    385     static const uint8_t *jumpByDelta(const uint8_t *pos);
    386 
    387     static inline const uint8_t *skipDelta(const uint8_t *pos) {
    388         int32_t delta=*pos++;
    389         if(delta>=kMinTwoByteDeltaLead) {
    390             if(delta<kMinThreeByteDeltaLead) {
    391                 ++pos;
    392             } else if(delta<kFourByteDeltaLead) {
    393                 pos+=2;
    394             } else {
    395                 pos+=3+(delta&1);
    396             }
    397         }
    398         return pos;
    399     }
    400 
    401     static inline UStringTrieResult valueResult(int32_t node) {
    402         return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node&kValueIsFinal));
    403     }
    404 
    405     // Handles a branch node for both next(byte) and next(string).
    406     UStringTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte);
    407 
    408     // Requires remainingLength_<0.
    409     UStringTrieResult nextImpl(const uint8_t *pos, int32_t inByte);
    410 
    411     // Helper functions for hasUniqueValue().
    412     // Recursively finds a unique value (or whether there is not a unique one)
    413     // from a branch.
    414     static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
    415                                                     UBool haveUniqueValue, int32_t &uniqueValue);
    416     // Recursively finds a unique value (or whether there is not a unique one)
    417     // starting from a position on a node lead byte.
    418     static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
    419 
    420     // Helper functions for getNextBytes().
    421     // getNextBytes() when pos is on a branch node.
    422     static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out);
    423     static void append(ByteSink &out, int c);
    424 
    425     // BytesTrie data structure
    426     //
    427     // The trie consists of a series of byte-serialized nodes for incremental
    428     // string/byte sequence matching. The root node is at the beginning of the trie data.
    429     //
    430     // Types of nodes are distinguished by their node lead byte ranges.
    431     // After each node, except a final-value node, another node follows to
    432     // encode match values or continue matching further bytes.
    433     //
    434     // Node types:
    435     //  - Value node: Stores a 32-bit integer in a compact, variable-length format.
    436     //    The value is for the string/byte sequence so far.
    437     //    One node bit indicates whether the value is final or whether
    438     //    matching continues with the next node.
    439     //  - Linear-match node: Matches a number of bytes.
    440     //  - Branch node: Branches to other nodes according to the current input byte.
    441     //    The node byte is the length of the branch (number of bytes to select from)
    442     //    minus 1. It is followed by a sub-node:
    443     //    - If the length is at most kMaxBranchLinearSubNodeLength, then
    444     //      there are length-1 (key, value) pairs and then one more comparison byte.
    445     //      If one of the key bytes matches, then the value is either a final value for
    446     //      the string/byte sequence so far, or a "jump" delta to the next node.
    447     //      If the last byte matches, then matching continues with the next node.
    448     //      (Values have the same encoding as value nodes.)
    449     //    - If the length is greater than kMaxBranchLinearSubNodeLength, then
    450     //      there is one byte and one "jump" delta.
    451     //      If the input byte is less than the sub-node byte, then "jump" by delta to
    452     //      the next sub-node which will have a length of length/2.
    453     //      (The delta has its own compact encoding.)
    454     //      Otherwise, skip the "jump" delta to the next sub-node
    455     //      which will have a length of length-length/2.
    456 
    457     // Node lead byte values.
    458 
    459     // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise
    460     // the length is one more than the next byte.
    461 
    462     // For a branch sub-node with at most this many entries, we drop down
    463     // to a linear search.
    464     static const int32_t kMaxBranchLinearSubNodeLength=5;
    465 
    466     // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node.
    467     static const int32_t kMinLinearMatch=0x10;
    468     static const int32_t kMaxLinearMatchLength=0x10;
    469 
    470     // 20..ff: Variable-length value node.
    471     // If odd, the value is final. (Otherwise, intermediate value or jump delta.)
    472     // Then shift-right by 1 bit.
    473     // The remaining lead byte value indicates the number of following bytes (0..4)
    474     // and contains the value's top bits.
    475     static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength;  // 0x20
    476     // It is a final value if bit 0 is set.
    477     static const int32_t kValueIsFinal=1;
    478 
    479     // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds.
    480     static const int32_t kMinOneByteValueLead=kMinValueLead/2;  // 0x10
    481     static const int32_t kMaxOneByteValue=0x40;  // At least 6 bits in the first byte.
    482 
    483     static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1;  // 0x51
    484     static const int32_t kMaxTwoByteValue=0x1aff;
    485 
    486     static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1;  // 0x6c
    487     static const int32_t kFourByteValueLead=0x7e;
    488 
    489     // A little more than Unicode code points. (0x11ffff)
    490     static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1;
    491 
    492     static const int32_t kFiveByteValueLead=0x7f;
    493 
    494     // Compact delta integers.
    495     static const int32_t kMaxOneByteDelta=0xbf;
    496     static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1;  // 0xc0
    497     static const int32_t kMinThreeByteDeltaLead=0xf0;
    498     static const int32_t kFourByteDeltaLead=0xfe;
    499     static const int32_t kFiveByteDeltaLead=0xff;
    500 
    501     static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1;  // 0x2fff
    502     static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1;  // 0xdffff
    503 
    504     uint8_t *ownedArray_;
    505 
    506     // Fixed value referencing the BytesTrie bytes.
    507     const uint8_t *bytes_;
    508 
    509     // Iterator variables.
    510 
    511     // Pointer to next trie byte to read. NULL if no more matches.
    512     const uint8_t *pos_;
    513     // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
    514     int32_t remainingMatchLength_;
    515 };
    516 
    517 U_NAMESPACE_END
    518 
    519 #endif  // __BYTESTRIE_H__
    520