Home | History | Annotate | Download | only in unicode
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *   Copyright (C) 2010-2012, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 *   file name:  ucharstrie.h
      9 *   encoding:   UTF-8
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2010nov14
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #ifndef __UCHARSTRIE_H__
     18 #define __UCHARSTRIE_H__
     19 
     20 /**
     21  * \file
     22  * \brief C++ API: Trie for mapping Unicode strings (or 16-bit-unit sequences)
     23  *                 to integer values.
     24  */
     25 
     26 #include "unicode/utypes.h"
     27 #include "unicode/unistr.h"
     28 #include "unicode/uobject.h"
     29 #include "unicode/ustringtrie.h"
     30 
     31 U_NAMESPACE_BEGIN
     32 
     33 class Appendable;
     34 class UCharsTrieBuilder;
     35 class UVector32;
     36 
     37 /**
     38  * Light-weight, non-const reader class for a UCharsTrie.
     39  * Traverses a char16_t-serialized data structure with minimal state,
     40  * for mapping strings (16-bit-unit sequences) to non-negative integer values.
     41  *
     42  * This class owns the serialized trie data only if it was constructed by
     43  * the builder's build() method.
     44  * The public constructor and the copy constructor only alias the data (only copy the pointer).
     45  * There is no assignment operator.
     46  *
     47  * This class is not intended for public subclassing.
     48  * @stable ICU 4.8
     49  */
     50 class U_COMMON_API UCharsTrie : public UMemory {
     51 public:
     52     /**
     53      * Constructs a UCharsTrie reader instance.
     54      *
     55      * The trieUChars must contain a copy of a char16_t sequence from the UCharsTrieBuilder,
     56      * starting with the first char16_t of that sequence.
     57      * The UCharsTrie object will not read more char16_ts than
     58      * the UCharsTrieBuilder generated in the corresponding build() call.
     59      *
     60      * The array is not copied/cloned and must not be modified while
     61      * the UCharsTrie object is in use.
     62      *
     63      * @param trieUChars The char16_t array that contains the serialized trie.
     64      * @stable ICU 4.8
     65      */
     66     UCharsTrie(ConstChar16Ptr trieUChars)
     67             : ownedArray_(NULL), uchars_(trieUChars),
     68               pos_(uchars_), remainingMatchLength_(-1) {}
     69 
     70     /**
     71      * Destructor.
     72      * @stable ICU 4.8
     73      */
     74     ~UCharsTrie();
     75 
     76     /**
     77      * Copy constructor, copies the other trie reader object and its state,
     78      * but not the char16_t array which will be shared. (Shallow copy.)
     79      * @param other Another UCharsTrie object.
     80      * @stable ICU 4.8
     81      */
     82     UCharsTrie(const UCharsTrie &other)
     83             : ownedArray_(NULL), uchars_(other.uchars_),
     84               pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
     85 
     86     /**
     87      * Resets this trie to its initial state.
     88      * @return *this
     89      * @stable ICU 4.8
     90      */
     91     UCharsTrie &reset() {
     92         pos_=uchars_;
     93         remainingMatchLength_=-1;
     94         return *this;
     95     }
     96 
     97     /**
     98      * UCharsTrie state object, for saving a trie's current state
     99      * and resetting the trie back to this state later.
    100      * @stable ICU 4.8
    101      */
    102     class State : public UMemory {
    103     public:
    104         /**
    105          * Constructs an empty State.
    106          * @stable ICU 4.8
    107          */
    108         State() { uchars=NULL; }
    109     private:
    110         friend class UCharsTrie;
    111 
    112         const char16_t *uchars;
    113         const char16_t *pos;
    114         int32_t remainingMatchLength;
    115     };
    116 
    117     /**
    118      * Saves the state of this trie.
    119      * @param state The State object to hold the trie's state.
    120      * @return *this
    121      * @see resetToState
    122      * @stable ICU 4.8
    123      */
    124     const UCharsTrie &saveState(State &state) const {
    125         state.uchars=uchars_;
    126         state.pos=pos_;
    127         state.remainingMatchLength=remainingMatchLength_;
    128         return *this;
    129     }
    130 
    131     /**
    132      * Resets this trie to the saved state.
    133      * If the state object contains no state, or the state of a different trie,
    134      * then this trie remains unchanged.
    135      * @param state The State object which holds a saved trie state.
    136      * @return *this
    137      * @see saveState
    138      * @see reset
    139      * @stable ICU 4.8
    140      */
    141     UCharsTrie &resetToState(const State &state) {
    142         if(uchars_==state.uchars && uchars_!=NULL) {
    143             pos_=state.pos;
    144             remainingMatchLength_=state.remainingMatchLength;
    145         }
    146         return *this;
    147     }
    148 
    149     /**
    150      * Determines whether the string so far matches, whether it has a value,
    151      * and whether another input char16_t can continue a matching string.
    152      * @return The match/value Result.
    153      * @stable ICU 4.8
    154      */
    155     UStringTrieResult current() const;
    156 
    157     /**
    158      * Traverses the trie from the initial state for this input char16_t.
    159      * Equivalent to reset().next(uchar).
    160      * @param uchar Input char value. Values below 0 and above 0xffff will never match.
    161      * @return The match/value Result.
    162      * @stable ICU 4.8
    163      */
    164     inline UStringTrieResult first(int32_t uchar) {
    165         remainingMatchLength_=-1;
    166         return nextImpl(uchars_, uchar);
    167     }
    168 
    169     /**
    170      * Traverses the trie from the initial state for the
    171      * one or two UTF-16 code units for this input code point.
    172      * Equivalent to reset().nextForCodePoint(cp).
    173      * @param cp A Unicode code point 0..0x10ffff.
    174      * @return The match/value Result.
    175      * @stable ICU 4.8
    176      */
    177     UStringTrieResult firstForCodePoint(UChar32 cp);
    178 
    179     /**
    180      * Traverses the trie from the current state for this input char16_t.
    181      * @param uchar Input char value. Values below 0 and above 0xffff will never match.
    182      * @return The match/value Result.
    183      * @stable ICU 4.8
    184      */
    185     UStringTrieResult next(int32_t uchar);
    186 
    187     /**
    188      * Traverses the trie from the current state for the
    189      * one or two UTF-16 code units for this input code point.
    190      * @param cp A Unicode code point 0..0x10ffff.
    191      * @return The match/value Result.
    192      * @stable ICU 4.8
    193      */
    194     UStringTrieResult nextForCodePoint(UChar32 cp);
    195 
    196     /**
    197      * Traverses the trie from the current state for this string.
    198      * Equivalent to
    199      * \code
    200      * Result result=current();
    201      * for(each c in s)
    202      *   if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH;
    203      *   result=next(c);
    204      * return result;
    205      * \endcode
    206      * @param s A string. Can be NULL if length is 0.
    207      * @param length The length of the string. Can be -1 if NUL-terminated.
    208      * @return The match/value Result.
    209      * @stable ICU 4.8
    210      */
    211     UStringTrieResult next(ConstChar16Ptr s, int32_t length);
    212 
    213     /**
    214      * Returns a matching string's value if called immediately after
    215      * current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE.
    216      * getValue() can be called multiple times.
    217      *
    218      * Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE!
    219      * @return The value for the string so far.
    220      * @stable ICU 4.8
    221      */
    222     inline int32_t getValue() const {
    223         const char16_t *pos=pos_;
    224         int32_t leadUnit=*pos++;
    225         // U_ASSERT(leadUnit>=kMinValueLead);
    226         return leadUnit&kValueIsFinal ?
    227             readValue(pos, leadUnit&0x7fff) : readNodeValue(pos, leadUnit);
    228     }
    229 
    230     /**
    231      * Determines whether all strings reachable from the current state
    232      * map to the same value.
    233      * @param uniqueValue Receives the unique value, if this function returns TRUE.
    234      *                    (output-only)
    235      * @return TRUE if all strings reachable from the current state
    236      *         map to the same value.
    237      * @stable ICU 4.8
    238      */
    239     inline UBool hasUniqueValue(int32_t &uniqueValue) const {
    240         const char16_t *pos=pos_;
    241         // Skip the rest of a pending linear-match node.
    242         return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
    243     }
    244 
    245     /**
    246      * Finds each char16_t which continues the string from the current state.
    247      * That is, each char16_t c for which it would be next(c)!=USTRINGTRIE_NO_MATCH now.
    248      * @param out Each next char16_t is appended to this object.
    249      * @return the number of char16_ts which continue the string from here
    250      * @stable ICU 4.8
    251      */
    252     int32_t getNextUChars(Appendable &out) const;
    253 
    254     /**
    255      * Iterator for all of the (string, value) pairs in a UCharsTrie.
    256      * @stable ICU 4.8
    257      */
    258     class U_COMMON_API Iterator : public UMemory {
    259     public:
    260         /**
    261          * Iterates from the root of a char16_t-serialized UCharsTrie.
    262          * @param trieUChars The trie char16_ts.
    263          * @param maxStringLength If 0, the iterator returns full strings.
    264          *                        Otherwise, the iterator returns strings with this maximum length.
    265          * @param errorCode Standard ICU error code. Its input value must
    266          *                  pass the U_SUCCESS() test, or else the function returns
    267          *                  immediately. Check for U_FAILURE() on output or use with
    268          *                  function chaining. (See User Guide for details.)
    269          * @stable ICU 4.8
    270          */
    271         Iterator(ConstChar16Ptr trieUChars, int32_t maxStringLength, UErrorCode &errorCode);
    272 
    273         /**
    274          * Iterates from the current state of the specified UCharsTrie.
    275          * @param trie The trie whose state will be copied for iteration.
    276          * @param maxStringLength If 0, the iterator returns full strings.
    277          *                        Otherwise, the iterator returns strings with this maximum length.
    278          * @param errorCode Standard ICU error code. Its input value must
    279          *                  pass the U_SUCCESS() test, or else the function returns
    280          *                  immediately. Check for U_FAILURE() on output or use with
    281          *                  function chaining. (See User Guide for details.)
    282          * @stable ICU 4.8
    283          */
    284         Iterator(const UCharsTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
    285 
    286         /**
    287          * Destructor.
    288          * @stable ICU 4.8
    289          */
    290         ~Iterator();
    291 
    292         /**
    293          * Resets this iterator to its initial state.
    294          * @return *this
    295          * @stable ICU 4.8
    296          */
    297         Iterator &reset();
    298 
    299         /**
    300          * @return TRUE if there are more elements.
    301          * @stable ICU 4.8
    302          */
    303         UBool hasNext() const;
    304 
    305         /**
    306          * Finds the next (string, value) pair if there is one.
    307          *
    308          * If the string is truncated to the maximum length and does not
    309          * have a real value, then the value is set to -1.
    310          * In this case, this "not a real value" is indistinguishable from
    311          * a real value of -1.
    312          * @param errorCode Standard ICU error code. Its input value must
    313          *                  pass the U_SUCCESS() test, or else the function returns
    314          *                  immediately. Check for U_FAILURE() on output or use with
    315          *                  function chaining. (See User Guide for details.)
    316          * @return TRUE if there is another element.
    317          * @stable ICU 4.8
    318          */
    319         UBool next(UErrorCode &errorCode);
    320 
    321         /**
    322          * @return The string for the last successful next().
    323          * @stable ICU 4.8
    324          */
    325         const UnicodeString &getString() const { return str_; }
    326         /**
    327          * @return The value for the last successful next().
    328          * @stable ICU 4.8
    329          */
    330         int32_t getValue() const { return value_; }
    331 
    332     private:
    333         UBool truncateAndStop() {
    334             pos_=NULL;
    335             value_=-1;  // no real value for str
    336             return TRUE;
    337         }
    338 
    339         const char16_t *branchNext(const char16_t *pos, int32_t length, UErrorCode &errorCode);
    340 
    341         const char16_t *uchars_;
    342         const char16_t *pos_;
    343         const char16_t *initialPos_;
    344         int32_t remainingMatchLength_;
    345         int32_t initialRemainingMatchLength_;
    346         UBool skipValue_;  // Skip intermediate value which was already delivered.
    347 
    348         UnicodeString str_;
    349         int32_t maxLength_;
    350         int32_t value_;
    351 
    352         // The stack stores pairs of integers for backtracking to another
    353         // outbound edge of a branch node.
    354         // The first integer is an offset from uchars_.
    355         // The second integer has the str_.length() from before the node in bits 15..0,
    356         // and the remaining branch length in bits 31..16.
    357         // (We could store the remaining branch length minus 1 in bits 30..16 and not use the sign bit,
    358         // but the code looks more confusing that way.)
    359         UVector32 *stack_;
    360     };
    361 
    362 private:
    363     friend class UCharsTrieBuilder;
    364 
    365     /**
    366      * Constructs a UCharsTrie reader instance.
    367      * Unlike the public constructor which just aliases an array,
    368      * this constructor adopts the builder's array.
    369      * This constructor is only called by the builder.
    370      */
    371     UCharsTrie(char16_t *adoptUChars, const char16_t *trieUChars)
    372             : ownedArray_(adoptUChars), uchars_(trieUChars),
    373               pos_(uchars_), remainingMatchLength_(-1) {}
    374 
    375     // No assignment operator.
    376     UCharsTrie &operator=(const UCharsTrie &other);
    377 
    378     inline void stop() {
    379         pos_=NULL;
    380     }
    381 
    382     // Reads a compact 32-bit integer.
    383     // pos is already after the leadUnit, and the lead unit has bit 15 reset.
    384     static inline int32_t readValue(const char16_t *pos, int32_t leadUnit) {
    385         int32_t value;
    386         if(leadUnit<kMinTwoUnitValueLead) {
    387             value=leadUnit;
    388         } else if(leadUnit<kThreeUnitValueLead) {
    389             value=((leadUnit-kMinTwoUnitValueLead)<<16)|*pos;
    390         } else {
    391             value=(pos[0]<<16)|pos[1];
    392         }
    393         return value;
    394     }
    395     static inline const char16_t *skipValue(const char16_t *pos, int32_t leadUnit) {
    396         if(leadUnit>=kMinTwoUnitValueLead) {
    397             if(leadUnit<kThreeUnitValueLead) {
    398                 ++pos;
    399             } else {
    400                 pos+=2;
    401             }
    402         }
    403         return pos;
    404     }
    405     static inline const char16_t *skipValue(const char16_t *pos) {
    406         int32_t leadUnit=*pos++;
    407         return skipValue(pos, leadUnit&0x7fff);
    408     }
    409 
    410     static inline int32_t readNodeValue(const char16_t *pos, int32_t leadUnit) {
    411         // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
    412         int32_t value;
    413         if(leadUnit<kMinTwoUnitNodeValueLead) {
    414             value=(leadUnit>>6)-1;
    415         } else if(leadUnit<kThreeUnitNodeValueLead) {
    416             value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|*pos;
    417         } else {
    418             value=(pos[0]<<16)|pos[1];
    419         }
    420         return value;
    421     }
    422     static inline const char16_t *skipNodeValue(const char16_t *pos, int32_t leadUnit) {
    423         // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
    424         if(leadUnit>=kMinTwoUnitNodeValueLead) {
    425             if(leadUnit<kThreeUnitNodeValueLead) {
    426                 ++pos;
    427             } else {
    428                 pos+=2;
    429             }
    430         }
    431         return pos;
    432     }
    433 
    434     static inline const char16_t *jumpByDelta(const char16_t *pos) {
    435         int32_t delta=*pos++;
    436         if(delta>=kMinTwoUnitDeltaLead) {
    437             if(delta==kThreeUnitDeltaLead) {
    438                 delta=(pos[0]<<16)|pos[1];
    439                 pos+=2;
    440             } else {
    441                 delta=((delta-kMinTwoUnitDeltaLead)<<16)|*pos++;
    442             }
    443         }
    444         return pos+delta;
    445     }
    446 
    447     static const char16_t *skipDelta(const char16_t *pos) {
    448         int32_t delta=*pos++;
    449         if(delta>=kMinTwoUnitDeltaLead) {
    450             if(delta==kThreeUnitDeltaLead) {
    451                 pos+=2;
    452             } else {
    453                 ++pos;
    454             }
    455         }
    456         return pos;
    457     }
    458 
    459     static inline UStringTrieResult valueResult(int32_t node) {
    460         return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node>>15));
    461     }
    462 
    463     // Handles a branch node for both next(uchar) and next(string).
    464     UStringTrieResult branchNext(const char16_t *pos, int32_t length, int32_t uchar);
    465 
    466     // Requires remainingLength_<0.
    467     UStringTrieResult nextImpl(const char16_t *pos, int32_t uchar);
    468 
    469     // Helper functions for hasUniqueValue().
    470     // Recursively finds a unique value (or whether there is not a unique one)
    471     // from a branch.
    472     static const char16_t *findUniqueValueFromBranch(const char16_t *pos, int32_t length,
    473                                                   UBool haveUniqueValue, int32_t &uniqueValue);
    474     // Recursively finds a unique value (or whether there is not a unique one)
    475     // starting from a position on a node lead unit.
    476     static UBool findUniqueValue(const char16_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
    477 
    478     // Helper functions for getNextUChars().
    479     // getNextUChars() when pos is on a branch node.
    480     static void getNextBranchUChars(const char16_t *pos, int32_t length, Appendable &out);
    481 
    482     // UCharsTrie data structure
    483     //
    484     // The trie consists of a series of char16_t-serialized nodes for incremental
    485     // Unicode string/char16_t sequence matching. (char16_t=16-bit unsigned integer)
    486     // The root node is at the beginning of the trie data.
    487     //
    488     // Types of nodes are distinguished by their node lead unit ranges.
    489     // After each node, except a final-value node, another node follows to
    490     // encode match values or continue matching further units.
    491     //
    492     // Node types:
    493     //  - Final-value node: Stores a 32-bit integer in a compact, variable-length format.
    494     //    The value is for the string/char16_t sequence so far.
    495     //  - Match node, optionally with an intermediate value in a different compact format.
    496     //    The value, if present, is for the string/char16_t sequence so far.
    497     //
    498     //  Aside from the value, which uses the node lead unit's high bits:
    499     //
    500     //  - Linear-match node: Matches a number of units.
    501     //  - Branch node: Branches to other nodes according to the current input unit.
    502     //    The node unit is the length of the branch (number of units to select from)
    503     //    minus 1. It is followed by a sub-node:
    504     //    - If the length is at most kMaxBranchLinearSubNodeLength, then
    505     //      there are length-1 (key, value) pairs and then one more comparison unit.
    506     //      If one of the key units matches, then the value is either a final value for
    507     //      the string so far, or a "jump" delta to the next node.
    508     //      If the last unit matches, then matching continues with the next node.
    509     //      (Values have the same encoding as final-value nodes.)
    510     //    - If the length is greater than kMaxBranchLinearSubNodeLength, then
    511     //      there is one unit and one "jump" delta.
    512     //      If the input unit is less than the sub-node unit, then "jump" by delta to
    513     //      the next sub-node which will have a length of length/2.
    514     //      (The delta has its own compact encoding.)
    515     //      Otherwise, skip the "jump" delta to the next sub-node
    516     //      which will have a length of length-length/2.
    517 
    518     // Match-node lead unit values, after masking off intermediate-value bits:
    519 
    520     // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise
    521     // the length is one more than the next unit.
    522 
    523     // For a branch sub-node with at most this many entries, we drop down
    524     // to a linear search.
    525     static const int32_t kMaxBranchLinearSubNodeLength=5;
    526 
    527     // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
    528     static const int32_t kMinLinearMatch=0x30;
    529     static const int32_t kMaxLinearMatchLength=0x10;
    530 
    531     // Match-node lead unit bits 14..6 for the optional intermediate value.
    532     // If these bits are 0, then there is no intermediate value.
    533     // Otherwise, see the *NodeValue* constants below.
    534     static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength;  // 0x0040
    535     static const int32_t kNodeTypeMask=kMinValueLead-1;  // 0x003f
    536 
    537     // A final-value node has bit 15 set.
    538     static const int32_t kValueIsFinal=0x8000;
    539 
    540     // Compact value: After testing and masking off bit 15, use the following thresholds.
    541     static const int32_t kMaxOneUnitValue=0x3fff;
    542 
    543     static const int32_t kMinTwoUnitValueLead=kMaxOneUnitValue+1;  // 0x4000
    544     static const int32_t kThreeUnitValueLead=0x7fff;
    545 
    546     static const int32_t kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1;  // 0x3ffeffff
    547 
    548     // Compact intermediate-value integer, lead unit shared with a branch or linear-match node.
    549     static const int32_t kMaxOneUnitNodeValue=0xff;
    550     static const int32_t kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6);  // 0x4040
    551     static const int32_t kThreeUnitNodeValueLead=0x7fc0;
    552 
    553     static const int32_t kMaxTwoUnitNodeValue=
    554         ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1;  // 0xfdffff
    555 
    556     // Compact delta integers.
    557     static const int32_t kMaxOneUnitDelta=0xfbff;
    558     static const int32_t kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1;  // 0xfc00
    559     static const int32_t kThreeUnitDeltaLead=0xffff;
    560 
    561     static const int32_t kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1;  // 0x03feffff
    562 
    563     char16_t *ownedArray_;
    564 
    565     // Fixed value referencing the UCharsTrie words.
    566     const char16_t *uchars_;
    567 
    568     // Iterator variables.
    569 
    570     // Pointer to next trie unit to read. NULL if no more matches.
    571     const char16_t *pos_;
    572     // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
    573     int32_t remainingMatchLength_;
    574 };
    575 
    576 U_NAMESPACE_END
    577 
    578 #endif  // __UCHARSTRIE_H__
    579