Home | History | Annotate | Download | only in unicode
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *   Copyright (C) 2010-2012,2014, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 *   file name:  stringtriebuilder.h
      9 *   encoding:   UTF-8
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2010dec24
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #ifndef __STRINGTRIEBUILDER_H__
     18 #define __STRINGTRIEBUILDER_H__
     19 
     20 #include "unicode/utypes.h"
     21 #include "unicode/uobject.h"
     22 
     23 /**
     24  * \file
     25  * \brief C++ API: Builder API for trie builders
     26  */
     27 
     28 // Forward declaration.
     29 struct UHashtable;
     30 typedef struct UHashtable UHashtable;
     31 
     32 /**
     33  * Build options for BytesTrieBuilder and CharsTrieBuilder.
     34  * @stable ICU 4.8
     35  */
     36 enum UStringTrieBuildOption {
     37     /**
     38      * Builds a trie quickly.
     39      * @stable ICU 4.8
     40      */
     41     USTRINGTRIE_BUILD_FAST,
     42     /**
     43      * Builds a trie more slowly, attempting to generate
     44      * a shorter but equivalent serialization.
     45      * This build option also uses more memory.
     46      *
     47      * This option can be effective when many integer values are the same
     48      * and string/byte sequence suffixes can be shared.
     49      * Runtime speed is not expected to improve.
     50      * @stable ICU 4.8
     51      */
     52     USTRINGTRIE_BUILD_SMALL
     53 };
     54 
     55 U_NAMESPACE_BEGIN
     56 
     57 /**
     58  * Base class for string trie builder classes.
     59  *
     60  * This class is not intended for public subclassing.
     61  * @stable ICU 4.8
     62  */
     63 class U_COMMON_API StringTrieBuilder : public UObject {
     64 public:
     65 #ifndef U_HIDE_INTERNAL_API
     66     /** @internal */
     67     static UBool hashNode(const void *node);
     68     /** @internal */
     69     static UBool equalNodes(const void *left, const void *right);
     70 #endif  /* U_HIDE_INTERNAL_API */
     71 
     72 protected:
     73     // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
     74     // or else the compiler will create a public default constructor.
     75     /** @internal */
     76     StringTrieBuilder();
     77     /** @internal */
     78     virtual ~StringTrieBuilder();
     79 
     80 #ifndef U_HIDE_INTERNAL_API
     81     /** @internal */
     82     void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
     83     /** @internal */
     84     void deleteCompactBuilder();
     85 
     86     /** @internal */
     87     void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
     88 
     89     /** @internal */
     90     int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
     91     /** @internal */
     92     int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
     93 #endif  /* U_HIDE_INTERNAL_API */
     94 
     95     class Node;
     96 
     97 #ifndef U_HIDE_INTERNAL_API
     98     /** @internal */
     99     Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
    100     /** @internal */
    101     Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
    102                             int32_t length, UErrorCode &errorCode);
    103 #endif  /* U_HIDE_INTERNAL_API */
    104 
    105     /** @internal */
    106     virtual int32_t getElementStringLength(int32_t i) const = 0;
    107     /** @internal */
    108     virtual char16_t getElementUnit(int32_t i, int32_t unitIndex) const = 0;
    109     /** @internal */
    110     virtual int32_t getElementValue(int32_t i) const = 0;
    111 
    112     // Finds the first unit index after this one where
    113     // the first and last element have different units again.
    114     /** @internal */
    115     virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
    116 
    117     // Number of different units at unitIndex.
    118     /** @internal */
    119     virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
    120     /** @internal */
    121     virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
    122     /** @internal */
    123     virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, char16_t unit) const = 0;
    124 
    125     /** @internal */
    126     virtual UBool matchNodesCanHaveValues() const = 0;
    127 
    128     /** @internal */
    129     virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
    130     /** @internal */
    131     virtual int32_t getMinLinearMatch() const = 0;
    132     /** @internal */
    133     virtual int32_t getMaxLinearMatchLength() const = 0;
    134 
    135 #ifndef U_HIDE_INTERNAL_API
    136     // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
    137     /** @internal */
    138     static const int32_t kMaxBranchLinearSubNodeLength=5;
    139 
    140     // Maximum number of nested split-branch levels for a branch on all 2^16 possible char16_t units.
    141     // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
    142     /** @internal */
    143     static const int32_t kMaxSplitBranchLevels=14;
    144 
    145     /**
    146      * Makes sure that there is only one unique node registered that is
    147      * equivalent to newNode.
    148      * @param newNode Input node. The builder takes ownership.
    149      * @param errorCode ICU in/out UErrorCode.
    150                         Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
    151      * @return newNode if it is the first of its kind, or
    152      *         an equivalent node if newNode is a duplicate.
    153      * @internal
    154      */
    155     Node *registerNode(Node *newNode, UErrorCode &errorCode);
    156     /**
    157      * Makes sure that there is only one unique FinalValueNode registered
    158      * with this value.
    159      * Avoids creating a node if the value is a duplicate.
    160      * @param value A final value.
    161      * @param errorCode ICU in/out UErrorCode.
    162                         Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
    163      * @return A FinalValueNode with the given value.
    164      * @internal
    165      */
    166     Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
    167 #endif  /* U_HIDE_INTERNAL_API */
    168 
    169     /*
    170      * C++ note:
    171      * registerNode() and registerFinalValue() take ownership of their input nodes,
    172      * and only return owned nodes.
    173      * If they see a failure UErrorCode, they will delete the input node.
    174      * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
    175      * If there is a failure, they return NULL.
    176      *
    177      * NULL Node pointers can be safely passed into other Nodes because
    178      * they call the static Node::hashCode() which checks for a NULL pointer first.
    179      *
    180      * Therefore, as long as builder functions register a new node,
    181      * they need to check for failures only before explicitly dereferencing
    182      * a Node pointer, or before setting a new UErrorCode.
    183      */
    184 
    185     // Hash set of nodes, maps from nodes to integer 1.
    186     /** @internal */
    187     UHashtable *nodes;
    188 
    189     // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
    190     // it is needed for layout of other objects.
    191     /** @internal */
    192     class Node : public UObject {
    193     public:
    194         Node(int32_t initialHash) : hash(initialHash), offset(0) {}
    195         inline int32_t hashCode() const { return hash; }
    196         // Handles node==NULL.
    197         static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
    198         // Base class operator==() compares the actual class types.
    199         virtual UBool operator==(const Node &other) const;
    200         inline UBool operator!=(const Node &other) const { return !operator==(other); }
    201         /**
    202          * Traverses the Node graph and numbers branch edges, with rightmost edges first.
    203          * This is to avoid writing a duplicate node twice.
    204          *
    205          * Branch nodes in this trie data structure are not symmetric.
    206          * Most branch edges "jump" to other nodes but the rightmost branch edges
    207          * just continue without a jump.
    208          * Therefore, write() must write the rightmost branch edge last
    209          * (trie units are written backwards), and must write it at that point even if
    210          * it is a duplicate of a node previously written elsewhere.
    211          *
    212          * This function visits and marks right branch edges first.
    213          * Edges are numbered with increasingly negative values because we share the
    214          * offset field which gets positive values when nodes are written.
    215          * A branch edge also remembers the first number for any of its edges.
    216          *
    217          * When a further-left branch edge has a number in the range of the rightmost
    218          * edge's numbers, then it will be written as part of the required right edge
    219          * and we can avoid writing it first.
    220          *
    221          * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
    222          * edge numbers.
    223          *
    224          * @param edgeNumber The first edge number for this node and its sub-nodes.
    225          * @return An edge number that is at least the maximum-negative
    226          *         of the input edge number and the numbers of this node and all of its sub-nodes.
    227          */
    228         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
    229         // write() must set the offset to a positive value.
    230         virtual void write(StringTrieBuilder &builder) = 0;
    231         // See markRightEdgesFirst.
    232         inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
    233                                                StringTrieBuilder &builder) {
    234             // Note: Edge numbers are negative, lastRight<=firstRight.
    235             // If offset>0 then this node and its sub-nodes have been written already
    236             // and we need not write them again.
    237             // If this node is part of the unwritten right branch edge,
    238             // then we wait until that is written.
    239             if(offset<0 && (offset<lastRight || firstRight<offset)) {
    240                 write(builder);
    241             }
    242         }
    243         inline int32_t getOffset() const { return offset; }
    244     protected:
    245         int32_t hash;
    246         int32_t offset;
    247     };
    248 
    249 #ifndef U_HIDE_INTERNAL_API
    250     // This class should not be overridden because
    251     // registerFinalValue() compares a stack-allocated FinalValueNode
    252     // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
    253     // with the input node, and the
    254     // !Node::operator==(other) used inside FinalValueNode::operator==(other)
    255     // will be false if the typeid's are different.
    256     /** @internal */
    257     class FinalValueNode : public Node {
    258     public:
    259         FinalValueNode(int32_t v) : Node(0x111111u*37u+v), value(v) {}
    260         virtual UBool operator==(const Node &other) const;
    261         virtual void write(StringTrieBuilder &builder);
    262     protected:
    263         int32_t value;
    264     };
    265 #endif  /* U_HIDE_INTERNAL_API */
    266 
    267     // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
    268     // it is needed for layout of other objects.
    269     /**
    270      * @internal
    271      */
    272     class ValueNode : public Node {
    273     public:
    274         ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
    275         virtual UBool operator==(const Node &other) const;
    276         void setValue(int32_t v) {
    277             hasValue=TRUE;
    278             value=v;
    279             hash=hash*37u+v;
    280         }
    281     protected:
    282         UBool hasValue;
    283         int32_t value;
    284     };
    285 
    286 #ifndef U_HIDE_INTERNAL_API
    287     /**
    288      * @internal
    289      */
    290     class IntermediateValueNode : public ValueNode {
    291     public:
    292         IntermediateValueNode(int32_t v, Node *nextNode)
    293                 : ValueNode(0x222222u*37u+hashCode(nextNode)), next(nextNode) { setValue(v); }
    294         virtual UBool operator==(const Node &other) const;
    295         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
    296         virtual void write(StringTrieBuilder &builder);
    297     protected:
    298         Node *next;
    299     };
    300 #endif  /* U_HIDE_INTERNAL_API */
    301 
    302     // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
    303     // it is needed for layout of other objects.
    304     /**
    305      * @internal
    306      */
    307     class LinearMatchNode : public ValueNode {
    308     public:
    309         LinearMatchNode(int32_t len, Node *nextNode)
    310                 : ValueNode((0x333333u*37u+len)*37u+hashCode(nextNode)),
    311                   length(len), next(nextNode) {}
    312         virtual UBool operator==(const Node &other) const;
    313         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
    314     protected:
    315         int32_t length;
    316         Node *next;
    317     };
    318 
    319 #ifndef U_HIDE_INTERNAL_API
    320     /**
    321      * @internal
    322      */
    323     class BranchNode : public Node {
    324     public:
    325         BranchNode(int32_t initialHash) : Node(initialHash) {}
    326     protected:
    327         int32_t firstEdgeNumber;
    328     };
    329 
    330     /**
    331      * @internal
    332      */
    333     class ListBranchNode : public BranchNode {
    334     public:
    335         ListBranchNode() : BranchNode(0x444444), length(0) {}
    336         virtual UBool operator==(const Node &other) const;
    337         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
    338         virtual void write(StringTrieBuilder &builder);
    339         // Adds a unit with a final value.
    340         void add(int32_t c, int32_t value) {
    341             units[length]=(char16_t)c;
    342             equal[length]=NULL;
    343             values[length]=value;
    344             ++length;
    345             hash=(hash*37u+c)*37u+value;
    346         }
    347         // Adds a unit which leads to another match node.
    348         void add(int32_t c, Node *node) {
    349             units[length]=(char16_t)c;
    350             equal[length]=node;
    351             values[length]=0;
    352             ++length;
    353             hash=(hash*37u+c)*37u+hashCode(node);
    354         }
    355     protected:
    356         Node *equal[kMaxBranchLinearSubNodeLength];  // NULL means "has final value".
    357         int32_t length;
    358         int32_t values[kMaxBranchLinearSubNodeLength];
    359         char16_t units[kMaxBranchLinearSubNodeLength];
    360     };
    361 
    362     /**
    363      * @internal
    364      */
    365     class SplitBranchNode : public BranchNode {
    366     public:
    367         SplitBranchNode(char16_t middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
    368                 : BranchNode(((0x555555u*37u+middleUnit)*37u+
    369                               hashCode(lessThanNode))*37u+hashCode(greaterOrEqualNode)),
    370                   unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
    371         virtual UBool operator==(const Node &other) const;
    372         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
    373         virtual void write(StringTrieBuilder &builder);
    374     protected:
    375         char16_t unit;
    376         Node *lessThan;
    377         Node *greaterOrEqual;
    378     };
    379 
    380     // Branch head node, for writing the actual node lead unit.
    381     /** @internal */
    382     class BranchHeadNode : public ValueNode {
    383     public:
    384         BranchHeadNode(int32_t len, Node *subNode)
    385                 : ValueNode((0x666666u*37u+len)*37u+hashCode(subNode)),
    386                   length(len), next(subNode) {}
    387         virtual UBool operator==(const Node &other) const;
    388         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
    389         virtual void write(StringTrieBuilder &builder);
    390     protected:
    391         int32_t length;
    392         Node *next;  // A branch sub-node.
    393     };
    394 #endif  /* U_HIDE_INTERNAL_API */
    395 
    396     /** @internal */
    397     virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
    398                                         Node *nextNode) const = 0;
    399 
    400     /** @internal */
    401     virtual int32_t write(int32_t unit) = 0;
    402     /** @internal */
    403     virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
    404     /** @internal */
    405     virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
    406     /** @internal */
    407     virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
    408     /** @internal */
    409     virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
    410 };
    411 
    412 U_NAMESPACE_END
    413 
    414 #endif  // __STRINGTRIEBUILDER_H__
    415