Home | History | Annotate | Download | only in unicode
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 2010-2012, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  stringtriebuilder.h
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2010dec24
     12 *   created by: Markus W. Scherer
     13 */
     14 
     15 #ifndef __STRINGTRIEBUILDER_H__
     16 #define __STRINGTRIEBUILDER_H__
     17 
     18 #include "unicode/utypes.h"
     19 #include "unicode/uobject.h"
     20 
     21 /**
     22  * \file
     23  * \brief C++ API: Builder API for trie builders
     24  */
     25 
     26 // Forward declaration.
     27 struct UHashtable;
     28 typedef struct UHashtable UHashtable;
     29 
     30 /**
     31  * Build options for BytesTrieBuilder and CharsTrieBuilder.
     32  * @stable ICU 4.8
     33  */
     34 enum UStringTrieBuildOption {
     35     /**
     36      * Builds a trie quickly.
     37      * @stable ICU 4.8
     38      */
     39     USTRINGTRIE_BUILD_FAST,
     40     /**
     41      * Builds a trie more slowly, attempting to generate
     42      * a shorter but equivalent serialization.
     43      * This build option also uses more memory.
     44      *
     45      * This option can be effective when many integer values are the same
     46      * and string/byte sequence suffixes can be shared.
     47      * Runtime speed is not expected to improve.
     48      * @stable ICU 4.8
     49      */
     50     USTRINGTRIE_BUILD_SMALL
     51 };
     52 
     53 U_NAMESPACE_BEGIN
     54 
     55 /**
     56  * Base class for string trie builder classes.
     57  *
     58  * This class is not intended for public subclassing.
     59  * @stable ICU 4.8
     60  */
     61 class U_COMMON_API StringTrieBuilder : public UObject {
     62 public:
     63 #ifndef U_HIDE_INTERNAL_API
     64     /** @internal */
     65     static UBool hashNode(const void *node);
     66     /** @internal */
     67     static UBool equalNodes(const void *left, const void *right);
     68 #endif  /* U_HIDE_INTERNAL_API */
     69 
     70 protected:
     71     // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
     72     // or else the compiler will create a public default constructor.
     73     /** @internal */
     74     StringTrieBuilder();
     75     /** @internal */
     76     virtual ~StringTrieBuilder();
     77 
     78 #ifndef U_HIDE_INTERNAL_API
     79     /** @internal */
     80     void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
     81     /** @internal */
     82     void deleteCompactBuilder();
     83 
     84     /** @internal */
     85     void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
     86 
     87     /** @internal */
     88     int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
     89     /** @internal */
     90     int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
     91 #endif  /* U_HIDE_INTERNAL_API */
     92 
     93     class Node;
     94 
     95 #ifndef U_HIDE_INTERNAL_API
     96     /** @internal */
     97     Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
     98     /** @internal */
     99     Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
    100                             int32_t length, UErrorCode &errorCode);
    101 #endif  /* U_HIDE_INTERNAL_API */
    102 
    103     /** @internal */
    104     virtual int32_t getElementStringLength(int32_t i) const = 0;
    105     /** @internal */
    106     virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
    107     /** @internal */
    108     virtual int32_t getElementValue(int32_t i) const = 0;
    109 
    110     // Finds the first unit index after this one where
    111     // the first and last element have different units again.
    112     /** @internal */
    113     virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
    114 
    115     // Number of different units at unitIndex.
    116     /** @internal */
    117     virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
    118     /** @internal */
    119     virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
    120     /** @internal */
    121     virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
    122 
    123     /** @internal */
    124     virtual UBool matchNodesCanHaveValues() const = 0;
    125 
    126     /** @internal */
    127     virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
    128     /** @internal */
    129     virtual int32_t getMinLinearMatch() const = 0;
    130     /** @internal */
    131     virtual int32_t getMaxLinearMatchLength() const = 0;
    132 
    133 #ifndef U_HIDE_INTERNAL_API
    134     // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
    135     /** @internal */
    136     static const int32_t kMaxBranchLinearSubNodeLength=5;
    137 
    138     // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
    139     // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
    140     /** @internal */
    141     static const int32_t kMaxSplitBranchLevels=14;
    142 
    143     /**
    144      * Makes sure that there is only one unique node registered that is
    145      * equivalent to newNode.
    146      * @param newNode Input node. The builder takes ownership.
    147      * @param errorCode ICU in/out UErrorCode.
    148                         Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
    149      * @return newNode if it is the first of its kind, or
    150      *         an equivalent node if newNode is a duplicate.
    151      * @internal
    152      */
    153     Node *registerNode(Node *newNode, UErrorCode &errorCode);
    154     /**
    155      * Makes sure that there is only one unique FinalValueNode registered
    156      * with this value.
    157      * Avoids creating a node if the value is a duplicate.
    158      * @param value A final value.
    159      * @param errorCode ICU in/out UErrorCode.
    160                         Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
    161      * @return A FinalValueNode with the given value.
    162      * @internal
    163      */
    164     Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
    165 
    166     /*
    167      * C++ note:
    168      * registerNode() and registerFinalValue() take ownership of their input nodes,
    169      * and only return owned nodes.
    170      * If they see a failure UErrorCode, they will delete the input node.
    171      * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
    172      * If there is a failure, they return NULL.
    173      *
    174      * NULL Node pointers can be safely passed into other Nodes because
    175      * they call the static Node::hashCode() which checks for a NULL pointer first.
    176      *
    177      * Therefore, as long as builder functions register a new node,
    178      * they need to check for failures only before explicitly dereferencing
    179      * a Node pointer, or before setting a new UErrorCode.
    180      */
    181 
    182     // Hash set of nodes, maps from nodes to integer 1.
    183     /** @internal */
    184     UHashtable *nodes;
    185 
    186     /** @internal */
    187     class Node : public UObject {
    188     public:
    189         Node(int32_t initialHash) : hash(initialHash), offset(0) {}
    190         inline int32_t hashCode() const { return hash; }
    191         // Handles node==NULL.
    192         static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
    193         // Base class operator==() compares the actual class types.
    194         virtual UBool operator==(const Node &other) const;
    195         inline UBool operator!=(const Node &other) const { return !operator==(other); }
    196         /**
    197          * Traverses the Node graph and numbers branch edges, with rightmost edges first.
    198          * This is to avoid writing a duplicate node twice.
    199          *
    200          * Branch nodes in this trie data structure are not symmetric.
    201          * Most branch edges "jump" to other nodes but the rightmost branch edges
    202          * just continue without a jump.
    203          * Therefore, write() must write the rightmost branch edge last
    204          * (trie units are written backwards), and must write it at that point even if
    205          * it is a duplicate of a node previously written elsewhere.
    206          *
    207          * This function visits and marks right branch edges first.
    208          * Edges are numbered with increasingly negative values because we share the
    209          * offset field which gets positive values when nodes are written.
    210          * A branch edge also remembers the first number for any of its edges.
    211          *
    212          * When a further-left branch edge has a number in the range of the rightmost
    213          * edge's numbers, then it will be written as part of the required right edge
    214          * and we can avoid writing it first.
    215          *
    216          * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
    217          * edge numbers.
    218          *
    219          * @param edgeNumber The first edge number for this node and its sub-nodes.
    220          * @return An edge number that is at least the maximum-negative
    221          *         of the input edge number and the numbers of this node and all of its sub-nodes.
    222          */
    223         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
    224         // write() must set the offset to a positive value.
    225         virtual void write(StringTrieBuilder &builder) = 0;
    226         // See markRightEdgesFirst.
    227         inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
    228                                                StringTrieBuilder &builder) {
    229             // Note: Edge numbers are negative, lastRight<=firstRight.
    230             // If offset>0 then this node and its sub-nodes have been written already
    231             // and we need not write them again.
    232             // If this node is part of the unwritten right branch edge,
    233             // then we wait until that is written.
    234             if(offset<0 && (offset<lastRight || firstRight<offset)) {
    235                 write(builder);
    236             }
    237         }
    238         inline int32_t getOffset() const { return offset; }
    239     protected:
    240         int32_t hash;
    241         int32_t offset;
    242     private:
    243         // No ICU "poor man's RTTI" for this class nor its subclasses.
    244         virtual UClassID getDynamicClassID() const;
    245     };
    246 
    247     // This class should not be overridden because
    248     // registerFinalValue() compares a stack-allocated FinalValueNode
    249     // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
    250     // with the input node, and the
    251     // !Node::operator==(other) used inside FinalValueNode::operator==(other)
    252     // will be false if the typeid's are different.
    253     /** @internal */
    254     class FinalValueNode : public Node {
    255     public:
    256         FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
    257         virtual UBool operator==(const Node &other) const;
    258         virtual void write(StringTrieBuilder &builder);
    259     protected:
    260         int32_t value;
    261     };
    262 
    263     /**
    264      * @internal
    265      */
    266     class ValueNode : public Node {
    267     public:
    268         ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
    269         virtual UBool operator==(const Node &other) const;
    270         void setValue(int32_t v) {
    271             hasValue=TRUE;
    272             value=v;
    273             hash=hash*37+v;
    274         }
    275     protected:
    276         UBool hasValue;
    277         int32_t value;
    278     };
    279 
    280     /**
    281      * @internal
    282      */
    283     class IntermediateValueNode : public ValueNode {
    284     public:
    285         IntermediateValueNode(int32_t v, Node *nextNode)
    286                 : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
    287         virtual UBool operator==(const Node &other) const;
    288         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
    289         virtual void write(StringTrieBuilder &builder);
    290     protected:
    291         Node *next;
    292     };
    293 
    294     /**
    295      * @internal
    296      */
    297     class LinearMatchNode : public ValueNode {
    298     public:
    299         LinearMatchNode(int32_t len, Node *nextNode)
    300                 : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
    301                   length(len), next(nextNode) {}
    302         virtual UBool operator==(const Node &other) const;
    303         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
    304     protected:
    305         int32_t length;
    306         Node *next;
    307     };
    308 
    309     /**
    310      * @internal
    311      */
    312     class BranchNode : public Node {
    313     public:
    314         BranchNode(int32_t initialHash) : Node(initialHash) {}
    315     protected:
    316         int32_t firstEdgeNumber;
    317     };
    318 
    319     /**
    320      * @internal
    321      */
    322     class ListBranchNode : public BranchNode {
    323     public:
    324         ListBranchNode() : BranchNode(0x444444), length(0) {}
    325         virtual UBool operator==(const Node &other) const;
    326         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
    327         virtual void write(StringTrieBuilder &builder);
    328         // Adds a unit with a final value.
    329         void add(int32_t c, int32_t value) {
    330             units[length]=(UChar)c;
    331             equal[length]=NULL;
    332             values[length]=value;
    333             ++length;
    334             hash=(hash*37+c)*37+value;
    335         }
    336         // Adds a unit which leads to another match node.
    337         void add(int32_t c, Node *node) {
    338             units[length]=(UChar)c;
    339             equal[length]=node;
    340             values[length]=0;
    341             ++length;
    342             hash=(hash*37+c)*37+hashCode(node);
    343         }
    344     protected:
    345         Node *equal[kMaxBranchLinearSubNodeLength];  // NULL means "has final value".
    346         int32_t length;
    347         int32_t values[kMaxBranchLinearSubNodeLength];
    348         UChar units[kMaxBranchLinearSubNodeLength];
    349     };
    350 
    351     /**
    352      * @internal
    353      */
    354     class SplitBranchNode : public BranchNode {
    355     public:
    356         SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
    357                 : BranchNode(((0x555555*37+middleUnit)*37+
    358                               hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
    359                   unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
    360         virtual UBool operator==(const Node &other) const;
    361         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
    362         virtual void write(StringTrieBuilder &builder);
    363     protected:
    364         UChar unit;
    365         Node *lessThan;
    366         Node *greaterOrEqual;
    367     };
    368 
    369     // Branch head node, for writing the actual node lead unit.
    370     /** @internal */
    371     class BranchHeadNode : public ValueNode {
    372     public:
    373         BranchHeadNode(int32_t len, Node *subNode)
    374                 : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
    375                   length(len), next(subNode) {}
    376         virtual UBool operator==(const Node &other) const;
    377         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
    378         virtual void write(StringTrieBuilder &builder);
    379     protected:
    380         int32_t length;
    381         Node *next;  // A branch sub-node.
    382     };
    383 #endif  /* U_HIDE_INTERNAL_API */
    384 
    385     /** @internal */
    386     virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
    387                                         Node *nextNode) const = 0;
    388 
    389     /** @internal */
    390     virtual int32_t write(int32_t unit) = 0;
    391     /** @internal */
    392     virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
    393     /** @internal */
    394     virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
    395     /** @internal */
    396     virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
    397     /** @internal */
    398     virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
    399 
    400 private:
    401     // No ICU "poor man's RTTI" for this class nor its subclasses.
    402     virtual UClassID getDynamicClassID() const;
    403 };
    404 
    405 U_NAMESPACE_END
    406 
    407 #endif  // __STRINGTRIEBUILDER_H__
    408