Home | History | Annotate | Download | only in unicode
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 2010-2011, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  stringtriebuilder.h
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2010dec24
     12 *   created by: Markus W. Scherer
     13 */
     14 
     15 #ifndef __STRINGTRIEBUILDER_H__
     16 #define __STRINGTRIEBUILDER_H__
     17 
     18 #include "unicode/utypes.h"
     19 #include "unicode/uobject.h"
     20 
     21 // Forward declaration.
     22 struct UHashtable;
     23 typedef struct UHashtable UHashtable;
     24 
     25 /**
     26  * Build options for BytesTrieBuilder and CharsTrieBuilder.
     27  * @draft ICU 4.8
     28  */
     29 enum UStringTrieBuildOption {
     30     /**
     31      * Builds a trie quickly.
     32      * @draft ICU 4.8
     33      */
     34     USTRINGTRIE_BUILD_FAST,
     35     /**
     36      * Builds a trie more slowly, attempting to generate
     37      * a shorter but equivalent serialization.
     38      * This build option also uses more memory.
     39      *
     40      * This option can be effective when many integer values are the same
     41      * and string/byte sequence suffixes can be shared.
     42      * Runtime speed is not expected to improve.
     43      * @draft ICU 4.8
     44      */
     45     USTRINGTRIE_BUILD_SMALL
     46 };
     47 
     48 U_NAMESPACE_BEGIN
     49 
     50 /**
     51  * Base class for string trie builder classes.
     52  *
     53  * This class is not intended for public subclassing.
     54  * @draft ICU 4.8
     55  */
     56 class U_COMMON_API StringTrieBuilder : public UObject {
     57 public:
     58     /** @internal */
     59     static UBool hashNode(const void *node);
     60     /** @internal */
     61     static UBool equalNodes(const void *left, const void *right);
     62 
     63 protected:
     64     /** @internal */
     65     StringTrieBuilder();
     66     /** @internal */
     67     virtual ~StringTrieBuilder();
     68 
     69     /** @internal */
     70     void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
     71     /** @internal */
     72     void deleteCompactBuilder();
     73 
     74     /** @internal */
     75     void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
     76 
     77     /** @internal */
     78     int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
     79     /** @internal */
     80     int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
     81 
     82     class Node;
     83 
     84     /** @internal */
     85     Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
     86     /** @internal */
     87     Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
     88                             int32_t length, UErrorCode &errorCode);
     89 
     90     /** @internal */
     91     virtual int32_t getElementStringLength(int32_t i) const = 0;
     92     /** @internal */
     93     virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
     94     /** @internal */
     95     virtual int32_t getElementValue(int32_t i) const = 0;
     96 
     97     // Finds the first unit index after this one where
     98     // the first and last element have different units again.
     99     /** @internal */
    100     virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
    101 
    102     // Number of different units at unitIndex.
    103     /** @internal */
    104     virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
    105     /** @internal */
    106     virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
    107     /** @internal */
    108     virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
    109 
    110     /** @internal */
    111     virtual UBool matchNodesCanHaveValues() const = 0;
    112 
    113     /** @internal */
    114     virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
    115     /** @internal */
    116     virtual int32_t getMinLinearMatch() const = 0;
    117     /** @internal */
    118     virtual int32_t getMaxLinearMatchLength() const = 0;
    119 
    120     // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
    121     /** @internal */
    122     static const int32_t kMaxBranchLinearSubNodeLength=5;
    123 
    124     // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
    125     // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
    126     /** @internal */
    127     static const int32_t kMaxSplitBranchLevels=14;
    128 
    129     /**
    130      * Makes sure that there is only one unique node registered that is
    131      * equivalent to newNode.
    132      * @param newNode Input node. The builder takes ownership.
    133      * @param errorCode ICU in/out UErrorCode.
    134                         Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
    135      * @return newNode if it is the first of its kind, or
    136      *         an equivalent node if newNode is a duplicate.
    137      * @internal
    138      */
    139     Node *registerNode(Node *newNode, UErrorCode &errorCode);
    140     /**
    141      * Makes sure that there is only one unique FinalValueNode registered
    142      * with this value.
    143      * Avoids creating a node if the value is a duplicate.
    144      * @param value A final value.
    145      * @param errorCode ICU in/out UErrorCode.
    146                         Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
    147      * @return A FinalValueNode with the given value.
    148      * @internal
    149      */
    150     Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
    151 
    152     /*
    153      * C++ note:
    154      * registerNode() and registerFinalValue() take ownership of their input nodes,
    155      * and only return owned nodes.
    156      * If they see a failure UErrorCode, they will delete the input node.
    157      * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
    158      * If there is a failure, they return NULL.
    159      *
    160      * NULL Node pointers can be safely passed into other Nodes because
    161      * they call the static Node::hashCode() which checks for a NULL pointer first.
    162      *
    163      * Therefore, as long as builder functions register a new node,
    164      * they need to check for failures only before explicitly dereferencing
    165      * a Node pointer, or before setting a new UErrorCode.
    166      */
    167 
    168     // Hash set of nodes, maps from nodes to integer 1.
    169     /** @internal */
    170     UHashtable *nodes;
    171 
    172     /** @internal */
    173     class Node : public UObject {
    174     public:
    175         Node(int32_t initialHash) : hash(initialHash), offset(0) {}
    176         inline int32_t hashCode() const { return hash; }
    177         // Handles node==NULL.
    178         static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
    179         // Base class operator==() compares the actual class types.
    180         virtual UBool operator==(const Node &other) const;
    181         inline UBool operator!=(const Node &other) const { return !operator==(other); }
    182         /**
    183          * Traverses the Node graph and numbers branch edges, with rightmost edges first.
    184          * This is to avoid writing a duplicate node twice.
    185          *
    186          * Branch nodes in this trie data structure are not symmetric.
    187          * Most branch edges "jump" to other nodes but the rightmost branch edges
    188          * just continue without a jump.
    189          * Therefore, write() must write the rightmost branch edge last
    190          * (trie units are written backwards), and must write it at that point even if
    191          * it is a duplicate of a node previously written elsewhere.
    192          *
    193          * This function visits and marks right branch edges first.
    194          * Edges are numbered with increasingly negative values because we share the
    195          * offset field which gets positive values when nodes are written.
    196          * A branch edge also remembers the first number for any of its edges.
    197          *
    198          * When a further-left branch edge has a number in the range of the rightmost
    199          * edge's numbers, then it will be written as part of the required right edge
    200          * and we can avoid writing it first.
    201          *
    202          * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
    203          * edge numbers.
    204          *
    205          * @param edgeNumber The first edge number for this node and its sub-nodes.
    206          * @return An edge number that is at least the maximum-negative
    207          *         of the input edge number and the numbers of this node and all of its sub-nodes.
    208          */
    209         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
    210         // write() must set the offset to a positive value.
    211         virtual void write(StringTrieBuilder &builder) = 0;
    212         // See markRightEdgesFirst.
    213         inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
    214                                                StringTrieBuilder &builder) {
    215             // Note: Edge numbers are negative, lastRight<=firstRight.
    216             // If offset>0 then this node and its sub-nodes have been written already
    217             // and we need not write them again.
    218             // If this node is part of the unwritten right branch edge,
    219             // then we wait until that is written.
    220             if(offset<0 && (offset<lastRight || firstRight<offset)) {
    221                 write(builder);
    222             }
    223         }
    224         inline int32_t getOffset() const { return offset; }
    225     protected:
    226         int32_t hash;
    227         int32_t offset;
    228     private:
    229         // No ICU "poor man's RTTI" for this class nor its subclasses.
    230         virtual UClassID getDynamicClassID() const;
    231     };
    232 
    233     // This class should not be overridden because
    234     // registerFinalValue() compares a stack-allocated FinalValueNode
    235     // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
    236     // with the input node, and the
    237     // !Node::operator==(other) used inside FinalValueNode::operator==(other)
    238     // will be false if the typeid's are different.
    239     /** @internal */
    240     class FinalValueNode : public Node {
    241     public:
    242         FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
    243         virtual UBool operator==(const Node &other) const;
    244         virtual void write(StringTrieBuilder &builder);
    245     protected:
    246         int32_t value;
    247     };
    248 
    249     /** @internal */
    250     class ValueNode : public Node {
    251     public:
    252         ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
    253         virtual UBool operator==(const Node &other) const;
    254         void setValue(int32_t v) {
    255             hasValue=TRUE;
    256             value=v;
    257             hash=hash*37+v;
    258         }
    259     protected:
    260         UBool hasValue;
    261         int32_t value;
    262     };
    263 
    264     /** @internal */
    265     class IntermediateValueNode : public ValueNode {
    266     public:
    267         IntermediateValueNode(int32_t v, Node *nextNode)
    268                 : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
    269         virtual UBool operator==(const Node &other) const;
    270         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
    271         virtual void write(StringTrieBuilder &builder);
    272     protected:
    273         Node *next;
    274     };
    275 
    276     /** @internal */
    277     class LinearMatchNode : public ValueNode {
    278     public:
    279         LinearMatchNode(int32_t len, Node *nextNode)
    280                 : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
    281                   length(len), next(nextNode) {}
    282         virtual UBool operator==(const Node &other) const;
    283         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
    284     protected:
    285         int32_t length;
    286         Node *next;
    287     };
    288 
    289     /** @internal */
    290     class BranchNode : public Node {
    291     public:
    292         BranchNode(int32_t initialHash) : Node(initialHash) {}
    293     protected:
    294         int32_t firstEdgeNumber;
    295     };
    296 
    297     /** @internal */
    298     class ListBranchNode : public BranchNode {
    299     public:
    300         ListBranchNode() : BranchNode(0x444444), length(0) {}
    301         virtual UBool operator==(const Node &other) const;
    302         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
    303         virtual void write(StringTrieBuilder &builder);
    304         // Adds a unit with a final value.
    305         void add(int32_t c, int32_t value) {
    306             units[length]=(UChar)c;
    307             equal[length]=NULL;
    308             values[length]=value;
    309             ++length;
    310             hash=(hash*37+c)*37+value;
    311         }
    312         // Adds a unit which leads to another match node.
    313         void add(int32_t c, Node *node) {
    314             units[length]=(UChar)c;
    315             equal[length]=node;
    316             values[length]=0;
    317             ++length;
    318             hash=(hash*37+c)*37+hashCode(node);
    319         }
    320     protected:
    321         Node *equal[kMaxBranchLinearSubNodeLength];  // NULL means "has final value".
    322         int32_t length;
    323         int32_t values[kMaxBranchLinearSubNodeLength];
    324         UChar units[kMaxBranchLinearSubNodeLength];
    325     };
    326 
    327     /** @internal */
    328     class SplitBranchNode : public BranchNode {
    329     public:
    330         SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
    331                 : BranchNode(((0x555555*37+middleUnit)*37+
    332                               hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
    333                   unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
    334         virtual UBool operator==(const Node &other) const;
    335         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
    336         virtual void write(StringTrieBuilder &builder);
    337     protected:
    338         UChar unit;
    339         Node *lessThan;
    340         Node *greaterOrEqual;
    341     };
    342 
    343     // Branch head node, for writing the actual node lead unit.
    344     /** @internal */
    345     class BranchHeadNode : public ValueNode {
    346     public:
    347         BranchHeadNode(int32_t len, Node *subNode)
    348                 : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
    349                   length(len), next(subNode) {}
    350         virtual UBool operator==(const Node &other) const;
    351         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
    352         virtual void write(StringTrieBuilder &builder);
    353     protected:
    354         int32_t length;
    355         Node *next;  // A branch sub-node.
    356     };
    357 
    358     /** @internal */
    359     virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
    360                                         Node *nextNode) const = 0;
    361 
    362     /** @internal */
    363     virtual int32_t write(int32_t unit) = 0;
    364     /** @internal */
    365     virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
    366     /** @internal */
    367     virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
    368     /** @internal */
    369     virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
    370     /** @internal */
    371     virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
    372 
    373 private:
    374     // No ICU "poor man's RTTI" for this class nor its subclasses.
    375     virtual UClassID getDynamicClassID() const;
    376 };
    377 
    378 U_NAMESPACE_END
    379 
    380 #endif  // __STRINGTRIEBUILDER_H__
    381