1 // 2 // rbbisetb.h 3 /* 4 ********************************************************************** 5 * Copyright (c) 2001-2005, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 */ 9 10 #ifndef RBBISETB_H 11 #define RBBISETB_H 12 13 #include "unicode/utypes.h" 14 #include "unicode/uobject.h" 15 #include "rbbirb.h" 16 #include "uvector.h" 17 18 struct UNewTrie; 19 20 U_NAMESPACE_BEGIN 21 22 // 23 // RBBISetBuilder Derives the character categories used by the runtime RBBI engine 24 // from the Unicode Sets appearing in the source RBBI rules, and 25 // creates the TRIE table used to map from Unicode to the 26 // character categories. 27 // 28 29 30 // 31 // RangeDescriptor 32 // 33 // Each of the non-overlapping character ranges gets one of these descriptors. 34 // All of them are strung together in a linked list, which is kept in order 35 // (by character) 36 // 37 class RangeDescriptor : public UMemory { 38 public: 39 UChar32 fStartChar; // Start of range, unicode 32 bit value. 40 UChar32 fEndChar; // End of range, unicode 32 bit value. 41 int32_t fNum; // runtime-mapped input value for this range. 42 UVector *fIncludesSets; // vector of the the original 43 // Unicode sets that include this range. 44 // (Contains ptrs to uset nodes) 45 RangeDescriptor *fNext; // Next RangeDescriptor in the linked list. 46 47 RangeDescriptor(UErrorCode &status); 48 RangeDescriptor(const RangeDescriptor &other, UErrorCode &status); 49 ~RangeDescriptor(); 50 void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with 51 // where appearing in the second (higher) part. 52 void setDictionaryFlag(); // Check whether this range appears as part of 53 // the Unicode set named "dictionary" 54 55 private: 56 RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class 57 RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class 58 }; 59 60 61 // 62 // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules. 63 // 64 // Starting with the rules parse tree from the scanner, 65 // 66 // - Enumerate the set of UnicodeSets that are referenced 67 // by the RBBI rules. 68 // - compute a derived set of non-overlapping UnicodeSets 69 // that will correspond to columns in the state table for 70 // the RBBI execution engine. 71 // - construct the trie table that maps input characters 72 // to set numbers in the non-overlapping set of sets. 73 // 74 75 76 class RBBISetBuilder : public UMemory { 77 public: 78 RBBISetBuilder(RBBIRuleBuilder *rb); 79 ~RBBISetBuilder(); 80 81 void build(); 82 void addValToSets(UVector *sets, uint32_t val); 83 void addValToSet (RBBINode *usetNode, uint32_t val); 84 int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the 85 // runtime state machine, which are the same as 86 // columns in the DFA state table 87 int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie. 88 void serializeTrie(uint8_t *where); // write out the serialized Trie. 89 UChar32 getFirstChar(int32_t val) const; 90 UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo 91 // character were encountered. 92 #ifdef RBBI_DEBUG 93 void printSets(); 94 void printRanges(); 95 void printRangeGroups(); 96 #else 97 #define printSets() 98 #define printRanges() 99 #define printRangeGroups() 100 #endif 101 102 private: 103 void numberSets(); 104 105 RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us. 106 UErrorCode *fStatus; 107 108 RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors 109 110 UNewTrie *fTrie; // The mapping TRIE that is the end result of processing 111 uint32_t fTrieSize; // the Unicode Sets. 112 113 // Groups correspond to character categories - 114 // groups of ranges that are in the same original UnicodeSets. 115 // fGroupCount is the index of the last used group. 116 // fGroupCount+1 is also the number of columns in the RBBI state table being compiled. 117 // State table column 0 is not used. Column 1 is for end-of-input. 118 // column 2 is for group 0. Funny counting. 119 int32_t fGroupCount; 120 121 UBool fSawBOF; 122 123 RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class 124 RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class 125 }; 126 127 128 129 U_NAMESPACE_END 130 #endif 131