1 // 2 // rbbirb.h 3 // 4 // Copyright (C) 2002-2008, International Business Machines Corporation and others. 5 // All Rights Reserved. 6 // 7 // This file contains declarations for several classes from the 8 // Rule Based Break Iterator rule builder. 9 // 10 11 12 #ifndef RBBIRB_H 13 #define RBBIRB_H 14 15 #include "unicode/utypes.h" 16 #include "unicode/uobject.h" 17 #include "unicode/rbbi.h" 18 #include "unicode/uniset.h" 19 #include "unicode/parseerr.h" 20 #include "uhash.h" 21 #include "uvector.h" 22 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that 23 // looks up references to $variables within a set. 24 25 26 27 U_NAMESPACE_BEGIN 28 29 class RBBIRuleScanner; 30 struct RBBIRuleTableEl; 31 class RBBISetBuilder; 32 class RBBINode; 33 class RBBITableBuilder; 34 35 36 37 //-------------------------------------------------------------------------------- 38 // 39 // RBBISymbolTable. Implements SymbolTable interface that is used by the 40 // UnicodeSet parser to resolve references to $variables. 41 // 42 //-------------------------------------------------------------------------------- 43 class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one 44 public: // of these structs for each entry. 45 RBBISymbolTableEntry(); 46 UnicodeString key; 47 RBBINode *val; 48 ~RBBISymbolTableEntry(); 49 50 private: 51 RBBISymbolTableEntry(const RBBISymbolTableEntry &other); // forbid copying of this class 52 RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other); // forbid copying of this class 53 }; 54 55 56 class RBBISymbolTable : public UMemory, public SymbolTable { 57 private: 58 const UnicodeString &fRules; 59 UHashtable *fHashTable; 60 RBBIRuleScanner *fRuleScanner; 61 62 // These next two fields are part of the mechanism for passing references to 63 // already-constructed UnicodeSets back to the UnicodeSet constructor 64 // when the pattern includes $variable references. 65 const UnicodeString ffffString; // = "/uffff" 66 UnicodeSet *fCachedSetLookup; 67 68 public: 69 // API inherited from class SymbolTable 70 virtual const UnicodeString* lookup(const UnicodeString& s) const; 71 virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const; 72 virtual UnicodeString parseReference(const UnicodeString& text, 73 ParsePosition& pos, int32_t limit) const; 74 75 // Additional Functions 76 RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status); 77 virtual ~RBBISymbolTable(); 78 79 virtual RBBINode *lookupNode(const UnicodeString &key) const; 80 virtual void addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err); 81 82 #ifdef RBBI_DEBUG 83 virtual void rbbiSymtablePrint() const; 84 #else 85 // A do-nothing inline function for non-debug builds. Member funcs can't be empty 86 // or the call sites won't compile. 87 int32_t fFakeField; 88 #define rbbiSymtablePrint() fFakeField=0; 89 #endif 90 91 private: 92 RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class 93 RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class 94 }; 95 96 97 //-------------------------------------------------------------------------------- 98 // 99 // class RBBIRuleBuilder The top-level class handling RBBI rule compiling. 100 // 101 //-------------------------------------------------------------------------------- 102 class RBBIRuleBuilder : public UMemory { 103 public: 104 105 // Create a rule based break iterator from a set of rules. 106 // This function is the main entry point into the rule builder. The 107 // public ICU API for creating RBBIs uses this function to do the actual work. 108 // 109 static BreakIterator * createRuleBasedBreakIterator( const UnicodeString &rules, 110 UParseError *parseError, 111 UErrorCode &status); 112 113 public: 114 // The "public" functions and data members that appear below are accessed 115 // (and shared) by the various parts that make up the rule builder. They 116 // are NOT intended to be accessed by anything outside of the 117 // rule builder implementation. 118 RBBIRuleBuilder(const UnicodeString &rules, 119 UParseError *parseErr, 120 UErrorCode &status 121 ); 122 123 virtual ~RBBIRuleBuilder(); 124 char *fDebugEnv; // controls debug trace output 125 UErrorCode *fStatus; // Error reporting. Keeping status 126 UParseError *fParseError; // here avoids passing it everywhere. 127 const UnicodeString &fRules; // The rule string that we are compiling 128 129 RBBIRuleScanner *fScanner; // The scanner. 130 RBBINode *fForwardTree; // The parse trees, generated by the scanner, 131 RBBINode *fReverseTree; // then manipulated by subsequent steps. 132 RBBINode *fSafeFwdTree; 133 RBBINode *fSafeRevTree; 134 135 RBBINode **fDefaultTree; // For rules not qualified with a ! 136 // the tree to which they belong to. 137 138 UBool fChainRules; // True for chained Unicode TR style rules. 139 // False for traditional regexp rules. 140 141 UBool fLBCMNoChain; // True: suppress chaining of rules on 142 // chars with LineBreak property == CM. 143 144 UBool fLookAheadHardBreak; // True: Look ahead matches cause an 145 // immediate break, no continuing for the 146 // longest match. 147 148 RBBISetBuilder *fSetBuilder; // Set and Character Category builder. 149 UVector *fUSetNodes; // Vector of all uset nodes. 150 151 RBBITableBuilder *fForwardTables; // State transition tables 152 RBBITableBuilder *fReverseTables; 153 RBBITableBuilder *fSafeFwdTables; 154 RBBITableBuilder *fSafeRevTables; 155 156 UVector *fRuleStatusVals; // The values that can be returned 157 // from getRuleStatus(). 158 159 RBBIDataHeader *flattenData(); // Create the flattened (runtime format) 160 // data tables.. 161 private: 162 RBBIRuleBuilder(const RBBIRuleBuilder &other); // forbid copying of this class 163 RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other); // forbid copying of this class 164 }; 165 166 167 168 169 //---------------------------------------------------------------------------- 170 // 171 // RBBISetTableEl is an entry in the hash table of UnicodeSets that have 172 // been encountered. The val Node will be of nodetype uset 173 // and contain pointers to the actual UnicodeSets. 174 // The Key is the source string for initializing the set. 175 // 176 // The hash table is used to avoid creating duplicate 177 // unnamed (not $var references) UnicodeSets. 178 // 179 // Memory Management: 180 // The Hash Table owns these RBBISetTableEl structs and 181 // the key strings. It does NOT own the val nodes. 182 // 183 //---------------------------------------------------------------------------- 184 struct RBBISetTableEl { 185 UnicodeString *key; 186 RBBINode *val; 187 }; 188 189 190 //---------------------------------------------------------------------------- 191 // 192 // RBBIDebugPrintf Printf equivalent, for debugging output. 193 // Conditional compilation of the implementation lets us 194 // get rid of the stdio dependency in environments where it 195 // is unavailable. 196 // 197 //---------------------------------------------------------------------------- 198 #ifdef RBBI_DEBUG 199 #include <stdio.h> 200 #define RBBIDebugPrintf printf 201 #define RBBIDebugPuts puts 202 #else 203 #undef RBBIDebugPrintf 204 #define RBBIDebugPuts(arg) 205 #endif 206 207 U_NAMESPACE_END 208 #endif 209 210 211 212