1 // 2 // rbbiscan.h 3 // 4 // Copyright (C) 2002-2008, International Business Machines Corporation and others. 5 // All Rights Reserved. 6 // 7 // This file contains declarations for class RBBIRuleScanner 8 // 9 10 11 #ifndef RBBISCAN_H 12 #define RBBISCAN_H 13 14 #include "unicode/utypes.h" 15 #include "unicode/uobject.h" 16 #include "unicode/rbbi.h" 17 #include "unicode/uniset.h" 18 #include "unicode/parseerr.h" 19 #include "uhash.h" 20 #include "uvector.h" 21 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that 22 // looks up references to $variables within a set. 23 #include "rbbinode.h" 24 //#include "rbbitblb.h" 25 26 27 28 U_NAMESPACE_BEGIN 29 30 class RBBIRuleBuilder; 31 class RBBISymbolTable; 32 33 34 //-------------------------------------------------------------------------------- 35 // 36 // class RBBIRuleScanner does the lowest level, character-at-a-time 37 // scanning of break iterator rules. 38 // 39 // The output of the scanner is parse trees for 40 // the rule expressions and a list of all Unicode Sets 41 // encountered. 42 // 43 //-------------------------------------------------------------------------------- 44 45 class RBBIRuleScanner : public UMemory { 46 public: 47 48 enum { 49 kStackSize = 100 // The size of the state stack for 50 }; // rules parsing. Corresponds roughly 51 // to the depth of parentheses nesting 52 // that is allowed in the rules. 53 54 struct RBBIRuleChar { 55 UChar32 fChar; 56 UBool fEscaped; 57 }; 58 59 RBBIRuleScanner(RBBIRuleBuilder *rb); 60 61 62 virtual ~RBBIRuleScanner(); 63 64 void nextChar(RBBIRuleChar &c); // Get the next char from the input stream. 65 // Return false if at end. 66 67 UBool push(const RBBIRuleChar &c); // Push (unget) one character. 68 // Only a single character may be pushed. 69 70 void parse(); // Parse the rules, generating two parse 71 // trees, one each for the forward and 72 // reverse rules, 73 // and a list of UnicodeSets encountered. 74 75 /** 76 * Return a rules string without unnecessary 77 * characters. 78 */ 79 static UnicodeString stripRules(const UnicodeString &rules); 80 private: 81 82 UBool doParseActions(int32_t a); 83 void error(UErrorCode e); // error reporting convenience function. 84 void fixOpStack(RBBINode::OpPrecedence p); 85 // a character. 86 void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL); 87 88 UChar32 nextCharLL(); 89 #ifdef RBBI_DEBUG 90 void printNodeStack(const char *title); 91 #endif 92 RBBINode *pushNewNode(RBBINode::NodeType t); 93 void scanSet(); 94 95 96 RBBIRuleBuilder *fRB; // The rule builder that we are part of. 97 98 int32_t fScanIndex; // Index of current character being processed 99 // in the rule input string. 100 int32_t fNextIndex; // Index of the next character, which 101 // is the first character not yet scanned. 102 UBool fQuoteMode; // Scan is in a 'quoted region' 103 int32_t fLineNum; // Line number in input file. 104 int32_t fCharNum; // Char position within the line. 105 UChar32 fLastChar; // Previous char, needed to count CR-LF 106 // as a single line, not two. 107 108 RBBIRuleChar fC; // Current char for parse state machine 109 // processing. 110 UnicodeString fVarName; // $variableName, valid when we've just 111 // scanned one. 112 113 RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule 114 // parsing. index by p[state][char-class] 115 116 uint16_t fStack[kStackSize]; // State stack, holds state pushes 117 int32_t fStackPtr; // and pops as specified in the state 118 // transition rules. 119 120 RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created 121 // during the parse of a rule 122 int32_t fNodeStackPtr; 123 124 125 UBool fReverseRule; // True if the rule currently being scanned 126 // is a reverse direction rule (if it 127 // starts with a '!') 128 129 UBool fLookAheadRule; // True if the rule includes a '/' 130 // somewhere within it. 131 132 RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of 133 // $variable symbols. 134 135 UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to 136 // the sets created while parsing rules. 137 // The key is the string used for creating 138 // the set. 139 140 UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during 141 // the scanning of RBBI rules. The 142 // indicies for these are assigned by the 143 // perl script that builds the state tables. 144 // See rbbirpt.h. 145 146 int32_t fRuleNum; // Counts each rule as it is scanned. 147 148 int32_t fOptionStart; // Input index of start of a !!option 149 // keyword, while being scanned. 150 151 UnicodeSet *gRuleSet_rule_char; 152 UnicodeSet *gRuleSet_white_space; 153 UnicodeSet *gRuleSet_name_char; 154 UnicodeSet *gRuleSet_name_start_char; 155 156 RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class 157 RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class 158 }; 159 160 U_NAMESPACE_END 161 162 #endif 163