1 2 #***************************************************************************** 3 # 4 # Copyright (C) 2002-2003, International Business Machines Corporation and others. 5 # All Rights Reserved. 6 # 7 #***************************************************************************** 8 # 9 # file: rbbirpt.txt 10 # ICU Break Iterator Rule Parser State Table 11 # 12 # This state table is used when reading and parsing a set of RBBI rules 13 # The rule parser uses a state machine; the data in this file define the 14 # state transitions that occur for each input character. 15 # 16 # *** This file defines the RBBI rule grammar. This is it. 17 # *** The determination of what is accepted is here. 18 # 19 # This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays 20 # that are then built with the rule parser. 21 # 22 23 # 24 # Here is the syntax of the state definitions in this file: 25 # 26 # 27 #StateName: 28 # input-char n next-state ^push-state action 29 # input-char n next-state ^push-state action 30 # | | | | | 31 # | | | | |--- action to be performed by state machine 32 # | | | | See function RBBIRuleScanner::doParseActions() 33 # | | | | 34 # | | | |--- Push this named state onto the state stack. 35 # | | | Later, when next state is specified as "pop", 36 # | | | the pushed state will become the current state. 37 # | | | 38 # | | |--- Transition to this state if the current input character matches the input 39 # | | character or char class in the left hand column. "pop" causes the next 40 # | | state to be popped from the state stack. 41 # | | 42 # | |--- When making the state transition specified on this line, advance to the next 43 # | character from the input only if 'n' appears here. 44 # | 45 # |--- Character or named character classes to test for. If the current character being scanned 46 # matches, peform the actions and go to the state specified on this line. 47 # The input character is tested sequentally, in the order written. The characters and 48 # character classes tested for do not need to be mutually exclusive. The first match wins. 49 # 50 51 52 53 54 # 55 # start state, scan position is at the beginning of the rules file, or in between two rules. 56 # 57 start: 58 escaped term ^break-rule-end doExprStart 59 white_space n start 60 '$' scan-var-name ^assign-or-rule doExprStart 61 '!' n rev-option 62 ';' n start # ignore empty rules. 63 eof exit 64 default term ^break-rule-end doExprStart 65 66 # 67 # break-rule-end: Returned from doing a break-rule expression. 68 # 69 break-rule-end: 70 ';' n start doEndOfRule 71 white_space n break-rule-end 72 default errorDeath doRuleError 73 74 75 # 76 # ! We've just scanned a '!', indicating either a !!key word flag or a 77 # !Reverse rule. 78 # 79 rev-option: 80 '!' n option-scan1 81 default reverse-rule ^break-rule-end doReverseDir 82 83 option-scan1: 84 name_start_char n option-scan2 doOptionStart 85 default errorDeath doRuleError 86 87 option-scan2: 88 name_char n option-scan2 89 default option-scan3 doOptionEnd 90 91 option-scan3: 92 ';' n start 93 white_space n option-scan3 94 default errorDeath doRuleError 95 96 97 reverse-rule: 98 default term ^break-rule-end doExprStart 99 100 101 # 102 # term. Eat through a single rule character, or a composite thing, which 103 # could be a parenthesized expression, a variable name, or a Unicode Set. 104 # 105 term: 106 escaped n expr-mod doRuleChar 107 white_space n term 108 rule_char n expr-mod doRuleChar 109 '[' scan-unicode-set ^expr-mod 110 '(' n term ^expr-mod doLParen 111 '$' scan-var-name ^term-var-ref 112 '.' n expr-mod doDotAny 113 default errorDeath doRuleError 114 115 116 117 # 118 # term-var-ref We've just finished scanning a reference to a $variable. 119 # Check that the variable was defined. 120 # The variable name scanning is in common with assignment statements, 121 # so the check can't be done there. 122 term-var-ref: 123 default expr-mod doCheckVarDef 124 125 126 # 127 # expr-mod We've just finished scanning a term, now look for the optional 128 # trailing '*', '?', '+' 129 # 130 expr-mod: 131 white_space n expr-mod 132 '*' n expr-cont doUnaryOpStar 133 '+' n expr-cont doUnaryOpPlus 134 '?' n expr-cont doUnaryOpQuestion 135 default expr-cont 136 137 138 # 139 # expr-cont Expression, continuation. At a point where additional terms are 140 # allowed, but not required. 141 # 142 expr-cont: 143 escaped term doExprCatOperator 144 white_space n expr-cont 145 rule_char term doExprCatOperator 146 '[' term doExprCatOperator 147 '(' term doExprCatOperator 148 '$' term doExprCatOperator 149 '.' term doExprCatOperator 150 '/' look-ahead doExprCatOperator 151 '{' n tag-open doExprCatOperator 152 '|' n term doExprOrOperator 153 ')' n pop doExprRParen 154 default pop doExprFinished 155 156 157 # 158 # look-ahead Scanning a '/', which identifies a break point, assuming that the 159 # remainder of the expression matches. 160 # 161 # Generate a parse tree as if this was a special kind of input symbol 162 # appearing in an otherwise normal concatenation expression. 163 # 164 look-ahead: 165 '/' n expr-cont-no-slash doSlash 166 default errorDeath 167 168 169 # 170 # expr-cont-no-slash Expression, continuation. At a point where additional terms are 171 # allowed, but not required. Just like 172 # expr-cont, above, except that no '/' 173 # look-ahead symbol is permitted. 174 # 175 expr-cont-no-slash: 176 escaped term doExprCatOperator 177 white_space n expr-cont 178 rule_char term doExprCatOperator 179 '[' term doExprCatOperator 180 '(' term doExprCatOperator 181 '$' term doExprCatOperator 182 '.' term doExprCatOperator 183 '|' n term doExprOrOperator 184 ')' n pop doExprRParen 185 default pop doExprFinished 186 187 188 # 189 # tags scanning a '{', the opening delimiter for a tag that identifies 190 # the kind of match. Scan the whole {dddd} tag, where d=digit 191 # 192 tag-open: 193 white_space n tag-open 194 digit_char tag-value doStartTagValue 195 default errorDeath doTagExpectedError 196 197 tag-value: 198 white_space n tag-close 199 '}' tag-close 200 digit_char n tag-value doTagDigit 201 default errorDeath doTagExpectedError 202 203 tag-close: 204 white_space n tag-close 205 '}' n expr-cont-no-tag doTagValue 206 default errorDeath doTagExpectedError 207 208 209 210 # 211 # expr-cont-no-tag Expression, continuation. At a point where additional terms are 212 # allowed, but not required. Just like 213 # expr-cont, above, except that no "{ddd}" 214 # tagging is permitted. 215 # 216 expr-cont-no-tag: 217 escaped term doExprCatOperator 218 white_space n expr-cont-no-tag 219 rule_char term doExprCatOperator 220 '[' term doExprCatOperator 221 '(' term doExprCatOperator 222 '$' term doExprCatOperator 223 '.' term doExprCatOperator 224 '/' look-ahead doExprCatOperator 225 '|' n term doExprOrOperator 226 ')' n pop doExprRParen 227 default pop doExprFinished 228 229 230 231 232 # 233 # Variable Name Scanning. 234 # 235 # The state that branched to here must have pushed a return state 236 # to go to after completion of the variable name scanning. 237 # 238 # The current input character must be the $ that introduces the name. 239 # The $ is consummed here rather than in the state that first detected it 240 # so that the doStartVariableName action only needs to happen in one 241 # place (here), and the other states don't need to worry about it. 242 # 243 scan-var-name: 244 '$' n scan-var-start doStartVariableName 245 default errorDeath 246 247 248 scan-var-start: 249 name_start_char n scan-var-body 250 default errorDeath doVariableNameExpectedErr 251 252 scan-var-body: 253 name_char n scan-var-body 254 default pop doEndVariableName 255 256 257 258 # 259 # scan-unicode-set Unicode Sets are parsed by the the UnicodeSet class. 260 # Within the RBBI parser, after finding the first character 261 # of a Unicode Set, we just hand the rule input at that 262 # point of to the Unicode Set constructor, then pick 263 # up parsing after the close of the set. 264 # 265 # The action for this state invokes the UnicodeSet parser. 266 # 267 scan-unicode-set: 268 '[' n pop doScanUnicodeSet 269 'p' n pop doScanUnicodeSet 270 'P' n pop doScanUnicodeSet 271 default errorDeath 272 273 274 275 276 277 278 279 # 280 # assign-or-rule. A $variable was encountered at the start of something, could be 281 # either an assignment statement or a rule, depending on whether an '=' 282 # follows the variable name. We get to this state when the variable name 283 # scanning does a return. 284 # 285 assign-or-rule: 286 white_space n assign-or-rule 287 '=' n term ^assign-end doStartAssign # variable was target of assignment 288 default term-var-ref ^break-rule-end # variable was a term in a rule 289 290 291 292 # 293 # assign-end This state is entered when the end of the expression on the 294 # right hand side of an assignment is found. We get here via 295 # a pop; this state is pushed when the '=' in an assignment is found. 296 # 297 # The only thing allowed at this point is a ';'. The RHS of an 298 # assignment must look like a rule expression, and we come here 299 # when what is being scanned no longer looks like an expression. 300 # 301 assign-end: 302 ';' n start doEndAssign 303 default errorDeath doRuleErrorAssignExpr 304 305 306 307 # 308 # errorDeath. This state is specified as the next state whenever a syntax error 309 # in the source rules is detected. Barring bugs, the state machine will never 310 # actually get here, but will stop because of the action associated with the error. 311 # But, just in case, this state asks the state machine to exit. 312 errorDeath: 313 default n errorDeath doExit 314 315 316