1 2 #***************************************************************************** 3 # 4 # Copyright (C) 2016 and later: Unicode, Inc. and others. 5 # License & terms of use: http://www.unicode.org/copyright.html#License 6 # 7 #***************************************************************************** 8 #***************************************************************************** 9 # 10 # Copyright (C) 2002-2016, International Business Machines Corporation and others. 11 # All Rights Reserved. 12 # 13 #***************************************************************************** 14 # 15 # file: rbbirpt.txt 16 # ICU Break Iterator Rule Parser State Table 17 # 18 # This state table is used when reading and parsing a set of RBBI rules 19 # The rule parser uses a state machine; the data in this file define the 20 # state transitions that occur for each input character. 21 # 22 # *** This file defines the RBBI rule grammar. This is it. 23 # *** The determination of what is accepted is here. 24 # 25 # This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays 26 # that are then built with the rule parser. 27 # 28 # perl rbbicst.pl < rbbirpt.txt > rbbirpt.h 29 30 # 31 # Here is the syntax of the state definitions in this file: 32 # 33 # 34 #StateName: 35 # input-char n next-state ^push-state action 36 # input-char n next-state ^push-state action 37 # | | | | | 38 # | | | | |--- action to be performed by state machine 39 # | | | | See function RBBIRuleScanner::doParseActions() 40 # | | | | 41 # | | | |--- Push this named state onto the state stack. 42 # | | | Later, when next state is specified as "pop", 43 # | | | the pushed state will become the current state. 44 # | | | 45 # | | |--- Transition to this state if the current input character matches the input 46 # | | character or char class in the left hand column. "pop" causes the next 47 # | | state to be popped from the state stack. 48 # | | 49 # | |--- When making the state transition specified on this line, advance to the next 50 # | character from the input only if 'n' appears here. 51 # | 52 # |--- Character or named character classes to test for. If the current character being scanned 53 # matches, peform the actions and go to the state specified on this line. 54 # The input character is tested sequentally, in the order written. The characters and 55 # character classes tested for do not need to be mutually exclusive. The first match wins. 56 # 57 58 59 60 61 # 62 # start state, scan position is at the beginning of the rules file, or in between two rules. 63 # 64 start: 65 escaped term ^break-rule-end doExprStart 66 white_space n start 67 '^' n start-after-caret ^break-rule-end doNoChain 68 '$' scan-var-name ^assign-or-rule doExprStart 69 '!' n rev-option 70 ';' n start # ignore empty rules. 71 eof exit 72 default term ^break-rule-end doExprStart 73 74 # 75 # break-rule-end: Returned from doing a break-rule expression. 76 # 77 break-rule-end: 78 ';' n start doEndOfRule 79 white_space n break-rule-end 80 default errorDeath doRuleError 81 82 # 83 # start of a rule, after having seen a '^' (inhibits rule chain in). 84 # Similar to the main 'start' state in most respects, except 85 # - empty rule is an error. 86 # - A second '^' is an error. 87 # 88 start-after-caret: 89 escaped term doExprStart 90 white_space n start-after-caret 91 '^' errorDeath doRuleError # two '^'s 92 '$' scan-var-name ^term-var-ref doExprStart 93 ';' errorDeath doRuleError # ^ ; 94 eof errorDeath doRuleError 95 default term doExprStart 96 97 # 98 # ! We've just scanned a '!', indicating either a !!key word flag or a 99 # !Reverse rule. 100 # 101 rev-option: 102 '!' n option-scan1 103 default reverse-rule ^break-rule-end doReverseDir 104 105 option-scan1: 106 name_start_char n option-scan2 doOptionStart 107 default errorDeath doRuleError 108 109 option-scan2: 110 name_char n option-scan2 111 default option-scan3 doOptionEnd 112 113 option-scan3: 114 ';' n start 115 white_space n option-scan3 116 default errorDeath doRuleError 117 118 119 reverse-rule: 120 default term ^break-rule-end doExprStart 121 122 123 # 124 # term. Eat through a single rule character, or a composite thing, which 125 # could be a parenthesized expression, a variable name, or a Unicode Set. 126 # 127 term: 128 escaped n expr-mod doRuleChar 129 white_space n term 130 rule_char n expr-mod doRuleChar 131 '[' scan-unicode-set ^expr-mod 132 '(' n term ^expr-mod doLParen 133 '$' scan-var-name ^term-var-ref 134 '.' n expr-mod doDotAny 135 default errorDeath doRuleError 136 137 138 139 # 140 # term-var-ref We've just finished scanning a reference to a $variable. 141 # Check that the variable was defined. 142 # The variable name scanning is in common with assignment statements, 143 # so the check can't be done there. 144 term-var-ref: 145 default expr-mod doCheckVarDef 146 147 148 # 149 # expr-mod We've just finished scanning a term, now look for the optional 150 # trailing '*', '?', '+' 151 # 152 expr-mod: 153 white_space n expr-mod 154 '*' n expr-cont doUnaryOpStar 155 '+' n expr-cont doUnaryOpPlus 156 '?' n expr-cont doUnaryOpQuestion 157 default expr-cont 158 159 160 # 161 # expr-cont Expression, continuation. At a point where additional terms are 162 # allowed, but not required. 163 # 164 expr-cont: 165 escaped term doExprCatOperator 166 white_space n expr-cont 167 rule_char term doExprCatOperator 168 '[' term doExprCatOperator 169 '(' term doExprCatOperator 170 '$' term doExprCatOperator 171 '.' term doExprCatOperator 172 '/' look-ahead doExprCatOperator 173 '{' n tag-open doExprCatOperator 174 '|' n term doExprOrOperator 175 ')' n pop doExprRParen 176 default pop doExprFinished 177 178 179 # 180 # look-ahead Scanning a '/', which identifies a break point, assuming that the 181 # remainder of the expression matches. 182 # 183 # Generate a parse tree as if this was a special kind of input symbol 184 # appearing in an otherwise normal concatenation expression. 185 # 186 look-ahead: 187 '/' n expr-cont-no-slash doSlash 188 default errorDeath 189 190 191 # 192 # expr-cont-no-slash Expression, continuation. At a point where additional terms are 193 # allowed, but not required. Just like 194 # expr-cont, above, except that no '/' 195 # look-ahead symbol is permitted. 196 # 197 expr-cont-no-slash: 198 escaped term doExprCatOperator 199 white_space n expr-cont 200 rule_char term doExprCatOperator 201 '[' term doExprCatOperator 202 '(' term doExprCatOperator 203 '$' term doExprCatOperator 204 '.' term doExprCatOperator 205 '|' n term doExprOrOperator 206 ')' n pop doExprRParen 207 default pop doExprFinished 208 209 210 # 211 # tags scanning a '{', the opening delimiter for a tag that identifies 212 # the kind of match. Scan the whole {dddd} tag, where d=digit 213 # 214 tag-open: 215 white_space n tag-open 216 digit_char tag-value doStartTagValue 217 default errorDeath doTagExpectedError 218 219 tag-value: 220 white_space n tag-close 221 '}' tag-close 222 digit_char n tag-value doTagDigit 223 default errorDeath doTagExpectedError 224 225 tag-close: 226 white_space n tag-close 227 '}' n expr-cont-no-tag doTagValue 228 default errorDeath doTagExpectedError 229 230 231 232 # 233 # expr-cont-no-tag Expression, continuation. At a point where additional terms are 234 # allowed, but not required. Just like 235 # expr-cont, above, except that no "{ddd}" 236 # tagging is permitted. 237 # 238 expr-cont-no-tag: 239 escaped term doExprCatOperator 240 white_space n expr-cont-no-tag 241 rule_char term doExprCatOperator 242 '[' term doExprCatOperator 243 '(' term doExprCatOperator 244 '$' term doExprCatOperator 245 '.' term doExprCatOperator 246 '/' look-ahead doExprCatOperator 247 '|' n term doExprOrOperator 248 ')' n pop doExprRParen 249 default pop doExprFinished 250 251 252 253 254 # 255 # Variable Name Scanning. 256 # 257 # The state that branched to here must have pushed a return state 258 # to go to after completion of the variable name scanning. 259 # 260 # The current input character must be the $ that introduces the name. 261 # The $ is consummed here rather than in the state that first detected it 262 # so that the doStartVariableName action only needs to happen in one 263 # place (here), and the other states don't need to worry about it. 264 # 265 scan-var-name: 266 '$' n scan-var-start doStartVariableName 267 default errorDeath 268 269 270 scan-var-start: 271 name_start_char n scan-var-body 272 default errorDeath doVariableNameExpectedErr 273 274 scan-var-body: 275 name_char n scan-var-body 276 default pop doEndVariableName 277 278 279 280 # 281 # scan-unicode-set Unicode Sets are parsed by the the UnicodeSet class. 282 # Within the RBBI parser, after finding the first character 283 # of a Unicode Set, we just hand the rule input at that 284 # point of to the Unicode Set constructor, then pick 285 # up parsing after the close of the set. 286 # 287 # The action for this state invokes the UnicodeSet parser. 288 # 289 scan-unicode-set: 290 '[' n pop doScanUnicodeSet 291 'p' n pop doScanUnicodeSet 292 'P' n pop doScanUnicodeSet 293 default errorDeath 294 295 296 297 298 299 300 301 # 302 # assign-or-rule. A $variable was encountered at the start of something, could be 303 # either an assignment statement or a rule, depending on whether an '=' 304 # follows the variable name. We get to this state when the variable name 305 # scanning does a return. 306 # 307 assign-or-rule: 308 white_space n assign-or-rule 309 '=' n term ^assign-end doStartAssign # variable was target of assignment 310 default term-var-ref ^break-rule-end # variable was a term in a rule 311 312 313 314 # 315 # assign-end This state is entered when the end of the expression on the 316 # right hand side of an assignment is found. We get here via 317 # a pop; this state is pushed when the '=' in an assignment is found. 318 # 319 # The only thing allowed at this point is a ';'. The RHS of an 320 # assignment must look like a rule expression, and we come here 321 # when what is being scanned no longer looks like an expression. 322 # 323 assign-end: 324 ';' n start doEndAssign 325 default errorDeath doRuleErrorAssignExpr 326 327 328 329 # 330 # errorDeath. This state is specified as the next state whenever a syntax error 331 # in the source rules is detected. Barring bugs, the state machine will never 332 # actually get here, but will stop because of the action associated with the error. 333 # But, just in case, this state asks the state machine to exit. 334 errorDeath: 335 default n errorDeath doExit 336 337 338