1 2 #***************************************************************************** 3 # 4 # Copyright (C) 2002-2007, International Business Machines Corporation and others. 5 # All Rights Reserved. 6 # 7 #***************************************************************************** 8 # 9 # file: regexcst.txt 10 # ICU Regular Expression Parser State Table 11 # 12 # This state table is used when reading and parsing a regular expression pattern 13 # The pattern parser uses a state machine; the data in this file define the 14 # state transitions that occur for each input character. 15 # 16 # *** This file defines the regex pattern grammar. This is it. 17 # *** The determination of what is accepted is here. 18 # 19 # This file is processed by a perl script "regexcst.pl" to produce initialized C arrays 20 # that are then built with the rule parser. 21 # 22 23 # 24 # Here is the syntax of the state definitions in this file: 25 # 26 # 27 #StateName: 28 # input-char n next-state ^push-state action 29 # input-char n next-state ^push-state action 30 # | | | | | 31 # | | | | |--- action to be performed by state machine 32 # | | | | See function RBBIRuleScanner::doParseActions() 33 # | | | | 34 # | | | |--- Push this named state onto the state stack. 35 # | | | Later, when next state is specified as "pop", 36 # | | | the pushed state will become the current state. 37 # | | | 38 # | | |--- Transition to this state if the current input character matches the input 39 # | | character or char class in the left hand column. "pop" causes the next 40 # | | state to be popped from the state stack. 41 # | | 42 # | |--- When making the state transition specified on this line, advance to the next 43 # | character from the input only if 'n' appears here. 44 # | 45 # |--- Character or named character classes to test for. If the current character being scanned 46 # matches, peform the actions and go to the state specified on this line. 47 # The input character is tested sequentally, in the order written. The characters and 48 # character classes tested for do not need to be mutually exclusive. The first match wins. 49 # 50 51 52 53 54 # 55 # start state, scan position is at the beginning of the pattern. 56 # 57 start: 58 default term doPatStart 59 60 61 62 63 # 64 # term. At a position where we can accept the start most items in a pattern. 65 # 66 term: 67 quoted n expr-quant doLiteralChar 68 rule_char n expr-quant doLiteralChar 69 '[' n set-open ^set-finish doSetBegin 70 '(' n open-paren 71 '.' n expr-quant doDotAny 72 '^' n expr-quant doCaret 73 '$' n expr-quant doDollar 74 '\' n backslash 75 '|' n term doOrOperator 76 ')' n pop doCloseParen 77 eof term doPatFinish 78 default errorDeath doRuleError 79 80 81 82 # 83 # expr-quant We've just finished scanning a term, now look for the optional 84 # trailing quantifier - *, +, ?, *?, etc. 85 # 86 expr-quant: 87 '*' n quant-star 88 '+' n quant-plus 89 '?' n quant-opt 90 '{' n interval-open doIntervalInit 91 '(' n open-paren-quant 92 default expr-cont 93 94 95 # 96 # expr-cont Expression, continuation. At a point where additional terms are 97 # allowed, but not required. No Quantifiers 98 # 99 expr-cont: 100 '|' n term doOrOperator 101 ')' n pop doCloseParen 102 default term 103 104 105 # 106 # open-paren-quant Special case handling for comments appearing before a quantifier, 107 # e.g. x(?#comment )* 108 # Open parens from expr-quant come here; anything but a (?# comment 109 # branches into the normal parenthesis sequence as quickly as possible. 110 # 111 open-paren-quant: 112 '?' n open-paren-quant2 doSuppressComments 113 default open-paren 114 115 open-paren-quant2: 116 '#' n paren-comment ^expr-quant 117 default open-paren-extended 118 119 120 # 121 # open-paren We've got an open paren. We need to scan further to 122 # determine what kind of quantifier it is - plain (, (?:, (?>, or whatever. 123 # 124 open-paren: 125 '?' n open-paren-extended doSuppressComments 126 default term ^expr-quant doOpenCaptureParen 127 128 open-paren-extended: 129 ':' n term ^expr-quant doOpenNonCaptureParen # (?: 130 '>' n term ^expr-quant doOpenAtomicParen # (?> 131 '=' n term ^expr-cont doOpenLookAhead # (?= 132 '!' n term ^expr-cont doOpenLookAheadNeg # (?! 133 '<' n open-paren-lookbehind 134 '#' n paren-comment ^term 135 'i' paren-flag doBeginMatchMode 136 'd' paren-flag doBeginMatchMode 137 'm' paren-flag doBeginMatchMode 138 's' paren-flag doBeginMatchMode 139 'u' paren-flag doBeginMatchMode 140 'w' paren-flag doBeginMatchMode 141 'x' paren-flag doBeginMatchMode 142 '-' paren-flag doBeginMatchMode 143 '(' n errorDeath doConditionalExpr 144 '{' n errorDeath doPerlInline 145 default errorDeath doBadOpenParenType 146 147 open-paren-lookbehind: 148 '=' n term ^expr-cont doOpenLookBehind # (?<= 149 '!' n term ^expr-cont doOpenLookBehindNeg # (?<! 150 default errorDeath doBadOpenParenType 151 152 153 # 154 # paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')' 155 # 156 paren-comment: 157 ')' n pop 158 eof errorDeath doMismatchedParenErr 159 default n paren-comment 160 161 # 162 # paren-flag Scanned a (?ismx-ismx flag setting 163 # 164 paren-flag: 165 'i' n paren-flag doMatchMode 166 'd' n paren-flag doMatchMode 167 'm' n paren-flag doMatchMode 168 's' n paren-flag doMatchMode 169 'u' n paren-flag doMatchMode 170 'w' n paren-flag doMatchMode 171 'x' n paren-flag doMatchMode 172 '-' n paren-flag doMatchMode 173 ')' n term doSetMatchMode 174 ':' n term ^expr-quant doMatchModeParen 175 default errorDeath doBadModeFlag 176 177 178 # 179 # quant-star Scanning a '*' quantifier. Need to look ahead to decide 180 # between plain '*', '*?', '*+' 181 # 182 quant-star: 183 '?' n expr-cont doNGStar # *? 184 '+' n expr-cont doPossessiveStar # *+ 185 default expr-cont doStar 186 187 188 # 189 # quant-plus Scanning a '+' quantifier. Need to look ahead to decide 190 # between plain '+', '+?', '++' 191 # 192 quant-plus: 193 '?' n expr-cont doNGPlus # *? 194 '+' n expr-cont doPossessivePlus # *+ 195 default expr-cont doPlus 196 197 198 # 199 # quant-opt Scanning a '?' quantifier. Need to look ahead to decide 200 # between plain '?', '??', '?+' 201 # 202 quant-opt: 203 '?' n expr-cont doNGOpt # ?? 204 '+' n expr-cont doPossessiveOpt # ?+ 205 default expr-cont doOpt # ? 206 207 208 # 209 # Interval scanning a '{', the opening delimiter for an interval specification 210 # {number} or {min, max} or {min,} 211 # 212 interval-open: 213 digit_char interval-lower 214 default errorDeath doIntervalError 215 216 interval-lower: 217 digit_char n interval-lower doIntevalLowerDigit 218 ',' n interval-upper 219 '}' n interval-type doIntervalSame # {n} 220 default errorDeath doIntervalError 221 222 interval-upper: 223 digit_char n interval-upper doIntervalUpperDigit 224 '}' n interval-type 225 default errorDeath doIntervalError 226 227 interval-type: 228 '?' n expr-cont doNGInterval # {n,m}? 229 '+' n expr-cont doPossessiveInterval # {n,m}+ 230 default expr-cont doInterval # {m,n} 231 232 233 # 234 # backslash # Backslash. Figure out which of the \thingies we have encountered. 235 # The low level next-char function will have preprocessed 236 # some of them already; those won't come here. 237 backslash: 238 'A' n term doBackslashA 239 'B' n term doBackslashB 240 'b' n term doBackslashb 241 'd' n expr-quant doBackslashd 242 'D' n expr-quant doBackslashD 243 'G' n term doBackslashG 244 'N' expr-quant doNamedChar # \N{NAME} named char 245 'p' expr-quant doProperty # \p{Lu} style property 246 'P' expr-quant doProperty 247 'Q' n term doEnterQuoteMode 248 'S' n expr-quant doBackslashS 249 's' n expr-quant doBackslashs 250 'W' n expr-quant doBackslashW 251 'w' n expr-quant doBackslashw 252 'X' n expr-quant doBackslashX 253 'Z' n term doBackslashZ 254 'z' n term doBackslashz 255 digit_char n expr-quant doBackRef # Will scan multiple digits 256 eof errorDeath doEscapeError 257 default n expr-quant doEscapedLiteralChar 258 259 260 261 # 262 # [set expression] parsing, 263 # All states involved in parsing set expressions have names beginning with "set-" 264 # 265 266 set-open: 267 '^' n set-open2 doSetNegate 268 ':' set-posix doSetPosixProp 269 default set-open2 270 271 set-open2: 272 ']' n set-after-lit doSetLiteral 273 default set-start 274 275 # set-posix: 276 # scanned a '[:' If it really is a [:property:], doSetPosixProp will have 277 # moved the scan to the closing ']'. If it wasn't a property 278 # expression, the scan will still be at the opening ':', which should 279 # be interpreted as a normal set expression. 280 set-posix: 281 ']' n pop doSetEnd 282 ':' set-start 283 default errorDeath doRuleError # should not be possible. 284 285 # 286 # set-start after the [ and special case leading characters (^ and/or ]) but before 287 # everything else. A '-' is literal at this point. 288 # 289 set-start: 290 ']' n pop doSetEnd 291 '[' n set-open ^set-after-set doSetBeginUnion 292 '\' n set-escape 293 '-' n set-start-dash 294 '&' n set-start-amp 295 default n set-after-lit doSetLiteral 296 297 # set-start-dash Turn "[--" into a syntax error. 298 # "[-x" is good, - and x are literals. 299 # 300 set-start-dash: 301 '-' errorDeath doRuleError 302 default set-after-lit doSetAddDash 303 304 # set-start-amp Turn "[&&" into a syntax error. 305 # "[&x" is good, & and x are literals. 306 # 307 set-start-amp: 308 '&' errorDeath doRuleError 309 default set-after-lit doSetAddAmp 310 311 # 312 # set-after-lit The last thing scanned was a literal character within a set. 313 # Can be followed by anything. Single '-' or '&' are 314 # literals in this context, not operators. 315 set-after-lit: 316 ']' n pop doSetEnd 317 '[' n set-open ^set-after-set doSetBeginUnion 318 '-' n set-lit-dash 319 '&' n set-lit-amp 320 '\' n set-escape 321 eof errorDeath doSetNoCloseError 322 default n set-after-lit doSetLiteral 323 324 set-after-set: 325 ']' n pop doSetEnd 326 '[' n set-open ^set-after-set doSetBeginUnion 327 '-' n set-set-dash 328 '&' n set-set-amp 329 '\' n set-escape 330 eof errorDeath doSetNoCloseError 331 default n set-after-lit doSetLiteral 332 333 set-after-range: 334 ']' n pop doSetEnd 335 '[' n set-open ^set-after-set doSetBeginUnion 336 '-' n set-range-dash 337 '&' n set-range-amp 338 '\' n set-escape 339 eof errorDeath doSetNoCloseError 340 default n set-after-lit doSetLiteral 341 342 343 # set-after-op 344 # After a -- or && 345 # It is an error to close a set at this point. 346 # 347 set-after-op: 348 '[' n set-open ^set-after-set doSetBeginUnion 349 ']' errorDeath doSetOpError 350 '\' n set-escape 351 default n set-after-lit doSetLiteral 352 353 # 354 # set-set-amp 355 # Have scanned [[set]& 356 # Could be a '&' intersection operator, if a set follows. 357 # Could be the start of a '&&' operator. 358 # Otherewise is a literal. 359 set-set-amp: 360 '[' n set-open ^set-after-set doSetBeginIntersection1 361 '&' n set-after-op doSetIntersection2 362 default set-after-lit doSetAddAmp 363 364 365 # set-lit-amp Have scanned "[literals&" 366 # Could be a start of "&&" operator or a literal 367 # In [abc&[def]], the '&' is a literal 368 # 369 set-lit-amp: 370 '&' n set-after-op doSetIntersection2 371 default set-after-lit doSetAddAmp 372 373 374 # 375 # set-set-dash 376 # Have scanned [set]- 377 # Could be a '-' difference operator, if a [set] follows. 378 # Could be the start of a '--' operator. 379 # Otherewise is a literal. 380 set-set-dash: 381 '[' n set-open ^set-after-set doSetBeginDifference1 382 '-' n set-after-op doSetDifference2 383 default set-after-lit doSetAddDash 384 385 386 # 387 # set-range-dash 388 # scanned a-b- or \w- 389 # any set or range like item where the trailing single '-' should 390 # be literal, not a set difference operation. 391 # A trailing "--" is still a difference operator. 392 set-range-dash: 393 '-' n set-after-op doSetDifference2 394 default set-after-lit doSetAddDash 395 396 397 set-range-amp: 398 '&' n set-after-op doSetIntersection2 399 default set-after-lit doSetAddAmp 400 401 402 # set-lit-dash 403 # Have scanned "[literals-" Could be a range or a -- operator or a literal 404 # In [abc-[def]], the '-' is a literal (confirmed with a Java test) 405 # [abc-\p{xx} the '-' is an error 406 # [abc-] the '-' is a literal 407 # [ab-xy] the '-' is a range 408 # 409 set-lit-dash: 410 '-' n set-after-op doSetDifference2 411 '[' set-after-lit doSetAddDash 412 ']' set-after-lit doSetAddDash 413 '\' n set-lit-dash-escape 414 default n set-after-range doSetRange 415 416 # set-lit-dash-escape 417 # 418 # scanned "[literal-\" 419 # Could be a range, if the \ introduces an escaped literal char or a named char. 420 # Otherwise it is an error. 421 # 422 set-lit-dash-escape: 423 's' errorDeath doSetOpError 424 'S' errorDeath doSetOpError 425 'w' errorDeath doSetOpError 426 'W' errorDeath doSetOpError 427 'd' errorDeath doSetOpError 428 'D' errorDeath doSetOpError 429 'N' set-after-range doSetNamedRange 430 default n set-after-range doSetRange 431 432 433 # 434 # set-escape 435 # Common back-slash escape processing within set expressions 436 # 437 set-escape: 438 'p' set-after-set doSetProp 439 'P' set-after-set doSetProp 440 'N' set-after-lit doSetNamedChar 441 's' n set-after-range doSetBackslash_s 442 'S' n set-after-range doSetBackslash_S 443 'w' n set-after-range doSetBackslash_w 444 'W' n set-after-range doSetBackslash_W 445 'd' n set-after-range doSetBackslash_d 446 'D' n set-after-range doSetBackslash_D 447 default n set-after-lit doSetLiteralEscaped 448 449 # 450 # set-finish 451 # Have just encountered the final ']' that completes a [set], and 452 # arrived here via a pop. From here, we exit the set parsing world, and go 453 # back to generic regular expression parsing. 454 # 455 set-finish: 456 default expr-quant doSetFinish 457 458 459 # 460 # errorDeath. This state is specified as the next state whenever a syntax error 461 # in the source rules is detected. Barring bugs, the state machine will never 462 # actually get here, but will stop because of the action associated with the error. 463 # But, just in case, this state asks the state machine to exit. 464 errorDeath: 465 default n errorDeath doExit 466 467 468