1 2 #***************************************************************************** 3 # 4 # Copyright (C) 2002-2015, International Business Machines Corporation and others. 5 # All Rights Reserved. 6 # 7 #***************************************************************************** 8 # 9 # file: regexcst.txt 10 # ICU Regular Expression Parser State Table 11 # 12 # This state table is used when reading and parsing a regular expression pattern 13 # The pattern parser uses a state machine; the data in this file define the 14 # state transitions that occur for each input character. 15 # 16 # *** This file defines the regex pattern grammar. This is it. 17 # *** The determination of what is accepted is here. 18 # 19 # This file is processed by a perl script "regexcst.pl" to produce initialized C arrays 20 # that are then built with the rule parser. 21 # 22 23 # 24 # Here is the syntax of the state definitions in this file: 25 # 26 # 27 #StateName: 28 # input-char n next-state ^push-state action 29 # input-char n next-state ^push-state action 30 # | | | | | 31 # | | | | |--- action to be performed by state machine 32 # | | | | See function RBBIRuleScanner::doParseActions() 33 # | | | | 34 # | | | |--- Push this named state onto the state stack. 35 # | | | Later, when next state is specified as "pop", 36 # | | | the pushed state will become the current state. 37 # | | | 38 # | | |--- Transition to this state if the current input character matches the input 39 # | | character or char class in the left hand column. "pop" causes the next 40 # | | state to be popped from the state stack. 41 # | | 42 # | |--- When making the state transition specified on this line, advance to the next 43 # | character from the input only if 'n' appears here. 44 # | 45 # |--- Character or named character classes to test for. If the current character being scanned 46 # matches, peform the actions and go to the state specified on this line. 47 # The input character is tested sequentally, in the order written. The characters and 48 # character classes tested for do not need to be mutually exclusive. The first match wins. 49 # 50 51 52 53 54 # 55 # start state, scan position is at the beginning of the pattern. 56 # 57 start: 58 default term doPatStart 59 60 61 62 63 # 64 # term. At a position where we can accept the start most items in a pattern. 65 # 66 term: 67 quoted n expr-quant doLiteralChar 68 rule_char n expr-quant doLiteralChar 69 '[' n set-open ^set-finish doSetBegin 70 '(' n open-paren 71 '.' n expr-quant doDotAny 72 '^' n expr-quant doCaret 73 '$' n expr-quant doDollar 74 '\' n backslash 75 '|' n term doOrOperator 76 ')' n pop doCloseParen 77 eof term doPatFinish 78 default errorDeath doRuleError 79 80 81 82 # 83 # expr-quant We've just finished scanning a term, now look for the optional 84 # trailing quantifier - *, +, ?, *?, etc. 85 # 86 expr-quant: 87 '*' n quant-star 88 '+' n quant-plus 89 '?' n quant-opt 90 '{' n interval-open doIntervalInit 91 '(' n open-paren-quant 92 default expr-cont 93 94 95 # 96 # expr-cont Expression, continuation. At a point where additional terms are 97 # allowed, but not required. No Quantifiers 98 # 99 expr-cont: 100 '|' n term doOrOperator 101 ')' n pop doCloseParen 102 default term 103 104 105 # 106 # open-paren-quant Special case handling for comments appearing before a quantifier, 107 # e.g. x(?#comment )* 108 # Open parens from expr-quant come here; anything but a (?# comment 109 # branches into the normal parenthesis sequence as quickly as possible. 110 # 111 open-paren-quant: 112 '?' n open-paren-quant2 doSuppressComments 113 default open-paren 114 115 open-paren-quant2: 116 '#' n paren-comment ^expr-quant 117 default open-paren-extended 118 119 120 # 121 # open-paren We've got an open paren. We need to scan further to 122 # determine what kind of quantifier it is - plain (, (?:, (?>, or whatever. 123 # 124 open-paren: 125 '?' n open-paren-extended doSuppressComments 126 default term ^expr-quant doOpenCaptureParen 127 128 open-paren-extended: 129 ':' n term ^expr-quant doOpenNonCaptureParen # (?: 130 '>' n term ^expr-quant doOpenAtomicParen # (?> 131 '=' n term ^expr-cont doOpenLookAhead # (?= 132 '!' n term ^expr-cont doOpenLookAheadNeg # (?! 133 '<' n open-paren-lookbehind 134 '#' n paren-comment ^term 135 'i' paren-flag doBeginMatchMode 136 'd' paren-flag doBeginMatchMode 137 'm' paren-flag doBeginMatchMode 138 's' paren-flag doBeginMatchMode 139 'u' paren-flag doBeginMatchMode 140 'w' paren-flag doBeginMatchMode 141 'x' paren-flag doBeginMatchMode 142 '-' paren-flag doBeginMatchMode 143 '(' n errorDeath doConditionalExpr 144 '{' n errorDeath doPerlInline 145 default errorDeath doBadOpenParenType 146 147 open-paren-lookbehind: 148 '=' n term ^expr-cont doOpenLookBehind # (?<= 149 '!' n term ^expr-cont doOpenLookBehindNeg # (?<! 150 ascii_letter named-capture doBeginNamedCapture # (?<name 151 default errorDeath doBadOpenParenType 152 153 154 # 155 # paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')' 156 # 157 paren-comment: 158 ')' n pop 159 eof errorDeath doMismatchedParenErr 160 default n paren-comment 161 162 # 163 # paren-flag Scanned a (?ismx-ismx flag setting 164 # 165 paren-flag: 166 'i' n paren-flag doMatchMode 167 'd' n paren-flag doMatchMode 168 'm' n paren-flag doMatchMode 169 's' n paren-flag doMatchMode 170 'u' n paren-flag doMatchMode 171 'w' n paren-flag doMatchMode 172 'x' n paren-flag doMatchMode 173 '-' n paren-flag doMatchMode 174 ')' n term doSetMatchMode 175 ':' n term ^expr-quant doMatchModeParen 176 default errorDeath doBadModeFlag 177 178 # 179 # named-capture (?<name> ... ), position currently on the name. 180 # 181 named-capture: 182 ascii_letter n named-capture doContinueNamedCapture 183 digit_char n named-capture doContinueNamedCapture 184 '>' n term ^expr-quant doOpenCaptureParen # common w non-named capture. 185 default errorDeath doBadNamedCapture 186 187 # 188 # quant-star Scanning a '*' quantifier. Need to look ahead to decide 189 # between plain '*', '*?', '*+' 190 # 191 quant-star: 192 '?' n expr-cont doNGStar # *? 193 '+' n expr-cont doPossessiveStar # *+ 194 default expr-cont doStar 195 196 197 # 198 # quant-plus Scanning a '+' quantifier. Need to look ahead to decide 199 # between plain '+', '+?', '++' 200 # 201 quant-plus: 202 '?' n expr-cont doNGPlus # *? 203 '+' n expr-cont doPossessivePlus # *+ 204 default expr-cont doPlus 205 206 207 # 208 # quant-opt Scanning a '?' quantifier. Need to look ahead to decide 209 # between plain '?', '??', '?+' 210 # 211 quant-opt: 212 '?' n expr-cont doNGOpt # ?? 213 '+' n expr-cont doPossessiveOpt # ?+ 214 default expr-cont doOpt # ? 215 216 217 # 218 # Interval scanning a '{', the opening delimiter for an interval specification 219 # {number} or {min, max} or {min,} 220 # 221 interval-open: 222 digit_char interval-lower 223 default errorDeath doIntervalError 224 225 interval-lower: 226 digit_char n interval-lower doIntevalLowerDigit 227 ',' n interval-upper 228 '}' n interval-type doIntervalSame # {n} 229 default errorDeath doIntervalError 230 231 interval-upper: 232 digit_char n interval-upper doIntervalUpperDigit 233 '}' n interval-type 234 default errorDeath doIntervalError 235 236 interval-type: 237 '?' n expr-cont doNGInterval # {n,m}? 238 '+' n expr-cont doPossessiveInterval # {n,m}+ 239 default expr-cont doInterval # {m,n} 240 241 242 # 243 # backslash # Backslash. Figure out which of the \thingies we have encountered. 244 # The low level next-char function will have preprocessed 245 # some of them already; those won't come here. 246 backslash: 247 'A' n term doBackslashA 248 'B' n term doBackslashB 249 'b' n term doBackslashb 250 'd' n expr-quant doBackslashd 251 'D' n expr-quant doBackslashD 252 'G' n term doBackslashG 253 'h' n expr-quant doBackslashh 254 'H' n expr-quant doBackslashH 255 'k' n named-backref 256 'N' expr-quant doNamedChar # \N{NAME} named char 257 'p' expr-quant doProperty # \p{Lu} style property 258 'P' expr-quant doProperty 259 'R' n expr-quant doBackslashR 260 'Q' n term doEnterQuoteMode 261 'S' n expr-quant doBackslashS 262 's' n expr-quant doBackslashs 263 'v' n expr-quant doBackslashv 264 'V' n expr-quant doBackslashV 265 'W' n expr-quant doBackslashW 266 'w' n expr-quant doBackslashw 267 'X' n expr-quant doBackslashX 268 'Z' n term doBackslashZ 269 'z' n term doBackslashz 270 digit_char n expr-quant doBackRef # Will scan multiple digits 271 eof errorDeath doEscapeError 272 default n expr-quant doEscapedLiteralChar 273 274 275 # named-backref Scanned \k 276 # Leading to \k<captureName> 277 # Failure to get the full sequence is an error. 278 # 279 named-backref: 280 '<' n named-backref-2 doBeginNamedBackRef 281 default errorDeath doBadNamedCapture 282 283 named-backref-2: 284 ascii_letter n named-backref-3 doContinueNamedBackRef 285 default errorDeath doBadNamedCapture 286 287 named-backref-3: 288 ascii_letter n named-backref-3 doContinueNamedBackRef 289 digit_char n named-backref-3 doContinueNamedBackRef 290 '>' n expr-quant doCompleteNamedBackRef 291 default errorDeath doBadNamedCapture 292 293 294 # 295 # [set expression] parsing, 296 # All states involved in parsing set expressions have names beginning with "set-" 297 # 298 299 set-open: 300 '^' n set-open2 doSetNegate 301 ':' set-posix doSetPosixProp 302 default set-open2 303 304 set-open2: 305 ']' n set-after-lit doSetLiteral 306 default set-start 307 308 # set-posix: 309 # scanned a '[:' If it really is a [:property:], doSetPosixProp will have 310 # moved the scan to the closing ']'. If it wasn't a property 311 # expression, the scan will still be at the opening ':', which should 312 # be interpreted as a normal set expression. 313 set-posix: 314 ']' n pop doSetEnd 315 ':' set-start 316 default errorDeath doRuleError # should not be possible. 317 318 # 319 # set-start after the [ and special case leading characters (^ and/or ]) but before 320 # everything else. A '-' is literal at this point. 321 # 322 set-start: 323 ']' n pop doSetEnd 324 '[' n set-open ^set-after-set doSetBeginUnion 325 '\' n set-escape 326 '-' n set-start-dash 327 '&' n set-start-amp 328 default n set-after-lit doSetLiteral 329 330 # set-start-dash Turn "[--" into a syntax error. 331 # "[-x" is good, - and x are literals. 332 # 333 set-start-dash: 334 '-' errorDeath doRuleError 335 default set-after-lit doSetAddDash 336 337 # set-start-amp Turn "[&&" into a syntax error. 338 # "[&x" is good, & and x are literals. 339 # 340 set-start-amp: 341 '&' errorDeath doRuleError 342 default set-after-lit doSetAddAmp 343 344 # 345 # set-after-lit The last thing scanned was a literal character within a set. 346 # Can be followed by anything. Single '-' or '&' are 347 # literals in this context, not operators. 348 set-after-lit: 349 ']' n pop doSetEnd 350 '[' n set-open ^set-after-set doSetBeginUnion 351 '-' n set-lit-dash 352 '&' n set-lit-amp 353 '\' n set-escape 354 eof errorDeath doSetNoCloseError 355 default n set-after-lit doSetLiteral 356 357 set-after-set: 358 ']' n pop doSetEnd 359 '[' n set-open ^set-after-set doSetBeginUnion 360 '-' n set-set-dash 361 '&' n set-set-amp 362 '\' n set-escape 363 eof errorDeath doSetNoCloseError 364 default n set-after-lit doSetLiteral 365 366 set-after-range: 367 ']' n pop doSetEnd 368 '[' n set-open ^set-after-set doSetBeginUnion 369 '-' n set-range-dash 370 '&' n set-range-amp 371 '\' n set-escape 372 eof errorDeath doSetNoCloseError 373 default n set-after-lit doSetLiteral 374 375 376 # set-after-op 377 # After a -- or && 378 # It is an error to close a set at this point. 379 # 380 set-after-op: 381 '[' n set-open ^set-after-set doSetBeginUnion 382 ']' errorDeath doSetOpError 383 '\' n set-escape 384 default n set-after-lit doSetLiteral 385 386 # 387 # set-set-amp 388 # Have scanned [[set]& 389 # Could be a '&' intersection operator, if a set follows. 390 # Could be the start of a '&&' operator. 391 # Otherewise is a literal. 392 set-set-amp: 393 '[' n set-open ^set-after-set doSetBeginIntersection1 394 '&' n set-after-op doSetIntersection2 395 default set-after-lit doSetAddAmp 396 397 398 # set-lit-amp Have scanned "[literals&" 399 # Could be a start of "&&" operator or a literal 400 # In [abc&[def]], the '&' is a literal 401 # 402 set-lit-amp: 403 '&' n set-after-op doSetIntersection2 404 default set-after-lit doSetAddAmp 405 406 407 # 408 # set-set-dash 409 # Have scanned [set]- 410 # Could be a '-' difference operator, if a [set] follows. 411 # Could be the start of a '--' operator. 412 # Otherewise is a literal. 413 set-set-dash: 414 '[' n set-open ^set-after-set doSetBeginDifference1 415 '-' n set-after-op doSetDifference2 416 default set-after-lit doSetAddDash 417 418 419 # 420 # set-range-dash 421 # scanned a-b- or \w- 422 # any set or range like item where the trailing single '-' should 423 # be literal, not a set difference operation. 424 # A trailing "--" is still a difference operator. 425 set-range-dash: 426 '-' n set-after-op doSetDifference2 427 default set-after-lit doSetAddDash 428 429 430 set-range-amp: 431 '&' n set-after-op doSetIntersection2 432 default set-after-lit doSetAddAmp 433 434 435 # set-lit-dash 436 # Have scanned "[literals-" Could be a range or a -- operator or a literal 437 # In [abc-[def]], the '-' is a literal (confirmed with a Java test) 438 # [abc-\p{xx} the '-' is an error 439 # [abc-] the '-' is a literal 440 # [ab-xy] the '-' is a range 441 # 442 set-lit-dash: 443 '-' n set-after-op doSetDifference2 444 '[' set-after-lit doSetAddDash 445 ']' set-after-lit doSetAddDash 446 '\' n set-lit-dash-escape 447 default n set-after-range doSetRange 448 449 # set-lit-dash-escape 450 # 451 # scanned "[literal-\" 452 # Could be a range, if the \ introduces an escaped literal char or a named char. 453 # Otherwise it is an error. 454 # 455 set-lit-dash-escape: 456 's' errorDeath doSetOpError 457 'S' errorDeath doSetOpError 458 'w' errorDeath doSetOpError 459 'W' errorDeath doSetOpError 460 'd' errorDeath doSetOpError 461 'D' errorDeath doSetOpError 462 'N' set-after-range doSetNamedRange 463 default n set-after-range doSetRange 464 465 466 # 467 # set-escape 468 # Common back-slash escape processing within set expressions 469 # 470 set-escape: 471 'p' set-after-set doSetProp 472 'P' set-after-set doSetProp 473 'N' set-after-lit doSetNamedChar 474 's' n set-after-range doSetBackslash_s 475 'S' n set-after-range doSetBackslash_S 476 'w' n set-after-range doSetBackslash_w 477 'W' n set-after-range doSetBackslash_W 478 'd' n set-after-range doSetBackslash_d 479 'D' n set-after-range doSetBackslash_D 480 'h' n set-after-range doSetBackslash_h 481 'H' n set-after-range doSetBackslash_H 482 'v' n set-after-range doSetBackslash_v 483 'V' n set-after-range doSetBackslash_V 484 default n set-after-lit doSetLiteralEscaped 485 486 # 487 # set-finish 488 # Have just encountered the final ']' that completes a [set], and 489 # arrived here via a pop. From here, we exit the set parsing world, and go 490 # back to generic regular expression parsing. 491 # 492 set-finish: 493 default expr-quant doSetFinish 494 495 496 # 497 # errorDeath. This state is specified as the next state whenever a syntax error 498 # in the source rules is detected. Barring bugs, the state machine will never 499 # actually get here, but will stop because of the action associated with the error. 500 # But, just in case, this state asks the state machine to exit. 501 errorDeath: 502 default n errorDeath doExit 503 504 505