1 # Copyright (C) 2016 and later: Unicode, Inc. and others. 2 # License & terms of use: http://www.unicode.org/copyright.html 3 #***************************************************************************** 4 # 5 # Copyright (C) 2002-2015, International Business Machines Corporation and others. 6 # All Rights Reserved. 7 # 8 #***************************************************************************** 9 # 10 # file: regexcst.txt 11 # ICU Regular Expression Parser State Table 12 # 13 # This state table is used when reading and parsing a regular expression pattern 14 # The pattern parser uses a state machine; the data in this file define the 15 # state transitions that occur for each input character. 16 # 17 # *** This file defines the regex pattern grammar. This is it. 18 # *** The determination of what is accepted is here. 19 # 20 # This file is processed by a perl script "regexcst.pl" to produce initialized C arrays 21 # that are then built with the rule parser. 22 # 23 24 # 25 # Here is the syntax of the state definitions in this file: 26 # 27 # 28 #StateName: 29 # input-char n next-state ^push-state action 30 # input-char n next-state ^push-state action 31 # | | | | | 32 # | | | | |--- action to be performed by state machine 33 # | | | | See function RBBIRuleScanner::doParseActions() 34 # | | | | 35 # | | | |--- Push this named state onto the state stack. 36 # | | | Later, when next state is specified as "pop", 37 # | | | the pushed state will become the current state. 38 # | | | 39 # | | |--- Transition to this state if the current input character matches the input 40 # | | character or char class in the left hand column. "pop" causes the next 41 # | | state to be popped from the state stack. 42 # | | 43 # | |--- When making the state transition specified on this line, advance to the next 44 # | character from the input only if 'n' appears here. 45 # | 46 # |--- Character or named character classes to test for. If the current character being scanned 47 # matches, peform the actions and go to the state specified on this line. 48 # The input character is tested sequentally, in the order written. The characters and 49 # character classes tested for do not need to be mutually exclusive. The first match wins. 50 # 51 52 53 54 55 # 56 # start state, scan position is at the beginning of the pattern. 57 # 58 start: 59 default term doPatStart 60 61 62 63 64 # 65 # term. At a position where we can accept the start most items in a pattern. 66 # 67 term: 68 quoted n expr-quant doLiteralChar 69 rule_char n expr-quant doLiteralChar 70 '[' n set-open ^set-finish doSetBegin 71 '(' n open-paren 72 '.' n expr-quant doDotAny 73 '^' n expr-quant doCaret 74 '$' n expr-quant doDollar 75 '\' n backslash 76 '|' n term doOrOperator 77 ')' n pop doCloseParen 78 eof term doPatFinish 79 default errorDeath doRuleError 80 81 82 83 # 84 # expr-quant We've just finished scanning a term, now look for the optional 85 # trailing quantifier - *, +, ?, *?, etc. 86 # 87 expr-quant: 88 '*' n quant-star 89 '+' n quant-plus 90 '?' n quant-opt 91 '{' n interval-open doIntervalInit 92 '(' n open-paren-quant 93 default expr-cont 94 95 96 # 97 # expr-cont Expression, continuation. At a point where additional terms are 98 # allowed, but not required. No Quantifiers 99 # 100 expr-cont: 101 '|' n term doOrOperator 102 ')' n pop doCloseParen 103 default term 104 105 106 # 107 # open-paren-quant Special case handling for comments appearing before a quantifier, 108 # e.g. x(?#comment )* 109 # Open parens from expr-quant come here; anything but a (?# comment 110 # branches into the normal parenthesis sequence as quickly as possible. 111 # 112 open-paren-quant: 113 '?' n open-paren-quant2 doSuppressComments 114 default open-paren 115 116 open-paren-quant2: 117 '#' n paren-comment ^expr-quant 118 default open-paren-extended 119 120 121 # 122 # open-paren We've got an open paren. We need to scan further to 123 # determine what kind of quantifier it is - plain (, (?:, (?>, or whatever. 124 # 125 open-paren: 126 '?' n open-paren-extended doSuppressComments 127 default term ^expr-quant doOpenCaptureParen 128 129 open-paren-extended: 130 ':' n term ^expr-quant doOpenNonCaptureParen # (?: 131 '>' n term ^expr-quant doOpenAtomicParen # (?> 132 '=' n term ^expr-cont doOpenLookAhead # (?= 133 '!' n term ^expr-cont doOpenLookAheadNeg # (?! 134 '<' n open-paren-lookbehind 135 '#' n paren-comment ^term 136 'i' paren-flag doBeginMatchMode 137 'd' paren-flag doBeginMatchMode 138 'm' paren-flag doBeginMatchMode 139 's' paren-flag doBeginMatchMode 140 'u' paren-flag doBeginMatchMode 141 'w' paren-flag doBeginMatchMode 142 'x' paren-flag doBeginMatchMode 143 '-' paren-flag doBeginMatchMode 144 '(' n errorDeath doConditionalExpr 145 '{' n errorDeath doPerlInline 146 default errorDeath doBadOpenParenType 147 148 open-paren-lookbehind: 149 '=' n term ^expr-cont doOpenLookBehind # (?<= 150 '!' n term ^expr-cont doOpenLookBehindNeg # (?<! 151 ascii_letter named-capture doBeginNamedCapture # (?<name 152 default errorDeath doBadOpenParenType 153 154 155 # 156 # paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')' 157 # 158 paren-comment: 159 ')' n pop 160 eof errorDeath doMismatchedParenErr 161 default n paren-comment 162 163 # 164 # paren-flag Scanned a (?ismx-ismx flag setting 165 # 166 paren-flag: 167 'i' n paren-flag doMatchMode 168 'd' n paren-flag doMatchMode 169 'm' n paren-flag doMatchMode 170 's' n paren-flag doMatchMode 171 'u' n paren-flag doMatchMode 172 'w' n paren-flag doMatchMode 173 'x' n paren-flag doMatchMode 174 '-' n paren-flag doMatchMode 175 ')' n term doSetMatchMode 176 ':' n term ^expr-quant doMatchModeParen 177 default errorDeath doBadModeFlag 178 179 # 180 # named-capture (?<name> ... ), position currently on the name. 181 # 182 named-capture: 183 ascii_letter n named-capture doContinueNamedCapture 184 digit_char n named-capture doContinueNamedCapture 185 '>' n term ^expr-quant doOpenCaptureParen # common w non-named capture. 186 default errorDeath doBadNamedCapture 187 188 # 189 # quant-star Scanning a '*' quantifier. Need to look ahead to decide 190 # between plain '*', '*?', '*+' 191 # 192 quant-star: 193 '?' n expr-cont doNGStar # *? 194 '+' n expr-cont doPossessiveStar # *+ 195 default expr-cont doStar 196 197 198 # 199 # quant-plus Scanning a '+' quantifier. Need to look ahead to decide 200 # between plain '+', '+?', '++' 201 # 202 quant-plus: 203 '?' n expr-cont doNGPlus # *? 204 '+' n expr-cont doPossessivePlus # *+ 205 default expr-cont doPlus 206 207 208 # 209 # quant-opt Scanning a '?' quantifier. Need to look ahead to decide 210 # between plain '?', '??', '?+' 211 # 212 quant-opt: 213 '?' n expr-cont doNGOpt # ?? 214 '+' n expr-cont doPossessiveOpt # ?+ 215 default expr-cont doOpt # ? 216 217 218 # 219 # Interval scanning a '{', the opening delimiter for an interval specification 220 # {number} or {min, max} or {min,} 221 # 222 interval-open: 223 digit_char interval-lower 224 default errorDeath doIntervalError 225 226 interval-lower: 227 digit_char n interval-lower doIntevalLowerDigit 228 ',' n interval-upper 229 '}' n interval-type doIntervalSame # {n} 230 default errorDeath doIntervalError 231 232 interval-upper: 233 digit_char n interval-upper doIntervalUpperDigit 234 '}' n interval-type 235 default errorDeath doIntervalError 236 237 interval-type: 238 '?' n expr-cont doNGInterval # {n,m}? 239 '+' n expr-cont doPossessiveInterval # {n,m}+ 240 default expr-cont doInterval # {m,n} 241 242 243 # 244 # backslash # Backslash. Figure out which of the \thingies we have encountered. 245 # The low level next-char function will have preprocessed 246 # some of them already; those won't come here. 247 backslash: 248 'A' n term doBackslashA 249 'B' n term doBackslashB 250 'b' n term doBackslashb 251 'd' n expr-quant doBackslashd 252 'D' n expr-quant doBackslashD 253 'G' n term doBackslashG 254 'h' n expr-quant doBackslashh 255 'H' n expr-quant doBackslashH 256 'k' n named-backref 257 'N' expr-quant doNamedChar # \N{NAME} named char 258 'p' expr-quant doProperty # \p{Lu} style property 259 'P' expr-quant doProperty 260 'R' n expr-quant doBackslashR 261 'Q' n term doEnterQuoteMode 262 'S' n expr-quant doBackslashS 263 's' n expr-quant doBackslashs 264 'v' n expr-quant doBackslashv 265 'V' n expr-quant doBackslashV 266 'W' n expr-quant doBackslashW 267 'w' n expr-quant doBackslashw 268 'X' n expr-quant doBackslashX 269 'Z' n term doBackslashZ 270 'z' n term doBackslashz 271 digit_char n expr-quant doBackRef # Will scan multiple digits 272 eof errorDeath doEscapeError 273 default n expr-quant doEscapedLiteralChar 274 275 276 # named-backref Scanned \k 277 # Leading to \k<captureName> 278 # Failure to get the full sequence is an error. 279 # 280 named-backref: 281 '<' n named-backref-2 doBeginNamedBackRef 282 default errorDeath doBadNamedCapture 283 284 named-backref-2: 285 ascii_letter n named-backref-3 doContinueNamedBackRef 286 default errorDeath doBadNamedCapture 287 288 named-backref-3: 289 ascii_letter n named-backref-3 doContinueNamedBackRef 290 digit_char n named-backref-3 doContinueNamedBackRef 291 '>' n expr-quant doCompleteNamedBackRef 292 default errorDeath doBadNamedCapture 293 294 295 # 296 # [set expression] parsing, 297 # All states involved in parsing set expressions have names beginning with "set-" 298 # 299 300 set-open: 301 '^' n set-open2 doSetNegate 302 ':' set-posix doSetPosixProp 303 default set-open2 304 305 set-open2: 306 ']' n set-after-lit doSetLiteral 307 default set-start 308 309 # set-posix: 310 # scanned a '[:' If it really is a [:property:], doSetPosixProp will have 311 # moved the scan to the closing ']'. If it wasn't a property 312 # expression, the scan will still be at the opening ':', which should 313 # be interpreted as a normal set expression. 314 set-posix: 315 ']' n pop doSetEnd 316 ':' set-start 317 default errorDeath doRuleError # should not be possible. 318 319 # 320 # set-start after the [ and special case leading characters (^ and/or ]) but before 321 # everything else. A '-' is literal at this point. 322 # 323 set-start: 324 ']' n pop doSetEnd 325 '[' n set-open ^set-after-set doSetBeginUnion 326 '\' n set-escape 327 '-' n set-start-dash 328 '&' n set-start-amp 329 default n set-after-lit doSetLiteral 330 331 # set-start-dash Turn "[--" into a syntax error. 332 # "[-x" is good, - and x are literals. 333 # 334 set-start-dash: 335 '-' errorDeath doRuleError 336 default set-after-lit doSetAddDash 337 338 # set-start-amp Turn "[&&" into a syntax error. 339 # "[&x" is good, & and x are literals. 340 # 341 set-start-amp: 342 '&' errorDeath doRuleError 343 default set-after-lit doSetAddAmp 344 345 # 346 # set-after-lit The last thing scanned was a literal character within a set. 347 # Can be followed by anything. Single '-' or '&' are 348 # literals in this context, not operators. 349 set-after-lit: 350 ']' n pop doSetEnd 351 '[' n set-open ^set-after-set doSetBeginUnion 352 '-' n set-lit-dash 353 '&' n set-lit-amp 354 '\' n set-escape 355 eof errorDeath doSetNoCloseError 356 default n set-after-lit doSetLiteral 357 358 set-after-set: 359 ']' n pop doSetEnd 360 '[' n set-open ^set-after-set doSetBeginUnion 361 '-' n set-set-dash 362 '&' n set-set-amp 363 '\' n set-escape 364 eof errorDeath doSetNoCloseError 365 default n set-after-lit doSetLiteral 366 367 set-after-range: 368 ']' n pop doSetEnd 369 '[' n set-open ^set-after-set doSetBeginUnion 370 '-' n set-range-dash 371 '&' n set-range-amp 372 '\' n set-escape 373 eof errorDeath doSetNoCloseError 374 default n set-after-lit doSetLiteral 375 376 377 # set-after-op 378 # After a -- or && 379 # It is an error to close a set at this point. 380 # 381 set-after-op: 382 '[' n set-open ^set-after-set doSetBeginUnion 383 ']' errorDeath doSetOpError 384 '\' n set-escape 385 default n set-after-lit doSetLiteral 386 387 # 388 # set-set-amp 389 # Have scanned [[set]& 390 # Could be a '&' intersection operator, if a set follows. 391 # Could be the start of a '&&' operator. 392 # Otherewise is a literal. 393 set-set-amp: 394 '[' n set-open ^set-after-set doSetBeginIntersection1 395 '&' n set-after-op doSetIntersection2 396 default set-after-lit doSetAddAmp 397 398 399 # set-lit-amp Have scanned "[literals&" 400 # Could be a start of "&&" operator or a literal 401 # In [abc&[def]], the '&' is a literal 402 # 403 set-lit-amp: 404 '&' n set-after-op doSetIntersection2 405 default set-after-lit doSetAddAmp 406 407 408 # 409 # set-set-dash 410 # Have scanned [set]- 411 # Could be a '-' difference operator, if a [set] follows. 412 # Could be the start of a '--' operator. 413 # Otherewise is a literal. 414 set-set-dash: 415 '[' n set-open ^set-after-set doSetBeginDifference1 416 '-' n set-after-op doSetDifference2 417 default set-after-lit doSetAddDash 418 419 420 # 421 # set-range-dash 422 # scanned a-b- or \w- 423 # any set or range like item where the trailing single '-' should 424 # be literal, not a set difference operation. 425 # A trailing "--" is still a difference operator. 426 set-range-dash: 427 '-' n set-after-op doSetDifference2 428 default set-after-lit doSetAddDash 429 430 431 set-range-amp: 432 '&' n set-after-op doSetIntersection2 433 default set-after-lit doSetAddAmp 434 435 436 # set-lit-dash 437 # Have scanned "[literals-" Could be a range or a -- operator or a literal 438 # In [abc-[def]], the '-' is a literal (confirmed with a Java test) 439 # [abc-\p{xx} the '-' is an error 440 # [abc-] the '-' is a literal 441 # [ab-xy] the '-' is a range 442 # 443 set-lit-dash: 444 '-' n set-after-op doSetDifference2 445 '[' set-after-lit doSetAddDash 446 ']' set-after-lit doSetAddDash 447 '\' n set-lit-dash-escape 448 default n set-after-range doSetRange 449 450 # set-lit-dash-escape 451 # 452 # scanned "[literal-\" 453 # Could be a range, if the \ introduces an escaped literal char or a named char. 454 # Otherwise it is an error. 455 # 456 set-lit-dash-escape: 457 's' errorDeath doSetOpError 458 'S' errorDeath doSetOpError 459 'w' errorDeath doSetOpError 460 'W' errorDeath doSetOpError 461 'd' errorDeath doSetOpError 462 'D' errorDeath doSetOpError 463 'N' set-after-range doSetNamedRange 464 default n set-after-range doSetRange 465 466 467 # 468 # set-escape 469 # Common back-slash escape processing within set expressions 470 # 471 set-escape: 472 'p' set-after-set doSetProp 473 'P' set-after-set doSetProp 474 'N' set-after-lit doSetNamedChar 475 's' n set-after-range doSetBackslash_s 476 'S' n set-after-range doSetBackslash_S 477 'w' n set-after-range doSetBackslash_w 478 'W' n set-after-range doSetBackslash_W 479 'd' n set-after-range doSetBackslash_d 480 'D' n set-after-range doSetBackslash_D 481 'h' n set-after-range doSetBackslash_h 482 'H' n set-after-range doSetBackslash_H 483 'v' n set-after-range doSetBackslash_v 484 'V' n set-after-range doSetBackslash_V 485 default n set-after-lit doSetLiteralEscaped 486 487 # 488 # set-finish 489 # Have just encountered the final ']' that completes a [set], and 490 # arrived here via a pop. From here, we exit the set parsing world, and go 491 # back to generic regular expression parsing. 492 # 493 set-finish: 494 default expr-quant doSetFinish 495 496 497 # 498 # errorDeath. This state is specified as the next state whenever a syntax error 499 # in the source rules is detected. Barring bugs, the state machine will never 500 # actually get here, but will stop because of the action associated with the error. 501 # But, just in case, this state asks the state machine to exit. 502 errorDeath: 503 default n errorDeath doExit 504 505 506