Home | History | Annotate | Download | only in i18n
      1 # Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 # License & terms of use: http://www.unicode.org/copyright.html
      3 #*****************************************************************************
      4 #
      5 #   Copyright (C) 2002-2015, International Business Machines Corporation and others.
      6 #   All Rights Reserved.
      7 #
      8 #*****************************************************************************
      9 #
     10 #  file:  regexcst.txt
     11 #  ICU Regular Expression Parser State Table
     12 #
     13 #     This state table is used when reading and parsing a regular expression pattern
     14 #     The pattern parser uses a state machine; the data in this file define the
     15 #     state transitions that occur for each input character.
     16 #
     17 #     *** This file defines the regex pattern grammar.   This is it.
     18 #     *** The determination of what is accepted is here.
     19 #
     20 #     This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
     21 #     that are then built with the rule parser.
     22 #
     23 
     24 #
     25 # Here is the syntax of the state definitions in this file:
     26 #
     27 #
     28 #StateName:
     29 #   input-char           n next-state           ^push-state     action
     30 #   input-char           n next-state           ^push-state     action
     31 #       |                |   |                      |             |
     32 #       |                |   |                      |             |--- action to be performed by state machine
     33 #       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
     34 #       |                |   |                      |
     35 #       |                |   |                      |--- Push this named state onto the state stack.
     36 #       |                |   |                           Later, when next state is specified as "pop",
     37 #       |                |   |                           the pushed state will become the current state.
     38 #       |                |   |
     39 #       |                |   |--- Transition to this state if the current input character matches the input
     40 #       |                |        character or char class in the left hand column.  "pop" causes the next
     41 #       |                |        state to be popped from the state stack.
     42 #       |                |
     43 #       |                |--- When making the state transition specified on this line, advance to the next
     44 #       |                     character from the input only if 'n' appears here.
     45 #       |
     46 #       |--- Character or named character classes to test for.  If the current character being scanned
     47 #            matches, peform the actions and go to the state specified on this line.
     48 #            The input character is tested sequentally, in the order written.  The characters and
     49 #            character classes tested for do not need to be mutually exclusive.  The first match wins.
     50 #
     51 
     52 
     53 
     54 
     55 #
     56 #  start state, scan position is at the beginning of the pattern.
     57 #
     58 start:
     59    default                 term                                     doPatStart
     60 
     61 
     62 
     63 
     64 #
     65 #  term.  At a position where we can accept the start most items in a pattern.
     66 #
     67 term:
     68     quoted               n expr-quant                               doLiteralChar
     69     rule_char            n expr-quant                               doLiteralChar
     70     '['                  n set-open       ^set-finish               doSetBegin
     71     '('                  n open-paren
     72     '.'                  n expr-quant                               doDotAny
     73     '^'                  n expr-quant                               doCaret
     74     '$'                  n expr-quant                               doDollar
     75     '\'                  n backslash
     76     '|'                  n  term                                    doOrOperator
     77     ')'                  n  pop                                     doCloseParen
     78     eof	                   term                                     doPatFinish
     79     default                errorDeath                               doRuleError
     80 
     81 
     82 
     83 #
     84 #   expr-quant    We've just finished scanning a term, now look for the optional
     85 #                 trailing quantifier - *, +, ?, *?,  etc.
     86 #
     87 expr-quant:
     88     '*'                  n  quant-star
     89     '+'                  n  quant-plus
     90     '?'                  n  quant-opt
     91     '{'                  n  interval-open                          doIntervalInit
     92     '('                  n  open-paren-quant
     93     default                 expr-cont
     94 
     95 
     96 #
     97 #  expr-cont      Expression, continuation.  At a point where additional terms are
     98 #                                            allowed, but not required.  No Quantifiers
     99 #
    100 expr-cont:
    101     '|'                  n  term                                    doOrOperator
    102     ')'                  n  pop                                     doCloseParen
    103     default                 term
    104 
    105 
    106 #
    107 #   open-paren-quant   Special case handling for comments appearing before a quantifier,
    108 #                        e.g.   x(?#comment )*
    109 #                      Open parens from expr-quant come here; anything but a (?# comment
    110 #                      branches into the normal parenthesis sequence as quickly as possible.
    111 #
    112 open-paren-quant:
    113     '?'                  n  open-paren-quant2                      doSuppressComments
    114     default                 open-paren
    115 
    116 open-paren-quant2:
    117     '#'                  n  paren-comment   ^expr-quant
    118     default                 open-paren-extended
    119 
    120 
    121 #
    122 #   open-paren    We've got an open paren.  We need to scan further to
    123 #                 determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
    124 #
    125 open-paren:
    126     '?'                  n  open-paren-extended                     doSuppressComments
    127     default                 term            ^expr-quant             doOpenCaptureParen
    128 
    129 open-paren-extended:
    130     ':'                  n  term            ^expr-quant             doOpenNonCaptureParen  #  (?:
    131     '>'                  n  term            ^expr-quant             doOpenAtomicParen      #  (?>
    132     '='                  n  term            ^expr-cont              doOpenLookAhead        #  (?=
    133     '!'                  n  term            ^expr-cont              doOpenLookAheadNeg     #  (?!
    134     '<'                  n  open-paren-lookbehind
    135     '#'                  n  paren-comment   ^term
    136     'i'                     paren-flag                              doBeginMatchMode
    137     'd'                     paren-flag                              doBeginMatchMode
    138     'm'                     paren-flag                              doBeginMatchMode
    139     's'                     paren-flag                              doBeginMatchMode
    140     'u'                     paren-flag                              doBeginMatchMode
    141     'w'                     paren-flag                              doBeginMatchMode
    142     'x'                     paren-flag                              doBeginMatchMode
    143     '-'                     paren-flag                              doBeginMatchMode
    144     '('                  n  errorDeath                              doConditionalExpr
    145     '{'                  n  errorDeath                              doPerlInline
    146     default                 errorDeath                              doBadOpenParenType
    147 
    148 open-paren-lookbehind:
    149     '='                  n  term            ^expr-cont              doOpenLookBehind       #  (?<=
    150     '!'                  n  term            ^expr-cont              doOpenLookBehindNeg    #  (?<!
    151     ascii_letter            named-capture                           doBeginNamedCapture    #  (?<name
    152     default                 errorDeath                              doBadOpenParenType
    153 
    154 
    155 #
    156 #   paren-comment    We've got a (?# ... )  style comment.  Eat pattern text till we get to the ')'
    157 #
    158 paren-comment:
    159     ')'                  n  pop
    160     eof		                errorDeath                              doMismatchedParenErr
    161     default              n  paren-comment
    162 
    163 #
    164 #  paren-flag    Scanned a (?ismx-ismx  flag setting
    165 #
    166 paren-flag:
    167     'i'                  n  paren-flag                              doMatchMode
    168     'd'                  n  paren-flag                              doMatchMode
    169     'm'                  n  paren-flag                              doMatchMode
    170     's'                  n  paren-flag                              doMatchMode
    171     'u'                  n  paren-flag                              doMatchMode
    172     'w'                  n  paren-flag                              doMatchMode
    173     'x'                  n  paren-flag                              doMatchMode
    174     '-'                  n  paren-flag                              doMatchMode
    175     ')'                  n  term                                    doSetMatchMode
    176     ':'                  n  term              ^expr-quant           doMatchModeParen
    177     default                 errorDeath                              doBadModeFlag
    178 
    179 #
    180 #  named-capture    (?<name> ... ), position currently on the name.
    181 #
    182 named-capture:
    183     ascii_letter         n  named-capture                           doContinueNamedCapture
    184     digit_char           n  named-capture                           doContinueNamedCapture
    185     '>'                  n  term               ^expr-quant          doOpenCaptureParen      # common w non-named capture.
    186     default                 errorDeath                              doBadNamedCapture
    187 
    188 #
    189 #  quant-star     Scanning a '*' quantifier.  Need to look ahead to decide
    190 #                 between plain '*', '*?', '*+'
    191 #
    192 quant-star:
    193      '?'                 n  expr-cont                               doNGStar               #  *?
    194      '+'                 n  expr-cont                               doPossessiveStar       #  *+
    195      default                expr-cont                               doStar
    196 
    197 
    198 #
    199 #  quant-plus     Scanning a '+' quantifier.  Need to look ahead to decide
    200 #                 between plain '+', '+?', '++'
    201 #
    202 quant-plus:
    203      '?'                 n  expr-cont                               doNGPlus               #  *?
    204      '+'                 n  expr-cont                               doPossessivePlus       #  *+
    205      default                expr-cont                               doPlus
    206 
    207 
    208 #
    209 #  quant-opt  Scanning a '?' quantifier.  Need to look ahead to decide
    210 #                  between plain '?', '??', '?+'
    211 #
    212 quant-opt:
    213      '?'                 n  expr-cont                               doNGOpt                 #  ??
    214      '+'                 n  expr-cont                               doPossessiveOpt         #  ?+
    215      default                expr-cont                               doOpt                   #  ?
    216 
    217 
    218 #
    219 #   Interval         scanning a '{', the opening delimiter for an interval specification
    220 #                                   {number} or {min, max} or {min,}
    221 #
    222 interval-open:
    223     digit_char              interval-lower
    224     default                 errorDeath                              doIntervalError
    225 
    226 interval-lower:
    227     digit_char           n  interval-lower                          doIntevalLowerDigit
    228     ','			         n  interval-upper
    229     '}'                  n  interval-type                           doIntervalSame             # {n}
    230     default                 errorDeath                              doIntervalError
    231 
    232 interval-upper:
    233     digit_char           n  interval-upper                          doIntervalUpperDigit
    234     '}'                  n  interval-type
    235     default                 errorDeath                              doIntervalError
    236 
    237 interval-type:
    238     '?'                  n  expr-cont                               doNGInterval                # {n,m}?
    239     '+'                  n  expr-cont                               doPossessiveInterval        # {n,m}+
    240     default                 expr-cont                               doInterval                  # {m,n}
    241 
    242 
    243 #
    244 #  backslash        #  Backslash.  Figure out which of the \thingies we have encountered.
    245 #                                  The low level next-char function will have preprocessed
    246 #                                  some of them already; those won't come here.
    247 backslash:
    248    'A'                   n  term                                    doBackslashA
    249    'B'                   n  term                                    doBackslashB
    250    'b'                   n  term                                    doBackslashb
    251    'd'                   n  expr-quant                              doBackslashd
    252    'D'                   n  expr-quant                              doBackslashD
    253    'G'                   n  term                                    doBackslashG
    254    'h'                   n  expr-quant                              doBackslashh
    255    'H'                   n  expr-quant                              doBackslashH
    256    'k'                   n  named-backref
    257    'N'                      expr-quant                              doNamedChar      #   \N{NAME}  named char
    258    'p'                      expr-quant                              doProperty       #   \p{Lu}  style property
    259    'P'                      expr-quant                              doProperty
    260    'R'                   n  expr-quant                              doBackslashR
    261    'Q'                   n  term                                    doEnterQuoteMode
    262    'S'                   n  expr-quant                              doBackslashS
    263    's'                   n  expr-quant                              doBackslashs
    264    'v'                   n  expr-quant                              doBackslashv
    265    'V'                   n  expr-quant                              doBackslashV
    266    'W'                   n  expr-quant                              doBackslashW
    267    'w'                   n  expr-quant                              doBackslashw
    268    'X'                   n  expr-quant                              doBackslashX
    269    'Z'                   n  term                                    doBackslashZ
    270    'z'                   n  term                                    doBackslashz
    271    digit_char            n  expr-quant                              doBackRef         #  Will scan multiple digits
    272    eof                      errorDeath                              doEscapeError
    273    default               n  expr-quant                              doEscapedLiteralChar
    274 
    275 
    276 # named-backref   Scanned \k
    277 #                 Leading to \k<captureName>
    278 #                 Failure to get the full sequence is an error.
    279 #
    280 named-backref:
    281     '<'                  n  named-backref-2                         doBeginNamedBackRef
    282     default                 errorDeath                              doBadNamedCapture
    283 
    284 named-backref-2:
    285     ascii_letter         n  named-backref-3                         doContinueNamedBackRef
    286     default                 errorDeath                              doBadNamedCapture
    287 
    288 named-backref-3:
    289     ascii_letter         n  named-backref-3                         doContinueNamedBackRef
    290     digit_char           n  named-backref-3                         doContinueNamedBackRef
    291     '>'                  n  expr-quant                              doCompleteNamedBackRef
    292     default                 errorDeath                              doBadNamedCapture
    293 
    294 
    295 #
    296 # [set expression] parsing,
    297 #    All states involved in parsing set expressions have names beginning with "set-"
    298 #
    299 
    300 set-open:
    301    '^'                   n  set-open2                               doSetNegate
    302    ':'                      set-posix                               doSetPosixProp
    303    default                  set-open2
    304 
    305 set-open2:
    306    ']'                   n  set-after-lit                           doSetLiteral
    307    default                  set-start
    308 
    309 #  set-posix:
    310 #                  scanned a '[:'  If it really is a [:property:], doSetPosixProp will have
    311 #                  moved the scan to the closing ']'.  If it wasn't a property
    312 #                  expression, the scan will still be at the opening ':', which should
    313 #                  be interpreted as a normal set expression.
    314 set-posix:
    315     ']'                  n   pop                                    doSetEnd
    316     ':'                      set-start
    317     default                  errorDeath                             doRuleError  # should not be possible.
    318 
    319 #
    320 #   set-start   after the [ and special case leading characters (^ and/or ]) but before
    321 #               everything else.   A '-' is literal at this point.
    322 #
    323 set-start:
    324     ']'                  n  pop                                     doSetEnd
    325     '['                  n  set-open      ^set-after-set            doSetBeginUnion
    326     '\'                  n  set-escape
    327     '-'                  n  set-start-dash
    328     '&'                  n  set-start-amp
    329     default              n  set-after-lit                           doSetLiteral
    330 
    331 #    set-start-dash    Turn "[--" into a syntax error.
    332 #                           "[-x" is good, - and x are literals.
    333 #
    334 set-start-dash:
    335     '-'                     errorDeath                              doRuleError
    336     default                 set-after-lit                           doSetAddDash
    337 
    338 #    set-start-amp     Turn "[&&" into a syntax error.
    339 #                           "[&x" is good, & and x are literals.
    340 #
    341 set-start-amp:
    342     '&'                     errorDeath                              doRuleError
    343     default                 set-after-lit                           doSetAddAmp
    344 
    345 #
    346 #   set-after-lit    The last thing scanned was a literal character within a set.
    347 #                    Can be followed by anything.  Single '-' or '&' are
    348 #                    literals in this context, not operators.
    349 set-after-lit:
    350     ']'                  n  pop                                     doSetEnd
    351     '['                  n  set-open      ^set-after-set            doSetBeginUnion
    352     '-'                  n  set-lit-dash
    353     '&'                  n  set-lit-amp
    354     '\'                  n  set-escape
    355     eof                     errorDeath                              doSetNoCloseError
    356     default              n  set-after-lit                           doSetLiteral
    357 
    358 set-after-set:
    359     ']'                  n  pop                                     doSetEnd
    360     '['                  n  set-open      ^set-after-set            doSetBeginUnion
    361     '-'                  n  set-set-dash
    362     '&'                  n  set-set-amp
    363     '\'                  n  set-escape
    364     eof                     errorDeath                              doSetNoCloseError
    365     default              n  set-after-lit                           doSetLiteral
    366 
    367 set-after-range:
    368     ']'                  n  pop                                     doSetEnd
    369     '['                  n  set-open      ^set-after-set            doSetBeginUnion
    370     '-'                  n  set-range-dash
    371     '&'                  n  set-range-amp
    372     '\'                  n  set-escape
    373     eof                     errorDeath                              doSetNoCloseError
    374     default              n  set-after-lit                           doSetLiteral
    375     
    376 
    377 # set-after-op
    378 #     After a --  or &&
    379 #     It is an error to close a set at this point.
    380 #
    381 set-after-op:
    382     '['                  n  set-open         ^set-after-set         doSetBeginUnion
    383     ']'                     errorDeath                              doSetOpError
    384     '\'                  n  set-escape
    385     default              n  set-after-lit                           doSetLiteral
    386 
    387 #
    388 #   set-set-amp
    389 #      Have scanned [[set]&
    390 #      Could be a '&' intersection operator, if a set follows.
    391 #      Could be the start of a '&&' operator.
    392 #      Otherewise is a literal.
    393 set-set-amp:
    394     '['                  n  set-open      ^set-after-set           doSetBeginIntersection1
    395     '&'                  n  set-after-op                           doSetIntersection2
    396     default                 set-after-lit                          doSetAddAmp
    397 
    398 
    399 # set-lit-amp   Have scanned "[literals&"
    400 #               Could be a start of "&&" operator or a literal
    401 #               In [abc&[def]],   the '&' is a literal
    402 #
    403 set-lit-amp:
    404     '&'                  n  set-after-op                            doSetIntersection2
    405     default                 set-after-lit                           doSetAddAmp
    406 
    407 
    408 #
    409 #  set-set-dash
    410 #      Have scanned [set]-
    411 #      Could be a '-' difference operator, if a [set] follows.
    412 #      Could be the start of a '--' operator.
    413 #      Otherewise is a literal.
    414 set-set-dash:
    415     '['                  n  set-open      ^set-after-set           doSetBeginDifference1
    416     '-'                  n  set-after-op                           doSetDifference2
    417     default                 set-after-lit                          doSetAddDash
    418 
    419 
    420 #
    421 #  set-range-dash
    422 #      scanned  a-b-  or \w-
    423 #         any set or range like item where the trailing single '-' should
    424 #         be literal, not a set difference operation.
    425 #         A trailing "--" is still a difference operator.
    426 set-range-dash:
    427     '-'                  n  set-after-op                           doSetDifference2
    428     default                 set-after-lit                          doSetAddDash
    429 
    430 
    431 set-range-amp:
    432     '&'                  n  set-after-op                           doSetIntersection2
    433     default                 set-after-lit                          doSetAddAmp
    434 
    435 
    436 #  set-lit-dash
    437 #     Have scanned "[literals-" Could be a range or a -- operator or a literal
    438 #     In [abc-[def]], the '-' is a literal (confirmed with a Java test)
    439 #        [abc-\p{xx}  the '-' is an error
    440 #        [abc-]       the '-' is a literal
    441 #        [ab-xy]      the '-' is a range
    442 #
    443 set-lit-dash:
    444     '-'                  n  set-after-op                            doSetDifference2
    445     '['                     set-after-lit                           doSetAddDash
    446     ']'                     set-after-lit                           doSetAddDash
    447     '\'                  n  set-lit-dash-escape
    448     default              n  set-after-range                         doSetRange
    449 
    450 # set-lit-dash-escape
    451 #
    452 #    scanned "[literal-\"
    453 #    Could be a range, if the \ introduces an escaped literal char or a named char.
    454 #    Otherwise it is an error.
    455 #
    456 set-lit-dash-escape:
    457    's'                      errorDeath                             doSetOpError
    458    'S'                      errorDeath                             doSetOpError
    459    'w'                      errorDeath                             doSetOpError
    460    'W'                      errorDeath                             doSetOpError
    461    'd'                      errorDeath                             doSetOpError
    462    'D'                      errorDeath                             doSetOpError
    463    'N'                      set-after-range                        doSetNamedRange
    464    default               n  set-after-range                        doSetRange
    465 
    466    
    467 #
    468 #  set-escape
    469 #       Common back-slash escape processing within set expressions
    470 #
    471 set-escape:
    472    'p'                      set-after-set                           doSetProp
    473    'P'                      set-after-set                           doSetProp
    474    'N'                      set-after-lit                           doSetNamedChar
    475    's'                   n  set-after-range                         doSetBackslash_s
    476    'S'                   n  set-after-range                         doSetBackslash_S
    477    'w'                   n  set-after-range                         doSetBackslash_w
    478    'W'                   n  set-after-range                         doSetBackslash_W
    479    'd'                   n  set-after-range                         doSetBackslash_d
    480    'D'                   n  set-after-range                         doSetBackslash_D
    481    'h'                   n  set-after-range                         doSetBackslash_h
    482    'H'                   n  set-after-range                         doSetBackslash_H
    483    'v'                   n  set-after-range                         doSetBackslash_v
    484    'V'                   n  set-after-range                         doSetBackslash_V
    485    default               n  set-after-lit                           doSetLiteralEscaped 
    486 
    487 #
    488 # set-finish
    489 #     Have just encountered the final ']' that completes a [set], and
    490 #     arrived here via a pop.  From here, we exit the set parsing world, and go
    491 #     back to generic regular expression parsing.
    492 #
    493 set-finish:
    494     default                 expr-quant                              doSetFinish
    495 
    496 
    497 #
    498 # errorDeath.   This state is specified as the next state whenever a syntax error
    499 #               in the source rules is detected.  Barring bugs, the state machine will never
    500 #               actually get here, but will stop because of the action associated with the error.
    501 #               But, just in case, this state asks the state machine to exit.
    502 errorDeath:
    503     default              n errorDeath                               doExit
    504 
    505 
    506