Home | History | Annotate | Download | only in i18n
      1 
      2 #*****************************************************************************
      3 #
      4 #   Copyright (C) 2002-2007, International Business Machines Corporation and others.
      5 #   All Rights Reserved.
      6 #
      7 #*****************************************************************************
      8 #
      9 #  file:  regexcst.txt
     10 #  ICU Regular Expression Parser State Table
     11 #
     12 #     This state table is used when reading and parsing a regular expression pattern
     13 #     The pattern parser uses a state machine; the data in this file define the
     14 #     state transitions that occur for each input character.
     15 #
     16 #     *** This file defines the regex pattern grammar.   This is it.
     17 #     *** The determination of what is accepted is here.
     18 #
     19 #     This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
     20 #     that are then built with the rule parser.
     21 #
     22 
     23 #
     24 # Here is the syntax of the state definitions in this file:
     25 #
     26 #
     27 #StateName:
     28 #   input-char           n next-state           ^push-state     action
     29 #   input-char           n next-state           ^push-state     action
     30 #       |                |   |                      |             |
     31 #       |                |   |                      |             |--- action to be performed by state machine
     32 #       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
     33 #       |                |   |                      |
     34 #       |                |   |                      |--- Push this named state onto the state stack.
     35 #       |                |   |                           Later, when next state is specified as "pop",
     36 #       |                |   |                           the pushed state will become the current state.
     37 #       |                |   |
     38 #       |                |   |--- Transition to this state if the current input character matches the input
     39 #       |                |        character or char class in the left hand column.  "pop" causes the next
     40 #       |                |        state to be popped from the state stack.
     41 #       |                |
     42 #       |                |--- When making the state transition specified on this line, advance to the next
     43 #       |                     character from the input only if 'n' appears here.
     44 #       |
     45 #       |--- Character or named character classes to test for.  If the current character being scanned
     46 #            matches, peform the actions and go to the state specified on this line.
     47 #            The input character is tested sequentally, in the order written.  The characters and
     48 #            character classes tested for do not need to be mutually exclusive.  The first match wins.
     49 #
     50 
     51 
     52 
     53 
     54 #
     55 #  start state, scan position is at the beginning of the pattern.
     56 #
     57 start:
     58    default                 term                                     doPatStart
     59 
     60 
     61 
     62 
     63 #
     64 #  term.  At a position where we can accept the start most items in a pattern.
     65 #
     66 term:
     67     quoted               n expr-quant                               doLiteralChar
     68     rule_char            n expr-quant                               doLiteralChar
     69     '['                  n set-open       ^set-finish               doSetBegin
     70     '('                  n open-paren
     71     '.'                  n expr-quant                               doDotAny
     72     '^'                  n expr-quant                               doCaret
     73     '$'                  n expr-quant                               doDollar
     74     '\'                  n backslash
     75     '|'                  n  term                                    doOrOperator
     76     ')'                  n  pop                                     doCloseParen
     77     eof	                   term                                     doPatFinish
     78     default                errorDeath                               doRuleError
     79 
     80 
     81 
     82 #
     83 #   expr-quant    We've just finished scanning a term, now look for the optional
     84 #                 trailing quantifier - *, +, ?, *?,  etc.
     85 #
     86 expr-quant:
     87     '*'                  n  quant-star
     88     '+'                  n  quant-plus
     89     '?'                  n  quant-opt
     90     '{'                  n  interval-open                          doIntervalInit
     91     '('                  n  open-paren-quant
     92     default                 expr-cont
     93 
     94 
     95 #
     96 #  expr-cont      Expression, continuation.  At a point where additional terms are
     97 #                                            allowed, but not required.  No Quantifiers
     98 #
     99 expr-cont:
    100     '|'                  n  term                                    doOrOperator
    101     ')'                  n  pop                                     doCloseParen
    102     default                 term
    103 
    104 
    105 #
    106 #   open-paren-quant   Special case handling for comments appearing before a quantifier,
    107 #                        e.g.   x(?#comment )*
    108 #                      Open parens from expr-quant come here; anything but a (?# comment
    109 #                      branches into the normal parenthesis sequence as quickly as possible.
    110 #
    111 open-paren-quant:
    112     '?'                  n  open-paren-quant2                      doSuppressComments
    113     default                 open-paren
    114 
    115 open-paren-quant2:
    116     '#'                  n  paren-comment   ^expr-quant
    117     default                 open-paren-extended
    118 
    119 
    120 #
    121 #   open-paren    We've got an open paren.  We need to scan further to
    122 #                 determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
    123 #
    124 open-paren:
    125     '?'                  n  open-paren-extended                     doSuppressComments
    126     default                 term            ^expr-quant             doOpenCaptureParen
    127 
    128 open-paren-extended:
    129     ':'                  n  term            ^expr-quant             doOpenNonCaptureParen  #  (?:
    130     '>'                  n  term            ^expr-quant             doOpenAtomicParen      #  (?>
    131     '='                  n  term            ^expr-cont              doOpenLookAhead        #  (?=
    132     '!'                  n  term            ^expr-cont              doOpenLookAheadNeg     #  (?!
    133     '<'                  n  open-paren-lookbehind
    134     '#'                  n  paren-comment   ^term
    135     'i'                     paren-flag                              doBeginMatchMode
    136     'd'                     paren-flag                              doBeginMatchMode
    137     'm'                     paren-flag                              doBeginMatchMode
    138     's'                     paren-flag                              doBeginMatchMode
    139     'u'                     paren-flag                              doBeginMatchMode
    140     'w'                     paren-flag                              doBeginMatchMode
    141     'x'                     paren-flag                              doBeginMatchMode
    142     '-'                     paren-flag                              doBeginMatchMode
    143     '('                  n  errorDeath                              doConditionalExpr
    144     '{'                  n  errorDeath                              doPerlInline
    145     default                 errorDeath                              doBadOpenParenType
    146 
    147 open-paren-lookbehind:
    148     '='                  n  term            ^expr-cont              doOpenLookBehind       #  (?<=
    149     '!'                  n  term            ^expr-cont              doOpenLookBehindNeg    #  (?<!
    150     default                 errorDeath                              doBadOpenParenType
    151 
    152 
    153 #
    154 #   paren-comment    We've got a (?# ... )  style comment.  Eat pattern text till we get to the ')'
    155 #
    156 paren-comment:
    157     ')'                  n  pop
    158     eof		                errorDeath                              doMismatchedParenErr
    159     default              n  paren-comment
    160 
    161 #
    162 #  paren-flag    Scanned a (?ismx-ismx  flag setting
    163 #
    164 paren-flag:
    165     'i'                  n  paren-flag                              doMatchMode
    166     'd'                  n  paren-flag                              doMatchMode
    167     'm'                  n  paren-flag                              doMatchMode
    168     's'                  n  paren-flag                              doMatchMode
    169     'u'                  n  paren-flag                              doMatchMode
    170     'w'                  n  paren-flag                              doMatchMode
    171     'x'                  n  paren-flag                              doMatchMode
    172     '-'                  n  paren-flag                              doMatchMode
    173     ')'                  n  term                                    doSetMatchMode
    174     ':'                  n  term              ^expr-quant           doMatchModeParen
    175     default                 errorDeath                              doBadModeFlag
    176 
    177 
    178 #
    179 #  quant-star     Scanning a '*' quantifier.  Need to look ahead to decide
    180 #                 between plain '*', '*?', '*+'
    181 #
    182 quant-star:
    183      '?'                 n  expr-cont                               doNGStar               #  *?
    184      '+'                 n  expr-cont                               doPossessiveStar       #  *+
    185      default                expr-cont                               doStar
    186 
    187 
    188 #
    189 #  quant-plus     Scanning a '+' quantifier.  Need to look ahead to decide
    190 #                 between plain '+', '+?', '++'
    191 #
    192 quant-plus:
    193      '?'                 n  expr-cont                               doNGPlus               #  *?
    194      '+'                 n  expr-cont                               doPossessivePlus       #  *+
    195      default                expr-cont                               doPlus
    196 
    197 
    198 #
    199 #  quant-opt  Scanning a '?' quantifier.  Need to look ahead to decide
    200 #                  between plain '?', '??', '?+'
    201 #
    202 quant-opt:
    203      '?'                 n  expr-cont                               doNGOpt                 #  ??
    204      '+'                 n  expr-cont                               doPossessiveOpt         #  ?+
    205      default                expr-cont                               doOpt                   #  ?
    206 
    207 
    208 #
    209 #   Interval         scanning a '{', the opening delimiter for an interval specification
    210 #                                   {number} or {min, max} or {min,}
    211 #
    212 interval-open:
    213     digit_char              interval-lower
    214     default                 errorDeath                              doIntervalError
    215 
    216 interval-lower:
    217     digit_char           n  interval-lower                          doIntevalLowerDigit
    218     ','			         n  interval-upper
    219     '}'                  n  interval-type                           doIntervalSame             # {n}
    220     default                 errorDeath                              doIntervalError
    221 
    222 interval-upper:
    223     digit_char           n  interval-upper                          doIntervalUpperDigit
    224     '}'                  n  interval-type
    225     default                 errorDeath                              doIntervalError
    226 
    227 interval-type:
    228     '?'                  n  expr-cont                               doNGInterval                # {n,m}?
    229     '+'                  n  expr-cont                               doPossessiveInterval        # {n,m}+
    230     default                 expr-cont                               doInterval                  # {m,n}
    231 
    232 
    233 #
    234 #  backslash        #  Backslash.  Figure out which of the \thingies we have encountered.
    235 #                                  The low level next-char function will have preprocessed
    236 #                                  some of them already; those won't come here.
    237 backslash:
    238    'A'                   n  term                                    doBackslashA
    239    'B'                   n  term                                    doBackslashB
    240    'b'                   n  term                                    doBackslashb
    241    'd'                   n  expr-quant                              doBackslashd
    242    'D'                   n  expr-quant                              doBackslashD
    243    'G'                   n  term                                    doBackslashG
    244    'N'                      expr-quant                              doNamedChar      #   \N{NAME}  named char
    245    'p'                      expr-quant                              doProperty       #   \p{Lu}  style property
    246    'P'                      expr-quant                              doProperty
    247    'Q'                   n  term                                    doEnterQuoteMode
    248    'S'                   n  expr-quant                              doBackslashS
    249    's'                   n  expr-quant                              doBackslashs
    250    'W'                   n  expr-quant                              doBackslashW
    251    'w'                   n  expr-quant                              doBackslashw
    252    'X'                   n  expr-quant                              doBackslashX
    253    'Z'                   n  term                                    doBackslashZ
    254    'z'                   n  term                                    doBackslashz
    255    digit_char            n  expr-quant                              doBackRef         #  Will scan multiple digits
    256    eof                      errorDeath                              doEscapeError
    257    default               n  expr-quant                              doEscapedLiteralChar
    258 
    259 
    260 
    261 #
    262 # [set expression] parsing,
    263 #    All states involved in parsing set expressions have names beginning with "set-"
    264 #
    265 
    266 set-open:
    267    '^'                   n  set-open2                               doSetNegate
    268    ':'                      set-posix                               doSetPosixProp
    269    default                  set-open2
    270 
    271 set-open2:
    272    ']'                   n  set-after-lit                           doSetLiteral
    273    default                  set-start
    274 
    275 #  set-posix:
    276 #                  scanned a '[:'  If it really is a [:property:], doSetPosixProp will have
    277 #                  moved the scan to the closing ']'.  If it wasn't a property
    278 #                  expression, the scan will still be at the opening ':', which should
    279 #                  be interpreted as a normal set expression.
    280 set-posix:
    281     ']'                  n   pop                                    doSetEnd
    282     ':'                      set-start
    283     default                  errorDeath                             doRuleError  # should not be possible.
    284 
    285 #
    286 #   set-start   after the [ and special case leading characters (^ and/or ]) but before
    287 #               everything else.   A '-' is literal at this point.
    288 #
    289 set-start:
    290     ']'                  n  pop                                     doSetEnd
    291     '['                  n  set-open      ^set-after-set            doSetBeginUnion
    292     '\'                  n  set-escape
    293     '-'                  n  set-start-dash
    294     '&'                  n  set-start-amp
    295     default              n  set-after-lit                           doSetLiteral
    296 
    297 #    set-start-dash    Turn "[--" into a syntax error.
    298 #                           "[-x" is good, - and x are literals.
    299 #
    300 set-start-dash:
    301     '-'                     errorDeath                              doRuleError
    302     default                 set-after-lit                           doSetAddDash
    303 
    304 #    set-start-amp     Turn "[&&" into a syntax error.
    305 #                           "[&x" is good, & and x are literals.
    306 #
    307 set-start-amp:
    308     '&'                     errorDeath                              doRuleError
    309     default                 set-after-lit                           doSetAddAmp
    310 
    311 #
    312 #   set-after-lit    The last thing scanned was a literal character within a set.
    313 #                    Can be followed by anything.  Single '-' or '&' are
    314 #                    literals in this context, not operators.
    315 set-after-lit:
    316     ']'                  n  pop                                     doSetEnd
    317     '['                  n  set-open      ^set-after-set            doSetBeginUnion
    318     '-'                  n  set-lit-dash
    319     '&'                  n  set-lit-amp
    320     '\'                  n  set-escape
    321     eof                     errorDeath                              doSetNoCloseError
    322     default              n  set-after-lit                           doSetLiteral
    323 
    324 set-after-set:
    325     ']'                  n  pop                                     doSetEnd
    326     '['                  n  set-open      ^set-after-set            doSetBeginUnion
    327     '-'                  n  set-set-dash
    328     '&'                  n  set-set-amp
    329     '\'                  n  set-escape
    330     eof                     errorDeath                              doSetNoCloseError
    331     default              n  set-after-lit                           doSetLiteral
    332 
    333 set-after-range:
    334     ']'                  n  pop                                     doSetEnd
    335     '['                  n  set-open      ^set-after-set            doSetBeginUnion
    336     '-'                  n  set-range-dash
    337     '&'                  n  set-range-amp
    338     '\'                  n  set-escape
    339     eof                     errorDeath                              doSetNoCloseError
    340     default              n  set-after-lit                           doSetLiteral
    341     
    342 
    343 # set-after-op
    344 #     After a --  or &&
    345 #     It is an error to close a set at this point.
    346 #
    347 set-after-op:
    348     '['                  n  set-open         ^set-after-set         doSetBeginUnion
    349     ']'                     errorDeath                              doSetOpError
    350     '\'                  n  set-escape
    351     default              n  set-after-lit                           doSetLiteral
    352 
    353 #
    354 #   set-set-amp
    355 #      Have scanned [[set]&
    356 #      Could be a '&' intersection operator, if a set follows.
    357 #      Could be the start of a '&&' operator.
    358 #      Otherewise is a literal.
    359 set-set-amp:
    360     '['                  n  set-open      ^set-after-set           doSetBeginIntersection1
    361     '&'                  n  set-after-op                           doSetIntersection2
    362     default                 set-after-lit                          doSetAddAmp
    363 
    364 
    365 # set-lit-amp   Have scanned "[literals&"
    366 #               Could be a start of "&&" operator or a literal
    367 #               In [abc&[def]],   the '&' is a literal
    368 #
    369 set-lit-amp:
    370     '&'                  n  set-after-op                            doSetIntersection2
    371     default                 set-after-lit                           doSetAddAmp
    372 
    373 
    374 #
    375 #  set-set-dash
    376 #      Have scanned [set]-
    377 #      Could be a '-' difference operator, if a [set] follows.
    378 #      Could be the start of a '--' operator.
    379 #      Otherewise is a literal.
    380 set-set-dash:
    381     '['                  n  set-open      ^set-after-set           doSetBeginDifference1
    382     '-'                  n  set-after-op                           doSetDifference2
    383     default                 set-after-lit                          doSetAddDash
    384 
    385 
    386 #
    387 #  set-range-dash
    388 #      scanned  a-b-  or \w-
    389 #         any set or range like item where the trailing single '-' should
    390 #         be literal, not a set difference operation.
    391 #         A trailing "--" is still a difference operator.
    392 set-range-dash:
    393     '-'                  n  set-after-op                           doSetDifference2
    394     default                 set-after-lit                          doSetAddDash
    395 
    396 
    397 set-range-amp:
    398     '&'                  n  set-after-op                           doSetIntersection2
    399     default                 set-after-lit                          doSetAddAmp
    400 
    401 
    402 #  set-lit-dash
    403 #     Have scanned "[literals-" Could be a range or a -- operator or a literal
    404 #     In [abc-[def]], the '-' is a literal (confirmed with a Java test)
    405 #        [abc-\p{xx}  the '-' is an error
    406 #        [abc-]       the '-' is a literal
    407 #        [ab-xy]      the '-' is a range
    408 #
    409 set-lit-dash:
    410     '-'                  n  set-after-op                            doSetDifference2
    411     '['                     set-after-lit                           doSetAddDash
    412     ']'                     set-after-lit                           doSetAddDash
    413     '\'                  n  set-lit-dash-escape
    414     default              n  set-after-range                         doSetRange
    415 
    416 # set-lit-dash-escape
    417 #
    418 #    scanned "[literal-\"
    419 #    Could be a range, if the \ introduces an escaped literal char or a named char.
    420 #    Otherwise it is an error.
    421 #
    422 set-lit-dash-escape:
    423    's'                      errorDeath                             doSetOpError
    424    'S'                      errorDeath                             doSetOpError
    425    'w'                      errorDeath                             doSetOpError
    426    'W'                      errorDeath                             doSetOpError
    427    'd'                      errorDeath                             doSetOpError
    428    'D'                      errorDeath                             doSetOpError
    429    'N'                      set-after-range                        doSetNamedRange
    430    default               n  set-after-range                        doSetRange
    431 
    432    
    433 #
    434 #  set-escape
    435 #       Common back-slash escape processing within set expressions
    436 #
    437 set-escape:
    438    'p'                      set-after-set                           doSetProp
    439    'P'                      set-after-set                           doSetProp
    440    'N'                      set-after-lit                           doSetNamedChar
    441    's'                   n  set-after-range                         doSetBackslash_s
    442    'S'                   n  set-after-range                         doSetBackslash_S
    443    'w'                   n  set-after-range                         doSetBackslash_w
    444    'W'                   n  set-after-range                         doSetBackslash_W
    445    'd'                   n  set-after-range                         doSetBackslash_d
    446    'D'                   n  set-after-range                         doSetBackslash_D
    447    default               n  set-after-lit                           doSetLiteralEscaped 
    448 
    449 #
    450 # set-finish
    451 #     Have just encountered the final ']' that completes a [set], and
    452 #     arrived here via a pop.  From here, we exit the set parsing world, and go
    453 #     back to generic regular expression parsing.
    454 #
    455 set-finish:
    456     default                 expr-quant                              doSetFinish
    457 
    458 
    459 #
    460 # errorDeath.   This state is specified as the next state whenever a syntax error
    461 #               in the source rules is detected.  Barring bugs, the state machine will never
    462 #               actually get here, but will stop because of the action associated with the error.
    463 #               But, just in case, this state asks the state machine to exit.
    464 errorDeath:
    465     default              n errorDeath                               doExit
    466 
    467 
    468