Home | History | Annotate | Download | only in common
      1 
      2 #*****************************************************************************
      3 #
      4 #   Copyright (C) 2016 and later: Unicode, Inc. and others.
      5 #   License & terms of use: http://www.unicode.org/copyright.html#License
      6 #
      7 #*****************************************************************************
      8 #*****************************************************************************
      9 #
     10 #   Copyright (C) 2002-2016, International Business Machines Corporation and others.
     11 #   All Rights Reserved.
     12 #
     13 #*****************************************************************************
     14 #
     15 #  file:  rbbirpt.txt
     16 #  ICU Break Iterator Rule Parser State Table
     17 #
     18 #     This state table is used when reading and parsing a set of RBBI rules
     19 #     The rule parser uses a state machine; the data in this file define the
     20 #     state transitions that occur for each input character.
     21 #
     22 #     *** This file defines the RBBI rule grammar.   This is it.
     23 #     *** The determination of what is accepted is here.
     24 #
     25 #     This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
     26 #     that are then built with the rule parser.
     27 #
     28 #    perl rbbicst.pl    < rbbirpt.txt > rbbirpt.h
     29 
     30 #
     31 # Here is the syntax of the state definitions in this file:
     32 #
     33 #
     34 #StateName:
     35 #   input-char           n next-state           ^push-state     action    
     36 #   input-char           n next-state           ^push-state     action    
     37 #       |                |   |                      |             |
     38 #       |                |   |                      |             |--- action to be performed by state machine
     39 #       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
     40 #       |                |   |                      |
     41 #       |                |   |                      |--- Push this named state onto the state stack.
     42 #       |                |   |                           Later, when next state is specified as "pop",
     43 #       |                |   |                           the pushed state will become the current state.
     44 #       |                |   |
     45 #       |                |   |--- Transition to this state if the current input character matches the input
     46 #       |                |        character or char class in the left hand column.  "pop" causes the next
     47 #       |                |        state to be popped from the state stack.
     48 #       |                |
     49 #       |                |--- When making the state transition specified on this line, advance to the next
     50 #       |                     character from the input only if 'n' appears here.
     51 #       |
     52 #       |--- Character or named character classes to test for.  If the current character being scanned
     53 #            matches, peform the actions and go to the state specified on this line.
     54 #            The input character is tested sequentally, in the order written.  The characters and
     55 #            character classes tested for do not need to be mutually exclusive.  The first match wins.
     56 #            
     57 
     58 
     59 
     60 
     61 #
     62 #  start state, scan position is at the beginning of the rules file, or in between two rules.
     63 #
     64 start:
     65     escaped                term                  ^break-rule-end    doExprStart                       
     66     white_space          n start                     
     67     '^'                  n start-after-caret     ^break-rule-end    doNoChain
     68     '$'                    scan-var-name         ^assign-or-rule    doExprStart
     69     '!'                  n rev-option                             
     70     ';'                  n start                                                  # ignore empty rules.
     71     eof                    exit              
     72     default                term                  ^break-rule-end    doExprStart
     73     
     74 #
     75 #  break-rule-end:  Returned from doing a break-rule expression.
     76 #
     77 break-rule-end:
     78     ';'	                 n start                                    doEndOfRule
     79     white_space          n break-rule-end
     80     default                errorDeath                               doRuleError
     81      
     82 #
     83 # start of a rule, after having seen a '^' (inhibits rule chain in).
     84 #     Similar to the main 'start' state in most respects, except
     85 #          - empty rule is an error.
     86 #          - A second '^' is an error.
     87 #
     88 start-after-caret:
     89     escaped                term                                     doExprStart
     90     white_space          n start-after-caret
     91     '^'                    errorDeath                               doRuleError    # two '^'s
     92     '$'                    scan-var-name         ^term-var-ref      doExprStart
     93     ';'                    errorDeath                               doRuleError    # ^ ;
     94     eof                    errorDeath                               doRuleError
     95     default                term                                     doExprStart
     96  
     97 #
     98 #   !               We've just scanned a '!', indicating either a !!key word flag or a
     99 #                   !Reverse rule.
    100 #
    101 rev-option:
    102     '!'                  n option-scan1   
    103     default                reverse-rule           ^break-rule-end   doReverseDir
    104     
    105 option-scan1:
    106     name_start_char      n option-scan2                             doOptionStart
    107     default                errorDeath                               doRuleError
    108     
    109 option-scan2:
    110     name_char            n option-scan2
    111     default                option-scan3                             doOptionEnd
    112     
    113 option-scan3:
    114     ';'                  n start 
    115     white_space          n option-scan3 
    116     default                errorDeath                               doRuleError 
    117     
    118 
    119 reverse-rule:
    120     default                term                   ^break-rule-end   doExprStart
    121     
    122     
    123 #
    124 #  term.  Eat through a single rule character, or a composite thing, which
    125 #         could be a parenthesized expression, a variable name, or a Unicode Set.
    126 #
    127 term:
    128     escaped              n expr-mod                                 doRuleChar
    129     white_space          n term
    130     rule_char            n expr-mod                                 doRuleChar
    131     '['                    scan-unicode-set      ^expr-mod
    132     '('                  n term                  ^expr-mod          doLParen
    133     '$'                    scan-var-name         ^term-var-ref
    134     '.'                  n expr-mod                                 doDotAny
    135     default                errorDeath                               doRuleError
    136     
    137     
    138 
    139 #
    140 #  term-var-ref   We've just finished scanning a reference to a $variable.
    141 #                 Check that the variable was defined.
    142 #                 The variable name scanning is in common with assignment statements,
    143 #                 so the check can't be done there.
    144 term-var-ref:
    145     default                expr-mod                                 doCheckVarDef
    146     
    147     
    148 #
    149 #   expr-mod      We've just finished scanning a term, now look for the optional
    150 #                 trailing '*', '?', '+'
    151 #
    152 expr-mod:
    153     white_space          n  expr-mod
    154     '*'                  n  expr-cont                               doUnaryOpStar
    155     '+'                  n  expr-cont                               doUnaryOpPlus
    156     '?'                  n  expr-cont                               doUnaryOpQuestion
    157     default                 expr-cont 
    158     
    159     
    160 #
    161 #  expr-cont      Expression, continuation.  At a point where additional terms are
    162 #                                            allowed, but not required.
    163 #
    164 expr-cont:
    165     escaped                 term                                    doExprCatOperator
    166     white_space          n  expr-cont
    167     rule_char               term                                    doExprCatOperator
    168     '['                     term                                    doExprCatOperator
    169     '('                     term                                    doExprCatOperator
    170     '$'                     term                                    doExprCatOperator
    171     '.'                     term                                    doExprCatOperator
    172     '/'                     look-ahead                              doExprCatOperator
    173     '{'                  n  tag-open                                doExprCatOperator
    174     '|'                  n  term                                    doExprOrOperator
    175     ')'                  n  pop                                     doExprRParen
    176     default                 pop                                     doExprFinished
    177     
    178 
    179 #
    180 #   look-ahead    Scanning a '/', which identifies a break point, assuming that the
    181 #                 remainder of the expression matches.
    182 #
    183 #                 Generate a parse tree as if this was a special kind of input symbol
    184 #                 appearing in an otherwise normal concatenation expression.
    185 #
    186 look-ahead:
    187     '/'                   n expr-cont-no-slash                      doSlash
    188     default                 errorDeath
    189 
    190 
    191 #
    192 #  expr-cont-no-slash    Expression, continuation.  At a point where additional terms are
    193 #                                            allowed, but not required.  Just like
    194 #                                            expr-cont, above, except that no '/'
    195 #                                            look-ahead symbol is permitted.
    196 #
    197 expr-cont-no-slash:
    198     escaped                 term                                    doExprCatOperator
    199     white_space          n  expr-cont
    200     rule_char               term                                    doExprCatOperator
    201     '['                     term                                    doExprCatOperator
    202     '('                     term                                    doExprCatOperator
    203     '$'                     term                                    doExprCatOperator
    204     '.'                     term                                    doExprCatOperator
    205     '|'                  n  term                                    doExprOrOperator
    206     ')'                  n  pop                                     doExprRParen
    207     default                 pop                                     doExprFinished
    208 
    209 
    210 #
    211 #   tags             scanning a '{', the opening delimiter for a tag that identifies
    212 #                    the kind of match.  Scan the whole {dddd} tag, where d=digit
    213 #
    214 tag-open:
    215     white_space          n  tag-open
    216     digit_char              tag-value                               doStartTagValue
    217     default                 errorDeath                              doTagExpectedError
    218     
    219 tag-value:
    220     white_space          n  tag-close
    221     '}'                     tag-close
    222     digit_char           n  tag-value                               doTagDigit
    223     default                 errorDeath                              doTagExpectedError
    224     
    225 tag-close:
    226     white_space          n  tag-close
    227     '}'                  n  expr-cont-no-tag                        doTagValue
    228     default                 errorDeath                              doTagExpectedError
    229     
    230     
    231     
    232 #
    233 #  expr-cont-no-tag    Expression, continuation.  At a point where additional terms are
    234 #                                            allowed, but not required.  Just like
    235 #                                            expr-cont, above, except that no "{ddd}"
    236 #                                            tagging is permitted.
    237 #
    238 expr-cont-no-tag:
    239     escaped                 term                                    doExprCatOperator
    240     white_space          n  expr-cont-no-tag
    241     rule_char               term                                    doExprCatOperator
    242     '['                     term                                    doExprCatOperator
    243     '('                     term                                    doExprCatOperator
    244     '$'                     term                                    doExprCatOperator
    245     '.'                     term                                    doExprCatOperator
    246     '/'                     look-ahead                              doExprCatOperator
    247     '|'                  n  term                                    doExprOrOperator
    248     ')'                  n  pop                                     doExprRParen
    249     default                 pop                                     doExprFinished
    250     
    251     
    252 
    253 
    254 #
    255 #   Variable Name Scanning.
    256 #
    257 #                    The state that branched to here must have pushed a return state
    258 #                    to go to after completion of the variable name scanning.
    259 #
    260 #                    The current input character must be the $ that introduces the name.
    261 #                    The $ is consummed here rather than in the state that first detected it
    262 #                    so that the doStartVariableName action only needs to happen in one
    263 #                    place (here), and the other states don't need to worry about it.
    264 #
    265 scan-var-name:
    266    '$'                  n scan-var-start                            doStartVariableName
    267    default                errorDeath
    268 
    269 
    270 scan-var-start:
    271     name_start_char      n scan-var-body
    272     default                errorDeath                               doVariableNameExpectedErr
    273     
    274 scan-var-body:
    275     name_char            n scan-var-body
    276     default                pop                                      doEndVariableName
    277     
    278     
    279     
    280 #
    281 #  scan-unicode-set   Unicode Sets are parsed by the the UnicodeSet class.
    282 #                     Within the RBBI parser, after finding the first character
    283 #                     of a Unicode Set, we just hand the rule input at that
    284 #                     point of to the Unicode Set constructor, then pick
    285 #                     up parsing after the close of the set.
    286 #
    287 #                     The action for this state invokes the UnicodeSet parser.
    288 #
    289 scan-unicode-set:
    290     '['                   n pop                                      doScanUnicodeSet
    291     'p'                   n pop                                      doScanUnicodeSet
    292     'P'                   n pop                                      doScanUnicodeSet
    293     default		    errorDeath 
    294     
    295     
    296 
    297 
    298 
    299 
    300 
    301 #
    302 #  assign-or-rule.   A $variable was encountered at the start of something, could be
    303 #                    either an assignment statement or a rule, depending on whether an '='
    304 #                    follows the variable name.  We get to this state when the variable name
    305 #                    scanning does a return.
    306 #
    307 assign-or-rule:
    308     white_space          n assign-or-rule
    309     '='                  n term                  ^assign-end        doStartAssign   # variable was target of assignment
    310     default                term-var-ref          ^break-rule-end                    # variable was a term in a rule
    311 
    312 
    313 
    314 #
    315 #  assign-end        This state is entered when the end of the expression on the
    316 #                    right hand side of an assignment is found.  We get here via
    317 #                    a pop; this state is pushed when the '=' in an assignment is found.
    318 #
    319 #                    The only thing allowed at this point is a ';'.  The RHS of an
    320 #                    assignment must look like a rule expression, and we come here
    321 #                    when what is being scanned no longer looks like an expression.
    322 #
    323 assign-end:
    324     ';'                  n start                                    doEndAssign
    325     default                errorDeath                               doRuleErrorAssignExpr
    326     
    327     
    328     
    329 #
    330 # errorDeath.   This state is specified as the next state whenever a syntax error
    331 #               in the source rules is detected.  Barring bugs, the state machine will never
    332 #               actually get here, but will stop because of the action associated with the error.
    333 #               But, just in case, this state asks the state machine to exit.
    334 errorDeath:
    335     default              n errorDeath                               doExit
    336 
    337 
    338