Home | History | Annotate | Download | only in common
      1 
      2 #*****************************************************************************
      3 #
      4 #   Copyright (C) 2002-2003, International Business Machines Corporation and others.
      5 #   All Rights Reserved.
      6 #
      7 #*****************************************************************************
      8 #
      9 #  file:  rbbirpt.txt
     10 #  ICU Break Iterator Rule Parser State Table
     11 #
     12 #     This state table is used when reading and parsing a set of RBBI rules
     13 #     The rule parser uses a state machine; the data in this file define the
     14 #     state transitions that occur for each input character.
     15 #
     16 #     *** This file defines the RBBI rule grammar.   This is it.
     17 #     *** The determination of what is accepted is here.
     18 #
     19 #     This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
     20 #     that are then built with the rule parser.
     21 #
     22 
     23 #
     24 # Here is the syntax of the state definitions in this file:
     25 #
     26 #
     27 #StateName:
     28 #   input-char           n next-state           ^push-state     action    
     29 #   input-char           n next-state           ^push-state     action    
     30 #       |                |   |                      |             |
     31 #       |                |   |                      |             |--- action to be performed by state machine
     32 #       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
     33 #       |                |   |                      |
     34 #       |                |   |                      |--- Push this named state onto the state stack.
     35 #       |                |   |                           Later, when next state is specified as "pop",
     36 #       |                |   |                           the pushed state will become the current state.
     37 #       |                |   |
     38 #       |                |   |--- Transition to this state if the current input character matches the input
     39 #       |                |        character or char class in the left hand column.  "pop" causes the next
     40 #       |                |        state to be popped from the state stack.
     41 #       |                |
     42 #       |                |--- When making the state transition specified on this line, advance to the next
     43 #       |                     character from the input only if 'n' appears here.
     44 #       |
     45 #       |--- Character or named character classes to test for.  If the current character being scanned
     46 #            matches, peform the actions and go to the state specified on this line.
     47 #            The input character is tested sequentally, in the order written.  The characters and
     48 #            character classes tested for do not need to be mutually exclusive.  The first match wins.
     49 #            
     50 
     51 
     52 
     53 
     54 #
     55 #  start state, scan position is at the beginning of the rules file, or in between two rules.
     56 #
     57 start:
     58     escaped                term                  ^break-rule-end    doExprStart                       
     59     white_space          n start                     
     60     '$'                    scan-var-name         ^assign-or-rule    doExprStart
     61     '!'                  n rev-option                             
     62     ';'                  n start                                                  # ignore empty rules.
     63     eof                    exit              
     64     default                term                  ^break-rule-end    doExprStart
     65     
     66 #
     67 #  break-rule-end:  Returned from doing a break-rule expression.
     68 #
     69 break-rule-end:
     70     ';'	                 n start                                    doEndOfRule
     71     white_space          n break-rule-end
     72     default                errorDeath                               doRuleError
     73      
     74 
     75 #
     76 #   !               We've just scanned a '!', indicating either a !!key word flag or a
     77 #                   !Reverse rule.
     78 #
     79 rev-option:
     80     '!'                  n option-scan1   
     81     default                reverse-rule           ^break-rule-end   doReverseDir
     82     
     83 option-scan1:
     84     name_start_char      n option-scan2                             doOptionStart
     85     default                errorDeath                               doRuleError
     86     
     87 option-scan2:
     88     name_char            n option-scan2
     89     default                option-scan3                             doOptionEnd
     90     
     91 option-scan3:
     92     ';'                  n start 
     93     white_space          n option-scan3 
     94     default                errorDeath                               doRuleError 
     95     
     96 
     97 reverse-rule:
     98     default                term                   ^break-rule-end   doExprStart
     99     
    100     
    101 #
    102 #  term.  Eat through a single rule character, or a composite thing, which
    103 #         could be a parenthesized expression, a variable name, or a Unicode Set.
    104 #
    105 term:
    106     escaped              n expr-mod                                 doRuleChar
    107     white_space          n term
    108     rule_char            n expr-mod                                 doRuleChar
    109     '['                    scan-unicode-set      ^expr-mod
    110     '('                  n term                  ^expr-mod          doLParen
    111     '$'                    scan-var-name         ^term-var-ref
    112     '.'                  n expr-mod                                 doDotAny
    113     default                errorDeath                               doRuleError
    114     
    115     
    116 
    117 #
    118 #  term-var-ref   We've just finished scanning a reference to a $variable.
    119 #                 Check that the variable was defined.
    120 #                 The variable name scanning is in common with assignment statements,
    121 #                 so the check can't be done there.
    122 term-var-ref:
    123     default                expr-mod                                 doCheckVarDef
    124     
    125     
    126 #
    127 #   expr-mod      We've just finished scanning a term, now look for the optional
    128 #                 trailing '*', '?', '+'
    129 #
    130 expr-mod:
    131     white_space          n  expr-mod
    132     '*'                  n  expr-cont                               doUnaryOpStar
    133     '+'                  n  expr-cont                               doUnaryOpPlus
    134     '?'                  n  expr-cont                               doUnaryOpQuestion
    135     default                 expr-cont 
    136     
    137     
    138 #
    139 #  expr-cont      Expression, continuation.  At a point where additional terms are
    140 #                                            allowed, but not required.
    141 #
    142 expr-cont:
    143     escaped                 term                                    doExprCatOperator
    144     white_space          n  expr-cont
    145     rule_char               term                                    doExprCatOperator
    146     '['                     term                                    doExprCatOperator
    147     '('                     term                                    doExprCatOperator
    148     '$'                     term                                    doExprCatOperator
    149     '.'                     term                                    doExprCatOperator
    150     '/'                     look-ahead                              doExprCatOperator
    151     '{'                  n  tag-open                                doExprCatOperator
    152     '|'                  n  term                                    doExprOrOperator
    153     ')'                  n  pop                                     doExprRParen
    154     default                 pop                                     doExprFinished
    155     
    156 
    157 #
    158 #   look-ahead    Scanning a '/', which identifies a break point, assuming that the
    159 #                 remainder of the expression matches.
    160 #
    161 #                 Generate a parse tree as if this was a special kind of input symbol
    162 #                 appearing in an otherwise normal concatenation expression.
    163 #
    164 look-ahead:
    165     '/'                   n expr-cont-no-slash                      doSlash
    166     default                 errorDeath
    167 
    168 
    169 #
    170 #  expr-cont-no-slash    Expression, continuation.  At a point where additional terms are
    171 #                                            allowed, but not required.  Just like
    172 #                                            expr-cont, above, except that no '/'
    173 #                                            look-ahead symbol is permitted.
    174 #
    175 expr-cont-no-slash:
    176     escaped                 term                                    doExprCatOperator
    177     white_space          n  expr-cont
    178     rule_char               term                                    doExprCatOperator
    179     '['                     term                                    doExprCatOperator
    180     '('                     term                                    doExprCatOperator
    181     '$'                     term                                    doExprCatOperator
    182     '.'                     term                                    doExprCatOperator
    183     '|'                  n  term                                    doExprOrOperator
    184     ')'                  n  pop                                     doExprRParen
    185     default                 pop                                     doExprFinished
    186 
    187 
    188 #
    189 #   tags             scanning a '{', the opening delimiter for a tag that identifies
    190 #                    the kind of match.  Scan the whole {dddd} tag, where d=digit
    191 #
    192 tag-open:
    193     white_space          n  tag-open
    194     digit_char              tag-value                               doStartTagValue
    195     default                 errorDeath                              doTagExpectedError
    196     
    197 tag-value:
    198     white_space          n  tag-close
    199     '}'                     tag-close
    200     digit_char           n  tag-value                               doTagDigit
    201     default                 errorDeath                              doTagExpectedError
    202     
    203 tag-close:
    204     white_space          n  tag-close
    205     '}'                  n  expr-cont-no-tag                        doTagValue
    206     default                 errorDeath                              doTagExpectedError
    207     
    208     
    209     
    210 #
    211 #  expr-cont-no-tag    Expression, continuation.  At a point where additional terms are
    212 #                                            allowed, but not required.  Just like
    213 #                                            expr-cont, above, except that no "{ddd}"
    214 #                                            tagging is permitted.
    215 #
    216 expr-cont-no-tag:
    217     escaped                 term                                    doExprCatOperator
    218     white_space          n  expr-cont-no-tag
    219     rule_char               term                                    doExprCatOperator
    220     '['                     term                                    doExprCatOperator
    221     '('                     term                                    doExprCatOperator
    222     '$'                     term                                    doExprCatOperator
    223     '.'                     term                                    doExprCatOperator
    224     '/'                     look-ahead                              doExprCatOperator
    225     '|'                  n  term                                    doExprOrOperator
    226     ')'                  n  pop                                     doExprRParen
    227     default                 pop                                     doExprFinished
    228     
    229     
    230 
    231 
    232 #
    233 #   Variable Name Scanning.
    234 #
    235 #                    The state that branched to here must have pushed a return state
    236 #                    to go to after completion of the variable name scanning.
    237 #
    238 #                    The current input character must be the $ that introduces the name.
    239 #                    The $ is consummed here rather than in the state that first detected it
    240 #                    so that the doStartVariableName action only needs to happen in one
    241 #                    place (here), and the other states don't need to worry about it.
    242 #
    243 scan-var-name:
    244    '$'                  n scan-var-start                            doStartVariableName
    245    default                errorDeath
    246 
    247 
    248 scan-var-start:
    249     name_start_char      n scan-var-body
    250     default                errorDeath                               doVariableNameExpectedErr
    251     
    252 scan-var-body:
    253     name_char            n scan-var-body
    254     default                pop                                      doEndVariableName
    255     
    256     
    257     
    258 #
    259 #  scan-unicode-set   Unicode Sets are parsed by the the UnicodeSet class.
    260 #                     Within the RBBI parser, after finding the first character
    261 #                     of a Unicode Set, we just hand the rule input at that
    262 #                     point of to the Unicode Set constructor, then pick
    263 #                     up parsing after the close of the set.
    264 #
    265 #                     The action for this state invokes the UnicodeSet parser.
    266 #
    267 scan-unicode-set:
    268     '['                   n pop                                      doScanUnicodeSet
    269     'p'                   n pop                                      doScanUnicodeSet
    270     'P'                   n pop                                      doScanUnicodeSet
    271     default		    errorDeath 
    272     
    273     
    274 
    275 
    276 
    277 
    278 
    279 #
    280 #  assign-or-rule.   A $variable was encountered at the start of something, could be
    281 #                    either an assignment statement or a rule, depending on whether an '='
    282 #                    follows the variable name.  We get to this state when the variable name
    283 #                    scanning does a return.
    284 #
    285 assign-or-rule:
    286     white_space          n assign-or-rule
    287     '='                  n term                  ^assign-end        doStartAssign   # variable was target of assignment
    288     default                term-var-ref          ^break-rule-end                    # variable was a term in a rule
    289 
    290 
    291 
    292 #
    293 #  assign-end        This state is entered when the end of the expression on the
    294 #                    right hand side of an assignment is found.  We get here via
    295 #                    a pop; this state is pushed when the '=' in an assignment is found.
    296 #
    297 #                    The only thing allowed at this point is a ';'.  The RHS of an
    298 #                    assignment must look like a rule expression, and we come here
    299 #                    when what is being scanned no longer looks like an expression.
    300 #
    301 assign-end:
    302     ';'                  n start                                    doEndAssign
    303     default                errorDeath                               doRuleErrorAssignExpr
    304     
    305     
    306     
    307 #
    308 # errorDeath.   This state is specified as the next state whenever a syntax error
    309 #               in the source rules is detected.  Barring bugs, the state machine will never
    310 #               actually get here, but will stop because of the action associated with the error.
    311 #               But, just in case, this state asks the state machine to exit.
    312 errorDeath:
    313     default              n errorDeath                               doExit
    314 
    315 
    316