Home | History | Annotate | Download | only in common
      1 #**************************************************************************
      2 #   Copyright (C) 2002-2005 International Business Machines Corporation   *
      3 #   and others. All rights reserved.                                      *
      4 #**************************************************************************
      5 #
      6 #  rbbicst   Compile the RBBI rule paser state table data into initialized C data.
      7 #            Usage:
      8 #                   cd icu/source/common
      9 #                   perl rbbicst.pl    < rbbirpt.txt > rbbirpt.h
     10 #                   perl rbbicst.pl -j < rbbirpt.txt > RBBIRuleParseTable.java
     11 #
     12 #             The output file, rbbrpt.h, is included by some of the .cpp rbbi
     13 #             implementation files.   This perl script is NOT run as part
     14 #             of a normal ICU build.  It is run by hand when needed, and the
     15 #             rbbirpt.h generated file is put back into cvs.
     16 #
     17 #             See rbbirpt.txt for a description of the input format for this script.
     18 #
     19 
     20 if ($ARGV[0] eq "-j") {
     21     $javaOutput = 1;
     22     shift @ARGV;
     23 }
     24 
     25 
     26 $num_states = 1;     # Always the state number for the line being compiled.
     27 $line_num  = 0;      # The line number in the input file.
     28 
     29 $states{"pop"} = 255;    # Add the "pop"  to the list of defined state names.
     30                          # This prevents any state from being labelled with "pop",
     31                          #  and resolves references to "pop" in the next state field.
     32 
     33 line_loop: while (<>) {
     34     chomp();
     35     $line = $_;
     36     @fields = split();
     37     $line_num++;
     38 
     39     # Remove # comments, which are any fields beginning with a #, plus all
     40     #  that follow on the line.
     41     for ($i=0; $i<@fields; $i++) {
     42         if ($fields[$i] =~ /^#/) {
     43             @fields = @fields[0 .. $i-1];
     44             last;
     45         }
     46     }
     47     # ignore blank lines, and those with no fields left after stripping comments..
     48     if (@fields == 0) {
     49         next;
     50     }
     51 
     52     #
     53     # State Label:  handling.
     54     #    Does the first token end with a ":"?  If so, it's the name  of a state.
     55     #    Put in a hash, together with the current state number,
     56     #        so that we can later look up the number from the name.
     57     #
     58     if (@fields[0] =~ /.*:$/) {
     59         $state_name = @fields[0];
     60         $state_name =~ s/://;        # strip off the colon from the state name.
     61 
     62         if ($states{$state_name} != 0) {
     63             print "  rbbicst: at line $line-num duplicate definition of state $state_name\n";
     64         }
     65         $states{$state_name} = $num_states;
     66         $stateNames[$num_states] = $state_name;
     67 
     68         # if the label was the only thing on this line, go on to the next line,
     69         # otherwise assume that a state definition is on the same line and fall through.
     70         if (@fields == 1) {
     71             next line_loop;
     72         }
     73         shift @fields;                       # shift off label field in preparation
     74                                              #  for handling the rest of the line.
     75     }
     76 
     77     #
     78     # State Transition line.
     79     #   syntax is this,
     80     #       character   [n]  target-state  [^push-state]  [function-name]
     81     #   where
     82     #      [something]   is an optional something
     83     #      character     is either a single quoted character e.g. '['
     84     #                       or a name of a character class, e.g. white_space
     85     #
     86 
     87     $state_line_num[$num_states] = $line_num;   # remember line number with each state
     88                                                 #  so we can make better error messages later.
     89     #
     90     # First field, character class or literal character for this transition.
     91     #
     92     if ($fields[0] =~ /^'.'$/) {
     93         # We've got a quoted literal character.
     94         $state_literal_chars[$num_states] = $fields[0];
     95         $state_literal_chars[$num_states] =~ s/'//g;
     96     } else {
     97         # We've got the name of a character class.
     98         $state_char_class[$num_states] = $fields[0];
     99         if ($fields[0] =~ /[\W]/) {
    100             print "  rbbicsts:  at line $line_num, bad character literal or character class name.\n";
    101             print "     scanning $fields[0]\n";
    102             exit(-1);
    103         }
    104     }
    105     shift @fields;
    106 
    107     #
    108     # do the 'n' flag
    109     #
    110     $state_flag[$num_states] = $javaOutput? "false" : "FALSE";
    111     if ($fields[0] eq "n") {
    112         $state_flag[$num_states] = $javaOutput? "true": "TRUE";
    113         shift @fields;
    114     }
    115 
    116     #
    117     # do the destination state.
    118     #
    119     $state_dest_state[$num_states] = $fields[0];
    120     if ($fields[0] eq "") {
    121         print "  rbbicsts:  at line $line_num, destination state missing.\n";
    122         exit(-1);
    123     }
    124     shift @fields;
    125 
    126     #
    127     # do the push state, if present.
    128     #
    129     if ($fields[0] =~ /^\^/) {
    130         $fields[0] =~ s/^\^//;
    131         $state_push_state[$num_states] = $fields[0];
    132         if ($fields[0] eq "" ) {
    133             print "  rbbicsts:  at line $line_num, expected state after ^ (no spaces).\n";
    134             exit(-1);
    135         }
    136         shift @fields;
    137     }
    138 
    139     #
    140     # Lastly, do the optional action name.
    141     #
    142     if ($fields[0] ne "") {
    143         $state_func_name[$num_states] = $fields[0];
    144         shift @fields;
    145     }
    146 
    147     #
    148     #  There should be no fields left on the line at this point.
    149     #
    150     if (@fields > 0) {
    151        print "  rbbicsts:  at line $line_num, unexpected extra stuff on input line.\n";
    152        print "     scanning $fields[0]\n";
    153    }
    154    $num_states++;
    155 }
    156 
    157 #
    158 # We've read in the whole file, now go back and output the
    159 #   C source code for the state transition table.
    160 #
    161 # We read all states first, before writing anything,  so that the state numbers
    162 # for the destination states are all available to be written.
    163 #
    164 
    165 #
    166 # Make hashes for the names of the character classes and
    167 #      for the names of the actions that appeared.
    168 #
    169 for ($state=1; $state < $num_states; $state++) {
    170     if ($state_char_class[$state] ne "") {
    171         if ($charClasses{$state_char_class[$state]} == 0) {
    172             $charClasses{$state_char_class[$state]} = 1;
    173         }
    174     }
    175     if ($state_func_name[$state] eq "") {
    176         $state_func_name[$state] = "doNOP";
    177     }
    178     if ($actions{$state_action_name[$state]} == 0) {
    179         $actions{$state_func_name[$state]} = 1;
    180     }
    181 }
    182 
    183 #
    184 # Check that all of the destination states have been defined
    185 #
    186 #
    187 $states{"exit"} = 0;              # Predefined state name, terminates state machine.
    188 for ($state=1; $state<$num_states; $state++) {
    189    if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne "exit") {
    190        print "Error at line $state_line_num[$state]: target state \"$state_dest_state[$state]\" is not defined.\n";
    191        $errors++;
    192    }
    193    if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} == 0) {
    194        print "Error at line $state_line_num[$state]: target state \"$state_push_state[$state]\" is not defined.\n";
    195        $errors++;
    196    }
    197 }
    198 
    199 die if ($errors>0);
    200 
    201 #
    202 # Assign numbers to each of the character classes classes  used.
    203 #   Sets are numbered from 128 - 250
    204 #   The values 0-127 in the state table are used for matching
    205 #     individual ASCII characters (the only thing that can appear in the rules.)
    206 #   The "set" names appearing in the code below (default, etc.)  need special
    207 #     handling because they do not correspond to a normal set of characters,
    208 #     but trigger special handling by code in the state machine.
    209 #
    210 $i = 128;
    211 foreach $setName (sort keys %charClasses) {
    212     if ($setName eq "default") {
    213         $charClasses{$setName} = 255;}
    214     elsif ($setName eq "escaped") {
    215         $charClasses{$setName} = 254;}
    216     elsif ($setName eq "escapedP") {
    217         $charClasses{$setName} = 253;}
    218     elsif ($setName eq "eof") {
    219         $charClasses{$setName} = 252;}
    220     else {
    221         # Normal (single) character class.  Number them.
    222         $charClasses{$setName} = $i;
    223         $i++;
    224     }
    225 }
    226 
    227 
    228 my ($sec, $min, $hour, , $day, $mon, $year, $wday, $yday, $isdst) = localtime;
    229 $year += 1900;
    230 
    231 if ($javaOutput) {
    232     print "/*\n";
    233     print " *******************************************************************************\n";
    234     print " * Copyright (C) 2003-$year,\n";
    235     print " * International Business Machines Corporation and others. All Rights Reserved.\n";
    236     print " *******************************************************************************\n";
    237     print " */\n";
    238     print " \n";
    239     print "package com.ibm.icu.text;\n";
    240     print " \n";
    241     print "/**\n";
    242     print " * Generated Java File.  Do not edit by hand.\n";
    243     print " * This file contains the state table for the ICU Rule Based Break Iterator\n";
    244     print " * rule parser.\n";
    245     print " * It is generated by the Perl script \"rbbicst.pl\" from\n";
    246     print " * the rule parser state definitions file \"rbbirpt.txt\".\n";
    247     print " * \@internal \n";
    248     print " *\n";
    249     print " */\n";
    250 
    251     print "class RBBIRuleParseTable\n";
    252     print "{\n";
    253 
    254      #
    255     # Emit the constants for the actions to be performed.
    256     #
    257     $n = 1;
    258     foreach $act (sort keys %actions) {
    259         print "     static final short $act = $n;\n";
    260         $n++;
    261     }
    262     print " \n";
    263     
    264     #
    265     # Emit constants for char class names
    266     #
    267     foreach $setName (sort keys %charClasses) {
    268        print "     static final short kRuleSet_$setName = $charClasses{$setName};\n";
    269     }
    270     print "\n\n";
    271     
    272     
    273     print "   static class RBBIRuleTableElement { \n";
    274     print "      short      fAction; \n";
    275     print "      short      fCharClass; \n";
    276     print "      short      fNextState; \n";
    277     print "      short      fPushState; \n";
    278     print "      boolean    fNextChar;  \n";
    279     print "      String     fStateName; \n";
    280     print "      RBBIRuleTableElement(short a, int cc, int ns, int ps, boolean nc, String sn) {  \n";
    281     print "      fAction = a; \n";
    282     print "      fCharClass = (short)cc; \n";
    283     print "      fNextState = (short)ns; \n";
    284     print "      fPushState = (short)ps; \n";
    285     print "      fNextChar  = nc; \n";
    286     print "      fStateName = sn; \n";
    287     print "   } \n";
    288     print "   }; \n";
    289     print "  \n";
    290     
    291     
    292     print "    static RBBIRuleTableElement[] gRuleParseStateTable = { \n ";
    293     print "      new RBBIRuleTableElement(doNOP, 0, 0,0,  true,   null )     //  0 \n";  #output the unused state 0. 
    294     for ($state=1; $state < $num_states; $state++) {
    295         print "     , new RBBIRuleTableElement($state_func_name[$state],";
    296         if ($state_literal_chars[$state] ne "") {
    297             $c = $state_literal_chars[$state];
    298             print("'$c', "); 
    299         }else {
    300             print " $charClasses{$state_char_class[$state]},";
    301         }
    302         print " $states{$state_dest_state[$state]},";
    303  
    304         # The push-state field is optional.  If omitted, fill field with a zero, which flags
    305         #   the state machine that there is no push state.
    306         if ($state_push_state[$state] eq "") {
    307             print "0, ";
    308         } else {
    309             print " $states{$state_push_state[$state]},";
    310         }
    311         print " $state_flag[$state], ";
    312  
    313         # if this is the first row of the table for this state, put out the state name.
    314         if ($stateNames[$state] ne "") {
    315             print "  \"$stateNames[$state]\") ";
    316         } else {
    317             print "  null ) ";
    318         }
    319             
    320         # Put out a comment showing the number (index) of this state row,
    321         print "    //  $state ";
    322         print "\n";
    323     }
    324     print " };\n";
    325 
    326     print "}; \n";
    327     
    328 }
    329 else
    330 {
    331     #
    332     #  C++ Output ...
    333     #
    334 
    335 
    336     print "//---------------------------------------------------------------------------------\n";
    337     print "//\n";
    338     print "// Generated Header File.  Do not edit by hand.\n";
    339     print "//    This file contains the state table for the ICU Rule Based Break Iterator\n";
    340     print "//    rule parser.\n";
    341     print "//    It is generated by the Perl script \"rbbicst.pl\" from\n";
    342     print "//    the rule parser state definitions file \"rbbirpt.txt\".\n";
    343     print "//\n";
    344     print "//   Copyright (C) 2002-$year International Business Machines Corporation \n";
    345     print "//   and others. All rights reserved.  \n";
    346     print "//\n";
    347     print "//---------------------------------------------------------------------------------\n";
    348     print "#ifndef RBBIRPT_H\n";
    349     print "#define RBBIRPT_H\n";
    350     print "\n";
    351     print "U_NAMESPACE_BEGIN\n";
    352 
    353     #
    354     # Emit the constants for indicies of Unicode Sets
    355     #   Define one constant for each of the character classes encountered.
    356     #   At the same time, store the index corresponding to the set name back into hash.
    357     #
    358     print "//\n";
    359     print "// Character classes for RBBI rule scanning.\n";
    360     print "//\n";
    361     foreach $setName (sort keys %charClasses) {
    362         if ($charClasses{$setName} < 250) {
    363            # Normal character class.
    364            print "    static const uint8_t kRuleSet_$setName = $charClasses{$setName};\n";
    365         }
    366     }
    367     print "\n\n";
    368 
    369     #
    370     # Emit the enum for the actions to be performed.
    371     #
    372     print "enum RBBI_RuleParseAction {\n";
    373     foreach $act (sort keys %actions) {
    374         print "    $act,\n";
    375     }
    376     print "    rbbiLastAction};\n\n";
    377 
    378     #
    379     # Emit the struct definition for transtion table elements.
    380     #
    381     print "//-------------------------------------------------------------------------------\n";
    382     print "//\n";
    383     print "//  RBBIRuleTableEl    represents the structure of a row in the transition table\n";
    384     print "//                     for the rule parser state machine.\n";
    385     print "//-------------------------------------------------------------------------------\n";
    386     print "struct RBBIRuleTableEl {\n";
    387     print "    RBBI_RuleParseAction          fAction;\n";
    388     print "    uint8_t                       fCharClass;       // 0-127:    an individual ASCII character\n";
    389     print "                                                    // 128-255:  character class index\n";
    390     print "    uint8_t                       fNextState;       // 0-250:    normal next-stat numbers\n";
    391     print "                                                    // 255:      pop next-state from stack.\n";
    392     print "    uint8_t                       fPushState;\n";
    393     print "    UBool                         fNextChar;\n";
    394     print "};\n\n";
    395 
    396     #
    397     # emit the state transition table
    398     #
    399     print "static const struct RBBIRuleTableEl gRuleParseStateTable[] = {\n";
    400     print "    {doNOP, 0, 0, 0, TRUE}\n";    # State 0 is a dummy.  Real states start with index = 1.
    401     for ($state=1; $state < $num_states; $state++) {
    402         print "    , {$state_func_name[$state],";
    403         if ($state_literal_chars[$state] ne "") {
    404             $c = $state_literal_chars[$state];
    405             printf(" %d /* $c */,", ord($c));   #  use numeric value, so EBCDIC machines are ok.
    406         }else {
    407             print " $charClasses{$state_char_class[$state]},";
    408         }
    409         print " $states{$state_dest_state[$state]},";
    410 
    411         # The push-state field is optional.  If omitted, fill field with a zero, which flags
    412         #   the state machine that there is no push state.
    413         if ($state_push_state[$state] eq "") {
    414             print "0, ";
    415         } else {
    416             print " $states{$state_push_state[$state]},";
    417         }
    418         print " $state_flag[$state]} ";
    419 
    420         # Put out a C++ comment showing the number (index) of this state row,
    421         #   and, if this is the first row of the table for this state, the state name.
    422         print "    //  $state ";
    423         if ($stateNames[$state] ne "") {
    424             print "     $stateNames[$state]";
    425         }
    426         print "\n";
    427     };
    428     print " };\n";
    429 
    430 
    431     #
    432     # emit a mapping array from state numbers to state names.
    433     #
    434     #    This array is used for producing debugging output from the rule parser.
    435     #
    436     print "#ifdef RBBI_DEBUG\n";
    437     print "static const char * const RBBIRuleStateNames[] = {";
    438     for ($state=0; $state<$num_states; $state++) {
    439         if ($stateNames[$state] ne "") {
    440             print "     \"$stateNames[$state]\",\n";
    441         } else {
    442             print "    0,\n";
    443         }
    444     }
    445     print "    0};\n";
    446     print "#endif\n\n";
    447 
    448     print "U_NAMESPACE_END\n";
    449     print "#endif\n";
    450 }
    451 
    452 
    453 
    454