1 #!/usr/bin/perl 2 # ******************************************************************** 3 # * COPYRIGHT: 4 # * Copyright (c) 2002-2015, International Business Machines Corporation and 5 # * others. All Rights Reserved. 6 # ******************************************************************** 7 # 8 # regexcst.pl 9 # Compile the regular expression paser state table data into initialized C data. 10 # Usage: 11 # cd icu/source/i18n 12 # perl regexcst.pl < regexcst.txt > regexcst.h 13 # 14 # The output file, regexcst.h, is included by some of the .cpp regex 15 # implementation files. This perl script is NOT run as part 16 # of a normal ICU build. It is run by hand when needed, and the 17 # regexcst.h generated file is put back into cvs. 18 # 19 # See regexcst.txt for a description of the input format for this script. 20 # 21 # This script is derived from rbbicst.pl, which peforms the same function 22 # for the Rule Based Break Iterator Rule Parser. Perhaps they could be 23 # merged? 24 # 25 26 27 $num_states = 1; # Always the state number for the line being compiled. 28 $line_num = 0; # The line number in the input file. 29 30 $states{"pop"} = 255; # Add the "pop" to the list of defined state names. 31 # This prevents any state from being labelled with "pop", 32 # and resolves references to "pop" in the next state field. 33 34 line_loop: while (<>) { 35 chomp(); 36 $line = $_; 37 @fields = split(); 38 $line_num++; 39 40 # Remove # comments, which are any fields beginning with a #, plus all 41 # that follow on the line. 42 for ($i=0; $i<@fields; $i++) { 43 if ($fields[$i] =~ /^#/) { 44 @fields = @fields[0 .. $i-1]; 45 last; 46 } 47 } 48 # ignore blank lines, and those with no fields left after stripping comments.. 49 if (@fields == 0) { 50 next; 51 } 52 53 # 54 # State Label: handling. 55 # Does the first token end with a ":"? If so, it's the name of a state. 56 # Put in a hash, together with the current state number, 57 # so that we can later look up the number from the name. 58 # 59 if (@fields[0] =~ /.*:$/) { 60 $state_name = @fields[0]; 61 $state_name =~ s/://; # strip off the colon from the state name. 62 63 if ($states{$state_name} != 0) { 64 print " rbbicst: at line $line-num duplicate definition of state $state_name\n"; 65 } 66 $states{$state_name} = $num_states; 67 $stateNames[$num_states] = $state_name; 68 69 # if the label was the only thing on this line, go on to the next line, 70 # otherwise assume that a state definition is on the same line and fall through. 71 if (@fields == 1) { 72 next line_loop; 73 } 74 shift @fields; # shift off label field in preparation 75 # for handling the rest of the line. 76 } 77 78 # 79 # State Transition line. 80 # syntax is this, 81 # character [n] target-state [^push-state] [function-name] 82 # where 83 # [something] is an optional something 84 # character is either a single quoted character e.g. '[' 85 # or a name of a character class, e.g. white_space 86 # 87 88 $state_line_num[$num_states] = $line_num; # remember line number with each state 89 # so we can make better error messages later. 90 # 91 # First field, character class or literal character for this transition. 92 # 93 if ($fields[0] =~ /^'.'$/) { 94 # We've got a quoted literal character. 95 $state_literal_chars[$num_states] = $fields[0]; 96 $state_literal_chars[$num_states] =~ s/'//g; 97 } else { 98 # We've got the name of a character class. 99 $state_char_class[$num_states] = $fields[0]; 100 if ($fields[0] =~ /[\W]/) { 101 print " rbbicsts: at line $line_num, bad character literal or character class name.\n"; 102 print " scanning $fields[0]\n"; 103 exit(-1); 104 } 105 } 106 shift @fields; 107 108 # 109 # do the 'n' flag 110 # 111 $state_flag[$num_states] = "FALSE"; 112 if ($fields[0] eq "n") { 113 $state_flag[$num_states] = "TRUE"; 114 shift @fields; 115 } 116 117 # 118 # do the destination state. 119 # 120 $state_dest_state[$num_states] = $fields[0]; 121 if ($fields[0] eq "") { 122 print " rbbicsts: at line $line_num, destination state missing.\n"; 123 exit(-1); 124 } 125 shift @fields; 126 127 # 128 # do the push state, if present. 129 # 130 if ($fields[0] =~ /^\^/) { 131 $fields[0] =~ s/^\^//; 132 $state_push_state[$num_states] = $fields[0]; 133 if ($fields[0] eq "" ) { 134 print " rbbicsts: at line $line_num, expected state after ^ (no spaces).\n"; 135 exit(-1); 136 } 137 shift @fields; 138 } 139 140 # 141 # Lastly, do the optional action name. 142 # 143 if ($fields[0] ne "") { 144 $state_func_name[$num_states] = $fields[0]; 145 shift @fields; 146 } 147 148 # 149 # There should be no fields left on the line at this point. 150 # 151 if (@fields > 0) { 152 print " rbbicsts: at line $line_num, unexpected extra stuff on input line.\n"; 153 print " scanning $fields[0]\n"; 154 } 155 $num_states++; 156 } 157 158 # 159 # We've read in the whole file, now go back and output the 160 # C source code for the state transition table. 161 # 162 # We read all states first, before writing anything, so that the state numbers 163 # for the destination states are all available to be written. 164 # 165 166 # 167 # Make hashes for the names of the character classes and 168 # for the names of the actions that appeared. 169 # 170 for ($state=1; $state < $num_states; $state++) { 171 if ($state_char_class[$state] ne "") { 172 if ($charClasses{$state_char_class[$state]} == 0) { 173 $charClasses{$state_char_class[$state]} = 1; 174 } 175 } 176 if ($state_func_name[$state] eq "") { 177 $state_func_name[$state] = "doNOP"; 178 } 179 if ($actions{$state_action_name[$state]} == 0) { 180 $actions{$state_func_name[$state]} = 1; 181 } 182 } 183 184 # 185 # Check that all of the destination states have been defined 186 # 187 # 188 $states{"exit"} = 0; # Predefined state name, terminates state machine. 189 for ($state=1; $state<$num_states; $state++) { 190 if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne "exit") { 191 print "Error at line $state_line_num[$state]: target state \"$state_dest_state[$state]\" is not defined.\n"; 192 $errors++; 193 } 194 if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} == 0) { 195 print "Error at line $state_line_num[$state]: target state \"$state_push_state[$state]\" is not defined.\n"; 196 $errors++; 197 } 198 } 199 200 die if ($errors>0); 201 202 print "//---------------------------------------------------------------------------------\n"; 203 print "//\n"; 204 print "// Generated Header File. Do not edit by hand.\n"; 205 print "// This file contains the state table for the ICU Regular Expression Pattern Parser\n"; 206 print "// It is generated by the Perl script \"regexcst.pl\" from\n"; 207 print "// the rule parser state definitions file \"regexcst.txt\".\n"; 208 print "//\n"; 209 print "// Copyright (C) 2002-2015 International Business Machines Corporation \n"; 210 print "// and others. All rights reserved. \n"; 211 print "//\n"; 212 print "//---------------------------------------------------------------------------------\n"; 213 print "#ifndef RBBIRPT_H\n"; 214 print "#define RBBIRPT_H\n"; 215 print "\n"; 216 print "U_NAMESPACE_BEGIN\n"; 217 218 # 219 # Emit the constants for indicies of Unicode Sets 220 # Define one constant for each of the character classes encountered. 221 # At the same time, store the index corresponding to the set name back into hash. 222 # 223 print "//\n"; 224 print "// Character classes for regex pattern scanning.\n"; 225 print "//\n"; 226 $i = 128; # State Table values for Unicode char sets range from 128-250. 227 # Sets "default", "quoted", etc. get special handling. 228 # They have no corresponding UnicodeSet object in the state machine, 229 # but are handled by special case code. So we emit no reference 230 # to a UnicodeSet object to them here. 231 foreach $setName (keys %charClasses) { 232 if ($setName eq "default") { 233 $charClasses{$setName} = 255;} 234 elsif ($setName eq "quoted") { 235 $charClasses{$setName} = 254;} 236 elsif ($setName eq "eof") { 237 $charClasses{$setName} = 253;} 238 else { 239 # Normal character class. Fill in array with a ptr to the corresponding UnicodeSet in the state machine. 240 print " static const uint8_t kRuleSet_$setName = $i;\n"; 241 $charClasses{$setName} = $i; 242 $i++; 243 } 244 } 245 print "\n\n"; 246 247 # 248 # Emit the enum for the actions to be performed. 249 # 250 print "enum Regex_PatternParseAction {\n"; 251 foreach $act (keys %actions) { 252 print " $act,\n"; 253 } 254 print " rbbiLastAction};\n\n"; 255 256 # 257 # Emit the struct definition for transtion table elements. 258 # 259 print "//-------------------------------------------------------------------------------\n"; 260 print "//\n"; 261 print "// RegexTableEl represents the structure of a row in the transition table\n"; 262 print "// for the pattern parser state machine.\n"; 263 print "//-------------------------------------------------------------------------------\n"; 264 print "struct RegexTableEl {\n"; 265 print " Regex_PatternParseAction fAction;\n"; 266 print " uint8_t fCharClass; // 0-127: an individual ASCII character\n"; 267 print " // 128-255: character class index\n"; 268 print " uint8_t fNextState; // 0-250: normal next-state numbers\n"; 269 print " // 255: pop next-state from stack.\n"; 270 print " uint8_t fPushState;\n"; 271 print " UBool fNextChar;\n"; 272 print "};\n\n"; 273 274 # 275 # emit the state transition table 276 # 277 print "static const struct RegexTableEl gRuleParseStateTable[] = {\n"; 278 print " {doNOP, 0, 0, 0, TRUE}\n"; # State 0 is a dummy. Real states start with index = 1. 279 for ($state=1; $state < $num_states; $state++) { 280 print " , {$state_func_name[$state],"; 281 if ($state_literal_chars[$state] ne "") { 282 $c = $state_literal_chars[$state]; 283 printf(" %d /* $c */,", ord($c)); # use numeric value, so EBCDIC machines are ok. 284 }else { 285 print " $charClasses{$state_char_class[$state]},"; 286 } 287 print " $states{$state_dest_state[$state]},"; 288 289 # The push-state field is optional. If omitted, fill field with a zero, which flags 290 # the state machine that there is no push state. 291 if ($state_push_state[$state] eq "") { 292 print "0, "; 293 } else { 294 print " $states{$state_push_state[$state]},"; 295 } 296 print " $state_flag[$state]} "; 297 298 # Put out a C++ comment showing the number (index) of this state row, 299 # and, if this is the first row of the table for this state, the state name. 300 print " // $state "; 301 if ($stateNames[$state] ne "") { 302 print " $stateNames[$state]"; 303 } 304 print "\n"; 305 }; 306 print " };\n"; 307 308 309 # 310 # emit a mapping array from state numbers to state names. 311 # 312 # This array is used for producing debugging output from the pattern parser. 313 # 314 print "static const char * const RegexStateNames[] = {"; 315 for ($state=0; $state<$num_states; $state++) { 316 if ($stateNames[$state] ne "") { 317 print " \"$stateNames[$state]\",\n"; 318 } else { 319 print " 0,\n"; 320 } 321 } 322 print " 0};\n\n"; 323 324 print "U_NAMESPACE_END\n"; 325 print "#endif\n"; 326 327 328 329