1 #************************************************************************** 2 # Copyright (C) 2016 and later: Unicode, Inc. and others. 3 # License & terms of use: http://www.unicode.org/copyright.html#License 4 #************************************************************************** 5 #************************************************************************** 6 # Copyright (C) 2002-2016 International Business Machines Corporation 7 # and others. All rights reserved. 8 #************************************************************************** 9 # 10 # rbbicst Compile the RBBI rule paser state table data into initialized C data. 11 # Usage: 12 # cd icu/source/common 13 # perl rbbicst.pl < rbbirpt.txt > rbbirpt.h 14 # perl rbbicst.pl -j < rbbirpt.txt > RBBIRuleParseTable.java 15 # 16 # The output file, rbbrpt.h, is included by some of the .cpp rbbi 17 # implementation files. This perl script is NOT run as part 18 # of a normal ICU build. It is run by hand when needed, and the 19 # rbbirpt.h generated file is put back into cvs. 20 # 21 # See rbbirpt.txt for a description of the input format for this script. 22 # 23 24 if ($ARGV[0] eq "-j") { 25 $javaOutput = 1; 26 shift @ARGV; 27 } 28 29 30 $num_states = 1; # Always the state number for the line being compiled. 31 $line_num = 0; # The line number in the input file. 32 33 $states{"pop"} = 255; # Add the "pop" to the list of defined state names. 34 # This prevents any state from being labelled with "pop", 35 # and resolves references to "pop" in the next state field. 36 37 line_loop: while (<>) { 38 chomp(); 39 $line = $_; 40 @fields = split(); 41 $line_num++; 42 43 # Remove # comments, which are any fields beginning with a #, plus all 44 # that follow on the line. 45 for ($i=0; $i<@fields; $i++) { 46 if ($fields[$i] =~ /^#/) { 47 @fields = @fields[0 .. $i-1]; 48 last; 49 } 50 } 51 # ignore blank lines, and those with no fields left after stripping comments.. 52 if (@fields == 0) { 53 next; 54 } 55 56 # 57 # State Label: handling. 58 # Does the first token end with a ":"? If so, it's the name of a state. 59 # Put in a hash, together with the current state number, 60 # so that we can later look up the number from the name. 61 # 62 if (@fields[0] =~ /.*:$/) { 63 $state_name = @fields[0]; 64 $state_name =~ s/://; # strip off the colon from the state name. 65 66 if ($states{$state_name} != 0) { 67 print " rbbicst: at line $line-num duplicate definition of state $state_name\n"; 68 } 69 $states{$state_name} = $num_states; 70 $stateNames[$num_states] = $state_name; 71 72 # if the label was the only thing on this line, go on to the next line, 73 # otherwise assume that a state definition is on the same line and fall through. 74 if (@fields == 1) { 75 next line_loop; 76 } 77 shift @fields; # shift off label field in preparation 78 # for handling the rest of the line. 79 } 80 81 # 82 # State Transition line. 83 # syntax is this, 84 # character [n] target-state [^push-state] [function-name] 85 # where 86 # [something] is an optional something 87 # character is either a single quoted character e.g. '[' 88 # or a name of a character class, e.g. white_space 89 # 90 91 $state_line_num[$num_states] = $line_num; # remember line number with each state 92 # so we can make better error messages later. 93 # 94 # First field, character class or literal character for this transition. 95 # 96 if ($fields[0] =~ /^'.'$/) { 97 # We've got a quoted literal character. 98 $state_literal_chars[$num_states] = $fields[0]; 99 $state_literal_chars[$num_states] =~ s/'//g; 100 } else { 101 # We've got the name of a character class. 102 $state_char_class[$num_states] = $fields[0]; 103 if ($fields[0] =~ /[\W]/) { 104 print " rbbicsts: at line $line_num, bad character literal or character class name.\n"; 105 print " scanning $fields[0]\n"; 106 exit(-1); 107 } 108 } 109 shift @fields; 110 111 # 112 # do the 'n' flag 113 # 114 $state_flag[$num_states] = $javaOutput? "false" : "FALSE"; 115 if ($fields[0] eq "n") { 116 $state_flag[$num_states] = $javaOutput? "true": "TRUE"; 117 shift @fields; 118 } 119 120 # 121 # do the destination state. 122 # 123 $state_dest_state[$num_states] = $fields[0]; 124 if ($fields[0] eq "") { 125 print " rbbicsts: at line $line_num, destination state missing.\n"; 126 exit(-1); 127 } 128 shift @fields; 129 130 # 131 # do the push state, if present. 132 # 133 if ($fields[0] =~ /^\^/) { 134 $fields[0] =~ s/^\^//; 135 $state_push_state[$num_states] = $fields[0]; 136 if ($fields[0] eq "" ) { 137 print " rbbicsts: at line $line_num, expected state after ^ (no spaces).\n"; 138 exit(-1); 139 } 140 shift @fields; 141 } 142 143 # 144 # Lastly, do the optional action name. 145 # 146 if ($fields[0] ne "") { 147 $state_func_name[$num_states] = $fields[0]; 148 shift @fields; 149 } 150 151 # 152 # There should be no fields left on the line at this point. 153 # 154 if (@fields > 0) { 155 print " rbbicsts: at line $line_num, unexpected extra stuff on input line.\n"; 156 print " scanning $fields[0]\n"; 157 } 158 $num_states++; 159 } 160 161 # 162 # We've read in the whole file, now go back and output the 163 # C source code for the state transition table. 164 # 165 # We read all states first, before writing anything, so that the state numbers 166 # for the destination states are all available to be written. 167 # 168 169 # 170 # Make hashes for the names of the character classes and 171 # for the names of the actions that appeared. 172 # 173 for ($state=1; $state < $num_states; $state++) { 174 if ($state_char_class[$state] ne "") { 175 if ($charClasses{$state_char_class[$state]} == 0) { 176 $charClasses{$state_char_class[$state]} = 1; 177 } 178 } 179 if ($state_func_name[$state] eq "") { 180 $state_func_name[$state] = "doNOP"; 181 } 182 if ($actions{$state_action_name[$state]} == 0) { 183 $actions{$state_func_name[$state]} = 1; 184 } 185 } 186 187 # 188 # Check that all of the destination states have been defined 189 # 190 # 191 $states{"exit"} = 0; # Predefined state name, terminates state machine. 192 for ($state=1; $state<$num_states; $state++) { 193 if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne "exit") { 194 print "Error at line $state_line_num[$state]: target state \"$state_dest_state[$state]\" is not defined.\n"; 195 $errors++; 196 } 197 if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} == 0) { 198 print "Error at line $state_line_num[$state]: target state \"$state_push_state[$state]\" is not defined.\n"; 199 $errors++; 200 } 201 } 202 203 die if ($errors>0); 204 205 # 206 # Assign numbers to each of the character classes classes used. 207 # Sets are numbered from 128 - 250 208 # The values 0-127 in the state table are used for matching 209 # individual ASCII characters (the only thing that can appear in the rules.) 210 # The "set" names appearing in the code below (default, etc.) need special 211 # handling because they do not correspond to a normal set of characters, 212 # but trigger special handling by code in the state machine. 213 # 214 $i = 128; 215 foreach $setName (sort keys %charClasses) { 216 if ($setName eq "default") { 217 $charClasses{$setName} = 255;} 218 elsif ($setName eq "escaped") { 219 $charClasses{$setName} = 254;} 220 elsif ($setName eq "escapedP") { 221 $charClasses{$setName} = 253;} 222 elsif ($setName eq "eof") { 223 $charClasses{$setName} = 252;} 224 else { 225 # Normal (single) character class. Number them. 226 $charClasses{$setName} = $i; 227 $i++; 228 } 229 } 230 231 232 my ($sec, $min, $hour, , $day, $mon, $year, $wday, $yday, $isdst) = localtime; 233 $year += 1900; 234 235 if ($javaOutput) { 236 print "/*\n"; 237 print " *******************************************************************************\n"; 238 print " * Copyright (C) 2003-$year,\n"; 239 print " * International Business Machines Corporation and others. All Rights Reserved.\n"; 240 print " *******************************************************************************\n"; 241 print " */\n"; 242 print " \n"; 243 print "package com.ibm.icu.text;\n"; 244 print " \n"; 245 print "/**\n"; 246 print " * Generated Java File. Do not edit by hand.\n"; 247 print " * This file contains the state table for the ICU Rule Based Break Iterator\n"; 248 print " * rule parser.\n"; 249 print " * It is generated by the Perl script \"rbbicst.pl\" from\n"; 250 print " * the rule parser state definitions file \"rbbirpt.txt\".\n"; 251 print " * \@internal \n"; 252 print " *\n"; 253 print " */\n"; 254 255 print "class RBBIRuleParseTable\n"; 256 print "{\n"; 257 258 # 259 # Emit the constants for the actions to be performed. 260 # 261 $n = 1; 262 foreach $act (sort keys %actions) { 263 print " static final short $act = $n;\n"; 264 $n++; 265 } 266 print " \n"; 267 268 # 269 # Emit constants for char class names 270 # 271 foreach $setName (sort keys %charClasses) { 272 print " static final short kRuleSet_$setName = $charClasses{$setName};\n"; 273 } 274 print "\n\n"; 275 276 277 print " static class RBBIRuleTableElement { \n"; 278 print " short fAction; \n"; 279 print " short fCharClass; \n"; 280 print " short fNextState; \n"; 281 print " short fPushState; \n"; 282 print " boolean fNextChar; \n"; 283 print " String fStateName; \n"; 284 print " RBBIRuleTableElement(short a, int cc, int ns, int ps, boolean nc, String sn) { \n"; 285 print " fAction = a; \n"; 286 print " fCharClass = (short)cc; \n"; 287 print " fNextState = (short)ns; \n"; 288 print " fPushState = (short)ps; \n"; 289 print " fNextChar = nc; \n"; 290 print " fStateName = sn; \n"; 291 print " } \n"; 292 print " }; \n"; 293 print " \n"; 294 295 296 print " static RBBIRuleTableElement[] gRuleParseStateTable = { \n "; 297 print " new RBBIRuleTableElement(doNOP, 0, 0,0, true, null ) // 0 \n"; #output the unused state 0. 298 for ($state=1; $state < $num_states; $state++) { 299 print " , new RBBIRuleTableElement($state_func_name[$state],"; 300 if ($state_literal_chars[$state] ne "") { 301 $c = $state_literal_chars[$state]; 302 print("'$c', "); 303 }else { 304 print " $charClasses{$state_char_class[$state]},"; 305 } 306 print " $states{$state_dest_state[$state]},"; 307 308 # The push-state field is optional. If omitted, fill field with a zero, which flags 309 # the state machine that there is no push state. 310 if ($state_push_state[$state] eq "") { 311 print "0, "; 312 } else { 313 print " $states{$state_push_state[$state]},"; 314 } 315 print " $state_flag[$state], "; 316 317 # if this is the first row of the table for this state, put out the state name. 318 if ($stateNames[$state] ne "") { 319 print " \"$stateNames[$state]\") "; 320 } else { 321 print " null ) "; 322 } 323 324 # Put out a comment showing the number (index) of this state row, 325 print " // $state "; 326 print "\n"; 327 } 328 print " };\n"; 329 330 print "}; \n"; 331 332 } 333 else 334 { 335 # 336 # C++ Output ... 337 # 338 339 340 print "//---------------------------------------------------------------------------------\n"; 341 print "//\n"; 342 print "// Generated Header File. Do not edit by hand.\n"; 343 print "// This file contains the state table for the ICU Rule Based Break Iterator\n"; 344 print "// rule parser.\n"; 345 print "// It is generated by the Perl script \"rbbicst.pl\" from\n"; 346 print "// the rule parser state definitions file \"rbbirpt.txt\".\n"; 347 print "//\n"; 348 print "// Copyright (C) 2002-$year International Business Machines Corporation \n"; 349 print "// and others. All rights reserved. \n"; 350 print "//\n"; 351 print "//---------------------------------------------------------------------------------\n"; 352 print "#ifndef RBBIRPT_H\n"; 353 print "#define RBBIRPT_H\n"; 354 print "\n"; 355 print "#include \"unicode/utypes.h\"\n"; 356 print "\n"; 357 print "U_NAMESPACE_BEGIN\n"; 358 359 # 360 # Emit the constants for indicies of Unicode Sets 361 # Define one constant for each of the character classes encountered. 362 # At the same time, store the index corresponding to the set name back into hash. 363 # 364 print "//\n"; 365 print "// Character classes for RBBI rule scanning.\n"; 366 print "//\n"; 367 foreach $setName (sort keys %charClasses) { 368 if ($charClasses{$setName} < 250) { 369 # Normal character class. 370 print " static const uint8_t kRuleSet_$setName = $charClasses{$setName};\n"; 371 } 372 } 373 print "\n\n"; 374 375 # 376 # Emit the enum for the actions to be performed. 377 # 378 print "enum RBBI_RuleParseAction {\n"; 379 foreach $act (sort keys %actions) { 380 print " $act,\n"; 381 } 382 print " rbbiLastAction};\n\n"; 383 384 # 385 # Emit the struct definition for transtion table elements. 386 # 387 print "//-------------------------------------------------------------------------------\n"; 388 print "//\n"; 389 print "// RBBIRuleTableEl represents the structure of a row in the transition table\n"; 390 print "// for the rule parser state machine.\n"; 391 print "//-------------------------------------------------------------------------------\n"; 392 print "struct RBBIRuleTableEl {\n"; 393 print " RBBI_RuleParseAction fAction;\n"; 394 print " uint8_t fCharClass; // 0-127: an individual ASCII character\n"; 395 print " // 128-255: character class index\n"; 396 print " uint8_t fNextState; // 0-250: normal next-stat numbers\n"; 397 print " // 255: pop next-state from stack.\n"; 398 print " uint8_t fPushState;\n"; 399 print " UBool fNextChar;\n"; 400 print "};\n\n"; 401 402 # 403 # emit the state transition table 404 # 405 print "static const struct RBBIRuleTableEl gRuleParseStateTable[] = {\n"; 406 print " {doNOP, 0, 0, 0, TRUE}\n"; # State 0 is a dummy. Real states start with index = 1. 407 for ($state=1; $state < $num_states; $state++) { 408 print " , {$state_func_name[$state],"; 409 if ($state_literal_chars[$state] ne "") { 410 $c = $state_literal_chars[$state]; 411 printf(" %d /* $c */,", ord($c)); # use numeric value, so EBCDIC machines are ok. 412 }else { 413 print " $charClasses{$state_char_class[$state]},"; 414 } 415 print " $states{$state_dest_state[$state]},"; 416 417 # The push-state field is optional. If omitted, fill field with a zero, which flags 418 # the state machine that there is no push state. 419 if ($state_push_state[$state] eq "") { 420 print "0, "; 421 } else { 422 print " $states{$state_push_state[$state]},"; 423 } 424 print " $state_flag[$state]} "; 425 426 # Put out a C++ comment showing the number (index) of this state row, 427 # and, if this is the first row of the table for this state, the state name. 428 print " // $state "; 429 if ($stateNames[$state] ne "") { 430 print " $stateNames[$state]"; 431 } 432 print "\n"; 433 }; 434 print " };\n"; 435 436 437 # 438 # emit a mapping array from state numbers to state names. 439 # 440 # This array is used for producing debugging output from the rule parser. 441 # 442 print "#ifdef RBBI_DEBUG\n"; 443 print "static const char * const RBBIRuleStateNames[] = {"; 444 for ($state=0; $state<$num_states; $state++) { 445 if ($stateNames[$state] ne "") { 446 print " \"$stateNames[$state]\",\n"; 447 } else { 448 print " 0,\n"; 449 } 450 } 451 print " 0};\n"; 452 print "#endif\n\n"; 453 454 print "U_NAMESPACE_END\n"; 455 print "#endif\n"; 456 } 457 458 459 460