1 #! /usr/bin/perl -w 2 # -*- Perl -*- 3 # 4 # afblue.pl 5 # 6 # Process a blue zone character data file. 7 # 8 # Copyright 2013-2015 by 9 # David Turner, Robert Wilhelm, and Werner Lemberg. 10 # 11 # This file is part of the FreeType project, and may only be used, 12 # modified, and distributed under the terms of the FreeType project 13 # license, LICENSE.TXT. By continuing to use, modify, or distribute 14 # this file you indicate that you have read the license and 15 # understand and accept it fully. 16 17 use strict; 18 use warnings; 19 use English '-no_match_vars'; 20 use open ':std', ':encoding(UTF-8)'; 21 22 23 my $prog = $PROGRAM_NAME; 24 $prog =~ s| .* / ||x; # Remove path. 25 26 die "usage: $prog datafile < infile > outfile\n" if $#ARGV != 0; 27 28 29 my $datafile = $ARGV[0]; 30 31 my %diversions; # The extracted and massaged data from `datafile'. 32 my @else_stack; # Booleans to track else-clauses. 33 my @name_stack; # Stack of integers used for names of aux. variables. 34 35 my $curr_enum; # Name of the current enumeration. 36 my $curr_array; # Name of the current array. 37 my $curr_max; # Name of the current maximum value. 38 39 my $curr_enum_element; # Name of the current enumeration element. 40 my $curr_offset; # The offset relative to current aux. variable. 41 my $curr_elem_size; # The size of the current string or block. 42 43 my $have_sections = 0; # Boolean; set if start of a section has been seen. 44 my $have_strings; # Boolean; set if current section contains strings. 45 my $have_blocks; # Boolean; set if current section contains blocks. 46 47 my $have_enum_element; # Boolean; set if we have an enumeration element. 48 my $in_string; # Boolean; set if a string has been parsed. 49 50 my $num_sections = 0; # Number of sections seen so far. 51 52 my $last_aux; # Name of last auxiliary variable. 53 54 55 # Regular expressions. 56 57 # [<ws>] <enum_name> <ws> <array_name> <ws> <max_name> [<ws>] ':' [<ws>] '\n' 58 my $section_re = qr/ ^ \s* (\S+) \s+ (\S+) \s+ (\S+) \s* : \s* $ /x; 59 60 # [<ws>] <enum_element_name> [<ws>] '\n' 61 my $enum_element_re = qr/ ^ \s* ( [A-Za-z0-9_]+ ) \s* $ /x; 62 63 # '#' <preprocessor directive> '\n' 64 my $preprocessor_re = qr/ ^ \# /x; 65 66 # [<ws>] '/' '/' <comment> '\n' 67 my $comment_re = qr| ^ \s* // |x; 68 69 # empty line 70 my $whitespace_only_re = qr/ ^ \s* $ /x; 71 72 # [<ws>] '"' <string> '"' [<ws>] '\n' (<string> doesn't contain newlines) 73 my $string_re = qr/ ^ \s* 74 " ( (?> (?: (?> [^"\\]+ ) | \\. )* ) ) " 75 \s* $ /x; 76 77 # [<ws>] '{' <block> '}' [<ws>] '\n' (<block> can contain newlines) 78 my $block_start_re = qr/ ^ \s* \{ /x; 79 80 # We need the capturing group for `split' to make it return the separator 81 # tokens (i.e., the opening and closing brace) also. 82 my $brace_re = qr/ ( [{}] ) /x; 83 84 85 sub Warn 86 { 87 my $message = shift; 88 warn "$datafile:$INPUT_LINE_NUMBER: warning: $message\n"; 89 } 90 91 92 sub Die 93 { 94 my $message = shift; 95 die "$datafile:$INPUT_LINE_NUMBER: error: $message\n"; 96 } 97 98 99 my $warned_before = 0; 100 101 sub warn_before 102 { 103 Warn("data before first section gets ignored") unless $warned_before; 104 $warned_before = 1; 105 } 106 107 108 sub strip_newline 109 { 110 chomp; 111 s/ \x0D $ //x; 112 } 113 114 115 sub end_curr_string 116 { 117 # Append final null byte to string. 118 if ($have_strings) 119 { 120 push @{$diversions{$curr_array}}, " '\\0',\n" if $in_string; 121 122 $curr_offset++; 123 $in_string = 0; 124 } 125 } 126 127 128 sub update_max_elem_size 129 { 130 if ($curr_elem_size) 131 { 132 my $max = pop @{$diversions{$curr_max}}; 133 $max = $curr_elem_size if $curr_elem_size > $max; 134 push @{$diversions{$curr_max}}, $max; 135 } 136 } 137 138 139 sub convert_non_ascii_char 140 { 141 # A UTF-8 character outside of the printable ASCII range, with possibly a 142 # leading backslash character. 143 my $s = shift; 144 145 # Here we count characters, not bytes. 146 $curr_elem_size += length $s; 147 148 utf8::encode($s); 149 $s = uc unpack 'H*', $s; 150 151 $curr_offset += $s =~ s/\G(..)/'\\x$1', /sg; 152 153 return $s; 154 } 155 156 157 sub convert_ascii_chars 158 { 159 # A series of ASCII characters in the printable range. 160 my $s = shift; 161 162 # We ignore spaces. 163 $s =~ s/ //g; 164 165 my $count = $s =~ s/\G(.)/'$1', /g; 166 $curr_offset += $count; 167 $curr_elem_size += $count; 168 169 return $s; 170 } 171 172 173 sub convert_literal 174 { 175 my $s = shift; 176 my $orig = $s; 177 178 # ASCII printables and space 179 my $safe_re = '\x20-\x7E'; 180 # ASCII printables and space, no backslash 181 my $safe_no_backslash_re = '\x20-\x5B\x5D-\x7E'; 182 183 $s =~ s{ 184 (?: \\? ( [^$safe_re] ) 185 | ( (?: [$safe_no_backslash_re] 186 | \\ [$safe_re] )+ ) ) 187 } 188 { 189 defined($1) ? convert_non_ascii_char($1) 190 : convert_ascii_chars($2) 191 }egx; 192 193 # We assume that `$orig' doesn't contain `*/' 194 return $s . " /* $orig */"; 195 } 196 197 198 sub aux_name 199 { 200 return "af_blue_" . $num_sections. "_" . join('_', @name_stack); 201 } 202 203 204 sub aux_name_next 205 { 206 $name_stack[$#name_stack]++; 207 my $name = aux_name(); 208 $name_stack[$#name_stack]--; 209 210 return $name; 211 } 212 213 214 sub enum_val_string 215 { 216 # Build string that holds code to save the current offset in an 217 # enumeration element. 218 my $aux = shift; 219 220 my $add = ($last_aux eq "af_blue_" . $num_sections . "_0" ) 221 ? "" 222 : "$last_aux + "; 223 224 return " $aux = $add$curr_offset,\n"; 225 } 226 227 228 229 # Process data file. 230 231 open(DATA, $datafile) || die "$prog: can't open \`$datafile': $OS_ERROR\n"; 232 233 while (<DATA>) 234 { 235 strip_newline(); 236 237 next if /$comment_re/; 238 next if /$whitespace_only_re/; 239 240 if (/$section_re/) 241 { 242 Warn("previous section is empty") if ($have_sections 243 && !$have_strings 244 && !$have_blocks); 245 246 end_curr_string(); 247 update_max_elem_size(); 248 249 # Save captured groups from `section_re'. 250 $curr_enum = $1; 251 $curr_array = $2; 252 $curr_max = $3; 253 254 $curr_enum_element = ""; 255 $curr_offset = 0; 256 257 Warn("overwriting already defined enumeration \`$curr_enum'") 258 if exists($diversions{$curr_enum}); 259 Warn("overwriting already defined array \`$curr_array'") 260 if exists($diversions{$curr_array}); 261 Warn("overwriting already defined maximum value \`$curr_max'") 262 if exists($diversions{$curr_max}); 263 264 $diversions{$curr_enum} = []; 265 $diversions{$curr_array} = []; 266 $diversions{$curr_max} = []; 267 268 push @{$diversions{$curr_max}}, 0; 269 270 @name_stack = (); 271 push @name_stack, 0; 272 273 $have_sections = 1; 274 $have_strings = 0; 275 $have_blocks = 0; 276 277 $have_enum_element = 0; 278 $in_string = 0; 279 280 $num_sections++; 281 $curr_elem_size = 0; 282 283 $last_aux = aux_name(); 284 285 next; 286 } 287 288 if (/$preprocessor_re/) 289 { 290 if ($have_sections) 291 { 292 # Having preprocessor conditionals complicates the computation of 293 # correct offset values. We have to introduce auxiliary enumeration 294 # elements with the name `af_blue_<s>_<n1>_<n2>_...' that store 295 # offsets to be used in conditional clauses. `<s>' is the number of 296 # sections seen so far, `<n1>' is the number of `#if' and `#endif' 297 # conditionals seen so far in the topmost level, `<n2>' the number of 298 # `#if' and `#endif' conditionals seen so far one level deeper, etc. 299 # As a consequence, uneven values are used within a clause, and even 300 # values after a clause, since the C standard doesn't allow the 301 # redefinition of an enumeration value. For example, the name 302 # `af_blue_5_1_6' is used to construct enumeration values in the fifth 303 # section after the third (second-level) if-clause within the first 304 # (top-level) if-clause. After the first top-level clause has 305 # finished, `af_blue_5_2' is used. The current offset is then 306 # relative to the value stored in the current auxiliary element. 307 308 if (/ ^ \# \s* if /x) 309 { 310 push @else_stack, 0; 311 312 $name_stack[$#name_stack]++; 313 314 push @{$diversions{$curr_enum}}, enum_val_string(aux_name()); 315 $last_aux = aux_name(); 316 317 push @name_stack, 0; 318 319 $curr_offset = 0; 320 } 321 elsif (/ ^ \# \s* elif /x) 322 { 323 Die("unbalanced #elif") unless @else_stack; 324 325 pop @name_stack; 326 327 push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next()); 328 $last_aux = aux_name(); 329 330 push @name_stack, 0; 331 332 $curr_offset = 0; 333 } 334 elsif (/ ^ \# \s* else /x) 335 { 336 my $prev_else = pop @else_stack; 337 Die("unbalanced #else") unless defined($prev_else); 338 Die("#else already seen") if $prev_else; 339 push @else_stack, 1; 340 341 pop @name_stack; 342 343 push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next()); 344 $last_aux = aux_name(); 345 346 push @name_stack, 0; 347 348 $curr_offset = 0; 349 } 350 elsif (/ ^ (\# \s*) endif /x) 351 { 352 my $prev_else = pop @else_stack; 353 Die("unbalanced #endif") unless defined($prev_else); 354 355 pop @name_stack; 356 357 # If there is no else-clause for an if-clause, we add one. This is 358 # necessary to have correct offsets. 359 if (!$prev_else) 360 { 361 # Use amount of whitespace from `endif'. 362 push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next()) 363 . $1 . "else\n"; 364 $last_aux = aux_name(); 365 366 $curr_offset = 0; 367 } 368 369 $name_stack[$#name_stack]++; 370 371 push @{$diversions{$curr_enum}}, enum_val_string(aux_name()); 372 $last_aux = aux_name(); 373 374 $curr_offset = 0; 375 } 376 377 # Handle (probably continued) preprocessor lines. 378 CONTINUED_LOOP: 379 { 380 do 381 { 382 strip_newline(); 383 384 push @{$diversions{$curr_enum}}, $ARG . "\n"; 385 push @{$diversions{$curr_array}}, $ARG . "\n"; 386 387 last CONTINUED_LOOP unless / \\ $ /x; 388 389 } while (<DATA>); 390 } 391 } 392 else 393 { 394 warn_before(); 395 } 396 397 next; 398 } 399 400 if (/$enum_element_re/) 401 { 402 end_curr_string(); 403 update_max_elem_size(); 404 405 $curr_enum_element = $1; 406 $have_enum_element = 1; 407 $curr_elem_size = 0; 408 409 next; 410 } 411 412 if (/$string_re/) 413 { 414 if ($have_sections) 415 { 416 Die("strings and blocks can't be mixed in a section") if $have_blocks; 417 418 # Save captured group from `string_re'. 419 my $string = $1; 420 421 if ($have_enum_element) 422 { 423 push @{$diversions{$curr_enum}}, enum_val_string($curr_enum_element); 424 $have_enum_element = 0; 425 } 426 427 $string = convert_literal($string); 428 429 push @{$diversions{$curr_array}}, " $string\n"; 430 431 $have_strings = 1; 432 $in_string = 1; 433 } 434 else 435 { 436 warn_before(); 437 } 438 439 next; 440 } 441 442 if (/$block_start_re/) 443 { 444 if ($have_sections) 445 { 446 Die("strings and blocks can't be mixed in a section") if $have_strings; 447 448 my $depth = 0; 449 my $block = ""; 450 my $block_end = 0; 451 452 # Count braces while getting the block. 453 BRACE_LOOP: 454 { 455 do 456 { 457 strip_newline(); 458 459 foreach my $substring (split(/$brace_re/)) 460 { 461 if ($block_end) 462 { 463 Die("invalid data after last matching closing brace") 464 if $substring !~ /$whitespace_only_re/; 465 } 466 467 $block .= $substring; 468 469 if ($substring eq '{') 470 { 471 $depth++; 472 } 473 elsif ($substring eq '}') 474 { 475 $depth--; 476 477 $block_end = 1 if $depth == 0; 478 } 479 } 480 481 # If we are here, we have run out of substrings, so get next line 482 # or exit. 483 last BRACE_LOOP if $block_end; 484 485 $block .= "\n"; 486 487 } while (<DATA>); 488 } 489 490 if ($have_enum_element) 491 { 492 push @{$diversions{$curr_enum}}, enum_val_string($curr_enum_element); 493 $have_enum_element = 0; 494 } 495 496 push @{$diversions{$curr_array}}, $block . ",\n"; 497 498 $curr_offset++; 499 $curr_elem_size++; 500 501 $have_blocks = 1; 502 } 503 else 504 { 505 warn_before(); 506 } 507 508 next; 509 } 510 511 # Garbage. We weren't able to parse the data. 512 Die("syntax error"); 513 } 514 515 # Finalize data. 516 end_curr_string(); 517 update_max_elem_size(); 518 519 520 # Filter stdin to stdout, replacing `@...@' templates. 521 522 sub emit_diversion 523 { 524 my $diversion_name = shift; 525 return (exists($diversions{$1})) ? "@{$diversions{$1}}" 526 : "@" . $diversion_name . "@"; 527 } 528 529 530 $LIST_SEPARATOR = ''; 531 532 my $s1 = "This file has been generated by the Perl script \`$prog',"; 533 my $s1len = length $s1; 534 my $s2 = "using data from file \`$datafile'."; 535 my $s2len = length $s2; 536 my $slen = ($s1len > $s2len) ? $s1len : $s2len; 537 538 print "/* " . $s1 . " " x ($slen - $s1len) . " */\n" 539 . "/* " . $s2 . " " x ($slen - $s2len) . " */\n" 540 . "\n"; 541 542 while (<STDIN>) 543 { 544 s/ @ ( [A-Za-z0-9_]+? ) @ / emit_diversion($1) /egx; 545 print; 546 } 547 548 # EOF 549