1 #! /usr/bin/perl -w 2 # -*- Perl -*- 3 # 4 # afblue.pl 5 # 6 # Process a blue zone character data file. 7 # 8 # Copyright 2013-2018 by 9 # David Turner, Robert Wilhelm, and Werner Lemberg. 10 # 11 # This file is part of the FreeType project, and may only be used, 12 # modified, and distributed under the terms of the FreeType project 13 # license, LICENSE.TXT. By continuing to use, modify, or distribute 14 # this file you indicate that you have read the license and 15 # understand and accept it fully. 16 17 use strict; 18 use warnings; 19 use English '-no_match_vars'; 20 use open ':std', ':encoding(UTF-8)'; 21 22 23 my $prog = $PROGRAM_NAME; 24 $prog =~ s| .* / ||x; # Remove path. 25 26 die "usage: $prog datafile < infile > outfile\n" if $#ARGV != 0; 27 28 29 my $datafile = $ARGV[0]; 30 31 my %diversions; # The extracted and massaged data from `datafile'. 32 my @else_stack; # Booleans to track else-clauses. 33 my @name_stack; # Stack of integers used for names of aux. variables. 34 35 my $curr_enum; # Name of the current enumeration. 36 my $curr_array; # Name of the current array. 37 my $curr_max; # Name of the current maximum value. 38 39 my $curr_enum_element; # Name of the current enumeration element. 40 my $curr_offset; # The offset relative to current aux. variable. 41 my $curr_elem_size; # The number of non-space characters in the current string or 42 # the number of elements in the current block. 43 44 my $have_sections = 0; # Boolean; set if start of a section has been seen. 45 my $have_strings; # Boolean; set if current section contains strings. 46 my $have_blocks; # Boolean; set if current section contains blocks. 47 48 my $have_enum_element; # Boolean; set if we have an enumeration element. 49 my $in_string; # Boolean; set if a string has been parsed. 50 51 my $num_sections = 0; # Number of sections seen so far. 52 53 my $last_aux; # Name of last auxiliary variable. 54 55 56 # Regular expressions. 57 58 # [<ws>] <enum_name> <ws> <array_name> <ws> <max_name> [<ws>] ':' [<ws>] '\n' 59 my $section_re = qr/ ^ \s* (\S+) \s+ (\S+) \s+ (\S+) \s* : \s* $ /x; 60 61 # [<ws>] <enum_element_name> [<ws>] '\n' 62 my $enum_element_re = qr/ ^ \s* ( [A-Za-z0-9_]+ ) \s* $ /x; 63 64 # '#' <preprocessor directive> '\n' 65 my $preprocessor_re = qr/ ^ \# /x; 66 67 # [<ws>] '/' '/' <comment> '\n' 68 my $comment_re = qr| ^ \s* // |x; 69 70 # empty line 71 my $whitespace_only_re = qr/ ^ \s* $ /x; 72 73 # [<ws>] '"' <string> '"' [<ws>] '\n' (<string> doesn't contain newlines) 74 my $string_re = qr/ ^ \s* 75 " ( (?> (?: (?> [^"\\]+ ) | \\. )* ) ) " 76 \s* $ /x; 77 78 # [<ws>] '{' <block> '}' [<ws>] '\n' (<block> can contain newlines) 79 my $block_start_re = qr/ ^ \s* \{ /x; 80 81 # We need the capturing group for `split' to make it return the separator 82 # tokens (i.e., the opening and closing brace) also. 83 my $brace_re = qr/ ( [{}] ) /x; 84 85 86 sub Warn 87 { 88 my $message = shift; 89 warn "$datafile:$INPUT_LINE_NUMBER: warning: $message\n"; 90 } 91 92 93 sub Die 94 { 95 my $message = shift; 96 die "$datafile:$INPUT_LINE_NUMBER: error: $message\n"; 97 } 98 99 100 my $warned_before = 0; 101 102 sub warn_before 103 { 104 Warn("data before first section gets ignored") unless $warned_before; 105 $warned_before = 1; 106 } 107 108 109 sub strip_newline 110 { 111 chomp; 112 s/ \x0D $ //x; 113 } 114 115 116 sub end_curr_string 117 { 118 # Append final null byte to string. 119 if ($have_strings) 120 { 121 push @{$diversions{$curr_array}}, " '\\0',\n" if $in_string; 122 123 $curr_offset++; 124 $in_string = 0; 125 } 126 } 127 128 129 sub update_max_elem_size 130 { 131 if ($curr_elem_size) 132 { 133 my $max = pop @{$diversions{$curr_max}}; 134 $max = $curr_elem_size if $curr_elem_size > $max; 135 push @{$diversions{$curr_max}}, $max; 136 } 137 } 138 139 140 sub convert_non_ascii_char 141 { 142 # A UTF-8 character outside of the printable ASCII range, with possibly a 143 # leading backslash character. 144 my $s = shift; 145 146 # Here we count characters, not bytes. 147 $curr_elem_size += length $s; 148 149 utf8::encode($s); 150 $s = uc unpack 'H*', $s; 151 152 $curr_offset += $s =~ s/\G(..)/'\\x$1', /sg; 153 154 return $s; 155 } 156 157 158 sub convert_ascii_chars 159 { 160 # A series of ASCII characters in the printable range. 161 my $s = shift; 162 163 # We reduce multiple space characters to a single one. 164 $s =~ s/ +/ /g; 165 166 # Count all non-space characters. Note that `()' applies a list context 167 # to the capture that is used to count the elements. 168 $curr_elem_size += () = $s =~ /[^ ]/g; 169 170 $curr_offset += $s =~ s/\G(.)/'$1', /g; 171 172 return $s; 173 } 174 175 176 sub convert_literal 177 { 178 my $s = shift; 179 my $orig = $s; 180 181 # ASCII printables and space 182 my $safe_re = '\x20-\x7E'; 183 # ASCII printables and space, no backslash 184 my $safe_no_backslash_re = '\x20-\x5B\x5D-\x7E'; 185 186 $s =~ s{ 187 (?: \\? ( [^$safe_re] ) 188 | ( (?: [$safe_no_backslash_re] 189 | \\ [$safe_re] )+ ) ) 190 } 191 { 192 defined($1) ? convert_non_ascii_char($1) 193 : convert_ascii_chars($2) 194 }egx; 195 196 # We assume that `$orig' doesn't contain `*/' 197 return $s . " /* $orig */"; 198 } 199 200 201 sub aux_name 202 { 203 return "af_blue_" . $num_sections. "_" . join('_', @name_stack); 204 } 205 206 207 sub aux_name_next 208 { 209 $name_stack[$#name_stack]++; 210 my $name = aux_name(); 211 $name_stack[$#name_stack]--; 212 213 return $name; 214 } 215 216 217 sub enum_val_string 218 { 219 # Build string that holds code to save the current offset in an 220 # enumeration element. 221 my $aux = shift; 222 223 my $add = ($last_aux eq "af_blue_" . $num_sections . "_0" ) 224 ? "" 225 : "$last_aux + "; 226 227 return " $aux = $add$curr_offset,\n"; 228 } 229 230 231 232 # Process data file. 233 234 open(DATA, $datafile) || die "$prog: can't open \`$datafile': $OS_ERROR\n"; 235 236 while (<DATA>) 237 { 238 strip_newline(); 239 240 next if /$comment_re/; 241 next if /$whitespace_only_re/; 242 243 if (/$section_re/) 244 { 245 Warn("previous section is empty") if ($have_sections 246 && !$have_strings 247 && !$have_blocks); 248 249 end_curr_string(); 250 update_max_elem_size(); 251 252 # Save captured groups from `section_re'. 253 $curr_enum = $1; 254 $curr_array = $2; 255 $curr_max = $3; 256 257 $curr_enum_element = ""; 258 $curr_offset = 0; 259 260 Warn("overwriting already defined enumeration \`$curr_enum'") 261 if exists($diversions{$curr_enum}); 262 Warn("overwriting already defined array \`$curr_array'") 263 if exists($diversions{$curr_array}); 264 Warn("overwriting already defined maximum value \`$curr_max'") 265 if exists($diversions{$curr_max}); 266 267 $diversions{$curr_enum} = []; 268 $diversions{$curr_array} = []; 269 $diversions{$curr_max} = []; 270 271 push @{$diversions{$curr_max}}, 0; 272 273 @name_stack = (); 274 push @name_stack, 0; 275 276 $have_sections = 1; 277 $have_strings = 0; 278 $have_blocks = 0; 279 280 $have_enum_element = 0; 281 $in_string = 0; 282 283 $num_sections++; 284 $curr_elem_size = 0; 285 286 $last_aux = aux_name(); 287 288 next; 289 } 290 291 if (/$preprocessor_re/) 292 { 293 if ($have_sections) 294 { 295 # Having preprocessor conditionals complicates the computation of 296 # correct offset values. We have to introduce auxiliary enumeration 297 # elements with the name `af_blue_<s>_<n1>_<n2>_...' that store 298 # offsets to be used in conditional clauses. `<s>' is the number of 299 # sections seen so far, `<n1>' is the number of `#if' and `#endif' 300 # conditionals seen so far in the topmost level, `<n2>' the number of 301 # `#if' and `#endif' conditionals seen so far one level deeper, etc. 302 # As a consequence, uneven values are used within a clause, and even 303 # values after a clause, since the C standard doesn't allow the 304 # redefinition of an enumeration value. For example, the name 305 # `af_blue_5_1_6' is used to construct enumeration values in the fifth 306 # section after the third (second-level) if-clause within the first 307 # (top-level) if-clause. After the first top-level clause has 308 # finished, `af_blue_5_2' is used. The current offset is then 309 # relative to the value stored in the current auxiliary element. 310 311 if (/ ^ \# \s* if /x) 312 { 313 push @else_stack, 0; 314 315 $name_stack[$#name_stack]++; 316 317 push @{$diversions{$curr_enum}}, enum_val_string(aux_name()); 318 $last_aux = aux_name(); 319 320 push @name_stack, 0; 321 322 $curr_offset = 0; 323 } 324 elsif (/ ^ \# \s* elif /x) 325 { 326 Die("unbalanced #elif") unless @else_stack; 327 328 pop @name_stack; 329 330 push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next()); 331 $last_aux = aux_name(); 332 333 push @name_stack, 0; 334 335 $curr_offset = 0; 336 } 337 elsif (/ ^ \# \s* else /x) 338 { 339 my $prev_else = pop @else_stack; 340 Die("unbalanced #else") unless defined($prev_else); 341 Die("#else already seen") if $prev_else; 342 push @else_stack, 1; 343 344 pop @name_stack; 345 346 push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next()); 347 $last_aux = aux_name(); 348 349 push @name_stack, 0; 350 351 $curr_offset = 0; 352 } 353 elsif (/ ^ (\# \s*) endif /x) 354 { 355 my $prev_else = pop @else_stack; 356 Die("unbalanced #endif") unless defined($prev_else); 357 358 pop @name_stack; 359 360 # If there is no else-clause for an if-clause, we add one. This is 361 # necessary to have correct offsets. 362 if (!$prev_else) 363 { 364 # Use amount of whitespace from `endif'. 365 push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next()) 366 . $1 . "else\n"; 367 $last_aux = aux_name(); 368 369 $curr_offset = 0; 370 } 371 372 $name_stack[$#name_stack]++; 373 374 push @{$diversions{$curr_enum}}, enum_val_string(aux_name()); 375 $last_aux = aux_name(); 376 377 $curr_offset = 0; 378 } 379 380 # Handle (probably continued) preprocessor lines. 381 CONTINUED_LOOP: 382 { 383 do 384 { 385 strip_newline(); 386 387 push @{$diversions{$curr_enum}}, $ARG . "\n"; 388 push @{$diversions{$curr_array}}, $ARG . "\n"; 389 390 last CONTINUED_LOOP unless / \\ $ /x; 391 392 } while (<DATA>); 393 } 394 } 395 else 396 { 397 warn_before(); 398 } 399 400 next; 401 } 402 403 if (/$enum_element_re/) 404 { 405 end_curr_string(); 406 update_max_elem_size(); 407 408 $curr_enum_element = $1; 409 $have_enum_element = 1; 410 $curr_elem_size = 0; 411 412 next; 413 } 414 415 if (/$string_re/) 416 { 417 if ($have_sections) 418 { 419 Die("strings and blocks can't be mixed in a section") if $have_blocks; 420 421 # Save captured group from `string_re'. 422 my $string = $1; 423 424 if ($have_enum_element) 425 { 426 push @{$diversions{$curr_enum}}, enum_val_string($curr_enum_element); 427 $have_enum_element = 0; 428 } 429 430 $string = convert_literal($string); 431 432 push @{$diversions{$curr_array}}, " $string\n"; 433 434 $have_strings = 1; 435 $in_string = 1; 436 } 437 else 438 { 439 warn_before(); 440 } 441 442 next; 443 } 444 445 if (/$block_start_re/) 446 { 447 if ($have_sections) 448 { 449 Die("strings and blocks can't be mixed in a section") if $have_strings; 450 451 my $depth = 0; 452 my $block = ""; 453 my $block_end = 0; 454 455 # Count braces while getting the block. 456 BRACE_LOOP: 457 { 458 do 459 { 460 strip_newline(); 461 462 foreach my $substring (split(/$brace_re/)) 463 { 464 if ($block_end) 465 { 466 Die("invalid data after last matching closing brace") 467 if $substring !~ /$whitespace_only_re/; 468 } 469 470 $block .= $substring; 471 472 if ($substring eq '{') 473 { 474 $depth++; 475 } 476 elsif ($substring eq '}') 477 { 478 $depth--; 479 480 $block_end = 1 if $depth == 0; 481 } 482 } 483 484 # If we are here, we have run out of substrings, so get next line 485 # or exit. 486 last BRACE_LOOP if $block_end; 487 488 $block .= "\n"; 489 490 } while (<DATA>); 491 } 492 493 if ($have_enum_element) 494 { 495 push @{$diversions{$curr_enum}}, enum_val_string($curr_enum_element); 496 $have_enum_element = 0; 497 } 498 499 push @{$diversions{$curr_array}}, $block . ",\n"; 500 501 $curr_offset++; 502 $curr_elem_size++; 503 504 $have_blocks = 1; 505 } 506 else 507 { 508 warn_before(); 509 } 510 511 next; 512 } 513 514 # Garbage. We weren't able to parse the data. 515 Die("syntax error"); 516 } 517 518 # Finalize data. 519 end_curr_string(); 520 update_max_elem_size(); 521 522 523 # Filter stdin to stdout, replacing `@...@' templates. 524 525 sub emit_diversion 526 { 527 my $diversion_name = shift; 528 return (exists($diversions{$1})) ? "@{$diversions{$1}}" 529 : "@" . $diversion_name . "@"; 530 } 531 532 533 $LIST_SEPARATOR = ''; 534 535 my $s1 = "This file has been generated by the Perl script \`$prog',"; 536 my $s1len = length $s1; 537 my $s2 = "using data from file \`$datafile'."; 538 my $s2len = length $s2; 539 my $slen = ($s1len > $s2len) ? $s1len : $s2len; 540 541 print "/* " . $s1 . " " x ($slen - $s1len) . " */\n" 542 . "/* " . $s2 . " " x ($slen - $s2len) . " */\n" 543 . "\n"; 544 545 while (<STDIN>) 546 { 547 s/ @ ( [A-Za-z0-9_]+? ) @ / emit_diversion($1) /egx; 548 print; 549 } 550 551 # EOF 552