Home | History | Annotate | Download | only in tools
      1 #! /usr/bin/perl -w
      2 # -*- Perl -*-
      3 #
      4 # afblue.pl
      5 #
      6 # Process a blue zone character data file.
      7 #
      8 # Copyright 2013-2015 by
      9 # David Turner, Robert Wilhelm, and Werner Lemberg.
     10 #
     11 # This file is part of the FreeType project, and may only be used,
     12 # modified, and distributed under the terms of the FreeType project
     13 # license, LICENSE.TXT.  By continuing to use, modify, or distribute
     14 # this file you indicate that you have read the license and
     15 # understand and accept it fully.
     16 
     17 use strict;
     18 use warnings;
     19 use English '-no_match_vars';
     20 use open ':std', ':encoding(UTF-8)';
     21 
     22 
     23 my $prog = $PROGRAM_NAME;
     24 $prog =~ s| .* / ||x;      # Remove path.
     25 
     26 die "usage: $prog datafile < infile > outfile\n" if $#ARGV != 0;
     27 
     28 
     29 my $datafile = $ARGV[0];
     30 
     31 my %diversions;        # The extracted and massaged data from `datafile'.
     32 my @else_stack;        # Booleans to track else-clauses.
     33 my @name_stack;        # Stack of integers used for names of aux. variables.
     34 
     35 my $curr_enum;         # Name of the current enumeration.
     36 my $curr_array;        # Name of the current array.
     37 my $curr_max;          # Name of the current maximum value.
     38 
     39 my $curr_enum_element; # Name of the current enumeration element.
     40 my $curr_offset;       # The offset relative to current aux. variable.
     41 my $curr_elem_size;    # The size of the current string or block.
     42 
     43 my $have_sections = 0; # Boolean; set if start of a section has been seen.
     44 my $have_strings;      # Boolean; set if current section contains strings.
     45 my $have_blocks;       # Boolean; set if current section contains blocks.
     46 
     47 my $have_enum_element; # Boolean; set if we have an enumeration element.
     48 my $in_string;         # Boolean; set if a string has been parsed.
     49 
     50 my $num_sections = 0;  # Number of sections seen so far.
     51 
     52 my $last_aux;          # Name of last auxiliary variable.
     53 
     54 
     55 # Regular expressions.
     56 
     57 # [<ws>] <enum_name> <ws> <array_name> <ws> <max_name> [<ws>] ':' [<ws>] '\n'
     58 my $section_re = qr/ ^ \s* (\S+) \s+ (\S+) \s+ (\S+) \s* : \s* $ /x;
     59 
     60 # [<ws>] <enum_element_name> [<ws>] '\n'
     61 my $enum_element_re = qr/ ^ \s* ( [A-Za-z0-9_]+ ) \s* $ /x;
     62 
     63 # '#' <preprocessor directive> '\n'
     64 my $preprocessor_re = qr/ ^ \# /x;
     65 
     66 # [<ws>] '/' '/' <comment> '\n'
     67 my $comment_re = qr| ^ \s* // |x;
     68 
     69 # empty line
     70 my $whitespace_only_re = qr/ ^ \s* $ /x;
     71 
     72 # [<ws>] '"' <string> '"' [<ws>] '\n'  (<string> doesn't contain newlines)
     73 my $string_re = qr/ ^ \s*
     74                        " ( (?> (?: (?> [^"\\]+ ) | \\. )* ) ) "
     75                        \s* $ /x;
     76 
     77 # [<ws>] '{' <block> '}' [<ws>] '\n'  (<block> can contain newlines)
     78 my $block_start_re = qr/ ^ \s* \{ /x;
     79 
     80 # We need the capturing group for `split' to make it return the separator
     81 # tokens (i.e., the opening and closing brace) also.
     82 my $brace_re = qr/ ( [{}] ) /x;
     83 
     84 
     85 sub Warn
     86 {
     87   my $message = shift;
     88   warn "$datafile:$INPUT_LINE_NUMBER: warning: $message\n";
     89 }
     90 
     91 
     92 sub Die
     93 {
     94   my $message = shift;
     95   die "$datafile:$INPUT_LINE_NUMBER: error: $message\n";
     96 }
     97 
     98 
     99 my $warned_before = 0;
    100 
    101 sub warn_before
    102 {
    103   Warn("data before first section gets ignored") unless $warned_before;
    104   $warned_before = 1;
    105 }
    106 
    107 
    108 sub strip_newline
    109 {
    110   chomp;
    111   s/ \x0D $ //x;
    112 }
    113 
    114 
    115 sub end_curr_string
    116 {
    117   # Append final null byte to string.
    118   if ($have_strings)
    119   {
    120     push @{$diversions{$curr_array}}, "    '\\0',\n" if $in_string;
    121 
    122     $curr_offset++;
    123     $in_string = 0;
    124   }
    125 }
    126 
    127 
    128 sub update_max_elem_size
    129 {
    130   if ($curr_elem_size)
    131   {
    132     my $max = pop @{$diversions{$curr_max}};
    133     $max = $curr_elem_size if $curr_elem_size > $max;
    134     push @{$diversions{$curr_max}}, $max;
    135   }
    136 }
    137 
    138 
    139 sub convert_non_ascii_char
    140 {
    141   # A UTF-8 character outside of the printable ASCII range, with possibly a
    142   # leading backslash character.
    143   my $s = shift;
    144 
    145   # Here we count characters, not bytes.
    146   $curr_elem_size += length $s;
    147 
    148   utf8::encode($s);
    149   $s = uc unpack 'H*', $s;
    150 
    151   $curr_offset += $s =~ s/\G(..)/'\\x$1', /sg;
    152 
    153   return $s;
    154 }
    155 
    156 
    157 sub convert_ascii_chars
    158 {
    159   # A series of ASCII characters in the printable range.
    160   my $s = shift;
    161 
    162   # We ignore spaces.
    163   $s =~ s/ //g;
    164 
    165   my $count = $s =~ s/\G(.)/'$1', /g;
    166   $curr_offset += $count;
    167   $curr_elem_size += $count;
    168 
    169   return $s;
    170 }
    171 
    172 
    173 sub convert_literal
    174 {
    175   my $s = shift;
    176   my $orig = $s;
    177 
    178   # ASCII printables and space
    179   my $safe_re = '\x20-\x7E';
    180   # ASCII printables and space, no backslash
    181   my $safe_no_backslash_re = '\x20-\x5B\x5D-\x7E';
    182 
    183   $s =~ s{
    184            (?: \\? ( [^$safe_re] )
    185                | ( (?: [$safe_no_backslash_re]
    186                        | \\ [$safe_re] )+ ) )
    187          }
    188          {
    189            defined($1) ? convert_non_ascii_char($1)
    190                        : convert_ascii_chars($2)
    191          }egx;
    192 
    193    # We assume that `$orig' doesn't contain `*/'
    194    return $s . " /* $orig */";
    195 }
    196 
    197 
    198 sub aux_name
    199 {
    200   return "af_blue_" . $num_sections. "_" . join('_', @name_stack);
    201 }
    202 
    203 
    204 sub aux_name_next
    205 {
    206   $name_stack[$#name_stack]++;
    207   my $name = aux_name();
    208   $name_stack[$#name_stack]--;
    209 
    210   return $name;
    211 }
    212 
    213 
    214 sub enum_val_string
    215 {
    216   # Build string that holds code to save the current offset in an
    217   # enumeration element.
    218   my $aux = shift;
    219 
    220   my $add = ($last_aux eq "af_blue_" . $num_sections . "_0" )
    221               ? ""
    222               : "$last_aux + ";
    223 
    224   return "    $aux = $add$curr_offset,\n";
    225 }
    226 
    227 
    228 
    229 # Process data file.
    230 
    231 open(DATA, $datafile) || die "$prog: can't open \`$datafile': $OS_ERROR\n";
    232 
    233 while (<DATA>)
    234 {
    235   strip_newline();
    236 
    237   next if /$comment_re/;
    238   next if /$whitespace_only_re/;
    239 
    240   if (/$section_re/)
    241   {
    242     Warn("previous section is empty") if ($have_sections
    243                                           && !$have_strings
    244                                           && !$have_blocks);
    245 
    246     end_curr_string();
    247     update_max_elem_size();
    248 
    249     # Save captured groups from `section_re'.
    250     $curr_enum = $1;
    251     $curr_array = $2;
    252     $curr_max = $3;
    253 
    254     $curr_enum_element = "";
    255     $curr_offset = 0;
    256 
    257     Warn("overwriting already defined enumeration \`$curr_enum'")
    258       if exists($diversions{$curr_enum});
    259     Warn("overwriting already defined array \`$curr_array'")
    260       if exists($diversions{$curr_array});
    261     Warn("overwriting already defined maximum value \`$curr_max'")
    262       if exists($diversions{$curr_max});
    263 
    264     $diversions{$curr_enum} = [];
    265     $diversions{$curr_array} = [];
    266     $diversions{$curr_max} = [];
    267 
    268     push @{$diversions{$curr_max}}, 0;
    269 
    270     @name_stack = ();
    271     push @name_stack, 0;
    272 
    273     $have_sections = 1;
    274     $have_strings = 0;
    275     $have_blocks = 0;
    276 
    277     $have_enum_element = 0;
    278     $in_string = 0;
    279 
    280     $num_sections++;
    281     $curr_elem_size = 0;
    282 
    283     $last_aux = aux_name();
    284 
    285     next;
    286   }
    287 
    288   if (/$preprocessor_re/)
    289   {
    290     if ($have_sections)
    291     {
    292       # Having preprocessor conditionals complicates the computation of
    293       # correct offset values.  We have to introduce auxiliary enumeration
    294       # elements with the name `af_blue_<s>_<n1>_<n2>_...' that store
    295       # offsets to be used in conditional clauses.  `<s>' is the number of
    296       # sections seen so far, `<n1>' is the number of `#if' and `#endif'
    297       # conditionals seen so far in the topmost level, `<n2>' the number of
    298       # `#if' and `#endif' conditionals seen so far one level deeper, etc.
    299       # As a consequence, uneven values are used within a clause, and even
    300       # values after a clause, since the C standard doesn't allow the
    301       # redefinition of an enumeration value.  For example, the name
    302       # `af_blue_5_1_6' is used to construct enumeration values in the fifth
    303       # section after the third (second-level) if-clause within the first
    304       # (top-level) if-clause.  After the first top-level clause has
    305       # finished, `af_blue_5_2' is used.  The current offset is then
    306       # relative to the value stored in the current auxiliary element.
    307 
    308       if (/ ^ \# \s* if /x)
    309       {
    310         push @else_stack, 0;
    311 
    312         $name_stack[$#name_stack]++;
    313 
    314         push @{$diversions{$curr_enum}}, enum_val_string(aux_name());
    315         $last_aux = aux_name();
    316 
    317         push @name_stack, 0;
    318 
    319         $curr_offset = 0;
    320       }
    321       elsif (/ ^ \# \s* elif /x)
    322       {
    323         Die("unbalanced #elif") unless @else_stack;
    324 
    325         pop @name_stack;
    326 
    327         push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next());
    328         $last_aux = aux_name();
    329 
    330         push @name_stack, 0;
    331 
    332         $curr_offset = 0;
    333       }
    334       elsif (/ ^ \# \s* else /x)
    335       {
    336         my $prev_else = pop @else_stack;
    337         Die("unbalanced #else") unless defined($prev_else);
    338         Die("#else already seen") if $prev_else;
    339         push @else_stack, 1;
    340 
    341         pop @name_stack;
    342 
    343         push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next());
    344         $last_aux = aux_name();
    345 
    346         push @name_stack, 0;
    347 
    348         $curr_offset = 0;
    349       }
    350       elsif (/ ^ (\# \s*) endif /x)
    351       {
    352         my $prev_else = pop @else_stack;
    353         Die("unbalanced #endif") unless defined($prev_else);
    354 
    355         pop @name_stack;
    356 
    357         # If there is no else-clause for an if-clause, we add one.  This is
    358         # necessary to have correct offsets.
    359         if (!$prev_else)
    360         {
    361           # Use amount of whitespace from `endif'.
    362           push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next())
    363                                            . $1 . "else\n";
    364           $last_aux = aux_name();
    365 
    366           $curr_offset = 0;
    367         }
    368 
    369         $name_stack[$#name_stack]++;
    370 
    371         push @{$diversions{$curr_enum}}, enum_val_string(aux_name());
    372         $last_aux = aux_name();
    373 
    374         $curr_offset = 0;
    375       }
    376 
    377       # Handle (probably continued) preprocessor lines.
    378     CONTINUED_LOOP:
    379       {
    380         do
    381         {
    382           strip_newline();
    383 
    384           push @{$diversions{$curr_enum}}, $ARG . "\n";
    385           push @{$diversions{$curr_array}}, $ARG . "\n";
    386 
    387           last CONTINUED_LOOP unless / \\ $ /x;
    388 
    389         } while (<DATA>);
    390       }
    391     }
    392     else
    393     {
    394       warn_before();
    395     }
    396 
    397     next;
    398   }
    399 
    400   if (/$enum_element_re/)
    401   {
    402     end_curr_string();
    403     update_max_elem_size();
    404 
    405     $curr_enum_element = $1;
    406     $have_enum_element = 1;
    407     $curr_elem_size = 0;
    408 
    409     next;
    410   }
    411 
    412   if (/$string_re/)
    413   {
    414     if ($have_sections)
    415     {
    416       Die("strings and blocks can't be mixed in a section") if $have_blocks;
    417 
    418       # Save captured group from `string_re'.
    419       my $string = $1;
    420 
    421       if ($have_enum_element)
    422       {
    423         push @{$diversions{$curr_enum}}, enum_val_string($curr_enum_element);
    424         $have_enum_element = 0;
    425       }
    426 
    427       $string = convert_literal($string);
    428 
    429       push @{$diversions{$curr_array}}, "    $string\n";
    430 
    431       $have_strings = 1;
    432       $in_string = 1;
    433     }
    434     else
    435     {
    436       warn_before();
    437     }
    438 
    439     next;
    440   }
    441 
    442   if (/$block_start_re/)
    443   {
    444     if ($have_sections)
    445     {
    446       Die("strings and blocks can't be mixed in a section") if $have_strings;
    447 
    448       my $depth = 0;
    449       my $block = "";
    450       my $block_end = 0;
    451 
    452       # Count braces while getting the block.
    453     BRACE_LOOP:
    454       {
    455         do
    456         {
    457           strip_newline();
    458 
    459           foreach my $substring (split(/$brace_re/))
    460           {
    461             if ($block_end)
    462             {
    463               Die("invalid data after last matching closing brace")
    464                 if $substring !~ /$whitespace_only_re/;
    465             }
    466 
    467             $block .= $substring;
    468 
    469             if ($substring eq '{')
    470             {
    471               $depth++;
    472             }
    473             elsif ($substring eq '}')
    474             {
    475               $depth--;
    476 
    477               $block_end = 1 if $depth == 0;
    478             }
    479           }
    480 
    481           # If we are here, we have run out of substrings, so get next line
    482           # or exit.
    483           last BRACE_LOOP if $block_end;
    484 
    485           $block .= "\n";
    486 
    487         } while (<DATA>);
    488       }
    489 
    490       if ($have_enum_element)
    491       {
    492         push @{$diversions{$curr_enum}}, enum_val_string($curr_enum_element);
    493         $have_enum_element = 0;
    494       }
    495 
    496       push @{$diversions{$curr_array}}, $block . ",\n";
    497 
    498       $curr_offset++;
    499       $curr_elem_size++;
    500 
    501       $have_blocks = 1;
    502     }
    503     else
    504     {
    505       warn_before();
    506     }
    507 
    508     next;
    509   }
    510 
    511   # Garbage.  We weren't able to parse the data.
    512   Die("syntax error");
    513 }
    514 
    515 # Finalize data.
    516 end_curr_string();
    517 update_max_elem_size();
    518 
    519 
    520 # Filter stdin to stdout, replacing `@...@' templates.
    521 
    522 sub emit_diversion
    523 {
    524   my $diversion_name = shift;
    525   return (exists($diversions{$1})) ? "@{$diversions{$1}}"
    526                                    : "@" . $diversion_name . "@";
    527 }
    528 
    529 
    530 $LIST_SEPARATOR = '';
    531 
    532 my $s1 = "This file has been generated by the Perl script \`$prog',";
    533 my $s1len = length $s1;
    534 my $s2 = "using data from file \`$datafile'.";
    535 my $s2len = length $s2;
    536 my $slen = ($s1len > $s2len) ? $s1len : $s2len;
    537 
    538 print "/* " . $s1 . " " x ($slen - $s1len) . " */\n"
    539       . "/* " . $s2 . " " x ($slen - $s2len) . " */\n"
    540       . "\n";
    541 
    542 while (<STDIN>)
    543 {
    544   s/ @ ( [A-Za-z0-9_]+? ) @ / emit_diversion($1) /egx;
    545   print;
    546 }
    547 
    548 # EOF
    549