Home | History | Annotate | Download | only in genpname
      1 #!/bin/perl -w
      2 #*******************************************************************
      3 # COPYRIGHT:
      4 # Copyright (c) 2002-2008, International Business Machines Corporation and
      5 # others. All Rights Reserved.
      6 #*******************************************************************
      7 
      8 # This script reads in UCD files PropertyAliases.txt and
      9 # PropertyValueAliases.txt and correlates them with ICU enums
     10 # defined in uchar.h and uscript.h.  It then outputs a header
     11 # file which contains all names and enums.  The header is included
     12 # by the genpname tool C++ source file, which produces the actual
     13 # binary data file.
     14 #
     15 # See usage note below.
     16 #
     17 # TODO: The Property[Value]Alias.txt files state that they can support
     18 # more than 2 names per property|value.  Currently (Unicode 3.2) there
     19 # are always 1 or 2 names.  If more names were supported, presumably
     20 # the format would be something like:
     21 #    nv        ; Numeric_Value
     22 #    nv        ; Value_Numerique
     23 # CURRENTLY, this script assumes that there are 1 or two names.  Any
     24 # duplicates it sees are flagged as an error.  If multiple aliases
     25 # appear in a future version of Unicode, modify this script to support
     26 # that.
     27 #
     28 # NOTE: As of ICU 2.6, this script has been modified to know about the
     29 # pseudo-property gcm/General_Category_Mask, which corresponds to the
     30 # uchar.h property UCHAR_GENERAL_CATEGORY_MASK.  This property
     31 # corresponds to General_Category but is a bitmask value.  It does not
     32 # exist in the UCD.  Therefore, I special case it in several places
     33 # (search for General_Category_Mask and gcm).
     34 #
     35 # NOTE: As of ICU 2.6, this script reads an auxiliary data file,
     36 # SyntheticPropertyAliases.txt, containing property aliases not
     37 # present in the UCD but present in ICU.  This file resides in the
     38 # same directory as this script.  Its contents are merged into those
     39 # of PropertyAliases.txt as if the two files were appended.
     40 #
     41 # NOTE: The following names are handled specially.  See script below
     42 # for details.
     43 #
     44 #   T/True
     45 #   F/False
     46 #   No_Block
     47 #
     48 # Author: Alan Liu
     49 # Created: October 14 2002
     50 # Since: ICU 2.4
     51 
     52 use FileHandle;
     53 use strict;
     54 use Dumpvalue;
     55 
     56 my $DEBUG = 1;
     57 my $DUMPER = new Dumpvalue;
     58 
     59 my $count = @ARGV;
     60 my $ICU_DIR = shift() || '';
     61 my $OUT_FILE = shift() || 'data.h';
     62 my $HEADER_DIR = "$ICU_DIR/source/common/unicode";
     63 my $UNIDATA_DIR = "$ICU_DIR/source/data/unidata";
     64 
     65 # Get the current year from the system
     66 my $YEAR = 1900+@{[localtime]}[5]; # Get the current year
     67 
     68 # Used to make "n/a" property [value] aliases (Unicode or Synthetic) unique
     69 my $propNA = 0;
     70 my $valueNA = 0;
     71 
     72 #----------------------------------------------------------------------
     73 # Top level property keys for binary, enumerated, string, and double props
     74 my @TOP     = qw( _bp _ep _sp _dp _mp );
     75 
     76 # This hash governs how top level properties are grouped into output arrays.
     77 #my %TOP_PROPS = ( "VALUED"   => [ '_bp', '_ep' ],
     78 #                  "NO_VALUE" => [ '_sp', '_dp' ] );m
     79 #my %TOP_PROPS = ( "BINARY"   => [ '_bp' ],
     80 #                  "ENUMERATED" => [ '_ep' ],
     81 #                  "STRING" => [ '_sp' ],
     82 #                  "DOUBLE" => [ '_dp' ] );
     83 my %TOP_PROPS = ( ""   => [ '_bp', '_ep', '_sp', '_dp', '_mp' ] );
     84 
     85 my %PROP_TYPE = (Binary => "_bp",
     86                  String => "_sp",
     87                  Double => "_dp",
     88                  Enumerated => "_ep",
     89                  Bitmask => "_mp");
     90 #----------------------------------------------------------------------
     91 
     92 # Properties that are unsupported in ICU
     93 my %UNSUPPORTED = (Composition_Exclusion => 1,
     94                    Decomposition_Mapping => 1,
     95                    Expands_On_NFC => 1,
     96                    Expands_On_NFD => 1,
     97                    Expands_On_NFKC => 1,
     98                    Expands_On_NFKD => 1,
     99                    FC_NFKC_Closure => 1,
    100                    ID_Start_Exceptions => 1,
    101                    Special_Case_Condition => 1,
    102                    );
    103 
    104 # Short names of properties that weren't seen in uchar.h.  If the
    105 # properties weren't seen, don't complain about the property values
    106 # missing.
    107 my %MISSING_FROM_UCHAR;
    108 
    109 # Additional property aliases beyond short and long names,
    110 # like space in addition to WSpace and White_Space in Unicode 4.1.
    111 # Hashtable, maps long name to alias.
    112 # For example, maps White_Space->space.
    113 #
    114 # If multiple additional aliases are defined,
    115 # then they are separated in the value string with '|'.
    116 # For example, White_Space->space|outer_space
    117 my %additional_property_aliases;
    118 
    119 #----------------------------------------------------------------------
    120 
    121 # Emitted class names
    122 my ($STRING_CLASS, $ALIAS_CLASS, $PROPERTY_CLASS) = qw(AliasName Alias Property);
    123 
    124 if ($count < 1 || $count > 2 ||
    125     !-d $HEADER_DIR ||
    126     !-d $UNIDATA_DIR) {
    127     my $me = $0;
    128     $me =~ s|.+[/\\]||;
    129     my $lm = ' ' x length($me);
    130     print <<"END";
    131 
    132 $me: Reads ICU4C headers and Unicode data files and creates
    133 $lm  a C header file that is included by genpname.  The header
    134 $lm  file matches constants defined in the ICU4C headers with
    135 $lm  property|value aliases in the Unicode data files.
    136 
    137 Usage: $me <icu_dir> [<out_file>]
    138 
    139 <icu_dir>   ICU4C root directory, containing
    140                source/common/unicode/uchar.h
    141                source/common/unicode/uscript.h
    142                source/data/unidata/Blocks.txt
    143                source/data/unidata/PropertyAliases.txt
    144                source/data/unidata/PropertyValueAliases.txt
    145 <out_file>  File name of header to be written;
    146             default is 'data.h'.
    147 
    148 The Unicode versions of all input files must match.
    149 END
    150     exit(1);
    151 }
    152 
    153 my ($h, $version) = readAndMerge($HEADER_DIR, $UNIDATA_DIR);
    154 
    155 if ($DEBUG) {
    156     print "Merged hash:\n";
    157     for my $key (sort keys %$h) {
    158         my $hh = $h->{$key};
    159         for my $subkey (sort keys %$hh) {
    160             print "$key:$subkey:", $hh->{$subkey}, "\n";
    161         }
    162     }
    163 }
    164 
    165 my $out = new FileHandle($OUT_FILE, 'w');
    166 die "Error: Can't write to $OUT_FILE: $!" unless (defined $out);
    167 my $save = select($out);
    168 formatData($h, $version);
    169 select($save);
    170 $out->close();
    171 
    172 exit(0);
    173 
    174 #----------------------------------------------------------------------
    175 # From PropList.html: "The properties of the form Other_XXX
    176 # are used to generate properties in DerivedCoreProperties.txt.
    177 # They are not intended for general use, such as in APIs that
    178 # return property values.
    179 # Non_Break is not a valid property as of 3.2.
    180 sub isIgnoredProperty {
    181     local $_ = shift;
    182     /^Other_/i || /^Non_Break$/i;
    183 }
    184 
    185 # 'qc' is a pseudo-property matching any quick-check property
    186 # see PropertyValueAliases.txt file comments.  'binprop' is
    187 # a synthetic binary value alias "True"/"False", not present
    188 # in PropertyValueAliases.txt until Unicode 5.0.
    189 # Starting with Unicode 5.1, PropertyValueAliases.txt does have
    190 # explicit values for binary properties.
    191 sub isPseudoProperty {
    192     $_[0] eq 'qc' ||
    193         $_[0] eq 'binprop';
    194 }
    195 
    196 #----------------------------------------------------------------------
    197 # Emit the combined data from headers and the Unicode database as a
    198 # C source code header file.
    199 #
    200 # @param ref to hash with the data
    201 # @param Unicode version, as a string
    202 sub formatData {
    203     my $h = shift;
    204     my $version = shift;
    205 
    206     my $date = scalar localtime();
    207     print <<"END";
    208 /**
    209  * Copyright (C) 2002-$YEAR, International Business Machines Corporation and
    210  * others. All Rights Reserved.
    211  *
    212  * MACHINE GENERATED FILE.  !!! Do not edit manually !!!
    213  *
    214  * Generated from
    215  *   uchar.h
    216  *   uscript.h
    217  *   Blocks.txt
    218  *   PropertyAliases.txt
    219  *   PropertyValueAliases.txt
    220  *
    221  * Date: $date
    222  * Unicode version: $version
    223  * Script: $0
    224  */
    225 
    226 END
    227 
    228     #------------------------------------------------------------
    229     # Emit Unicode version
    230     print "/* Unicode version $version */\n";
    231     my @v = split(/\./, $version);
    232     push @v, '0' while (@v < 4);
    233     for (my $i=0; $i<@v; ++$i) {
    234         print "const uint8_t VERSION_$i = $v[$i];\n";
    235     }
    236     print "\n";
    237 
    238     #------------------------------------------------------------
    239     # Emit String table
    240     # [A table of all identifiers, that is, all long or short property
    241     # or value names.  The list need NOT be sorted; it will be sorted
    242     # by the C program.  Strings are referenced by their index into
    243     # this table.  After sorting, a REMAP[] array is used to map the
    244     # old position indices to the new positions.]
    245     my %strings;
    246     for my $prop (sort keys %$h) {
    247         my $hh = $h->{$prop};
    248         for my $enum (sort keys %$hh) {
    249             my @a = split(/\|/, $hh->{$enum});
    250             for (@a) {
    251                 $strings{$_} = 1 if (length($_));
    252             }
    253         }
    254     }
    255     my @strings = sort keys %strings;
    256     unshift @strings, "";
    257 
    258     print "const int32_t STRING_COUNT = ", scalar @strings, ";\n\n"; 
    259 
    260     # while printing, create a mapping hash from string table entry to index
    261     my %stringToID;
    262     print "/* to be sorted */\n";
    263     print "const $STRING_CLASS STRING_TABLE[] = {\n";
    264     for (my $i=0; $i<@strings; ++$i) {
    265         print "    $STRING_CLASS(\"$strings[$i]\", $i),\n";
    266         $stringToID{$strings[$i]} = $i;
    267     }
    268     print "};\n\n";
    269 
    270     # placeholder for the remapping index.  this is used to map
    271     # indices that we compute here to indices of the sorted
    272     # STRING_TABLE.  STRING_TABLE will be sorted by the C++ program
    273     # using the uprv_comparePropertyNames() function.  this will
    274     # reshuffle the order.  we then use the indices (passed to the
    275     # String constructor) to create a REMAP[] array.
    276     print "/* to be filled in */\n";
    277     print "int32_t REMAP[", scalar @strings, "];\n\n";
    278     
    279     #------------------------------------------------------------
    280     # Emit the name group table
    281     # [A table of name groups.  A name group is one or more names
    282     # for a property or property value.  The Unicode data files specify
    283     # that there may be more than 2, although as of Unicode 3.2 there
    284     # are at most 2.  The name group table looks like this:
    285     #
    286     #  114, -115, 116, -117, 0, -118, 65, -64, ...
    287     #  [0]        [2]        [4]      [6]
    288     #
    289     # The entry at [0] consists of 2 strings, 114 and 115.
    290     # The entry at [2] consists of 116 and 117.  The entry at
    291     # [4] is one string, 118.  There is always at least one
    292     # string; typically there are two.  If there are two, the first
    293     # is the SHORT name and the second is the LONG.  If there is
    294     # one, then the missing entry (always the short name, in 3.2)
    295     # is zero, which is by definition the index of "".  The
    296     # 'preferred' name will generally be the LONG name, if there are
    297     # more than 2 entries.  The last entry is negative.
    298 
    299     # Build name group list and replace string refs with nameGroup indices
    300     my @nameGroups;
    301     
    302     # Check for duplicate name groups, and reuse them if possible
    303     my %groupToInt; # Map group strings to ints
    304     for my $prop (sort keys %$h) {
    305         my $hh = $h->{$prop};
    306         for my $enum (sort keys %$hh) {
    307             my $groupString = $hh->{$enum};
    308             my $i;
    309             if (exists $groupToInt{$groupString}) {
    310                 $i = $groupToInt{$groupString};
    311             } else {
    312                 my @names = split(/\|/, $groupString);
    313                 die "Error: Wrong number of names in " . $groupString if (@names < 1);
    314                 $i = @nameGroups; # index of group we are making 
    315                 $groupToInt{$groupString} = $i; # Cache for reuse
    316                 push @nameGroups, map { $stringToID{$_} } @names;
    317                 $nameGroups[$#nameGroups] = -$nameGroups[$#nameGroups]; # mark end
    318             }
    319             # now, replace string list with ref to name group
    320             $hh->{$enum} = $i;
    321         }
    322     }
    323 
    324     print "const int32_t NAME_GROUP_COUNT = ",
    325           scalar @nameGroups, ";\n\n";
    326 
    327     print "int32_t NAME_GROUP[] = {\n";
    328     # emit one group per line, with annotations
    329     my $max_names = 0;
    330     for (my $i=0; $i<@nameGroups; ) {
    331         my @a;
    332         my $line;
    333         my $start = $i;
    334         for (;;) {
    335             my $j = $nameGroups[$i++];
    336             $line .= "$j, ";
    337             push @a, abs($j);
    338             last if ($j < 0);
    339         }
    340         print "    ",
    341               $line,
    342               ' 'x(20-length($line)),
    343               "/* ", sprintf("%3d", $start),
    344               ": \"", join("\", \"", map { $strings[$_] } @a), "\" */\n";
    345         $max_names = @a if(@a > $max_names);
    346           
    347     }
    348     print "};\n\n";
    349     
    350     # This is fixed for 3.2 at "2" but should be calculated dynamically
    351     # when more than 2 names appear in Property[Value]Aliases.txt.
    352     print "#define MAX_NAMES_PER_GROUP $max_names\n\n";
    353 
    354     #------------------------------------------------------------
    355     # Emit enumerated property values
    356     for my $prop (sort keys %$h) {
    357         next if ($prop =~ /^_/);
    358         my $vh = $h->{$prop};
    359         my $count = scalar keys %$vh;
    360 
    361         print "const int32_t VALUES_${prop}_COUNT = ",
    362               $count, ";\n\n";
    363         
    364         print "const $ALIAS_CLASS VALUES_${prop}\[] = {\n";
    365         for my $enum (sort keys %$vh) {
    366             #my @names = split(/\|/, $vh->{$enum});
    367             #die "Error: Wrong number of names for $prop:$enum in [" . join(",", @names) . "]"
    368             #    if (@names != 2);
    369             print "    $ALIAS_CLASS((int32_t) $enum, ", $vh->{$enum}, "),\n";
    370                   #$stringToID{$names[0]}, ", ",
    371                   #$stringToID{$names[1]}, "),\n";
    372             #      "\"", $names[0], "\", ",
    373             #      "\"", $names[1], "\"),\n";
    374         }
    375         print "};\n\n";
    376     }
    377 
    378     #------------------------------------------------------------
    379     # Emit top-level properties (binary, enumerated, etc.)
    380     for my $topName (sort keys %TOP_PROPS) {
    381         my $a = $TOP_PROPS{$topName};
    382         my $count = 0;
    383         for my $type (@$a) { # "_bp", "_ep", etc.
    384             $count += scalar keys %{$h->{$type}};
    385         }
    386 
    387         print "const int32_t ${topName}PROPERTY_COUNT = $count;\n\n";
    388         
    389         print "const $PROPERTY_CLASS ${topName}PROPERTY[] = {\n";
    390 
    391         for my $type (@$a) { # "_bp", "_ep", etc.
    392             my $p = $h->{$type};
    393 
    394             for my $enum (sort keys %$p) {
    395                 my $name = $strings[$nameGroups[$p->{$enum}]];
    396             
    397                 my $valueRef = "0, NULL";
    398                 if ($type eq '_bp') {
    399                     $valueRef = "VALUES_binprop_COUNT, VALUES_binprop";
    400                 }
    401                 elsif (exists $h->{$name}) {
    402                     $valueRef = "VALUES_${name}_COUNT, VALUES_$name";
    403                 }
    404                 
    405                 print "    $PROPERTY_CLASS((int32_t) $enum, ",
    406                       $p->{$enum}, ", $valueRef),\n";
    407             }
    408         }
    409         print "};\n\n";
    410     }
    411 
    412     print "/*eof*/\n";
    413 }
    414 
    415 #----------------------------------------------------------------------
    416 # Read in the files uchar.h, uscript.h, Blocks.txt,
    417 # PropertyAliases.txt, and PropertyValueAliases.txt,
    418 # and combine them into one hash.
    419 #
    420 # @param directory containing headers
    421 # @param directory containin Unicode data files
    422 #
    423 # @return hash ref, Unicode version
    424 sub readAndMerge {
    425 
    426     my ($headerDir, $unidataDir) = @_;
    427 
    428     my $h = read_uchar("$headerDir/uchar.h");
    429     my $s = read_uscript("$headerDir/uscript.h");
    430     my $b = read_Blocks("$unidataDir/Blocks.txt");
    431     my $pa = {};
    432     read_PropertyAliases($pa, "$unidataDir/PropertyAliases.txt");
    433     read_PropertyAliases($pa, "SyntheticPropertyAliases.txt");
    434     my $va = {};
    435     read_PropertyValueAliases($va, "$unidataDir/PropertyValueAliases.txt");
    436     read_PropertyValueAliases($va, "SyntheticPropertyValueAliases.txt");
    437     
    438     # Extract property family hash
    439     my $fam = $pa->{'_family'};
    440     delete $pa->{'_family'};
    441     
    442     # Note: uscript.h has no version string, so don't check it
    443     my $version = check_versions([ 'uchar.h', $h ],
    444                                  [ 'Blocks.txt', $b ],
    445                                  [ 'PropertyAliases.txt', $pa ],
    446                                  [ 'PropertyValueAliases.txt', $va ]);
    447     
    448     # Do this BEFORE merging; merging modifies the hashes
    449     check_PropertyValueAliases($pa, $va);
    450     
    451     # Dump out the $va hash for debugging
    452     if ($DEBUG) {
    453         print "Property values hash:\n";
    454         for my $key (sort keys %$va) {
    455             my $hh = $va->{$key};
    456             for my $subkey (sort keys %$hh) {
    457                 print "$key:$subkey:", $hh->{$subkey}, "\n";
    458             }
    459         }
    460     }
    461     
    462     # Dump out the $s hash for debugging
    463     if ($DEBUG) {
    464         print "Script hash:\n";
    465         for my $key (sort keys %$s) {
    466             print "$key:", $s->{$key}, "\n";
    467         }
    468     }
    469     
    470     # Link in the script data
    471     $h->{'sc'} = $s;
    472     
    473     merge_Blocks($h, $b);
    474     
    475     merge_PropertyAliases($h, $pa, $fam);
    476     
    477     merge_PropertyValueAliases($h, $va);
    478     
    479     ($h, $version);
    480 }
    481 
    482 #----------------------------------------------------------------------
    483 # Ensure that the version strings in the given hashes (under the key
    484 # '_version') are compatible.  Currently this means they must be
    485 # identical, with the exception that "X.Y" will match "X.Y.0".
    486 # All hashes must define the key '_version'.
    487 #
    488 # @param a list of pairs of (file name, hash reference)
    489 #
    490 # @return the version of all the hashes.  Upon return, the '_version'
    491 # will be removed from all hashes.
    492 sub check_versions {
    493     my $version = '';
    494     my $msg = '';
    495     foreach my $a (@_) {
    496         my $name = $a->[0];
    497         my $h    = $a->[1];
    498         die "Error: No version found" unless (exists $h->{'_version'});
    499         my $v = $h->{'_version'};
    500         delete $h->{'_version'};
    501 
    502         # append ".0" if necessary, to standardize to X.Y.Z
    503         $v .= '.0' unless ($v =~ /\.\d+\./);
    504         $v .= '.0' unless ($v =~ /\.\d+\./);
    505         $msg .= "$name = $v\n";
    506         if ($version) {
    507             die "Error: Mismatched Unicode versions\n$msg"
    508                 unless ($version eq $v);
    509         } else {
    510             $version = $v;
    511         }
    512     }
    513     $version;
    514 }
    515 
    516 #----------------------------------------------------------------------
    517 # Make sure the property names in PropertyValueAliases.txt match those
    518 # in PropertyAliases.txt.
    519 #
    520 # @param a hash ref from read_PropertyAliases.
    521 # @param a hash ref from read_PropertyValueAliases.
    522 sub check_PropertyValueAliases {
    523     my ($pa, $va) = @_;
    524 
    525     # make a reverse hash of short->long
    526     my %rev;
    527     for (keys %$pa) { $rev{$pa->{$_}} = $_; }
    528     
    529     for my $prop (keys %$va) {
    530         if (!exists $rev{$prop} && !isPseudoProperty($prop)) {
    531             print "Warning: Property $prop from PropertyValueAliases not listed in PropertyAliases\n";
    532         }
    533     }
    534 }
    535 
    536 #----------------------------------------------------------------------
    537 # Merge blocks data into uchar.h enum data.  In the 'blk' subhash all
    538 # code point values, as returned from read_uchar, are replaced by
    539 # block names, as read from Blocks.txt and returned by read_Blocks.
    540 # The match must be 1-to-1.  If there is any failure of 1-to-1
    541 # mapping, an error is signaled.  Upon return, the read_Blocks hash
    542 # is emptied of all contents, except for those that failed to match.
    543 #
    544 # The mapping in the 'blk' subhash, after this function returns, is
    545 # from uchar.h enum name, e.g. "UBLOCK_BASIC_LATIN", to Blocks.h
    546 # pseudo-name, e.g. "Basic Latin".
    547 #
    548 # @param a hash ref from read_uchar.
    549 # @param a hash ref from read_Blocks.
    550 sub merge_Blocks {
    551     my ($h, $b) = @_;
    552 
    553     die "Error: No blocks data in uchar.h"
    554         unless (exists $h->{'blk'});
    555     my $blk = $h->{'blk'};
    556     for my $enum (keys %$blk) {
    557         my $cp = $blk->{$enum};
    558         if ($cp && !exists $b->{$cp}) {
    559             die "Error: No block found at $cp in Blocks.txt";
    560         }
    561         # Convert code point to pseudo-name:
    562         $blk->{$enum} = $b->{$cp};
    563         delete $b->{$cp};
    564     }
    565     my $err = '';
    566     for my $cp (keys %$b) {
    567         $err .= "Error: Block " . $b->{$cp} . " not listed in uchar.h\n";
    568     }
    569     die $err if ($err);
    570 }
    571 
    572 #----------------------------------------------------------------------
    573 # Merge property alias names into the uchar.h hash.  The subhashes
    574 # under the keys _* (b(inary, e(numerated, s(tring, d(ouble) are
    575 # examined and the values of those subhashes are assumed to be long
    576 # names in PropertyAliases.txt.  They are validated and replaced by
    577 # "<short>|<long>".  Upon return, the read_PropertyAliases hash is
    578 # emptied of all contents, except for those that failed to match.
    579 # Unmatched names in PropertyAliases are listed as a warning but do
    580 # NOT cause the script to die.
    581 #
    582 # @param a hash ref from read_uchar.
    583 # @param a hash ref from read_PropertyAliases.
    584 # @param a hash mapping long names to property family (e.g., 'binary')
    585 sub merge_PropertyAliases {
    586     my ($h, $pa, $fam) = @_;
    587 
    588     for my $k (@TOP) {
    589         die "Error: No properties data for $k in uchar.h"
    590             unless (exists $h->{$k});
    591     }
    592 
    593     for my $subh (map { $h->{$_} } @TOP) {
    594         for my $enum (keys %$subh) {
    595             my $long_name = $subh->{$enum};
    596             if (!exists $pa->{$long_name}) {
    597                 die "Error: Property $long_name not found (or used more than once)";
    598             }
    599 
    600             my $value;
    601             if($pa->{$long_name} =~ m|^n/a\d*$|) {
    602                 # replace an "n/a" short name with an empty name (nothing before "|");
    603                 # don't remove it (don't remove the "|"): there must always be a long name,
    604                 # and if the short name is removed, then the long name becomes the
    605                 # short name and there is no long name left (unless there is another alias)
    606                 $value = "|" . $long_name;
    607             } else {
    608                 $value = $pa->{$long_name} . "|" . $long_name;
    609             }
    610             if (exists $additional_property_aliases{$long_name}) {
    611                 $value .= "|" . $additional_property_aliases{$long_name};
    612             }
    613             $subh->{$enum} = $value;
    614             delete $pa->{$long_name};
    615         }
    616     }
    617 
    618     my @err;
    619     for my $name (keys %$pa) {
    620         $MISSING_FROM_UCHAR{$pa->{$name}} = 1;
    621         if (exists $UNSUPPORTED{$name}) {
    622             push @err, "Info: No enum for " . $fam->{$name} . " property $name in uchar.h";
    623         } elsif (!isIgnoredProperty($name)) {
    624             push @err, "Warning: No enum for " . $fam->{$name} . " property $name in uchar.h";
    625         }
    626     }
    627     print join("\n", sort @err), "\n" if (@err);
    628 }
    629 
    630 #----------------------------------------------------------------------
    631 # Return 1 if two names match ignoring whitespace, '-', and '_'.
    632 # Used to match names in Blocks.txt with those in PropertyValueAliases.txt
    633 # as of Unicode 4.0.
    634 sub matchesLoosely {
    635     my ($a, $b) = @_;
    636     $a =~ s/[\s\-_]//g;
    637     $b =~ s/[\s\-_]//g;
    638     $a =~ /^$b$/i;
    639 }
    640 
    641 #----------------------------------------------------------------------
    642 # Merge PropertyValueAliases.txt data into the uchar.h hash.  All
    643 # properties other than blk, _bp, and _ep are analyzed and mapped to
    644 # the names listed in PropertyValueAliases.  They are then replaced
    645 # with a string of the form "<short>|<long>".  The short or long name
    646 # may be missing.
    647 #
    648 # @param a hash ref from read_uchar.
    649 # @param a hash ref from read_PropertyValueAliases.
    650 sub merge_PropertyValueAliases {
    651     my ($h, $va) = @_;
    652 
    653     my %gcCount;
    654     for my $prop (keys %$h) {
    655         # _bp, _ep handled in merge_PropertyAliases
    656         next if ($prop =~ /^_/);
    657 
    658         # Special case: gcm
    659         my $prop2 = ($prop eq 'gcm') ? 'gc' : $prop;
    660 
    661         # find corresponding PropertyValueAliases data
    662         die "Error: Can't find $prop in PropertyValueAliases.txt"
    663             unless (exists $va->{$prop2});
    664         my $pva = $va->{$prop2};
    665 
    666         # match up data
    667         my $hh = $h->{$prop};
    668         for my $enum (keys %$hh) {
    669 
    670             my $name = $hh->{$enum};
    671 
    672             # look up both long and short & ignore case
    673             my $n;
    674             if (exists $pva->{$name}) {
    675                 $n = $name; 
    676             } else {
    677                 # iterate (slow)
    678                 for my $a (keys %$pva) {
    679                     # case-insensitive match
    680                     # & case-insensitive reverse match
    681                     if ($a =~ /^$name$/i ||
    682                         $pva->{$a} =~ /^$name$/i) {
    683                         $n = $a;
    684                         last;
    685                     }
    686                 }
    687             }
    688                 
    689             # For blocks, do a loose match from Blocks.txt pseudo-name
    690             # to PropertyValueAliases long name.
    691             if (!$n && $prop eq 'blk') {
    692                 for my $a (keys %$pva) {
    693                     # The block is only going to match the long name,
    694                     # but we check both for completeness.  As of Unicode
    695                     # 4.0, blocks do not have short names.
    696                     if (matchesLoosely($name, $pva->{$a}) ||
    697                         matchesLoosely($name, $a)) {
    698                         $n = $a;
    699                         last;
    700                     }
    701                 }
    702             }
    703             
    704             die "Error: Property value $prop:$name not found" unless ($n);
    705 
    706             my $l = $n;
    707             my $r = $pva->{$n};
    708             # convert |n/a\d*| to blank
    709             $l = '' if ($l =~ m|^n/a\d*$|);
    710             $r = '' if ($r =~ m|^n/a\d*$|);
    711 
    712             $hh->{$enum} = "$l|$r";
    713             # Don't delete the 'gc' properties because we need to share
    714             # them between 'gc' and 'gcm'.  Count each use instead.
    715             if ($prop2 eq 'gc') {
    716                 ++$gcCount{$n};
    717             } else {
    718                 delete $pva->{$n};
    719             }
    720         }
    721     }
    722 
    723     # Merge the combining class values in manually
    724     # Add the same values to the synthetic lccc and tccc properties
    725     die "Error: No ccc data"
    726         unless exists $va->{'ccc'};
    727     for my $ccc (keys %{$va->{'ccc'}}) {
    728         die "Error: Can't overwrite ccc $ccc"
    729             if (exists $h->{'ccc'}->{$ccc});
    730         $h->{'lccc'}->{$ccc} =
    731         $h->{'tccc'}->{$ccc} =
    732         $h->{'ccc'}->{$ccc} = $va->{'ccc'}->{$ccc};
    733     }
    734     delete $va->{'ccc'};
    735 
    736     # Merge synthetic binary property values in manually.
    737     # These are the "True" and "False" value aliases.
    738     die "Error: No True/False value aliases"
    739         unless exists $va->{'binprop'};
    740     for my $bp (keys %{$va->{'binprop'}}) {
    741         $h->{'binprop'}->{$bp} = $va->{'binprop'}->{$bp};
    742     }
    743     delete $va->{'binprop'};
    744 
    745     my $err = '';
    746     for my $prop (sort keys %$va) {
    747         my $hh = $va->{$prop};
    748         for my $subkey (sort keys %$hh) {
    749             # 'gc' props are shared with 'gcm'; make sure they were used
    750             # once or twice.
    751             if ($prop eq 'gc') {
    752                 my $n = $gcCount{$subkey};
    753                 next if ($n >= 1 && $n <= 2);
    754             }
    755             $err .= "Warning: Enum for value $prop:$subkey not found in uchar.h\n"
    756                 unless exists $MISSING_FROM_UCHAR{$prop};
    757         }
    758     }
    759     print $err if ($err);
    760 }
    761 
    762 #----------------------------------------------------------------------
    763 # Read the PropertyAliases.txt file.  Return a hash that maps the long
    764 # name to the short name.  The special key '_version' will map to the
    765 # Unicode version of the file.  The special key '_family' holds a
    766 # subhash that maps long names to a family string, for descriptive
    767 # purposes.
    768 #
    769 # @param a filename for PropertyAliases.txt
    770 # @param reference to hash to receive data.  Keys are long names.
    771 # Values are short names.
    772 sub read_PropertyAliases {
    773 
    774     my $hash = shift;         # result
    775 
    776     my $filename = shift; 
    777 
    778     my $fam = {};  # map long names to family string
    779     $fam = $hash->{'_family'} if (exists $hash->{'_family'});
    780 
    781     my $family; # binary, enumerated, etc.
    782 
    783     my $in = new FileHandle($filename, 'r');
    784     die "Error: Cannot open $filename" if (!defined $in);
    785 
    786     while (<$in>) {
    787 
    788         # Read version (embedded in a comment)
    789         if (/PropertyAliases-(\d+\.\d+\.\d+)/i) {
    790             die "Error: Multiple versions in $filename"
    791                 if (exists $hash->{'_version'});
    792             $hash->{'_version'} = $1;
    793         }
    794 
    795         # Read family heading
    796         if (/^\s*\#\s*(.+?)\s*Properties\s*$/) {
    797             $family = $1;
    798         }
    799 
    800         # Ignore comments and blank lines
    801         s/\#.*//;
    802         next unless (/\S/);
    803 
    804         if (/^\s*(.+?)\s*;/) {
    805             my $short = $1;
    806             my @fields = /;\s*([^\s;]+)/g;
    807             if (@fields < 1 || @fields > 2) {
    808                 my $number = @fields;
    809                 die "Error: Wrong number of fields ($number) in $filename at $_";
    810             }
    811 
    812             # Make "n/a" strings unique
    813             if ($short eq 'n/a') {
    814                 $short .= sprintf("%03d", $propNA++);
    815             }
    816             my $long = $fields[0];
    817             if ($long eq 'n/a') {
    818                 $long .= sprintf("%03d", $propNA++);
    819             }
    820 
    821             # Add long name->short name to the hash=pa hash table
    822             if (exists $hash->{$long}) {
    823                 die "Error: Duplicate property $long in $filename"
    824             }
    825             $hash->{$long} = $short;
    826             $fam->{$long} = $family;
    827 
    828             # Add the list of further aliases to the additional_property_aliases hash table,
    829             # using the long property name as the key.
    830             # For example:
    831             #   White_Space->space|outer_space
    832             if (@fields > 1) {
    833                 my $value = pop @fields;
    834                 while (@fields > 1) {
    835                     $value .= "|" . pop @fields;
    836                 }
    837                 $additional_property_aliases{$long} = $value;
    838             }
    839         } else {
    840             die "Error: Can't parse $_ in $filename";
    841         }
    842     }
    843 
    844     $in->close();
    845 
    846     $hash->{'_family'} = $fam;
    847 }
    848 
    849 #----------------------------------------------------------------------
    850 # Read the PropertyValueAliases.txt file.  Return a two level hash
    851 # that maps property_short_name:value_short_name:value_long_name.  In
    852 # the case of the 'ccc' property, the short name is the numeric class
    853 # and the long name is "<short>|<long>".  The special key '_version'
    854 # will map to the Unicode version of the file.
    855 #
    856 # @param a filename for PropertyValueAliases.txt
    857 #
    858 # @return a hash reference.
    859 sub read_PropertyValueAliases {
    860 
    861     my $hash = shift;         # result
    862 
    863     my $filename = shift; 
    864 
    865     my $in = new FileHandle($filename, 'r');
    866     die "Error: Cannot open $filename" if (!defined $in);
    867 
    868     while (<$in>) {
    869 
    870         # Read version (embedded in a comment)
    871         if (/PropertyValueAliases-(\d+\.\d+\.\d+)/i) {
    872             die "Error: Multiple versions in $filename"
    873                 if (exists $hash->{'_version'});
    874             $hash->{'_version'} = $1;
    875         }
    876 
    877         # Ignore comments and blank lines
    878         s/\#.*//;
    879         next unless (/\S/);
    880 
    881         if (/^\s*(.+?)\s*;/i) {
    882             my $prop = $1;
    883             my @fields = /;\s*([^\s;]+)/g;
    884             die "Error: Wrong number of fields in $filename"
    885                 if (@fields < 2 || @fields > 5);
    886             # Make "n/a" strings unique
    887             $fields[0] .= sprintf("%03d", $valueNA++) if ($fields[0] eq 'n/a');
    888             # Squash extra fields together
    889             while (@fields > 2) {
    890                 my $f = pop @fields;
    891                 $fields[$#fields] .= '|' . $f;
    892             }
    893             addDatum($hash, $prop, @fields);
    894         }
    895 
    896         else {
    897             die "Error: Can't parse $_ in $filename";
    898         }
    899     }
    900 
    901     $in->close();
    902 
    903     # Script Copt=Qaac (Coptic) is a special case.
    904     # Before the Copt code was defined, the private-use code Qaac was used.
    905     # Starting with Unicode 4.1, PropertyValueAliases.txt contains
    906     # Copt as the short name as well as Qaac as an alias.
    907     # For use with older Unicode data files, we add here a Qaac->Coptic entry.
    908     # This should not do anything for 4.1-and-later Unicode data files.
    909     # See also UAX #24: Script Names http://www.unicode.org/unicode/reports/tr24/
    910     $hash->{'sc'}->{'Qaac'} = 'Coptic'
    911         unless (exists $hash->{'sc'}->{'Qaac'} || exists $hash->{'sc'}->{'Copt'});
    912 
    913     # Add N|No|T|True and Y|Yes|F|False -- these are values we recognize for
    914     # binary properties (until Unicode 5.0 NOT from PropertyValueAliases.txt).
    915     # These are of the same form as the 'ccc' value aliases.
    916     # Starting with Unicode 5.1, PropertyValueAliases.txt does have values
    917     # for binary properties.
    918     if (!exists $hash->{'binprop'}->{'0'}) {
    919         if (exists $hash->{'Alpha'}->{'N'}) {
    920             # Unicode 5.1 and later: Make the numeric value the key.
    921             $hash->{'binprop'}->{'0'} = 'N|' . $hash->{'Alpha'}->{'N'};
    922             $hash->{'binprop'}->{'1'} = 'Y|' . $hash->{'Alpha'}->{'Y'};
    923         } elsif (exists $hash->{'Alpha'}) {
    924             die "Error: Unrecognized short value name for binary property 'Alpha'\n";
    925         } else {
    926             # Unicode 5.0 and earlier: Add manually.
    927             $hash->{'binprop'}->{'0'} = 'N|No|F|False';
    928             $hash->{'binprop'}->{'1'} = 'Y|Yes|T|True';
    929         }
    930     }
    931 }
    932 
    933 #----------------------------------------------------------------------
    934 # Read the Blocks.txt file.  Return a hash that maps the code point
    935 # range start to the block name.  The special key '_version' will map
    936 # to the Unicode version of the file.
    937 #
    938 # As of Unicode 4.0, the names in the Blocks.txt are no longer the
    939 # proper names.  The proper names are now listed in PropertyValueAliases.
    940 # They are similar but not identical.  Furthermore, 4.0 introduces
    941 # a new block name, No_Block, which is listed only in PropertyValueAliases
    942 # and not in Blocks.txt.  As a result, we handle blocks as follows:
    943 #
    944 # 1. Read Blocks.txt to map code point range start to quasi-block name.
    945 # 2. Add to Blocks.txt a synthetic No Block code point & name:
    946 #    X -> No Block
    947 # 3. Map quasi-names from Blocks.txt (including No Block) to actual
    948 #    names from PropertyValueAliases.  This occurs in
    949 #    merge_PropertyValueAliases.
    950 #
    951 # @param a filename for Blocks.txt
    952 #
    953 # @return a ref to a hash.  Keys are code points, as text, e.g.,
    954 # "1720".  Values are pseudo-block names, e.g., "Hanunoo".
    955 sub read_Blocks {
    956 
    957     my $filename = shift; 
    958 
    959     my $hash = {};         # result
    960 
    961     my $in = new FileHandle($filename, 'r');
    962     die "Error: Cannot open $filename" if (!defined $in);
    963 
    964     while (<$in>) {
    965 
    966         # Read version (embedded in a comment)
    967         if (/Blocks-(\d+\.\d+\.\d+)/i) {
    968             die "Error: Multiple versions in $filename"
    969                 if (exists $hash->{'_version'});
    970             $hash->{'_version'} = $1;
    971         }
    972 
    973         # Ignore comments and blank lines
    974         s/\#.*//;
    975         next unless (/\S/);
    976 
    977         if (/^([0-9a-f]+)\.\.[0-9a-f]+\s*;\s*(.+?)\s*$/i) {
    978             die "Error: Duplicate range $1 in $filename"
    979                 if (exists $hash->{$1});
    980             $hash->{$1} = $2;
    981         }
    982 
    983         else {
    984             die "Error: Can't parse $_ in $filename";
    985         }
    986     }
    987 
    988     $in->close();
    989 
    990     # Add pseudo-name for No Block
    991     $hash->{'none'} = 'No Block';
    992 
    993     $hash;
    994 }
    995 
    996 #----------------------------------------------------------------------
    997 # Read the uscript.h file and compile a mapping of Unicode symbols to
    998 # icu4c enum values.
    999 #
   1000 # @param a filename for uscript.h
   1001 #
   1002 # @return a ref to a hash.  The keys of the hash are enum symbols from
   1003 # uscript.h, and the values are script names.
   1004 sub read_uscript {
   1005 
   1006     my $filename = shift; 
   1007 
   1008     my $mode = '';         # state machine mode and submode
   1009     my $submode = '';
   1010 
   1011     my $last = '';         # for line folding
   1012 
   1013     my $hash = {};         # result
   1014     my $key;               # first-level key
   1015 
   1016     my $in = new FileHandle($filename, 'r');
   1017     die "Error: Cannot open $filename" if (!defined $in);
   1018 
   1019     while (<$in>) {
   1020         # Fold continued lines together
   1021         if (/^(.*)\\$/) {
   1022             $last = $1;
   1023             next;
   1024         } elsif ($last) {
   1025             $_ = $last . $_;
   1026             $last = '';
   1027         }
   1028 
   1029         # Exit all modes here
   1030         if ($mode && $mode ne 'DEPRECATED') {
   1031             if (/^\s*\}/) {
   1032                 $mode = '';
   1033                 next;
   1034             }
   1035         }
   1036 
   1037         # Handle individual modes
   1038 
   1039         if ($mode eq 'UScriptCode') {
   1040             if (m|^\s*(USCRIPT_\w+).+?/\*\s*(\w+)|) {
   1041                 my ($enum, $code) = ($1, $2);
   1042                 die "Error: Duplicate script $enum"
   1043                     if (exists $hash->{$enum});
   1044                 $hash->{$enum} = $code;
   1045             }
   1046         }
   1047 
   1048         elsif ($mode eq 'DEPRECATED') {
   1049             if (/\s*\#ifdef/) {
   1050                 die "Error: Nested #ifdef";
   1051                 }
   1052             elsif (/\s*\#endif/) {
   1053                 $mode = '';
   1054             }
   1055         }
   1056 
   1057         elsif (!$mode) {
   1058             if (/^\s*typedef\s+enum\s+(\w+)\s*\{/ ||
   1059                    /^\s*typedef\s+enum\s+(\w+)\s*$/) {
   1060                 $mode = $1;
   1061                 #print "Parsing $mode\n";
   1062             }
   1063 
   1064             elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) {
   1065                 $mode = 'DEPRECATED';
   1066             }
   1067         }
   1068     }
   1069 
   1070     $in->close();
   1071 
   1072     $hash;
   1073 }
   1074 
   1075 #----------------------------------------------------------------------
   1076 # Read the uchar.h file and compile a mapping of Unicode symbols to
   1077 # icu4c enum values.
   1078 #
   1079 # @param a filename for uchar.h
   1080 #
   1081 # @return a ref to a hash.  The keys of the hash are '_bp' for binary
   1082 # properties, '_ep' for enumerated properties, '_dp'/'_sp'/'_mp' for
   1083 # double/string/mask properties, and 'gc', 'gcm', 'bc', 'blk',
   1084 # 'ea', 'dt', 'jt', 'jg', 'lb', or 'nt' for corresponding property
   1085 # value aliases.  The values of the hash are subhashes.  The subhashes
   1086 # have a key of the uchar.h enum symbol, and a value of the alias
   1087 # string (as listed in PropertyValueAliases.txt).  NOTE: The alias
   1088 # string is whatever alias uchar.h lists.  This may be either short or
   1089 # long, depending on the specific enum.  NOTE: For blocks ('blk'), the
   1090 # value is a hex code point for the start of the associated block.
   1091 # NOTE: The special key _version will map to the Unicode version of
   1092 # the file.
   1093 sub read_uchar {
   1094 
   1095     my $filename = shift; 
   1096 
   1097     my $mode = '';         # state machine mode and submode
   1098     my $submode = '';
   1099 
   1100     my $last = '';         # for line folding
   1101 
   1102     my $hash = {};         # result
   1103     my $key;               # first-level key
   1104 
   1105     my $in = new FileHandle($filename, 'r');
   1106     die "Error: Cannot open $filename" if (!defined $in);
   1107 
   1108     while (<$in>) {
   1109         # Fold continued lines together
   1110         if (/^(.*)\\$/) {
   1111             $last .= $1;
   1112             next;
   1113         } elsif ($last) {
   1114             $_ = $last . $_;
   1115             $last = '';
   1116         }
   1117 
   1118         # Exit all modes here
   1119         if ($mode && $mode ne 'DEPRECATED') {
   1120             if (/^\s*\}/) {
   1121                 $mode = '';
   1122                 next;
   1123             }
   1124         }
   1125 
   1126         # Handle individual modes
   1127 
   1128         if ($mode eq 'UProperty') {
   1129             if (/^\s*(UCHAR_\w+)\s*[,=]/ || /^\s+(UCHAR_\w+)\s*$/) {
   1130                 if ($submode) {
   1131                     addDatum($hash, $key, $1, $submode);
   1132                     $submode = '';
   1133                 } else {
   1134                     #print "Warning: Ignoring $1\n";
   1135                 }
   1136             }
   1137 
   1138             elsif (m|^\s*/\*\*\s*(\w+)\s+property\s+(\w+)|i) {
   1139                 die "Error: Unmatched tag $submode" if ($submode);
   1140                 die "Error: Unrecognized UProperty comment: $_"
   1141                     unless (exists $PROP_TYPE{$1});
   1142                 $key = $PROP_TYPE{$1};
   1143                 $submode = $2;
   1144             }
   1145         }
   1146 
   1147         elsif ($mode eq 'UCharCategory') {
   1148             if (/^\s*(U_\w+)\s*=/) {
   1149                 if ($submode) {
   1150                     addDatum($hash, 'gc', $1, $submode);
   1151                     $submode = '';
   1152                 } else {
   1153                     #print "Warning: Ignoring $1\n";
   1154                 }
   1155             }
   1156 
   1157             elsif (m|^\s*/\*\*\s*([A-Z][a-z])\s|) {
   1158                 die "Error: Unmatched tag $submode" if ($submode);
   1159                 $submode = $1;
   1160             }
   1161         }
   1162 
   1163         elsif ($mode eq 'UCharDirection') {
   1164             if (/^\s*(U_\w+)\s*[,=]/ || /^\s+(U_\w+)\s*$/) {
   1165                 if ($submode) {
   1166                     addDatum($hash, $key, $1, $submode);
   1167                     $submode = '';
   1168                 } else {
   1169                     #print "Warning: Ignoring $1\n";
   1170                 }
   1171             }
   1172 
   1173             elsif (m|/\*\*\s*([A-Z]+)\s|) {
   1174                 die "Error: Unmatched tag $submode" if ($submode);
   1175                 $key = 'bc';
   1176                 $submode = $1;
   1177             }
   1178         }
   1179 
   1180         elsif ($mode eq 'UBlockCode') {
   1181             if (m|^\s*(UBLOCK_\w+).+?/\*\[(.+?)\]\*/|) {
   1182                 addDatum($hash, 'blk', $1, $2);
   1183             }
   1184         }
   1185 
   1186         elsif ($mode eq 'UEastAsianWidth') {
   1187             if (m|^\s*(U_EA_\w+).+?/\*\[(.+?)\]\*/|) {
   1188                 addDatum($hash, 'ea', $1, $2);
   1189             }
   1190         }
   1191 
   1192         elsif ($mode eq 'UDecompositionType') {
   1193             if (m|^\s*(U_DT_\w+).+?/\*\[(.+?)\]\*/|) {
   1194                 addDatum($hash, 'dt', $1, $2);
   1195             }
   1196         }
   1197 
   1198         elsif ($mode eq 'UJoiningType') {
   1199             if (m|^\s*(U_JT_\w+).+?/\*\[(.+?)\]\*/|) {
   1200                 addDatum($hash, 'jt', $1, $2);
   1201             }
   1202         }
   1203 
   1204         elsif ($mode eq 'UJoiningGroup') {
   1205             if (/^\s*(U_JG_(\w+))/) {
   1206                 addDatum($hash, 'jg', $1, $2) unless ($2 eq 'COUNT');
   1207             }
   1208         }
   1209 
   1210         elsif ($mode eq 'UGraphemeClusterBreak') {
   1211             if (m|^\s*(U_GCB_\w+).+?/\*\[(.+?)\]\*/|) {
   1212                 addDatum($hash, 'GCB', $1, $2);
   1213             }
   1214         }
   1215 
   1216         elsif ($mode eq 'UWordBreakValues') {
   1217             if (m|^\s*(U_WB_\w+).+?/\*\[(.+?)\]\*/|) {
   1218                 addDatum($hash, 'WB', $1, $2);
   1219             }
   1220         }
   1221 
   1222         elsif ($mode eq 'USentenceBreak') {
   1223             if (m|^\s*(U_SB_\w+).+?/\*\[(.+?)\]\*/|) {
   1224                 addDatum($hash, 'SB', $1, $2);
   1225             }
   1226         }
   1227 
   1228         elsif ($mode eq 'ULineBreak') {
   1229             if (m|^\s*(U_LB_\w+).+?/\*\[(.+?)\]\*/|) {
   1230                 addDatum($hash, 'lb', $1, $2);
   1231             }
   1232         }
   1233 
   1234         elsif ($mode eq 'UNumericType') {
   1235             if (m|^\s*(U_NT_\w+).+?/\*\[(.+?)\]\*/|) {
   1236                 addDatum($hash, 'nt', $1, $2);
   1237             }
   1238         }
   1239 
   1240         elsif ($mode eq 'UHangulSyllableType') {
   1241             if (m|^\s*(U_HST_\w+).+?/\*\[(.+?)\]\*/|) {
   1242                 addDatum($hash, 'hst', $1, $2);
   1243             }
   1244         }
   1245 
   1246         elsif ($mode eq 'DEPRECATED') {
   1247             if (/\s*\#ifdef/) {
   1248                 die "Error: Nested #ifdef";
   1249                 }
   1250             elsif (/\s*\#endif/) {
   1251                 $mode = '';
   1252             }
   1253         }
   1254 
   1255         elsif (!$mode) {
   1256             if (/^\s*\#define\s+(\w+)\s+(.+)/) {
   1257                 # #define $left $right
   1258                 my ($left, $right) = ($1, $2);
   1259 
   1260                 if ($left eq 'U_UNICODE_VERSION') {
   1261                     my $version = $right;
   1262                     $version = $1 if ($version =~ /^\"(.*)\"/);
   1263                     # print "Unicode version: ", $version, "\n";
   1264                     die "Error: Multiple versions in $filename"
   1265                         if (defined $hash->{'_version'});
   1266                     $hash->{'_version'} = $version;
   1267                 }
   1268 
   1269                 elsif ($left =~ /U_GC_(\w+?)_MASK/) {
   1270                     addDatum($hash, 'gcm', $left, $1);
   1271                 }
   1272             }
   1273 
   1274             elsif (/^\s*typedef\s+enum\s+(\w+)\s*\{/ ||
   1275                    /^\s*typedef\s+enum\s+(\w+)\s*$/) {
   1276                 $mode = $1;
   1277                 #print "Parsing $mode\n";
   1278             }
   1279 
   1280             elsif (/^\s*enum\s+(\w+)\s*\{/ ||
   1281                    /^\s*enum\s+(\w+)\s*$/) {
   1282                 $mode = $1;
   1283                 #print "Parsing $mode\n";
   1284             }
   1285 
   1286             elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) {
   1287                 $mode = 'DEPRECATED';
   1288             }
   1289         }
   1290     }
   1291 
   1292     $in->close();
   1293 
   1294     # hardcode known values for the normalization quick check properties
   1295     # see unorm.h for the UNormalizationCheckResult enum
   1296 
   1297     addDatum($hash, 'NFC_QC', 'UNORM_NO',    'N');
   1298     addDatum($hash, 'NFC_QC', 'UNORM_YES',   'Y');
   1299     addDatum($hash, 'NFC_QC', 'UNORM_MAYBE', 'M');
   1300 
   1301     addDatum($hash, 'NFKC_QC', 'UNORM_NO',    'N');
   1302     addDatum($hash, 'NFKC_QC', 'UNORM_YES',   'Y');
   1303     addDatum($hash, 'NFKC_QC', 'UNORM_MAYBE', 'M');
   1304 
   1305     # no "maybe" values for NF[K]D
   1306 
   1307     addDatum($hash, 'NFD_QC', 'UNORM_NO',    'N');
   1308     addDatum($hash, 'NFD_QC', 'UNORM_YES',   'Y');
   1309 
   1310     addDatum($hash, 'NFKD_QC', 'UNORM_NO',    'N');
   1311     addDatum($hash, 'NFKD_QC', 'UNORM_YES',   'Y');
   1312 
   1313     $hash;
   1314 }
   1315 
   1316 #----------------------------------------------------------------------
   1317 # Add a new value to a two-level hash.  That is, given a ref to
   1318 # a hash, two keys, and a value, add $hash->{$key1}->{$key2} = $value.
   1319 sub addDatum {
   1320     my ($h, $k1, $k2, $v) = @_;
   1321     if (exists $h->{$k1}->{$k2}) {
   1322         die "Error: $k1:$k2 already set to " .
   1323             $h->{$k1}->{$k2} . ", cannot set to " . $v;
   1324     }
   1325     $h->{$k1}->{$k2} = $v;
   1326 }
   1327 
   1328 #eof
   1329