Home | History | Annotate | Download | only in tests
      1 #! /usr/bin/perl -w
      2 
      3 #    Copyright (C) 1998, 1999 Tom Tromey
      4 #    Copyright (C) 2001 Red Hat Software
      5 
      6 #    This program is free software; you can redistribute it and/or modify
      7 #    it under the terms of the GNU General Public License as published by
      8 #    the Free Software Foundation; either version 2, or (at your option)
      9 #    any later version.
     10 
     11 #    This program is distributed in the hope that it will be useful,
     12 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14 #    GNU General Public License for more details.
     15 
     16 #    You should have received a copy of the GNU General Public License
     17 #    along with this program; if not, write to the Free Software
     18 #    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
     19 #    02111-1307, USA.
     20 
     21 # gen-casemap-test.pl - Generate test cases for case mapping from Unicode data.
     22 # See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
     23 # I consider the output of this program to be unrestricted.  Use it as
     24 # you will.
     25 
     26 require 5.006;
     27 use utf8;
     28 
     29 if (@ARGV != 3) {
     30     $0 =~ s@.*/@@;
     31     die "Usage: $0 UNICODE-VERSION UnicodeData.txt SpecialCasing.txt\n";
     32 }
     33  
     34 use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME $CASE_CODE $CASE_LOWER $CASE_TITLE $CASE_UPPER $CASE_CONDITION);
     35 
     36 # Names of fields in Unicode data table.
     37 $CODE = 0;
     38 $NAME = 1;
     39 $CATEGORY = 2;
     40 $COMBINING_CLASSES = 3;
     41 $BIDI_CATEGORY = 4;
     42 $DECOMPOSITION = 5;
     43 $DECIMAL_VALUE = 6;
     44 $DIGIT_VALUE = 7;
     45 $NUMERIC_VALUE = 8;
     46 $MIRRORED = 9;
     47 $OLD_NAME = 10;
     48 $COMMENT = 11;
     49 $UPPER = 12;
     50 $LOWER = 13;
     51 $TITLE = 14;
     52 
     53 # Names of fields in the SpecialCasing table
     54 $CASE_CODE = 0;
     55 $CASE_LOWER = 1;
     56 $CASE_TITLE = 2;
     57 $CASE_UPPER = 3;
     58 $CASE_CONDITION = 4;
     59 
     60 my @upper;
     61 my @title;
     62 my @lower;
     63 
     64 binmode STDOUT, ":utf8";
     65 open (INPUT, "< $ARGV[1]") || exit 1;
     66 
     67 $last_code = -1;
     68 while (<INPUT>)
     69 {
     70     chop;
     71     @fields = split (';', $_, 30);
     72     if ($#fields != 14)
     73     {
     74 	printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields);
     75     }
     76 
     77     $code = hex ($fields[$CODE]);
     78 
     79     if ($code > $last_code + 1)
     80     {
     81 	# Found a gap.
     82 	if ($fields[$NAME] =~ /Last>/)
     83 	{
     84 	    # Fill the gap with the last character read,
     85             # since this was a range specified in the char database
     86 	    @gfields = @fields;
     87 	}
     88 	else
     89 	{
     90 	    # The gap represents undefined characters.  Only the type
     91 	    # matters.
     92 	    @gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
     93 			'', '', '', '');
     94 	}
     95 	for (++$last_code; $last_code < $code; ++$last_code)
     96 	{
     97 	    $gfields{$CODE} = sprintf ("%04x", $last_code);
     98 	    &process_one ($last_code, @gfields);
     99 	}
    100     }
    101     &process_one ($code, @fields);
    102     $last_code = $code;
    103 }
    104 
    105 close INPUT;
    106 
    107 open (INPUT, "< $ARGV[2]") || exit 1;
    108 
    109 while (<INPUT>)
    110 {
    111     my $code;
    112     
    113     chop;
    114 
    115     next if /^#/;
    116     next if /^\s*$/;
    117 
    118     s/\s*#.*//;
    119 
    120     @fields = split ('\s*;\s*', $_, 30);
    121 
    122     $raw_code = $fields[$CASE_CODE];
    123     $code = hex ($raw_code);
    124 
    125     if ($#fields != 4 && $#fields != 5)
    126     {
    127 	printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);
    128 	next;
    129     }
    130 
    131     if (defined $fields[5]) {
    132 	# Ignore conditional special cases - we'll handle them manually
    133 	next;
    134     }
    135 
    136     $upper[$code] = &make_hex ($fields[$CASE_UPPER]);
    137     $lower[$code] = &make_hex ($fields[$CASE_LOWER]);
    138     $title[$code] = &make_hex ($fields[$CASE_TITLE]);
    139 }
    140 
    141 close INPUT;
    142 
    143 print <<EOT;
    144 # Test cases generated from Unicode $ARGV[0] data
    145 # by gen-case-tests.pl. Do not edit.
    146 #
    147 # Some special hand crafted tests
    148 #
    149 tr_TR\ti\ti\t\x{0130}\t\x{0130}\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
    150 tr_TR\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
    151 tr_TR\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\t# I => LATIN SMALL LETTER DOTLESS I
    152 tr_TR.UTF-8\ti\ti\t\x{0130}\t\x{0130}\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
    153 tr_TR.UTF-8\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
    154 tr_TR.UTF-8\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\t# I => LATIN SMALL LETTER DOTLESS I
    155 # Test reordering of YPOGEGRAMMENI across other accents
    156 \t\x{03b1}\x{0345}\x{0314}\t\x{03b1}\x{0345}\x{314}\t\x{0391}\x{0345}\x{0314}\t\x{0391}\x{0314}\x{0399}\t
    157 \t\x{03b1}\x{0314}\x{0345}\t\x{03b1}\x{314}\x{0345}\t\x{0391}\x{0314}\x{0345}\t\x{0391}\x{0314}\x{0399}\t
    158 # Handling of final and nonfinal sigma
    159 	 	 	 	 	
    160 					
    161 					
    162 # Lithuanian rule of i followed by letter with dot. Not at all sure
    163 # about the titlecase part here
    164 lt_LT\ti\x{117}\ti\x{117}\tIe\tIE\t
    165 lt_LT\tie\x{307}\tie\x{307}\tIe\tIE\t
    166 lt_LT\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t # LATIN CAPITAL LETTER I WITH GRAVE
    167 lt_LT\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t # LATIN CAPITAL LETTER I WITH ACUTE
    168 lt_LT\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t # LATIN CAPITAL LETTER I WITH TILDE
    169 lt_LT\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t # LATIN CAPITAL LETTER I (with acute accent)
    170 lt_LT\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t # LATIN CAPITAL LETTER I (with grave accent)
    171 lt_LT\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t # LATIN CAPITAL LETTER I (with tilde above)
    172 lt_LT\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
    173 lt_LT\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t # LATIN CAPITAL LETTER J (with acute accent)
    174 lt_LT\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
    175 lt_LT.UTF-8\ti\x{117}\ti\x{117}\tIe\tIE\t
    176 lt_LT.UTF-8\tie\x{307}\tie\x{307}\tIe\tIE\t
    177 lt_LT.UTF-8\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t # LATIN CAPITAL LETTER I WITH GRAVE
    178 lt_LT.UTF-8\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t # LATIN CAPITAL LETTER I WITH ACUTE
    179 lt_LT.UTF-8\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t # LATIN CAPITAL LETTER I WITH TILDE
    180 lt_LT.UTF-8\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t # LATIN CAPITAL LETTER I (with acute accent)
    181 lt_LT.UTF-8\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t # LATIN CAPITAL LETTER I (with grave accent)
    182 lt_LT.UTF-8\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t # LATIN CAPITAL LETTER I (with tilde above)
    183 lt_LT.UTF-8\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
    184 lt_LT.UTF-8\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t # LATIN CAPITAL LETTER J (with acute accent)
    185 lt_LT.UTF-8\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
    186 # Special case not at initial position
    187 \ta\x{fb04}\ta\x{fb04}\tAffl\tAFFL\t# FB04
    188 #
    189 # Now the automatic tests
    190 #
    191 EOT
    192 &print_tests;
    193 
    194 exit 0;
    195 
    196 # Process a single character.
    197 sub process_one
    198 {
    199     my ($code, @fields) = @_;
    200 
    201     my $type =  $fields[$CATEGORY];
    202     if ($type eq 'Ll')
    203     {
    204 	$upper[$code] = make_hex ($fields[$UPPER]);
    205 	$lower[$code] = pack ("U", $code);
    206 	$title[$code] = make_hex ($fields[$TITLE]);
    207     }
    208     elsif ($type eq 'Lu')
    209     {
    210 	$lower[$code] = make_hex ($fields[$LOWER]);
    211 	$upper[$code] = pack ("U", $code);
    212 	$title[$code] = make_hex ($fields[$TITLE]);
    213     }
    214 
    215     if ($type eq 'Lt')
    216     {
    217 	$upper[$code] = make_hex ($fields[$UPPER]);
    218 	$lower[$code] = pack ("U", hex ($fields[$LOWER]));
    219 	$title[$code] = make_hex ($fields[$LOWER]);
    220     }
    221 }
    222 
    223 sub print_tests
    224 {
    225     for ($i = 0; $i < 0x10ffff; $i++) {
    226 	if ($i == 0x3A3) {
    227 	    # Greek sigma needs special tests
    228 	    next;
    229 	}
    230 	
    231 	my $lower = $lower[$i];
    232 	my $title = $title[$i];
    233 	my $upper = $upper[$i];
    234 
    235 	if (defined $upper || defined $lower || defined $title) {
    236 	    printf "\t%s\t%s\t%s\t%s\t# %4X\n",
    237 		    pack ("U", $i),
    238 		    (defined $lower ? $lower : ""),
    239 		    (defined $title ? $title : ""),
    240 		    (defined $upper ? $upper : ""),
    241                     $i;
    242 	}
    243     }
    244 }
    245 
    246 sub make_hex
    247 {
    248     my $codes = shift;
    249 
    250     $codes =~ s/^\s+//;
    251     $codes =~ s/\s+$//;
    252 
    253     if ($codes eq "0" || $codes eq "") {
    254 	return "";
    255     } else {
    256 	return pack ("U*", map { hex ($_) } split /\s+/, $codes);
    257     }
    258 }
    259