1 #! /usr/bin/perl -w 2 3 # Copyright (C) 1998, 1999 Tom Tromey 4 # Copyright (C) 2001 Red Hat Software 5 6 # This program is free software; you can redistribute it and/or modify 7 # it under the terms of the GNU General Public License as published by 8 # the Free Software Foundation; either version 2, or (at your option) 9 # any later version. 10 11 # This program is distributed in the hope that it will be useful, 12 # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 # GNU General Public License for more details. 15 16 # You should have received a copy of the GNU General Public License 17 # along with this program; if not, write to the Free Software 18 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 19 # 02111-1307, USA. 20 21 # gen-casemap-test.pl - Generate test cases for case mapping from Unicode data. 22 # See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html 23 # I consider the output of this program to be unrestricted. Use it as 24 # you will. 25 26 require 5.006; 27 use utf8; 28 29 if (@ARGV != 3) { 30 $0 =~ s@.*/@@; 31 die "Usage: $0 UNICODE-VERSION UnicodeData.txt SpecialCasing.txt\n"; 32 } 33 34 use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME $CASE_CODE $CASE_LOWER $CASE_TITLE $CASE_UPPER $CASE_CONDITION); 35 36 # Names of fields in Unicode data table. 37 $CODE = 0; 38 $NAME = 1; 39 $CATEGORY = 2; 40 $COMBINING_CLASSES = 3; 41 $BIDI_CATEGORY = 4; 42 $DECOMPOSITION = 5; 43 $DECIMAL_VALUE = 6; 44 $DIGIT_VALUE = 7; 45 $NUMERIC_VALUE = 8; 46 $MIRRORED = 9; 47 $OLD_NAME = 10; 48 $COMMENT = 11; 49 $UPPER = 12; 50 $LOWER = 13; 51 $TITLE = 14; 52 53 # Names of fields in the SpecialCasing table 54 $CASE_CODE = 0; 55 $CASE_LOWER = 1; 56 $CASE_TITLE = 2; 57 $CASE_UPPER = 3; 58 $CASE_CONDITION = 4; 59 60 my @upper; 61 my @title; 62 my @lower; 63 64 binmode STDOUT, ":utf8"; 65 open (INPUT, "< $ARGV[1]") || exit 1; 66 67 $last_code = -1; 68 while (<INPUT>) 69 { 70 chop; 71 @fields = split (';', $_, 30); 72 if ($#fields != 14) 73 { 74 printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields); 75 } 76 77 $code = hex ($fields[$CODE]); 78 79 if ($code > $last_code + 1) 80 { 81 # Found a gap. 82 if ($fields[$NAME] =~ /Last>/) 83 { 84 # Fill the gap with the last character read, 85 # since this was a range specified in the char database 86 @gfields = @fields; 87 } 88 else 89 { 90 # The gap represents undefined characters. Only the type 91 # matters. 92 @gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '', 93 '', '', '', ''); 94 } 95 for (++$last_code; $last_code < $code; ++$last_code) 96 { 97 $gfields{$CODE} = sprintf ("%04x", $last_code); 98 &process_one ($last_code, @gfields); 99 } 100 } 101 &process_one ($code, @fields); 102 $last_code = $code; 103 } 104 105 close INPUT; 106 107 open (INPUT, "< $ARGV[2]") || exit 1; 108 109 while (<INPUT>) 110 { 111 my $code; 112 113 chop; 114 115 next if /^#/; 116 next if /^\s*$/; 117 118 s/\s*#.*//; 119 120 @fields = split ('\s*;\s*', $_, 30); 121 122 $raw_code = $fields[$CASE_CODE]; 123 $code = hex ($raw_code); 124 125 if ($#fields != 4 && $#fields != 5) 126 { 127 printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields); 128 next; 129 } 130 131 if (defined $fields[5]) { 132 # Ignore conditional special cases - we'll handle them manually 133 next; 134 } 135 136 $upper[$code] = &make_hex ($fields[$CASE_UPPER]); 137 $lower[$code] = &make_hex ($fields[$CASE_LOWER]); 138 $title[$code] = &make_hex ($fields[$CASE_TITLE]); 139 } 140 141 close INPUT; 142 143 print <<EOT; 144 # Test cases generated from Unicode $ARGV[0] data 145 # by gen-case-tests.pl. Do not edit. 146 # 147 # Some special hand crafted tests 148 # 149 tr_TR\ti\ti\t\x{0130}\t\x{0130}\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE 150 tr_TR\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I 151 tr_TR\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\t# I => LATIN SMALL LETTER DOTLESS I 152 tr_TR.UTF-8\ti\ti\t\x{0130}\t\x{0130}\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE 153 tr_TR.UTF-8\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I 154 tr_TR.UTF-8\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\t# I => LATIN SMALL LETTER DOTLESS I 155 # Test reordering of YPOGEGRAMMENI across other accents 156 \t\x{03b1}\x{0345}\x{0314}\t\x{03b1}\x{0345}\x{314}\t\x{0391}\x{0345}\x{0314}\t\x{0391}\x{0314}\x{0399}\t 157 \t\x{03b1}\x{0314}\x{0345}\t\x{03b1}\x{314}\x{0345}\t\x{0391}\x{0314}\x{0345}\t\x{0391}\x{0314}\x{0399}\t 158 # Handling of final and nonfinal sigma 159 160 161 162 # Lithuanian rule of i followed by letter with dot. Not at all sure 163 # about the titlecase part here 164 lt_LT\ti\x{117}\ti\x{117}\tIe\tIE\t 165 lt_LT\tie\x{307}\tie\x{307}\tIe\tIE\t 166 lt_LT\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t # LATIN CAPITAL LETTER I WITH GRAVE 167 lt_LT\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t # LATIN CAPITAL LETTER I WITH ACUTE 168 lt_LT\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t # LATIN CAPITAL LETTER I WITH TILDE 169 lt_LT\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t # LATIN CAPITAL LETTER I (with acute accent) 170 lt_LT\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t # LATIN CAPITAL LETTER I (with grave accent) 171 lt_LT\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t # LATIN CAPITAL LETTER I (with tilde above) 172 lt_LT\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t # LATIN CAPITAL LETTER I (with ogonek and acute accent) 173 lt_LT\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t # LATIN CAPITAL LETTER J (with acute accent) 174 lt_LT\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent) 175 lt_LT.UTF-8\ti\x{117}\ti\x{117}\tIe\tIE\t 176 lt_LT.UTF-8\tie\x{307}\tie\x{307}\tIe\tIE\t 177 lt_LT.UTF-8\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t # LATIN CAPITAL LETTER I WITH GRAVE 178 lt_LT.UTF-8\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t # LATIN CAPITAL LETTER I WITH ACUTE 179 lt_LT.UTF-8\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t # LATIN CAPITAL LETTER I WITH TILDE 180 lt_LT.UTF-8\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t # LATIN CAPITAL LETTER I (with acute accent) 181 lt_LT.UTF-8\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t # LATIN CAPITAL LETTER I (with grave accent) 182 lt_LT.UTF-8\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t # LATIN CAPITAL LETTER I (with tilde above) 183 lt_LT.UTF-8\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t # LATIN CAPITAL LETTER I (with ogonek and acute accent) 184 lt_LT.UTF-8\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t # LATIN CAPITAL LETTER J (with acute accent) 185 lt_LT.UTF-8\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent) 186 # Special case not at initial position 187 \ta\x{fb04}\ta\x{fb04}\tAffl\tAFFL\t# FB04 188 # 189 # Now the automatic tests 190 # 191 EOT 192 &print_tests; 193 194 exit 0; 195 196 # Process a single character. 197 sub process_one 198 { 199 my ($code, @fields) = @_; 200 201 my $type = $fields[$CATEGORY]; 202 if ($type eq 'Ll') 203 { 204 $upper[$code] = make_hex ($fields[$UPPER]); 205 $lower[$code] = pack ("U", $code); 206 $title[$code] = make_hex ($fields[$TITLE]); 207 } 208 elsif ($type eq 'Lu') 209 { 210 $lower[$code] = make_hex ($fields[$LOWER]); 211 $upper[$code] = pack ("U", $code); 212 $title[$code] = make_hex ($fields[$TITLE]); 213 } 214 215 if ($type eq 'Lt') 216 { 217 $upper[$code] = make_hex ($fields[$UPPER]); 218 $lower[$code] = pack ("U", hex ($fields[$LOWER])); 219 $title[$code] = make_hex ($fields[$LOWER]); 220 } 221 } 222 223 sub print_tests 224 { 225 for ($i = 0; $i < 0x10ffff; $i++) { 226 if ($i == 0x3A3) { 227 # Greek sigma needs special tests 228 next; 229 } 230 231 my $lower = $lower[$i]; 232 my $title = $title[$i]; 233 my $upper = $upper[$i]; 234 235 if (defined $upper || defined $lower || defined $title) { 236 printf "\t%s\t%s\t%s\t%s\t# %4X\n", 237 pack ("U", $i), 238 (defined $lower ? $lower : ""), 239 (defined $title ? $title : ""), 240 (defined $upper ? $upper : ""), 241 $i; 242 } 243 } 244 } 245 246 sub make_hex 247 { 248 my $codes = shift; 249 250 $codes =~ s/^\s+//; 251 $codes =~ s/\s+$//; 252 253 if ($codes eq "0" || $codes eq "") { 254 return ""; 255 } else { 256 return pack ("U*", map { hex ($_) } split /\s+/, $codes); 257 } 258 } 259