1 #!/usr/bin/perl -w 2 3 # Copyright (C) 2003, 2004, 2005, 2006 Apple Computer, Inc. All rights reserved. 4 # 5 # Redistribution and use in source and binary forms, with or without 6 # modification, are permitted provided that the following conditions 7 # are met: 8 # 9 # 1. Redistributions of source code must retain the above copyright 10 # notice, this list of conditions and the following disclaimer. 11 # 2. Redistributions in binary form must reproduce the above copyright 12 # notice, this list of conditions and the following disclaimer in the 13 # documentation and/or other materials provided with the distribution. 14 # 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of 15 # its contributors may be used to endorse or promote products derived 16 # from this software without specific prior written permission. 17 # 18 # THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY 19 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 # DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY 22 # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 25 # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 27 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 29 use strict; 30 31 my %aliasesFromCharsetsFile; 32 my %namesWritten; 33 34 my $output = ""; 35 36 my $error = 0; 37 38 sub error ($) 39 { 40 print STDERR @_, "\n"; 41 $error = 1; 42 } 43 44 sub emit_line 45 { 46 my ($name, $prefix, $encoding, $flags) = @_; 47 48 error "$name shows up twice in output" if $namesWritten{$name}; 49 $namesWritten{$name} = 1; 50 51 $output .= " { \"$name\", $prefix$encoding },\n"; 52 } 53 54 sub process_platform_encodings 55 { 56 my ($filename, $PlatformPrefix) = @_; 57 my $baseFilename = $filename; 58 $baseFilename =~ s|.*/||; 59 60 my %seenPlatformNames; 61 my %seenIANANames; 62 63 open PLATFORM_ENCODINGS, $filename or die; 64 65 while (<PLATFORM_ENCODINGS>) { 66 chomp; 67 s/\#.*$//; 68 s/\s+$//; 69 if (my ($PlatformName, undef, $flags, $IANANames) = /^(.+?)(, (.+))?: (.+)$/) { 70 my %aliases; 71 72 my $PlatformNameWithFlags = $PlatformName; 73 if ($flags) { 74 $PlatformNameWithFlags .= ", " . $flags; 75 } else { 76 $flags = "NoEncodingFlags"; 77 } 78 error "Platform encoding name $PlatformName is mentioned twice in $baseFilename" if $seenPlatformNames{$PlatformNameWithFlags}; 79 $seenPlatformNames{$PlatformNameWithFlags} = 1; 80 81 # Build the aliases list. 82 # Also check that no two names are part of the same entry in the charsets file. 83 my @IANANames = split ", ", $IANANames; 84 my $firstName = ""; 85 my $canonicalFirstName = ""; 86 my $prevName = ""; 87 for my $name (@IANANames) { 88 if ($firstName eq "") { 89 if ($name !~ /^[-A-Za-z0-9_]+$/) { 90 error "$name, in $baseFilename, has illegal characters in it"; 91 next; 92 } 93 $firstName = $name; 94 } else { 95 if ($name !~ /^[a-z0-9]+$/) { 96 error "$name, in $baseFilename, has illegal characters in it (must be all lowercase alphanumeric)"; 97 next; 98 } 99 if ($name le $prevName) { 100 error "$name comes after $prevName in $baseFilename, but everything must be in alphabetical order"; 101 } 102 $prevName = $name; 103 } 104 105 my $canonicalName = lc $name; 106 $canonicalName =~ tr/-_//d; 107 108 $canonicalFirstName = $canonicalName if $canonicalFirstName eq ""; 109 110 error "$name is mentioned twice in $baseFilename" if $seenIANANames{$canonicalName}; 111 $seenIANANames{$canonicalName} = 1; 112 113 $aliases{$canonicalName} = 1; 114 next if !$aliasesFromCharsetsFile{$canonicalName}; 115 for my $alias (@{$aliasesFromCharsetsFile{$canonicalName}}) { 116 $aliases{$alias} = 1; 117 } 118 for my $otherName (@IANANames) { 119 next if $canonicalName eq $otherName; 120 if ($aliasesFromCharsetsFile{$otherName} 121 && $aliasesFromCharsetsFile{$canonicalName} eq $aliasesFromCharsetsFile{$otherName} 122 && $canonicalName le $otherName) { 123 error "$baseFilename lists both $name and $otherName under $PlatformName, but that aliasing is already specified in character-sets.txt"; 124 } 125 } 126 } 127 128 # write out 129 emit_line($firstName, $PlatformPrefix, $PlatformName, $flags); 130 for my $alias (sort keys %aliases) { 131 emit_line($alias, $PlatformPrefix, $PlatformName, $flags) if $alias ne $canonicalFirstName; 132 } 133 } elsif (/^([a-zA-Z0-9_]+)(, (.+))?$/) { 134 my $PlatformName = $1; 135 136 error "Platform encoding name $PlatformName is mentioned twice in $baseFilename" if $seenPlatformNames{$PlatformName}; 137 $seenPlatformNames{$PlatformName} = 1; 138 } elsif (/./) { 139 error "syntax error in $baseFilename, line $."; 140 } 141 } 142 143 close PLATFORM_ENCODINGS; 144 } 145 146 sub process_iana_charset 147 { 148 my ($canonical_name, @aliases) = @_; 149 150 return if !$canonical_name; 151 152 my @names = sort $canonical_name, @aliases; 153 154 for my $name (@names) { 155 $aliasesFromCharsetsFile{$name} = \@names; 156 } 157 } 158 159 sub process_iana_charsets 160 { 161 my ($filename) = @_; 162 163 open CHARSETS, $filename or die; 164 165 my %seen; 166 167 my $canonical_name; 168 my @aliases; 169 170 my %exceptions = ( isoir91 => 1, isoir92 => 1 ); 171 172 while (<CHARSETS>) { 173 chomp; 174 if ((my $new_canonical_name) = /Name: ([^ \t]*).*/) { 175 $new_canonical_name = lc $new_canonical_name; 176 $new_canonical_name =~ tr/a-z0-9//cd; 177 178 error "saw $new_canonical_name twice in character-sets.txt", if $seen{$new_canonical_name}; 179 $seen{$new_canonical_name} = $new_canonical_name; 180 181 process_iana_charset $canonical_name, @aliases; 182 183 $canonical_name = $new_canonical_name; 184 @aliases = (); 185 } elsif ((my $new_alias) = /Alias: ([^ \t]*).*/) { 186 $new_alias = lc $new_alias; 187 $new_alias =~ tr/a-z0-9//cd; 188 189 # do this after normalizing the alias, sometimes character-sets.txt 190 # has weird escape characters, e.g. \b after None 191 next if $new_alias eq "none"; 192 193 error "saw $new_alias twice in character-sets.txt $seen{$new_alias}, $canonical_name", if $seen{$new_alias} && $seen{$new_alias} ne $canonical_name && !$exceptions{$new_alias}; 194 push @aliases, $new_alias if !$seen{$new_alias}; 195 $seen{$new_alias} = $canonical_name; 196 } 197 } 198 199 process_iana_charset $canonical_name, @aliases; 200 201 close CHARSETS; 202 } 203 204 # Program body 205 206 process_iana_charsets($ARGV[0]); 207 process_platform_encodings($ARGV[1], $ARGV[2]); 208 209 exit 1 if $error; 210 211 print <<EOF 212 // File generated by make-charset-table.pl. Do not edit! 213 214 #include "config.h" 215 #include "CharsetData.h" 216 217 namespace WebCore { 218 219 const CharsetEntry CharsetTable[] = { 220 $output 221 { 0, 0 } 222 }; 223 224 } 225 EOF 226