Home | History | Annotate | Download | only in codepage
      1 #!/usr/bin/perl
      2 #
      3 # Generate a subset of the UnicodeData.txt file, available from
      4 # ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
      5 #
      6 # Usage:
      7 #   gensubset.pl [subset files] < UnicodeData.txt > MiniUCD.txt
      8 #
      9 
     10 %need_these = ();
     11 
     12 # Mark as needed all the characters mentioned in the relevant files
     13 foreach $file (@ARGV) {
     14     open(F, '<', $file) or die;
     15     while (defined($line = <F>)) {
     16 	$line =~ s/\s*(\#.*|)$//; # Remove comments and final blanks
     17 	@f = split(/\s+/, $line);
     18 	next if (scalar @f != 2);
     19 	$need_these{hex $f[1]}++;
     20     }
     21     close(F);
     22 }
     23 
     24 # Also mark as needed any case variants of those
     25 # (Note: this doesn't necessarily provide the full transitive closure,
     26 # but we shouldn't need it.)
     27 while (defined($line = <STDIN>)) {
     28     @f = split(/;/, $line);
     29     if ($f[0] =~ /^([0-9a-f]+)$/i) {
     30 	$r = hex $f[0];
     31 	if ($need_these{$r}) {
     32 	    $need_these{hex $f[12]}++ if ($f[12] ne '');
     33 	    $need_these{hex $f[13]}++ if ($f[13] ne '');
     34 	    $need_these{hex $f[14]}++ if ($f[14] ne '');
     35 	}
     36     }
     37 }
     38 
     39 # Finally, write out the subset
     40 seek(STDIN, 0, 0);
     41 while (defined($line = <STDIN>)) {
     42     ($v, $l) = split(/;/, $line, 2);
     43     if ($v =~ /^([0-9a-f]+)\-([0-9a-f]+)$/i) {
     44 	# This isn't actually the format... fix that if it ever matters
     45 	$r1 = hex $1;
     46 	$r2 = hex $2;
     47     } elsif ($v =~ /^([0-9a-f]+)$/i) {
     48 	$r1 = $r2 = hex $1;
     49     } else {
     50 	next;
     51     }
     52     for ($r = $r1; $r <= $r2; $r++) {
     53 	printf "%04X;%s", $r, $l if ($need_these{$r});
     54     }
     55 }
     56 
     57 	
     58