Home | History | Annotate | Download | only in hyphenation
      1 #!/usr/bin/perl
      2 # convert TeX (Patgen) hyphenation patterns to Libhnj format
      3 # (A utility for finding substring embeddings in patterns)
      4 # usage: substrings.pl inputfile outputfile [encoding]
      5 
      6 if (!defined $ARGV[1]) {
      7     print "" .
      8 "substrings.pl - convert TeX (Patgen) hyphenation patterns to Libhnj format\n" . 
      9 "(A utility for finding substring embeddings in patterns)\n" .
     10 "usage: substrings.pl infile outfile [encoding [lefthyphenmin [righthyphenmin]]]\n";
     11     exit 1;
     12 }
     13 $fn = $ARGV[0];
     14 if (!-e $fn) { $fn = "hyphen.us"; }
     15 open HYPH, $fn;
     16 open OUT, ">$ARGV[1]";
     17 $encoding = $ARGV[2];
     18 $lhmin = $ARGV[3];
     19 $rhmin = $ARGV[4];
     20 if (defined $encoding) { print OUT "$encoding\n"; }
     21 if (defined $lhmin) { print OUT "LEFTHYPHENMIN $lhmin\n"; }
     22 if (defined $rhmin) { print OUT "RIGHTHYPHENMIN $rhmin\n"; }
     23 
     24 while (<HYPH>)
     25 {
     26     $pat =~ s/%.*$//g;
     27     if (/^\%/) {
     28 	#comment, ignore
     29     } elsif (/^(.+)\/([^,]+),([0-9]+),([0-9]+)$/) {
     30         $origpat = $1;
     31 	$pat = $1;
     32         $repl = $2;
     33         $beg = $3;
     34         $len = $4;
     35 	$pat =~ s/\d//g;
     36         if ($origpat eq $pat) {
     37             print "error - missing hyphenation point: $_";
     38             exit 1;
     39         }
     40 	push @patlist, $pat;
     41 	$pattab{$pat} = $origpat;
     42         $repltab{$pat} = $repl;
     43         $replbeg{$pat} = $beg - 1;
     44         $repllen{$pat} = $len;
     45     } elsif (/^(.+)\/(.+)$/) {
     46         $origpat = $1;
     47 	$pat = $1;
     48         $repl = $2;
     49 	$pat =~ s/\d//g;
     50         if ($origpat eq $pat) {
     51             print "error - missing hyphenation point: $_";
     52             exit 1;
     53         }
     54 	push @patlist, $pat;
     55 	$pattab{$pat} = $origpat;
     56         $repltab{$pat} = $repl;
     57         $replbeg{$pat} = 0;
     58         $repllen{$pat} = enclen($pat);
     59     } elsif (/^(.+)$/) {
     60 	$origpat = $1;
     61 	$pat = $1;
     62 	$pat =~ s/\d//g;
     63 	push @patlist, $pat;
     64 	$pattab{$pat} = $origpat;
     65     }
     66 }
     67 
     68 foreach $pat (@patlist) {
     69     $patsize = length $pat;
     70     for $i (0..$patsize - 1) {
     71 	for $j (1..$patsize - $i) {
     72 	    $subpat = substr ($pat, $i, $j);
     73 	    if (defined $pattab{$subpat}) {
     74 		print "$pattab{$subpat} is embedded in $pattab{$pat}\n";
     75 		$newpat = substr $pat, 0, $i + $j;
     76 		if (!defined $newpattab{$newpat}) {
     77 		    $newpattab{$newpat} =
     78 			substr ($pat, 0, $i).$pattab{$subpat};
     79 		    $ss = substr $pat, 0, $i;
     80 		    print "$ss+$pattab{$subpat}\n";
     81 		    push @newpatlist, $newpat;
     82 		    if (defined $repltab{$subpat}) {
     83                         $begcorr = (($pat =~ /^[.]/) && !($subpat =~ /^[.]/)) ? 1 : 0;
     84                         $newrepltab{$newpat} = $repltab{$subpat};
     85                         $newreplbeg{$newpat} = $replbeg{$subpat} + enclen($ss) - $begcorr;
     86                         $newrepllen{$newpat} = $repllen{$subpat};
     87                     }
     88 		} else {
     89 		    $tmp =  $newpattab{$newpat};
     90 		    $newpattab{$newpat} =
     91 			combine ($newpattab{$newpat}, $pattab{$subpat});
     92 		    print "$tmp + $pattab{$subpat} -> $newpattab{$newpat}\n";
     93 		}
     94 	    }
     95 	}
     96     }
     97 }
     98 
     99 foreach $pat (@newpatlist) {
    100     if (defined $newrepltab{$pat}) {
    101         print OUT $newpattab{$pat}."/".$newrepltab{$pat}.",".($newreplbeg{$pat}+1).",".$newrepllen{$pat}."\n";
    102     } else {
    103         print OUT $newpattab{$pat}."\n";
    104     }
    105 }
    106 
    107 #convert 'n1im' to 0n1i0m0 expresed as a list
    108 sub expand {
    109     my ($pat) = @_;
    110     my $last = '.';
    111     my @exp = ();
    112 
    113     foreach $c (split (//, $pat)) {
    114 	if ($last =~ /[\D]/ && $c =~ /[\D]/) {
    115 	    push @exp, 0;
    116 	}
    117 	push @exp, $c;
    118 	$last = $c;
    119     }
    120     if ($last =~ /[\D]/) {
    121 	push @exp, 0;
    122     }
    123     return @exp;
    124 }
    125 
    126 # Combine two patterns, i.e. .ad4der + a2d becomes .a2d4der
    127 # The second pattern needs to be a substring of the first (modulo digits)
    128 sub combine {
    129     my @exp = expand shift;
    130     my @subexp = expand shift;
    131     my $pat1, $pat2;
    132     my $i;
    133 
    134     $pat1 = join ('', map { $_ =~ /\d/ ? () : $_ } @exp);
    135     $pat2 = join ('', map { $_ =~ /\d/ ? () : $_ } @subexp);
    136 
    137     $begcorr = ($pat1 =~ /^[.]/) ? 1 : 0;
    138 
    139     for $i (0..length ($pat1) - length ($pat2)) {
    140 	if (substr ($pat1, $i, length $pat2) eq $subpat) {
    141 	    for ($j = 0; $j < @subexp; $j += 2) {
    142 		if ($subexp[$j] > $exp[2 * $i + $j]) {
    143 		    $exp[2 * $i + $j] = $subexp[$j];
    144                     if (defined $newrepltab{$pat2} && !defined $newrepltab{$pat1}) {
    145                         $ss = substr ($pat1, 0, $i);
    146                         $newrepltab{$pat1} = $newrepltab{$pat2};
    147                         $newreplbeg{$pat1} = $newreplbeg{$pat2} + enclen($ss) - $begcorr;
    148                         $newrepllen{$pat1} = $newrepllen{$pat2};
    149                     }
    150 		}
    151 	    }
    152 	    print ("$pat1 includes $pat2 at pos $i\n");
    153 	}
    154     }
    155     return join ('', map { $_ eq '0' ? () : $_ } @exp);
    156 }
    157 
    158 # 8 bit or UTF-8 character length (calculating right start position for discretionary hyphenation)
    159 sub enclen {
    160     my $nonchar = 0;
    161     my $len = length($_[0]);
    162     if ($encoding eq "UTF-8") {
    163         # length of an UTF-8 string equals to the count of the characters not started with '10' bits
    164         for ($i = 0; $i < $len; $i++) {
    165             if ((ord(substr($_[0], $i, 1)) >> 6) == 2) { $nonchar++; }
    166         }
    167     }
    168     return $len - $nonchar;   
    169 }
    170