1 #!/usr/bin/perl 2 # convert TeX (Patgen) hyphenation patterns to Libhnj format 3 # (A utility for finding substring embeddings in patterns) 4 # usage: substrings.pl inputfile outputfile [encoding] 5 6 if (!defined $ARGV[1]) { 7 print "" . 8 "substrings.pl - convert TeX (Patgen) hyphenation patterns to Libhnj format\n" . 9 "(A utility for finding substring embeddings in patterns)\n" . 10 "usage: substrings.pl infile outfile [encoding [lefthyphenmin [righthyphenmin]]]\n"; 11 exit 1; 12 } 13 $fn = $ARGV[0]; 14 if (!-e $fn) { $fn = "hyphen.us"; } 15 open HYPH, $fn; 16 open OUT, ">$ARGV[1]"; 17 $encoding = $ARGV[2]; 18 $lhmin = $ARGV[3]; 19 $rhmin = $ARGV[4]; 20 if (defined $encoding) { print OUT "$encoding\n"; } 21 if (defined $lhmin) { print OUT "LEFTHYPHENMIN $lhmin\n"; } 22 if (defined $rhmin) { print OUT "RIGHTHYPHENMIN $rhmin\n"; } 23 24 while (<HYPH>) 25 { 26 $pat =~ s/%.*$//g; 27 if (/^\%/) { 28 #comment, ignore 29 } elsif (/^(.+)\/([^,]+),([0-9]+),([0-9]+)$/) { 30 $origpat = $1; 31 $pat = $1; 32 $repl = $2; 33 $beg = $3; 34 $len = $4; 35 $pat =~ s/\d//g; 36 if ($origpat eq $pat) { 37 print "error - missing hyphenation point: $_"; 38 exit 1; 39 } 40 push @patlist, $pat; 41 $pattab{$pat} = $origpat; 42 $repltab{$pat} = $repl; 43 $replbeg{$pat} = $beg - 1; 44 $repllen{$pat} = $len; 45 } elsif (/^(.+)\/(.+)$/) { 46 $origpat = $1; 47 $pat = $1; 48 $repl = $2; 49 $pat =~ s/\d//g; 50 if ($origpat eq $pat) { 51 print "error - missing hyphenation point: $_"; 52 exit 1; 53 } 54 push @patlist, $pat; 55 $pattab{$pat} = $origpat; 56 $repltab{$pat} = $repl; 57 $replbeg{$pat} = 0; 58 $repllen{$pat} = enclen($pat); 59 } elsif (/^(.+)$/) { 60 $origpat = $1; 61 $pat = $1; 62 $pat =~ s/\d//g; 63 push @patlist, $pat; 64 $pattab{$pat} = $origpat; 65 } 66 } 67 68 foreach $pat (@patlist) { 69 $patsize = length $pat; 70 for $i (0..$patsize - 1) { 71 for $j (1..$patsize - $i) { 72 $subpat = substr ($pat, $i, $j); 73 if (defined $pattab{$subpat}) { 74 print "$pattab{$subpat} is embedded in $pattab{$pat}\n"; 75 $newpat = substr $pat, 0, $i + $j; 76 if (!defined $newpattab{$newpat}) { 77 $newpattab{$newpat} = 78 substr ($pat, 0, $i).$pattab{$subpat}; 79 $ss = substr $pat, 0, $i; 80 print "$ss+$pattab{$subpat}\n"; 81 push @newpatlist, $newpat; 82 if (defined $repltab{$subpat}) { 83 $begcorr = (($pat =~ /^[.]/) && !($subpat =~ /^[.]/)) ? 1 : 0; 84 $newrepltab{$newpat} = $repltab{$subpat}; 85 $newreplbeg{$newpat} = $replbeg{$subpat} + enclen($ss) - $begcorr; 86 $newrepllen{$newpat} = $repllen{$subpat}; 87 } 88 } else { 89 $tmp = $newpattab{$newpat}; 90 $newpattab{$newpat} = 91 combine ($newpattab{$newpat}, $pattab{$subpat}); 92 print "$tmp + $pattab{$subpat} -> $newpattab{$newpat}\n"; 93 } 94 } 95 } 96 } 97 } 98 99 foreach $pat (@newpatlist) { 100 if (defined $newrepltab{$pat}) { 101 print OUT $newpattab{$pat}."/".$newrepltab{$pat}.",".($newreplbeg{$pat}+1).",".$newrepllen{$pat}."\n"; 102 } else { 103 print OUT $newpattab{$pat}."\n"; 104 } 105 } 106 107 #convert 'n1im' to 0n1i0m0 expresed as a list 108 sub expand { 109 my ($pat) = @_; 110 my $last = '.'; 111 my @exp = (); 112 113 foreach $c (split (//, $pat)) { 114 if ($last =~ /[\D]/ && $c =~ /[\D]/) { 115 push @exp, 0; 116 } 117 push @exp, $c; 118 $last = $c; 119 } 120 if ($last =~ /[\D]/) { 121 push @exp, 0; 122 } 123 return @exp; 124 } 125 126 # Combine two patterns, i.e. .ad4der + a2d becomes .a2d4der 127 # The second pattern needs to be a substring of the first (modulo digits) 128 sub combine { 129 my @exp = expand shift; 130 my @subexp = expand shift; 131 my $pat1, $pat2; 132 my $i; 133 134 $pat1 = join ('', map { $_ =~ /\d/ ? () : $_ } @exp); 135 $pat2 = join ('', map { $_ =~ /\d/ ? () : $_ } @subexp); 136 137 $begcorr = ($pat1 =~ /^[.]/) ? 1 : 0; 138 139 for $i (0..length ($pat1) - length ($pat2)) { 140 if (substr ($pat1, $i, length $pat2) eq $subpat) { 141 for ($j = 0; $j < @subexp; $j += 2) { 142 if ($subexp[$j] > $exp[2 * $i + $j]) { 143 $exp[2 * $i + $j] = $subexp[$j]; 144 if (defined $newrepltab{$pat2} && !defined $newrepltab{$pat1}) { 145 $ss = substr ($pat1, 0, $i); 146 $newrepltab{$pat1} = $newrepltab{$pat2}; 147 $newreplbeg{$pat1} = $newreplbeg{$pat2} + enclen($ss) - $begcorr; 148 $newrepllen{$pat1} = $newrepllen{$pat2}; 149 } 150 } 151 } 152 print ("$pat1 includes $pat2 at pos $i\n"); 153 } 154 } 155 return join ('', map { $_ eq '0' ? () : $_ } @exp); 156 } 157 158 # 8 bit or UTF-8 character length (calculating right start position for discretionary hyphenation) 159 sub enclen { 160 my $nonchar = 0; 161 my $len = length($_[0]); 162 if ($encoding eq "UTF-8") { 163 # length of an UTF-8 string equals to the count of the characters not started with '10' bits 164 for ($i = 0; $i < $len; $i++) { 165 if ((ord(substr($_[0], $i, 1)) >> 6) == 2) { $nonchar++; } 166 } 167 } 168 return $len - $nonchar; 169 } 170