1 ############################################################################# 2 # Perl script symshift.pl --- shift symbols of different tables into proper 3 # "plane" and create combined symbol table 4 # Copyright (C) 2009 SVOX AG. All Rights Reserved. 5 # 6 # type perl symshift.pl -help to get help 7 # 8 ############################################################################# 9 # This script creates a symbol table which must be used when 10 # compiling a source FST into its binary format. 11 # 12 # Explanation: 13 # When creating SVOX pico lingware, different sets of symbols (phonemes, 14 # Part-Of-Speech symbols, accents,boundaries) are expressed with 15 # names (strings) in the lingware source, but in the compiled lingware 16 # resources (bin-files) only ids (numbers) are used. 17 # For each set, symbols are mapped into one-byte ids [0..255]. 18 # Finite-State-Transducers are used to transform one sequence of symbols into 19 # another, where input and output symbols may be mixed from different sets. 20 # In order to keep the id ranges for each set disjoint, ids are shifted 21 # into a corresponding plane when forming such input sequences: 22 # 23 # id_combined = id_original + 256 * plane 24 # 25 # Note: shifting/unshifting in the running system uses hard-coded 26 # constants (e.g. the plane for each set). Also, some hard-coded 27 # "universal" symbols are added that are not related to any particular 28 # lingware but are inserted by the running system. 29 # Therefore there is a hard dependency between this script and the 30 # engine code! 31 # 32 # 33 ############################################################################# 34 35 eval "exec perl -S \$0 \${1+\"\$@\"}" 36 if 0; 37 ################################################################### 38 ## 39 ## Imports 40 ## 41 ################################################################### 42 #use File::DosGlob 'glob'; 43 #use File::Copy; 44 #use File::Path; 45 #use File::Basename; 46 #use Filehandle; 47 #use Time::Local; 48 use Getopt::Long; 49 ################################################################### 50 ## 51 ## Default values 52 ## 53 ################################################################### 54 $VALUE = 1; 55 $NAME = "name"; 56 $DEST = "."; 57 ################################################################### 58 ## 59 ## Options 60 ## 61 ################################################################### 62 GetOptions( 63 "phones=s" => \$PHONES, # string 64 "POS=s" => \$POS, # string 65 "accents=s" => \$ACCENTS, # string 66 "pb_strengths=s" => \$PB_STRENGTHS, # string 67 "alphabet=s" => \$ALPHABET, # string 68 "help" => \$HELP 69 ); 70 ################################################################### 71 ## 72 ## Help 73 ## 74 ################################################################### 75 $help = <<EOHELP 76 $0 -- 77 78 Usage: 79 $0 -help 80 81 print this help 82 83 $0 [-phones <phonestab>] [-POS <postab>] [-accents <acctab>] 84 [-pb_strengths <pbstab>] [-alphabet <alphaout>] 85 86 reads in a combination of symbol tables with ids in range [0..2^8-1] 87 and converts into one symbol table with ids in range [0..2^16-1] which 88 is written to STDOUT. 89 90 (Read perl source for more explanations) 91 92 Options: 93 -phones <infile>, 94 -POS <infile>, 95 -accents <infile>, 96 -pb_strengths <infile> read symbol tables from <file> and shift them into 97 the appropriate plane 98 99 A hard-coded universal set of accents and 100 pb_strengths is automatically included so that 101 usually only -phones ans -POS are used. 102 103 -alphabet <outfile> writes the combined set of symbols to <outfile>. 104 (Not used yet) 105 106 EOHELP 107 ; 108 die $help if $HELP; 109 110 ################################################################### 111 ## 112 ## Initialization 113 ## 114 ################################################################### 115 116 @alltables = ("PHONES", "ACCENTS", "POS", "PB_STRENGTHS", "INTERN"); 117 118 %plane = ( 119 PHONES => 0, 120 ACCENTS => 4, 121 POS => 5, 122 PB_STRENGTHS => 6, 123 INTERN => 7, 124 ); 125 126 #sometimes we want the inverse 127 foreach $table (@alltables) { 128 $table{$plane{$table}} = $table; 129 } 130 131 132 #translation between symbol names used in decision trees 133 #and corresponding names used in FSTs 134 %translation = ( 135 #boundaries 136 "PB_STRENGTHS" => { 137 "0" => "{WB}", 138 "_SHORTBR_" => "{P2}", 139 "_SECBND_" => "{P3}", 140 }, 141 #accents 142 "ACCENTS" => { 143 "0" => "{A0}", 144 "1" => "{A1}", 145 "2" => "{A2}", 146 "3" => "{A3}", 147 "4" => "{A4}", 148 }, 149 ); 150 151 # not all symbols are predicted by trees, some universals are inserted 152 # programatically. we add these hardcoded symbols/ids and check that they$ 153 # don't collide with predicted ones 154 %notpredicted = ( 155 #boundaries 156 "PB_STRENGTHS" => { 157 "{WB}" => 48, 158 "{P1}" => 49, 159 "{P2}" => 50, 160 "{P3}" => 51, 161 "{P0}" => 115, # "s" 162 }, 163 #accents 164 "ACCENTS" => { 165 "{A0}" => 48, 166 "{A1}" => 49, 167 "{A2}" => 50, 168 "{A3}" => 51, 169 "{A4}" => 52, 170 }, 171 #intern 172 "INTERN" => { 173 "&" => 38, 174 "#" => 35, 175 "|" => 50, 176 "+" => 51, 177 "*" => 52, 178 "{DEL}" => 127, 179 }, 180 ); 181 182 183 184 foreach $table (@alltables) { 185 #printf STDERR "doing table $table (plane %d)\n", $plane{$table}; 186 $file = ${$table}; 187 if ($file) { 188 $plane = $plane{$table}; 189 open TABLE, $file or die "can't open $table table $file"; 190 while (<TABLE>) { 191 #ignore empty lines 192 next if /^\s*$/; 193 #ignore comment lines 194 next if /^\s*[\!]/; 195 if (/^\s*:SYM\s+\"([^\"]+)\"(.*)$/) { 196 ($sym,$rest) = ($1,$2); 197 #we have the symbol (which potentially contains an exclamation mark) 198 #remove comments now 199 $rest =~ s/[\!].*//; 200 next if $rest =~ /iscombined/; #filter out combined POS 201 if ($rest =~ /.*:PROP.*mapval\s*=\s*(\d+)/) { 202 $id = $1 + 0; 203 $shifted = $id + $plane * 256; 204 $sym = translate($table,$sym,$id); 205 if ($shifted{$sym}) { 206 $otherplane = int($shifted{$sym} / 256); 207 print STDERR "symbol \"$sym\" was allready assigned to plane of \"$table{$otherplane}\" ($otherplane); overwriting\n"; 208 } 209 $shifted{$sym} = $shifted; 210 $sym{$shifted} = $sym; 211 $intable{$table}{$shifted}++; 212 } else { 213 print STDERR "strange line (no mapval) in $file: $_"; 214 } 215 } else { 216 print STDERR "strange line (no SYM) in $file: $_"; 217 } 218 } 219 } 220 } 221 222 #insert not predicted symbols 223 foreach $table (keys %notpredicted) { 224 $plane = $plane{$table}; 225 foreach $sym (keys %{$notpredicted{$table}}) { 226 $id = $notpredicted{$table}{$sym}; 227 $shifted = $id + $plane * 256; 228 $shifted{$sym} = $shifted unless $shifted{$sym}; 229 $sym{$shifted} = $sym unless $sym{$shifted}; 230 $intable{$table}{$shifted}++; 231 } 232 } 233 234 #create combined table 235 foreach $plane (sort numerically keys %table) { 236 $table = $table{$plane}; 237 print "\n! $table\n"; 238 foreach $shifted (sort numerically keys %{$intable{$table}}) { 239 printf ":SYM %-20s :PROP mapval = %5d\n", "\"$sym{$shifted}\"", $shifted; 240 } 241 } 242 243 #create corresponding alphabet if demanded 244 if ($ALPHABET) { 245 open OUT, ">$ALPHABET" or die "cant open $ALPHABET for writing"; 246 foreach $plane (sort numerically keys %table) { 247 $table = $table{$plane}; 248 print OUT "\n! $table\n "; 249 $count=10; 250 foreach $shifted (sort numerically keys %{$intable{$table}}) { 251 $sym = $sym{$shifted}; 252 $sym =~ s/'/''/g; 253 if (!$count--) { 254 $count = 10; 255 print OUT "\n "; 256 } 257 printf OUT " %s", "\'$sym{$shifted}\'"; 258 } 259 } 260 close OUT; 261 } 262 263 sub numerically {$a <=> $b} 264 265 266 sub translate($$$) { 267 my ($table,$sym,$id) = @_; 268 my $translated; 269 my $otherid; 270 if ($table eq "POS") { 271 $translated = "{P:$sym}"; 272 } else { 273 $translated = $translation{$table}{$sym}; 274 $translated = $sym unless $translated; 275 if (($other = $notpredicted{$table}{$translated}) && ($other != $id)) { 276 die "inconsistent table $table: sym \"$sym\" has id=$id, but i expected $other"; 277 } 278 } 279 return $translated; 280 } 281