Home | History | Annotate | Download | only in tools
      1 #############################################################################
      2 # Perl script symshift.pl --- shift symbols of different tables into proper 
      3 #                             "plane" and create combined symbol table
      4 # Copyright (C) 2009 SVOX AG. All Rights Reserved.
      5 #
      6 # type perl symshift.pl -help to get help
      7 #
      8 #############################################################################
      9 # This script creates a symbol table which must be used when
     10 # compiling a source FST into its binary format.
     11 # 
     12 # Explanation:
     13 # When creating SVOX pico lingware, different sets of symbols (phonemes,
     14 # Part-Of-Speech symbols, accents,boundaries) are expressed with
     15 # names (strings) in the lingware source, but in the compiled lingware
     16 # resources (bin-files) only ids (numbers) are used. 
     17 # For each set, symbols are mapped into one-byte ids [0..255].
     18 # Finite-State-Transducers are used to transform one sequence of symbols into
     19 # another, where input and output symbols may be mixed from different sets.
     20 # In order to keep the id ranges for each set disjoint, ids are shifted
     21 # into a corresponding plane when forming such input sequences:
     22 #
     23 #    id_combined = id_original + 256 * plane
     24 #
     25 # Note: shifting/unshifting in the running system uses hard-coded
     26 #       constants (e.g. the plane for each set). Also, some hard-coded
     27 #       "universal" symbols are added that are not related to any particular
     28 #       lingware but are inserted by the running system.
     29 #       Therefore there is a hard dependency between this script and the
     30 #       engine code!
     31 #    
     32 #
     33 #############################################################################
     34 
     35 eval "exec perl -S \$0 \${1+\"\$@\"}"
     36    if 0;
     37 ###################################################################
     38 ##
     39 ##  Imports
     40 ##
     41 ###################################################################
     42 #use File::DosGlob 'glob';
     43 #use File::Copy;
     44 #use File::Path;
     45 #use File::Basename;
     46 #use Filehandle;
     47 #use Time::Local;
     48 use Getopt::Long;
     49 ###################################################################
     50 ##
     51 ##  Default values
     52 ##
     53 ###################################################################
     54 $VALUE = 1;
     55 $NAME = "name";
     56 $DEST = ".";
     57 ###################################################################
     58 ##
     59 ##  Options
     60 ##
     61 ###################################################################
     62 GetOptions(
     63           "phones=s" => \$PHONES,    # string
     64           "POS=s" => \$POS,    # string
     65           "accents=s" => \$ACCENTS,    # string
     66           "pb_strengths=s" => \$PB_STRENGTHS,    # string
     67           "alphabet=s" => \$ALPHABET,    # string
     68           "help"    => \$HELP
     69           );
     70 ###################################################################
     71 ##
     72 ##  Help
     73 ##
     74 ###################################################################
     75 $help = <<EOHELP
     76    $0 -- 
     77 
     78 Usage:
     79    $0 -help
     80 
     81       print this help
     82 
     83     $0  [-phones <phonestab>] [-POS <postab>] [-accents <acctab>]
     84          [-pb_strengths <pbstab>] [-alphabet <alphaout>]
     85 
     86     reads in a combination of symbol tables with ids in range [0..2^8-1]
     87     and converts into one symbol table with ids in range [0..2^16-1] which
     88     is written to STDOUT.
     89 
     90   (Read perl source for more explanations)
     91 
     92  Options:
     93     -phones <infile>,
     94     -POS <infile>,
     95     -accents <infile>,
     96     -pb_strengths <infile> read symbol tables from <file> and shift them into
     97                            the appropriate plane
     98 
     99                            A hard-coded universal set of accents and
    100                            pb_strengths is automatically included so that
    101                            usually only -phones ans -POS are used.
    102 
    103     -alphabet <outfile>    writes the combined set of symbols to <outfile>.
    104                            (Not used yet)
    105 
    106 EOHELP
    107    ;
    108 die $help if $HELP;
    109 
    110 ###################################################################
    111 ##
    112 ##  Initialization
    113 ##
    114 ###################################################################
    115 
    116 @alltables = ("PHONES", "ACCENTS", "POS", "PB_STRENGTHS", "INTERN");
    117 
    118 %plane = (
    119    PHONES => 0,
    120    ACCENTS => 4,
    121    POS => 5,
    122    PB_STRENGTHS => 6,
    123    INTERN => 7,
    124 );
    125 
    126 #sometimes we want the inverse
    127 foreach $table (@alltables) {
    128 	$table{$plane{$table}} = $table;
    129 }
    130 
    131 
    132 #translation between symbol names used in decision trees
    133 #and corresponding names used in FSTs
    134 %translation = (
    135 #boundaries
    136     "PB_STRENGTHS" => {
    137 	"0"         => "{WB}",
    138 	"_SHORTBR_" => "{P2}",
    139 	"_SECBND_"  => "{P3}",
    140     },
    141 #accents
    142     "ACCENTS" => {
    143 	"0" => "{A0}",
    144 	"1" => "{A1}",
    145 	"2" => "{A2}",
    146 	"3" => "{A3}",
    147 	"4" => "{A4}",
    148     },
    149 );
    150 
    151 # not all symbols are predicted by trees, some universals are inserted
    152 # programatically. we add these hardcoded symbols/ids and check that they$
    153 # don't collide with predicted ones
    154 %notpredicted = (    
    155 #boundaries
    156     "PB_STRENGTHS" => {
    157 	"{WB}" => 48,
    158 	"{P1}" => 49, 
    159 	"{P2}" => 50, 
    160 	"{P3}" => 51,  
    161 	"{P0}" => 115,  # "s" 
    162     },
    163 #accents
    164     "ACCENTS" => {
    165 	"{A0}" => 48,
    166 	"{A1}" => 49,
    167 	"{A2}" => 50,
    168 	"{A3}" => 51,
    169 	"{A4}" => 52,
    170     },
    171 #intern
    172     "INTERN" => {
    173 	"&" => 38,
    174 	"#" => 35,
    175 	"|" => 50,
    176 	"+" => 51,
    177 	"*" => 52,
    178 	"{DEL}" => 127,
    179     },
    180 );
    181 
    182 
    183 
    184 foreach $table (@alltables) {
    185     #printf STDERR "doing table $table (plane %d)\n", $plane{$table};
    186     $file = ${$table};
    187     if ($file) {
    188 	$plane = $plane{$table};
    189 	open TABLE, $file or die "can't open $table table $file";
    190 	while (<TABLE>) {
    191 	    #ignore empty lines
    192 	    next if /^\s*$/;
    193 	    #ignore comment lines
    194 	    next if /^\s*[\!]/;
    195 	    if (/^\s*:SYM\s+\"([^\"]+)\"(.*)$/) {
    196 		($sym,$rest) = ($1,$2);
    197 		#we have the symbol (which potentially contains an exclamation mark)
    198 		#remove comments now
    199 		$rest =~ s/[\!].*//;
    200 		next if $rest =~ /iscombined/; #filter out combined POS
    201 		if ($rest =~ /.*:PROP.*mapval\s*=\s*(\d+)/) {
    202 		    $id = $1 + 0;
    203 		    $shifted = $id + $plane * 256;
    204 		    $sym = translate($table,$sym,$id);
    205 		    if ($shifted{$sym}) {
    206 			$otherplane = int($shifted{$sym} / 256);
    207 			print STDERR "symbol \"$sym\" was allready assigned to plane of \"$table{$otherplane}\" ($otherplane); overwriting\n";
    208 		    }
    209 		    $shifted{$sym} = $shifted;
    210 		    $sym{$shifted} = $sym;
    211 		    $intable{$table}{$shifted}++; 
    212 		} else {
    213 		    print STDERR "strange line (no mapval) in $file: $_";
    214 		}
    215 	    } else {
    216 		print STDERR "strange line (no SYM) in $file: $_";
    217 	    }
    218 	}
    219     }
    220 }
    221 
    222 #insert not predicted symbols
    223 foreach $table (keys %notpredicted) {
    224     $plane = $plane{$table};
    225     foreach $sym (keys %{$notpredicted{$table}}) {
    226 	$id = $notpredicted{$table}{$sym};
    227 	$shifted = $id + $plane * 256;
    228 	$shifted{$sym} = $shifted unless $shifted{$sym};
    229 	$sym{$shifted} = $sym unless $sym{$shifted};
    230 	$intable{$table}{$shifted}++; 
    231     }
    232 }
    233 	
    234 #create combined table
    235 foreach $plane (sort numerically keys %table) {
    236     $table = $table{$plane};
    237     print "\n! $table\n";
    238     foreach $shifted (sort numerically keys %{$intable{$table}}) {
    239 	printf ":SYM %-20s   :PROP mapval = %5d\n", "\"$sym{$shifted}\"", $shifted;
    240     }
    241 }
    242 
    243 #create corresponding alphabet if demanded
    244 if ($ALPHABET) {
    245     open OUT, ">$ALPHABET" or die "cant open $ALPHABET for writing";
    246     foreach $plane (sort numerically keys %table) {
    247 	$table = $table{$plane};
    248 	print OUT "\n! $table\n   ";
    249 	$count=10;
    250 	foreach $shifted (sort numerically keys %{$intable{$table}}) {
    251 	    $sym = $sym{$shifted};
    252 	    $sym =~ s/'/''/g;
    253 	    if (!$count--) {
    254 		$count = 10;
    255 		print OUT "\n   ";
    256 	    }
    257 	    printf OUT " %s", "\'$sym{$shifted}\'";
    258 	}
    259     }
    260     close OUT;
    261 }
    262 
    263 sub numerically {$a <=> $b}
    264 	
    265 
    266 sub translate($$$) {
    267     my ($table,$sym,$id) = @_;
    268     my $translated;
    269     my $otherid;
    270     if ($table eq "POS") {
    271 	$translated = "{P:$sym}";
    272     } else {
    273 	$translated = $translation{$table}{$sym};
    274 	$translated = $sym unless $translated;
    275 	if (($other = $notpredicted{$table}{$translated}) && ($other != $id)) {
    276 	    die "inconsistent table $table: sym \"$sym\" has id=$id, but i expected $other";
    277 	} 
    278     }
    279     return $translated;
    280 }    
    281