Home | History | Annotate | Download | only in tools
      1 #############################################################################
      2 # Perl script genlingware.pl --- composes a lingware resource from
      3 #                                pico knowledge base binary files (pkb)
      4 #                                according to given configuration
      5 #
      6 # Copyright (C) 2009 SVOX AG. All Rights Reserved.
      7 #
      8 # type perl genlingware.pl -help to get help
      9 #
     10 #############################################################################
     11 eval "exec perl -S \$0 \${1+\"\$@\"}"
     12     if 0;
     13 
     14 $resource_structure = <<EOSTRUCT
     15 
     16 
     17 the resource file structure is as follows:
     18 ------------------------------
     19 1. optional foreign header (4-byte aligned), externally added
     20 ------------------------------
     21 A) Pico header (4-byte aligned)
     22  2. SVOX PICO header (signature)
     23  3. length of header (2 byte, excluding length itself)
     24  4. number of fields  (1 byte)
     25  5. header fields (space separated) (key/value pairs)
     26  6. filler (0-3)
     27 ------------------------------
     28 B) length of content
     29 7. length of the remaining content in 4-byte, excluding length itself
     30 ------------------------------
     31 C) Index
     32  8. summary: number of kbs
     33  9. id names of kb (strings of max 15 chars plus closing space) 
     34  10. directory (kb id (1 byte), offset (4 bytes), size (4 bytes))
     35  11. filler (0-3)
     36 ------------------------------
     37 D) knowledge bases
     38 12. sequence of knowledge bases (byte arrays), each 4-byte aligned
     39 
     40 all numbers are little endian
     41 EOSTRUCT
     42     ;
     43 
     44 
     45 
     46 
     47 
     48 ###################################################################
     49 ##
     50 ##  Imports
     51 ##
     52 ###################################################################
     53 #use File::DosGlob 'glob';
     54 #use File::Copy;
     55 #use File::Path;
     56 #use File::Basename;
     57 #use Filehandle;
     58 #use Time::Local;
     59 use Getopt::Long;
     60 ###################################################################
     61 ##
     62 ##  Default values
     63 ##
     64 ###################################################################
     65 $VALUE = 1;
     66 $NAME = "name";
     67 $DEST = ".";
     68 ###################################################################
     69 ##
     70 ##  Options
     71 ##
     72 ###################################################################
     73 GetOptions(
     74     "value=i" => \$VALUE,  # numeric
     75     "name=s" => \$NAME,    # string
     76     "help"    => \$HELP
     77     );
     78 ###################################################################
     79 ##
     80 ##  Help
     81 ##
     82 ###################################################################
     83 $help = <<EOHELP
     84     $0 -- composes a lingware resource from pico knowledge base
     85           binary files (pkb) according to given configuration
     86     
     87   Usage:
     88     $0 -help
     89     
     90     print this help
     91     
     92     $0  <config> <resource> 
     93     
     94     reads in configuration file <config> and creates resource <resource>
     95     
     96   Arguments:
     97     <config>   :  configuration file (input)
     98     <resource> :  platform-independent resource file (output)
     99     
    100 
    101   (For more details see the source of this script)
    102 EOHELP
    103     ;
    104 die $help if $HELP;
    105 
    106 $config_example = <<EOCONFIGEXAMPLE
    107 
    108 Example of a config:
    109 
    110 -------------------------------------------------------------------
    111 \# collection of de-DE textana knowledge bases
    112 
    113 \# header fields:
    114 
    115 NAME                    de-DE_ta_1.0.0.0-0-2
    116 VERSION                 1.0.0.0-0-2
    117 DATE                    2009-01-15
    118 TIME                    17:00:00.000
    119 CONTENT_TYPE            TEXTANA
    120 
    121 
    122 \# pico knowledge bases:
    123 
    124 TPP_MAIN                \"../pkb/de-DE/de-DE_kpr.pkb\"
    125 TAB_GRAPHS              \"../pkb/de-DE/de-DE_ktab_graphs.pkb\"
    126 ...
    127 
    128 -------------------------------------------------------------------
    129 
    130 for all recognized pkb tags, see %picoknow_kb_id below
    131 
    132 EOCONFIGEXAMPLE
    133 ;    
    134 
    135 
    136 ###################################################################
    137 ##
    138 ##  Initialization
    139 ##
    140 ###################################################################
    141 $svoxheader = " (C) SVOX AG ";
    142 
    143 %header_field = (
    144     "NAME"    => 1,
    145     "VERSION" => 2,
    146     "DATE"    => 3,
    147     "TIME"    => 4,
    148     "CONTENT_TYPE" => 5,
    149 );
    150 
    151 %picoknow_kb_id = (
    152     NULL         => 0,
    153 #base
    154     TAB_GRAPHS   => 2,
    155     TAB_PHONES   => 3,
    156     TAB_POS      => 4,
    157   # FIXED_IDS     = 7,
    158 #dbg
    159     DBG          => 8,
    160 
    161 #textana
    162     TPP_MAIN     => 1,
    163     LEX_MAIN     => 9,
    164     DT_POSP      => 10,
    165     DT_POSD      => 11,
    166     DT_G2P       => 12,
    167     FST_WPHO_1   => 13,
    168     FST_WPHO_2   => 14,
    169     FST_WPHO_3   => 15,
    170     FST_WPHO_4   => 16,
    171     FST_WPHO_5   => 17,
    172     DT_PHR       => 18,
    173     DT_ACC       => 19,
    174     FST_SPHO_1   => 20,
    175     FST_SPHO_2   => 21,
    176     FST_SPHO_3   => 22,
    177     FST_SPHO_4   => 23,
    178     FST_SPHO_5   => 24,
    179     FST_XSPA_PARSE   => 25,
    180     FST_SVPA_PARSE   => 26,
    181     FST_XS2SVPA   => 27,
    182     
    183     FST_SPHO_6   => 28,
    184     FST_SPHO_7   => 29,
    185     FST_SPHO_8   => 30,
    186     FST_SPHO_9   => 31,
    187     FST_SPHO_10   => 32,
    188     
    189 #siggen
    190     DT_DUR       => 34,
    191     DT_LFZ1      => 35,
    192     DT_LFZ2      => 36,
    193     DT_LFZ3      => 37,
    194     DT_LFZ4      => 38,
    195     DT_LFZ5      => 39,
    196     DT_MGC1      => 40,
    197     DT_MGC2      => 41,
    198     DT_MGC3      => 42,
    199     DT_MGC4      => 43,
    200     DT_MGC5      => 44,
    201     PDF_DUR      => 45,
    202     PDF_LFZ      => 46,
    203     PDF_MGC      => 47,
    204     PDF_PHS      => 48,
    205 
    206 #user tpp
    207     TPP_USER1    => 49,
    208     TPP_USER2    => 50,
    209 #user lex
    210     LEX_USER1    => 57,
    211     LEX_USER2    => 58,
    212 
    213     DUMMY => 127
    214     );
    215 
    216 
    217 
    218 
    219 ###################################################################
    220 ##
    221 ##  Get Parameters
    222 ##
    223 ###################################################################
    224 ($in,$out) = (shift,shift);
    225 
    226 unless ($in && $out) {
    227     print "*** error: incorrect number of parameters\n"; 
    228     die $help;
    229 }
    230 
    231 
    232 ###################################################################
    233 ##
    234 ##  Work
    235 ##
    236 ###################################################################
    237 
    238 #get description of kbs
    239 
    240 unless (open IN, $in) {
    241     print "*** error: can't open $in\n";
    242     die "can't open $in";
    243 }
    244 while (<IN>) {
    245     # skip comments
    246     next if /^\s*#/;    
    247     ($key, $value) = split;    
    248     next unless $key;
    249     $value =~ s/^\s*\"(.*)\"\s*$/\1/;
    250     if ($field = $header_field{$key}) {
    251         $fields[$field] = {key => "$key", value => "$value"}; 
    252         next;
    253     }      
    254     #print "$key -> $value\n";
    255     unless ($id = $picoknow_kb_id{$key}) {
    256 	print "*** error: not a valid knowledge base name $key\n";
    257 	die "not a valid knowledge base name $key" ;
    258     }
    259     push @kb, {name => $key, file => $value, id => $id};
    260 } 
    261 close IN;
    262 
    263 
    264 
    265 #open output lingware file and write header
    266 
    267 unless (open OUT, ">$out") {
    268     print "*** error: can't open $out for writing\n";
    269     die "can't open $out for writing";
    270 }
    271 binmode(OUT);
    272 
    273 $offs = 0;
    274 
    275 ###################################################################
    276 ##
    277 ##  A) PICO HEADER
    278 ##
    279 ###################################################################
    280 # 1. SVOX HEADER
    281 foreach $ch (split //, $svoxheader) {
    282     push @svoxheader, chr(ord($ch)-ord(' '));
    283 }
    284 $offs += print_offs(@svoxheader);
    285 
    286 print "offset after svoxheader: $offs\n";
    287 
    288 # fields header
    289 $fieldheader = "";
    290 foreach $field (@fields) {
    291     $fieldheader .= $field->{key} .  " " . $field->{value} . " ";
    292 }
    293 
    294 #print size of fields header
    295 
    296 # length of the fields plus preceding number of fields (1) 
    297 $len = length($fieldheader)+1;
    298 
    299 #fill should make the whole header 4-byte aligned, i.e. the current offs
    300 # (svoxheader) plus the length of the header (2) plus the fields
    301 $fill = ($offs + 2 + $len) % 4;
    302 $fill = $fill ? 4-$fill : 0;
    303 $len += $fill;
    304 
    305 print "filled length of header : $len\n";
    306 
    307 # 2. length of header
    308 $offs += &print_uint16($len); #write little-endian 16-bit cardinal
    309 print "offset after length of header: $offs\n";
    310 
    311 # 3. print number of fields
    312 $offs += &print_uint8(@fields+0);
    313 print "offset after number of fields: $offs\n";
    314 
    315 # 4. print fields
    316 $offs += print_offs($fieldheader);
    317 print "offset after fields: $offs\n";
    318 
    319 # 5. print magic number (not yet)
    320 
    321 # 6. print filler
    322 $offs += &opt_fill($fill);
    323 print "offset after fill: $offs\n";
    324 
    325 
    326 ###################################################################
    327 ##
    328 ##  CONTENT (that is actually saved in RAM)
    329 ##
    330 ###################################################################
    331 
    332 
    333 
    334 # open kb files to get sizes and calculate fillers 
    335 
    336 foreach $kb (@kb) {
    337     if ($kb->{file}) {
    338 	unless (open IN, $kb->{file}) {
    339 	    print "*** error: can't open " . $kb->{file} ."\n";
    340 	    die("can't open PKB " . $kb->{file});
    341 	}
    342 	binmode(IN);
    343 	#slurp in the whole file
    344 	@content = <IN>;
    345 	close IN;
    346 	$kb->{size} = length(join '',@content);
    347 	$fill = ($kb->{size} % 4);
    348 	$kb->{fill} = $fill ? 4-$fill : 0;
    349 	print "fill of ", $kb->{name} , " is ", $kb->{fill}+0, "\n";
    350 	$totalcont += $kb->{size} + $kb->{fill};
    351     } else {
    352 	$kb->{size} = 0;
    353     }
    354     print $kb->{name}, " -> ", $kb->{file}, " -> ", $kb->{size}, "\n";
    355 }
    356 
    357 
    358 # calculate total content size (B):
    359 
    360 $totalcont = 0;
    361 
    362 # size of number of kbs (7.)
    363 $totalcont += 1;  
    364 
    365 #size of names of kbs (8.)
    366 foreach $kb (@kb) {
    367     $totalcont += &size_offs($kb->{name}, " ");
    368 }
    369 
    370 
    371 # size of directory (9.)
    372 
    373 $totalcont += 9 * @kb;
    374 
    375 # size of filler (10.)
    376 $xfill = $totalcont % 4;
    377 $xfill = $xfill ? 4 - $xfill : 0;
    378 $totalcont += $xfill;
    379 
    380 #set root of first kb
    381 $offs1 = $totalcont;
    382 
    383 print "totalcont before kbs: $totalcont\n";
    384 # size of actual kbs
    385 foreach $kb (@kb) {
    386     $totalcont += $kb->{size} + $kb->{fill};
    387 }
    388 print "totalcont after kbs: $totalcont\n";
    389 
    390 # B) print the total size
    391 
    392 $offs += &print_uint32($totalcont);
    393 
    394 print "offset after size of content (to be loaded): $offs\n";
    395 
    396 # here starts the part that is stored in "permament memory"
    397 
    398 $offs = 0;
    399 
    400 print "offset after reset: $offs\n";
    401 
    402 
    403 
    404 # 7. print number of kbs:
    405 $offs += &print_uint8(@kb+0);
    406 print "offset after number of kbs: $offs\n";
    407 
    408 
    409 # 8. print names of kbs
    410 foreach $kb (@kb) {
    411     $offs += &print_offs($kb->{name}, " ");
    412 }
    413 print "offset after descriptive kb names: $offs\n";
    414 
    415 # 9. print directory (ids and offsets of kbs)
    416 
    417 
    418 print "first kb should start at $offs1\n";
    419 foreach $kb (@kb) {
    420     $offs += &print_uint8($kb->{id});
    421     print "kb id: $kb->{id}, offs=$offs\n"; 
    422     if ($kb->{size}) {
    423     print "real kb (size $kb->{size})\n"; 
    424 	$offs += &print_uint32($offs1);
    425     print "kb offs: $offs1, offs=$offs\n"; 
    426 	$offs += &print_uint32($kb->{size});
    427     print "kb size: $kb->{size}, offs=$offs\n"; 
    428 	$offs1 += $kb->{size} + $kb->{fill};
    429     } else {
    430     print "dummy kb (size 0)\n"; 
    431 	$offs += &print_uint32(0);
    432     print "kb offs: 0, offs=$offs\n"; 
    433 	$offs += &print_uint32(0);
    434     print "kb size: $kb->{size}, offs=$offs\n"; 
    435     }
    436     print "offset after directory entry for kb $kb->{name} (id $kb->{id}): $offs\n";
    437 }
    438 
    439 # 10. print filler
    440 $offs += &opt_fill($xfill);
    441 print "offset after fill: $offs\n";
    442 
    443 
    444 # print kbs themselves
    445 
    446 foreach $kb (@kb) {
    447     if ($kb->{file}) {
    448 	unless (open IN, $kb->{file}) {
    449 	    print "*** error: can't open " . $kb->{file} ."\n";
    450 	    die "can't open " . $kb->{file};
    451 	}
    452 	binmode(IN);
    453 	#slurp in the whole file
    454 	@content = <IN>;
    455 	close IN;
    456 	print OUT join '', @content;
    457 	$offs +=  $kb->{size};
    458 	$offs += &opt_fill($kb->{fill});
    459 	print "offset after kb $kb->{file}: $offs\n";
    460     } else {
    461     }
    462     #print $kb->{name}, " -> ", $kb->{file}, " -> ", $kb->{size}, "\n";
    463 }
    464 
    465 
    466 close OUT;
    467 
    468 
    469 
    470 # optional filler
    471 # use for alignment if filler doesn't need to be parsed 
    472 sub opt_fill() {
    473     my ($fill) = @_;
    474     my $size = 0;
    475     if ($fill) {
    476 	$size += &print_uint8($fill);
    477 	for (my $i = 1; $i < $fill; $i++) {
    478 	    $size += &print_uint8(0);
    479 	}
    480     }
    481     return $size;
    482 }
    483 
    484 # mandatory filler
    485 # use for alignment if filler needs to be parsed (at least one byte) 
    486 sub mand_fill() {
    487     my ($fill) = @_;
    488     my $size = 0;
    489     #force fill to be 4 if there is no fill
    490     $fill = 4 unless $fill;
    491     $size += &print_uint8($fill);
    492     for (my $i = 1; $i < $fill; $i++) {
    493 	$size += &print_uint8(0);
    494     }
    495     return $size;
    496 }
    497 
    498 sub print_offs() {
    499     my (@cont) = @_;
    500     my $size = 0;
    501     foreach my $cont (@cont) {
    502 	$size += length($cont);
    503 	print OUT $cont;
    504     }
    505     return $size;
    506 }
    507 
    508 sub size_offs() {
    509     my (@cont) = @_;
    510     my $size = 0;
    511     foreach my $cont (@cont) {
    512 	$size += length($cont);
    513     }
    514     return $size;
    515 }
    516 
    517 sub print_uint_n() {
    518     #little-endian n-byte cardinal (no check, though!)
    519     my ($num, $n) = @_;
    520     my (@out) = ();
    521     for (my $i=0; $i < $n;$i++) {
    522 	$out[$i] = $num % 256;
    523 	$num = int($num / 256);
    524     }
    525     print OUT pack("c*",@out);
    526     return $n;
    527 }
    528 
    529 sub print_uint32() {
    530     #little-endian 4-byte cardinal (no check, though!)
    531     my ($num) = @_;
    532     return &print_uint_n($num,4);
    533 }
    534 
    535 sub print_uint16() {
    536     #little-endian 2-byte cardinal (no check, though!)
    537     my ($num) = @_;
    538     return &print_uint_n($num,2);
    539 }
    540 sub print_uint8() {
    541     my ($num) = @_;
    542     return &print_uint_n($num,1);
    543 }
    544 
    545 sub numerically {$x < $y}
    546     
    547 # @x = map {chr(48+$_) } (0..9);
    548 # foreach $x (@x) {
    549 #     print $x, "\n";
    550 # }
    551 # @x = map +(chr(65+$_)), (0..9);
    552 # foreach $x (@x) {
    553 #     print $x, "\n";
    554 # }
    555