Home | History | Annotate | Download | only in cmd
      1 #!/usr/localbin/perl
      2 
      3 use Getopt::Long;
      4 use File::Basename;
      5 use lib dirname($0);
      6 
      7 $assume_invocab = 0; # _when_semantics_missing 
      8 $rc = GetOptions("add=s" => \@additional_fields,
      9 		 "invocab" => \$assume_invocab, 
     10 		 "quiet" => \$quiet,
     11 		 "semantic" => \$try_semantic_validation,
     12 		 "altsem=s" => \$altsemfile,
     13                  );
     14 
     15 my @fields = ("file", "correct", "invocab", "gdiff", "sd", "sd13", "spf", "abs", "gdiffpf", "rejrslt", "rankc", "match", "ortho", "choice1", "choice2", "score1", "conf", "gender");
     16 
     17 if($try_semantic_validation) {
     18     push(@additional_fields,"parsed_ortho");
     19 }
     20 push(@fields, @additional_fields);
     21 foreach $additional_field (@additional_fields) {
     22     $additional_fieldh{$additional_field}++;
     23 }
     24 
     25 load_altsemfile($altsemfile) if($altsemfile);
     26 
     27 $| = 1;
     28 
     29 if(@ARGV[0] =~ /^@/) {
     30     $flist = substr($ARGV[0],1);
     31     @resfiles = `cat $flist`;
     32     grep { s/\s+$// } @resfiles;
     33 } else {
     34     @resfiles = @ARGV;
     35 }
     36 
     37 foreach $resfile (@resfiles) {
     38     ($base = $resfile) =~ s/\.[a-z]+$//i;
     39     $utdfile = "$base.utd";
     40 
     41     # print "processing $resfile to $utdfile\n" unless($quiet);
     42     open(RES, "<$resfile") || die "error opening $resfile\n";
     43     open(UTD, ">$utdfile") || die "error opening $utdfile\n";
     44     $hUTD = \*UTD;
     45     undef %results;
     46     while(<RES>) {
     47 	s/\s+$//;
     48         s/^\s+//;
     49 	if(/^D:\s+(\S+)\s*$/) { # same as CREC
     50 	    $file = $1;
     51 	    if(defined %token) {
     52 		process(\%token, \%results);
     53 		dump_record($hUTD, \%token);
     54 	    } else {
     55 		dump_header($hUTD);
     56 	    }
     57 	    undef %token;
     58 	    $token{file} = $file;
     59 	    $file =~ /ENU-(\d\d\d)-/;
     60 	    $token{gender} = $gender{$1};
     61 	    $token{"snr"} = get_snr($file) if($additional_fieldh{"snr"});
     62 	    $token{"snrr"} = sprintf("%.2d",int(get_snr($file)/5+0.5)*5) if($additional_fieldh{"snrr"});
     63 	} elsif(/^C:\s+(.*)$/) { # same as CREC
     64 	    $token{ortho} = normalize($1);
     65 	} elsif(/^\s*(\S+) = (.*)$/) {
     66 	    ($augkey,$augval) = ($1,$2);
     67 	    if($augkey eq "feedback") {
     68 		$token{parsed_ortho} = $augval;
     69 		$token{invocab}++;
     70 	    }
     71 	} elsif(/^R:\s+(.*)$/) { # same as CREC
     72 	    if(/<rejected/i || /<FAILED/i) {
     73 		$token{rejrslt} = "f";
     74 	    } else {
     75 		# $token{topchoice} = $1;
     76 		$token{rejrslt} = "a";
     77 	    }
     78 	} elsif(/^Sem[^:]+:  invocab=(\d)/) { # same as CREC
     79 	    $token{invocab} = 1;
     80 	} elsif(/^CSem:\s+([a-z]+.*)\s*$/i) { 
     81 	    $token{parsed_ortho} = $1;
     82 	} elsif(/^Sem:(\s+)(\S+)/) { # same as CREC
     83 	    $token{invocab} = 0;
     84 	} elsif(/^LITERAL\[\s*0\]\s*:\s*\'(.*)\'/) {
     85 	    $choice = $1;
     86 	    $token{choices}[0] = $choice;
     87 	} elsif(/^LITERAL\[\s*(\d+)\]\s+:\s+\'(.*)\'/) {
     88 	    $i = $1;
     89 	    $choice = $2;
     90 	    /.*\: \'(.*)\'/;
     91 	    $choice = $1;
     92 	    $token{choices}[$i] = $choice;
     93 	} elsif(/^MEANING\[\s*(\d+)\]\s+:\s+\'(.*)\'/) {
     94 	    $i = $1;
     95 	    $choice = $2;
     96 	    /.*\: \'(.*)\'/;
     97 	    $choice = $1;
     98 	    $choice =~ s/\s+$//;
     99 	    $token{meanings}[$i] = $choice;
    100 	} elsif(/^LITERAL\[(\d+)\]\[(\d+)\]\s+:\s+\'(.*)\'/) {
    101 	    $i = $1;
    102 	    $score = $2;
    103 	    $token{scores}[$i] = $score;
    104 	} elsif(/^RAW SCORE\s+:\s+\'(.*)\'/) {
    105 	    $token{topscore} = $1;
    106 	} elsif(/^gdiff\s+(.*)$/){
    107             $token{gdiff} = $1;
    108         } elsif(/^sd13\s+(.*)$/){
    109             $token{sd13} = $1; 
    110         } elsif(/^spf\s+(.*)$/){
    111             $token{spf} = $1;
    112         } elsif(/^abs\s+(.*)$/){
    113             $token{abs} = $1;
    114         } elsif(/^gdiffpf\s+(.*)$/){
    115             $token{gdiffpf} = $1;
    116         } elsif(/^sd\s+(.*)$/){ 
    117             $token{sd} = $1; 
    118         } elsif(/^CONFIDENCE SCORE\s+:\s+\'(.*)\'/) {
    119             $token{conf} = $1;
    120         }
    121     }
    122     process(\%token, \%results) if(defined %token);
    123     dump_record($hUTD, \%token) if(defined %token);
    124     close(UTD);
    125     close(RES);
    126     undef %token;
    127     $results{total} ||= 1;
    128     $rr = $results{correct}/$results{total} * 100;
    129     $rr = int($rr*10 + 0.5)/10;
    130     print sprintf("%-45s RR %4.1f %d/%d (%d oovs)\n", $base, $rr, $results{correct}, $results{total}, $results{numoovs});
    131 }
    132 
    133 
    134 sub process
    135 {
    136     my $token = shift(@_);
    137     my $results = shift(@_);
    138     $token->{invocab} = 1 if($assume_invocab);
    139     if(defined $token{topchoice}) {
    140 	$token->{choices}[0] = $token{topchoice};
    141     }
    142     if(defined $token{topscore}) {
    143 	$token->{scores}[0] = $token{topscore};
    144     }
    145     my $ortho = lc($token->{ortho});
    146     my $topch = lc($token->{choices}[0]);
    147 
    148     $ortho =~ s/_/ /g;
    149     $topch =~ s/_/ /g;
    150     $topch =~ s/\s\s+/ /g;
    151     $ortho =~ s/\s\s+/ /g;
    152     if($token->{invocab} == 0) {
    153 	$token->{correct} = "0"; 
    154 	$results->{numoovs}++;
    155     } elsif($topch eq $ortho) {
    156 	$results->{total}++;
    157 	$results->{correct}++;
    158 	$token->{correct} = "1";
    159     } else {
    160 	$results->{total}++;
    161 	# print "$token->{file} MEANINGCMP: ==$token->{meanings}[0]== ==$token->{parsed_ortho}==\n";
    162 	if($altsemfile) {
    163 	    if($token->{parsed_ortho} ne $csemtags{$token->{file}}) {
    164 		# print "changing $token{parsed_ortho} ne $csemtags{$token->{file}}\n";
    165 		$token->{parsed_ortho} = $csemtags{$token->{file}};
    166 	    }
    167 	}
    168 
    169 	if(not $try_semantic_validation) {
    170 	    $token->{correct} = "0";
    171 	} else {
    172 	    if($token->{meanings}[0] eq $token->{parsed_ortho} && length($token->{parsed_ortho})>0) {
    173 		$token->{correct} = "1";
    174 		$results->{correct}++ ;
    175 	    } else {
    176 		$token->{correct} = "0";
    177 	    }
    178 	}
    179     }
    180     $token->{rankc} = 0;
    181     my $nchoices = scalar(@{$token->{choices}});
    182     for($i=0; $i<$nchoices; $i++) {
    183 	my $choice = lc $token->{choices}[$i];
    184 	$choice =~ s/_/ /g;
    185 	if($choice eq $ortho) {
    186 	    $token->{rankc} = $i+1;
    187 	    last;
    188 	}
    189     }
    190     $token->{gender} = "?";
    191 }
    192 
    193 sub dump_record
    194 {
    195     my $HH = shift(@_);
    196     my $token = shift(@_);
    197     foreach $field (@fields) {
    198           if ($field =~ /^sd13$/){
    199           print UTD "$token->{$field}" , ":";
    200 	} elsif($field =~ /^(\S+)(\d+)$/) {
    201 	  $name = "${1}s"; 
    202 	  $num = $2 - 1;
    203 	  print UTD "$token->{$name}[$num]", ":";
    204 	} else{
    205           print UTD "$token->{$field}" , ":";
    206 	} 
    207     }
    208     print UTD "\n";
    209 }
    210 
    211 sub dump_header
    212 {
    213     my $HH = shift(@_);
    214     foreach $field (@fields) {
    215 	print UTD "$field" , ":";
    216     }
    217     print UTD "\n";
    218 }
    219 
    220 sub normalize
    221 {
    222     my $k = shift(@_);
    223     $k =~ s/\s\s+/ /g;
    224     $k =~ s/\:/\;/g;
    225     $k =~ s/\[[^\]]+\]//g;
    226     $k =~ s/^\s+//g;
    227     $k =~ s/\s+$//g;
    228     return $k;
    229 }
    230 
    231 sub load_altsemfile
    232 {
    233     my $semfile = shift(@_);
    234     open(SM,"<$semfile") || die "error: opening $semfile\n";
    235     while(<SM>) {
    236 	if(/D: (\S+)$/) {
    237 	    $file = $1;
    238 	    $file =~ s/\s+$//;
    239 	} elsif(/^CSem:\s+([a-z]+.*)\s*$/i) { 
    240 	    $csemtags{$file} = $1;
    241 	    $csemtags{$file} =~ s/\s+$//;
    242 	} elsif(/^Sem[^:]+:  invocab=(\d)/) { # same as CREC
    243 	    $semtags{$file} = 1;
    244 	} elsif(/^Sem:(\s+)(\S+)/) { # same as CREC
    245 	    $semtags{$file} = 0;
    246 	}
    247     }
    248     close(SM);
    249 }
    250 
    251