1 #!/usr/localbin/perl 2 3 use Getopt::Long; 4 use File::Basename; 5 use lib dirname($0); 6 7 $assume_invocab = 0; # _when_semantics_missing 8 $rc = GetOptions("add=s" => \@additional_fields, 9 "invocab" => \$assume_invocab, 10 "quiet" => \$quiet, 11 "semantic" => \$try_semantic_validation, 12 "altsem=s" => \$altsemfile, 13 ); 14 15 my @fields = ("file", "correct", "invocab", "gdiff", "sd", "sd13", "spf", "abs", "gdiffpf", "rejrslt", "rankc", "match", "ortho", "choice1", "choice2", "score1", "conf", "gender"); 16 17 if($try_semantic_validation) { 18 push(@additional_fields,"parsed_ortho"); 19 } 20 push(@fields, @additional_fields); 21 foreach $additional_field (@additional_fields) { 22 $additional_fieldh{$additional_field}++; 23 } 24 25 load_altsemfile($altsemfile) if($altsemfile); 26 27 $| = 1; 28 29 if(@ARGV[0] =~ /^@/) { 30 $flist = substr($ARGV[0],1); 31 @resfiles = `cat $flist`; 32 grep { s/\s+$// } @resfiles; 33 } else { 34 @resfiles = @ARGV; 35 } 36 37 foreach $resfile (@resfiles) { 38 ($base = $resfile) =~ s/\.[a-z]+$//i; 39 $utdfile = "$base.utd"; 40 41 # print "processing $resfile to $utdfile\n" unless($quiet); 42 open(RES, "<$resfile") || die "error opening $resfile\n"; 43 open(UTD, ">$utdfile") || die "error opening $utdfile\n"; 44 $hUTD = \*UTD; 45 undef %results; 46 while(<RES>) { 47 s/\s+$//; 48 s/^\s+//; 49 if(/^D:\s+(\S+)\s*$/) { # same as CREC 50 $file = $1; 51 if(defined %token) { 52 process(\%token, \%results); 53 dump_record($hUTD, \%token); 54 } else { 55 dump_header($hUTD); 56 } 57 undef %token; 58 $token{file} = $file; 59 $file =~ /ENU-(\d\d\d)-/; 60 $token{gender} = $gender{$1}; 61 $token{"snr"} = get_snr($file) if($additional_fieldh{"snr"}); 62 $token{"snrr"} = sprintf("%.2d",int(get_snr($file)/5+0.5)*5) if($additional_fieldh{"snrr"}); 63 } elsif(/^C:\s+(.*)$/) { # same as CREC 64 $token{ortho} = normalize($1); 65 } elsif(/^\s*(\S+) = (.*)$/) { 66 ($augkey,$augval) = ($1,$2); 67 if($augkey eq "feedback") { 68 $token{parsed_ortho} = $augval; 69 $token{invocab}++; 70 } 71 } elsif(/^R:\s+(.*)$/) { # same as CREC 72 if(/<rejected/i || /<FAILED/i) { 73 $token{rejrslt} = "f"; 74 } else { 75 # $token{topchoice} = $1; 76 $token{rejrslt} = "a"; 77 } 78 } elsif(/^Sem[^:]+: invocab=(\d)/) { # same as CREC 79 $token{invocab} = 1; 80 } elsif(/^CSem:\s+([a-z]+.*)\s*$/i) { 81 $token{parsed_ortho} = $1; 82 } elsif(/^Sem:(\s+)(\S+)/) { # same as CREC 83 $token{invocab} = 0; 84 } elsif(/^LITERAL\[\s*0\]\s*:\s*\'(.*)\'/) { 85 $choice = $1; 86 $token{choices}[0] = $choice; 87 } elsif(/^LITERAL\[\s*(\d+)\]\s+:\s+\'(.*)\'/) { 88 $i = $1; 89 $choice = $2; 90 /.*\: \'(.*)\'/; 91 $choice = $1; 92 $token{choices}[$i] = $choice; 93 } elsif(/^MEANING\[\s*(\d+)\]\s+:\s+\'(.*)\'/) { 94 $i = $1; 95 $choice = $2; 96 /.*\: \'(.*)\'/; 97 $choice = $1; 98 $choice =~ s/\s+$//; 99 $token{meanings}[$i] = $choice; 100 } elsif(/^LITERAL\[(\d+)\]\[(\d+)\]\s+:\s+\'(.*)\'/) { 101 $i = $1; 102 $score = $2; 103 $token{scores}[$i] = $score; 104 } elsif(/^RAW SCORE\s+:\s+\'(.*)\'/) { 105 $token{topscore} = $1; 106 } elsif(/^gdiff\s+(.*)$/){ 107 $token{gdiff} = $1; 108 } elsif(/^sd13\s+(.*)$/){ 109 $token{sd13} = $1; 110 } elsif(/^spf\s+(.*)$/){ 111 $token{spf} = $1; 112 } elsif(/^abs\s+(.*)$/){ 113 $token{abs} = $1; 114 } elsif(/^gdiffpf\s+(.*)$/){ 115 $token{gdiffpf} = $1; 116 } elsif(/^sd\s+(.*)$/){ 117 $token{sd} = $1; 118 } elsif(/^CONFIDENCE SCORE\s+:\s+\'(.*)\'/) { 119 $token{conf} = $1; 120 } 121 } 122 process(\%token, \%results) if(defined %token); 123 dump_record($hUTD, \%token) if(defined %token); 124 close(UTD); 125 close(RES); 126 undef %token; 127 $results{total} ||= 1; 128 $rr = $results{correct}/$results{total} * 100; 129 $rr = int($rr*10 + 0.5)/10; 130 print sprintf("%-45s RR %4.1f %d/%d (%d oovs)\n", $base, $rr, $results{correct}, $results{total}, $results{numoovs}); 131 } 132 133 134 sub process 135 { 136 my $token = shift(@_); 137 my $results = shift(@_); 138 $token->{invocab} = 1 if($assume_invocab); 139 if(defined $token{topchoice}) { 140 $token->{choices}[0] = $token{topchoice}; 141 } 142 if(defined $token{topscore}) { 143 $token->{scores}[0] = $token{topscore}; 144 } 145 my $ortho = lc($token->{ortho}); 146 my $topch = lc($token->{choices}[0]); 147 148 $ortho =~ s/_/ /g; 149 $topch =~ s/_/ /g; 150 $topch =~ s/\s\s+/ /g; 151 $ortho =~ s/\s\s+/ /g; 152 if($token->{invocab} == 0) { 153 $token->{correct} = "0"; 154 $results->{numoovs}++; 155 } elsif($topch eq $ortho) { 156 $results->{total}++; 157 $results->{correct}++; 158 $token->{correct} = "1"; 159 } else { 160 $results->{total}++; 161 # print "$token->{file} MEANINGCMP: ==$token->{meanings}[0]== ==$token->{parsed_ortho}==\n"; 162 if($altsemfile) { 163 if($token->{parsed_ortho} ne $csemtags{$token->{file}}) { 164 # print "changing $token{parsed_ortho} ne $csemtags{$token->{file}}\n"; 165 $token->{parsed_ortho} = $csemtags{$token->{file}}; 166 } 167 } 168 169 if(not $try_semantic_validation) { 170 $token->{correct} = "0"; 171 } else { 172 if($token->{meanings}[0] eq $token->{parsed_ortho} && length($token->{parsed_ortho})>0) { 173 $token->{correct} = "1"; 174 $results->{correct}++ ; 175 } else { 176 $token->{correct} = "0"; 177 } 178 } 179 } 180 $token->{rankc} = 0; 181 my $nchoices = scalar(@{$token->{choices}}); 182 for($i=0; $i<$nchoices; $i++) { 183 my $choice = lc $token->{choices}[$i]; 184 $choice =~ s/_/ /g; 185 if($choice eq $ortho) { 186 $token->{rankc} = $i+1; 187 last; 188 } 189 } 190 $token->{gender} = "?"; 191 } 192 193 sub dump_record 194 { 195 my $HH = shift(@_); 196 my $token = shift(@_); 197 foreach $field (@fields) { 198 if ($field =~ /^sd13$/){ 199 print UTD "$token->{$field}" , ":"; 200 } elsif($field =~ /^(\S+)(\d+)$/) { 201 $name = "${1}s"; 202 $num = $2 - 1; 203 print UTD "$token->{$name}[$num]", ":"; 204 } else{ 205 print UTD "$token->{$field}" , ":"; 206 } 207 } 208 print UTD "\n"; 209 } 210 211 sub dump_header 212 { 213 my $HH = shift(@_); 214 foreach $field (@fields) { 215 print UTD "$field" , ":"; 216 } 217 print UTD "\n"; 218 } 219 220 sub normalize 221 { 222 my $k = shift(@_); 223 $k =~ s/\s\s+/ /g; 224 $k =~ s/\:/\;/g; 225 $k =~ s/\[[^\]]+\]//g; 226 $k =~ s/^\s+//g; 227 $k =~ s/\s+$//g; 228 return $k; 229 } 230 231 sub load_altsemfile 232 { 233 my $semfile = shift(@_); 234 open(SM,"<$semfile") || die "error: opening $semfile\n"; 235 while(<SM>) { 236 if(/D: (\S+)$/) { 237 $file = $1; 238 $file =~ s/\s+$//; 239 } elsif(/^CSem:\s+([a-z]+.*)\s*$/i) { 240 $csemtags{$file} = $1; 241 $csemtags{$file} =~ s/\s+$//; 242 } elsif(/^Sem[^:]+: invocab=(\d)/) { # same as CREC 243 $semtags{$file} = 1; 244 } elsif(/^Sem:(\s+)(\S+)/) { # same as CREC 245 $semtags{$file} = 0; 246 } 247 } 248 close(SM); 249 } 250 251