#!/usr/localbin/perl use Getopt::Long; use File::Basename; use lib dirname($0); $assume_invocab = 0; # _when_semantics_missing $rc = GetOptions("add=s" => \@additional_fields, "invocab" => \$assume_invocab, "quiet" => \$quiet, "semantic" => \$try_semantic_validation, "altsem=s" => \$altsemfile, ); my @fields = ("file", "correct", "invocab", "gdiff", "sd", "sd13", "spf", "abs", "gdiffpf", "rejrslt", "rankc", "match", "ortho", "choice1", "choice2", "score1", "conf", "gender"); if($try_semantic_validation) { push(@additional_fields,"parsed_ortho"); } push(@fields, @additional_fields); foreach $additional_field (@additional_fields) { $additional_fieldh{$additional_field}++; } load_altsemfile($altsemfile) if($altsemfile); $| = 1; if(@ARGV[0] =~ /^@/) { $flist = substr($ARGV[0],1); @resfiles = `cat $flist`; grep { s/\s+$// } @resfiles; } else { @resfiles = @ARGV; } foreach $resfile (@resfiles) { ($base = $resfile) =~ s/\.[a-z]+$//i; $utdfile = "$base.utd"; # print "processing $resfile to $utdfile\n" unless($quiet); open(RES, "<$resfile") || die "error opening $resfile\n"; open(UTD, ">$utdfile") || die "error opening $utdfile\n"; $hUTD = \*UTD; undef %results; while(<RES>) { s/\s+$//; s/^\s+//; if(/^D:\s+(\S+)\s*$/) { # same as CREC $file = $1; if(defined %token) { process(\%token, \%results); dump_record($hUTD, \%token); } else { dump_header($hUTD); } undef %token; $token{file} = $file; $file =~ /ENU-(\d\d\d)-/; $token{gender} = $gender{$1}; $token{"snr"} = get_snr($file) if($additional_fieldh{"snr"}); $token{"snrr"} = sprintf("%.2d",int(get_snr($file)/5+0.5)*5) if($additional_fieldh{"snrr"}); } elsif(/^C:\s+(.*)$/) { # same as CREC $token{ortho} = normalize($1); } elsif(/^\s*(\S+) = (.*)$/) { ($augkey,$augval) = ($1,$2); if($augkey eq "feedback") { $token{parsed_ortho} = $augval; $token{invocab}++; } } elsif(/^R:\s+(.*)$/) { # same as CREC if(/<rejected/i || /<FAILED/i) { $token{rejrslt} = "f"; } else { # $token{topchoice} = $1; $token{rejrslt} = "a"; } } elsif(/^Sem[^:]+: invocab=(\d)/) { # same as CREC $token{invocab} = 1; } elsif(/^CSem:\s+([a-z]+.*)\s*$/i) { $token{parsed_ortho} = $1; } elsif(/^Sem:(\s+)(\S+)/) { # same as CREC $token{invocab} = 0; } elsif(/^LITERAL\[\s*0\]\s*:\s*\'(.*)\'/) { $choice = $1; $token{choices}[0] = $choice; } elsif(/^LITERAL\[\s*(\d+)\]\s+:\s+\'(.*)\'/) { $i = $1; $choice = $2; /.*\: \'(.*)\'/; $choice = $1; $token{choices}[$i] = $choice; } elsif(/^MEANING\[\s*(\d+)\]\s+:\s+\'(.*)\'/) { $i = $1; $choice = $2; /.*\: \'(.*)\'/; $choice = $1; $choice =~ s/\s+$//; $token{meanings}[$i] = $choice; } elsif(/^LITERAL\[(\d+)\]\[(\d+)\]\s+:\s+\'(.*)\'/) { $i = $1; $score = $2; $token{scores}[$i] = $score; } elsif(/^RAW SCORE\s+:\s+\'(.*)\'/) { $token{topscore} = $1; } elsif(/^gdiff\s+(.*)$/){ $token{gdiff} = $1; } elsif(/^sd13\s+(.*)$/){ $token{sd13} = $1; } elsif(/^spf\s+(.*)$/){ $token{spf} = $1; } elsif(/^abs\s+(.*)$/){ $token{abs} = $1; } elsif(/^gdiffpf\s+(.*)$/){ $token{gdiffpf} = $1; } elsif(/^sd\s+(.*)$/){ $token{sd} = $1; } elsif(/^CONFIDENCE SCORE\s+:\s+\'(.*)\'/) { $token{conf} = $1; } } process(\%token, \%results) if(defined %token); dump_record($hUTD, \%token) if(defined %token); close(UTD); close(RES); undef %token; $results{total} ||= 1; $rr = $results{correct}/$results{total} * 100; $rr = int($rr*10 + 0.5)/10; print sprintf("%-45s RR %4.1f %d/%d (%d oovs)\n", $base, $rr, $results{correct}, $results{total}, $results{numoovs}); } sub process { my $token = shift(@_); my $results = shift(@_); $token->{invocab} = 1 if($assume_invocab); if(defined $token{topchoice}) { $token->{choices}[0] = $token{topchoice}; } if(defined $token{topscore}) { $token->{scores}[0] = $token{topscore}; } my $ortho = lc($token->{ortho}); my $topch = lc($token->{choices}[0]); $ortho =~ s/_/ /g; $topch =~ s/_/ /g; $topch =~ s/\s\s+/ /g; $ortho =~ s/\s\s+/ /g; if($token->{invocab} == 0) { $token->{correct} = "0"; $results->{numoovs}++; } elsif($topch eq $ortho) { $results->{total}++; $results->{correct}++; $token->{correct} = "1"; } else { $results->{total}++; # print "$token->{file} MEANINGCMP: ==$token->{meanings}[0]== ==$token->{parsed_ortho}==\n"; if($altsemfile) { if($token->{parsed_ortho} ne $csemtags{$token->{file}}) { # print "changing $token{parsed_ortho} ne $csemtags{$token->{file}}\n"; $token->{parsed_ortho} = $csemtags{$token->{file}}; } } if(not $try_semantic_validation) { $token->{correct} = "0"; } else { if($token->{meanings}[0] eq $token->{parsed_ortho} && length($token->{parsed_ortho})>0) { $token->{correct} = "1"; $results->{correct}++ ; } else { $token->{correct} = "0"; } } } $token->{rankc} = 0; my $nchoices = scalar(@{$token->{choices}}); for($i=0; $i<$nchoices; $i++) { my $choice = lc $token->{choices}[$i]; $choice =~ s/_/ /g; if($choice eq $ortho) { $token->{rankc} = $i+1; last; } } $token->{gender} = "?"; } sub dump_record { my $HH = shift(@_); my $token = shift(@_); foreach $field (@fields) { if ($field =~ /^sd13$/){ print UTD "$token->{$field}" , ":"; } elsif($field =~ /^(\S+)(\d+)$/) { $name = "${1}s"; $num = $2 - 1; print UTD "$token->{$name}[$num]", ":"; } else{ print UTD "$token->{$field}" , ":"; } } print UTD "\n"; } sub dump_header { my $HH = shift(@_); foreach $field (@fields) { print UTD "$field" , ":"; } print UTD "\n"; } sub normalize { my $k = shift(@_); $k =~ s/\s\s+/ /g; $k =~ s/\:/\;/g; $k =~ s/\[[^\]]+\]//g; $k =~ s/^\s+//g; $k =~ s/\s+$//g; return $k; } sub load_altsemfile { my $semfile = shift(@_); open(SM,"<$semfile") || die "error: opening $semfile\n"; while(<SM>) { if(/D: (\S+)$/) { $file = $1; $file =~ s/\s+$//; } elsif(/^CSem:\s+([a-z]+.*)\s*$/i) { $csemtags{$file} = $1; $csemtags{$file} =~ s/\s+$//; } elsif(/^Sem[^:]+: invocab=(\d)/) { # same as CREC $semtags{$file} = 1; } elsif(/^Sem:(\s+)(\S+)/) { # same as CREC $semtags{$file} = 0; } } close(SM); }