package cky; use strict; my $newlhs = 'a0'; ## parse # parse implements the CKY parsing algorithm. Unlike standard CKY, it # also accepts unary rules of the type A->B. The function goes through # each span ($i, $j) in the input string and for every $i < $k < $j it # attempts to find a cfg rule that can combine them to get a new chart # entry at $i, $j. # The chart is implemented as a 2D array: $chart[$i][$j] stores # information about span $i, $j. Since at each span we store a set of # the non-terminals that can derive that span, we use a hash table for # a simple set implementation (no duplicates allowed). Each chart # entry is a reference to a hash table, where the key is the # non-terminal and the value is a list of backpointers. If a # non-terminal "A" spans $i, $j then the chart entry will be stored # as: $chart[$i][$j]->{"A"}. # The list of backpointer values are stored in $chart[$i][$j]->{"A"} # where each element in this list is a string "$k B C" which implies # that "A" was derived from the two spans $i, $k with non-terminal "B" # and $k, $j with non-terminal "C". This information is used by the # function extract to print out the parse trees from the chart. sub parse { my ($grammar, $start, $input) = @_; my $N = (scalar @$input); my @chart = (); my @memo = (); for (my $i=0; $i<$N; $i++) { my $j = $i+1; # lexical rule: nt -> input[i] for my $nt (keys %{$grammar->{$input->[$i]}}) { push( @{ $chart[$i][$j]->{$nt} } , $input->[$i]); print STDERR "$i, $j: $nt -> $input->[$i]\n"; } # unary rule: nt -> nt1 while (my ($nt, $nt1) = each (%{handle_unary(\@chart, $grammar, $i, $j)})) { push( @{ $chart[$i][$j]->{$nt} } , "$j $nt1 " ); print STDERR "$i, $j: $nt -> $nt1\n"; } } for (my $j = 2; $j <= $N; $j++) { for (my $i = $j-2; $i >= 0; $i--) { # binary rule: nt -> nt1 nt2 for (my $k = $i+1; $k < $j; $k++) { for my $nt1 (keys %{$chart[$i][$k]}) { for my $nt2 (keys %{$chart[$k][$j]}) { for my $nt (keys %{$grammar->{"$nt1 $nt2"}}) { push( @{ $chart[$i][$j]->{$nt} } , "$k $nt1 $nt2" ); print STDERR "$i, $j: $nt -> $nt1 $nt2\n"; } } } } # unary rule: nt -> nt1 while (my ($nt, $nt1) = each (%{handle_unary(\@chart, $grammar, $i, $j)})) { push( @{ $chart[$i][$j]->{$nt} } , "$j $nt1 " ); print STDERR "$i, $j: $nt -> $nt1\n"; } } } return extract(\@chart, 0, $N, $start); } ## handle_unary # Deals with unary rules of the type A->B by repeatedly looking up the # grammar and entering A at chart entry $i,$j if there is a B at # $i,$j. It deals with chains of unary rules correctly (finds them # all, without the possibility of an infinite loop by using an occur # check). It returns a reference to a hash table containing valid # unary rules that are applicable for chart entry at $i, $j. sub handle_unary { my ($chart, $grammar, $i, $j) = @_; my @unary = keys %{$chart->[$i][$j]}; my (%unaryht, $nt1); while (defined ($nt1 = pop(@unary))) { # nt -> nt1 for my $nt (keys %{$grammar->{"$nt1"}}) { # occur check next if (defined $chart->[$i][$j]->{$nt}); $unaryht{$nt} = "$nt1"; push(@unary, $nt); } } return \%unaryht; } ## extract # Starts from $chart[0][$N] and recursively follows the backpointer # value stored in the chart entry. Each backpointer value has the # value of k and the rhs of the cfg rule used. For example, if "A" is # stored in chart entry 0,3 then a possible backpointer value could be # "2 B C" which implies that B(0,2) and C(2,3) was combined by the cky # parser to provide the chart entry of A(0,3) sub extract { my ($chart, $i, $j, $nt) = @_; my @rx = (); return '' if ($nt eq ''); for my $back (@{ $chart->[$i][$j]->{$nt} }) { if ($back !~ /\s+/) { @rx = ("($nt $back)"); } else { my ($k, $nt1, $nt2) = split ' ', $back; my @r1 = extract($chart, $i, $k, $nt1); my @r2 = extract($chart, $k, $j, $nt2); for my $r1 (@r1) { for my $r2 (@r2) { push @rx, "($nt $r1 $r2)"; } } } } return @rx; } ## readcfg # Reads in a file and returns the grammar and start symbol. It uses # make_cnf to convert the grammar into CNF. The grammar is stored into # a hash table called %grammar with the key as the right-hand side of # the cfg rule. The value stored in %grammar is a reference to a hash # table containing the left-hand side. For example, the rule A->B C # would be stored as: $grammar{"B C"}->{"A"}. The start symbol is the # lhs of the first cfg rule in the file. sub readcfg { my ($file) = @_; my %grammar; my $start; my ($fh, $line); open($fh, $file) or die "could not find $file\n"; while ($line = <$fh>) { chomp($line); my @rules = make_cnf(split(/\s+/, $line)); my $sz = scalar @rules; die "$file contains CFG that has an empty rule\n" if ($sz < 2); if ($sz == 2) { print STDERR "$rules[0] $rules[1]\n"; if ($rules[0] eq $rules[1]) { print STDERR "skipping chain rule: $rules[0] $rules[1]\n"; next; } $grammar{$rules[1]}->{$rules[0]} = 1; } else { for (my $i=0; $i<$sz; $i+=3) { print STDERR "$rules[$i] $rules[$i+1] $rules[$i+2]\n"; $grammar{$rules[$i+1] . ' ' . $rules[$i+2]}->{$rules[$i]} = 1; } } if (!defined $start) { $start = $rules[0]; } } close($fh); print STDERR "start=$start\n"; return (\%grammar, $start); } ## make_cnf # Converts the right-hand side of a cfg rule into a binary format by # inserting new non-terminals. It uses the variable $newlhs to create # new non-terminals by using the Perl increment operator which is # guaranteed to create a new name every time. The method is simple, # the cfg rule A->B C D is passed as a list (A, B, C, D) to make_cnf, # and then a new non-terminal is inserted into this list to create (A, # B, a0, a0, C, D) which is interpreted by readcfg as two cfg rules: # A->B a0 and a0->C D sub make_cnf { my (@rules) = @_; return @rules if (2 == (scalar @rules)); my $lhs = shift(@rules); my @tmp = reverse(@rules); my @newrules = (shift(@tmp), shift(@tmp)); for my $i (@tmp) { push(@newrules, ($newlhs, $newlhs, $i)); ++$newlhs; } push(@newrules, $lhs); return reverse(@newrules); } 1;