#!/usr/bin/perl -w # # Copyright (c) 2000 Anoop Sarkar # my $rcsid = '$Id: corpus2forest.pl,v 1.2 2001-05-09 01:28:57 anoop Exp $'; #' ## default values for options my $grammar_file = 'grammar.pl'; my $verbose = 0; my $progname = $0; $progname =~ s|.*/||; my $usage = <" : ""; } else { $nodetype = ""; } if (is_auxtree($treeindex)) { $nodelbl = '<' . $treeroot . '-' . $label[$treeroot] . '/' . $treefoot . '-' . $label[$treefoot] . '>'; } else { $nodelbl = '<' . $treeroot . '-' . $label[$treeroot] . '/>'; } if (!defined $sentence[$windex]) { die "Error: invalid index: $windex for node: $root\n"; } my $header = "%d: $treelabel" . "[" . $sentence[$windex] . "]<" . $root . '-' . $label[$root] . "/%s><->%s" . $nodelbl . "[%s] %s\n"; my $modifchild = -1; my $adjoin = 0; my $modiftype = $ERROR; if (defined $derivation{$root}{$windex}) { my ($newroot, $newindex) = split(' ', $derivation{$root}{$windex}); $counter = dump_tree(int($newroot), int($newindex)); $modifchild = $counter; $adjoin = 1; my $adjtree = $tree[int($newroot)]; if (is_auxtree($adjtree)) { $modiftype = auxtree_type($adjtree, int($newroot)); } else { $modiftype = $SUBST; } } else { $adjoin = 0; } my $lchild = $lchild[$root]; my $rchild = $rchild[$root]; if (!defined $lchild) { if ($root == $treefoot) { printf $header, ++$counter, 'bot', '', '<>', ''; } else { printf $header, ++$counter, 'bot', '', '<>', ''; } } elsif (!defined $rchild) { my $lc = dump_tree($lchild, $windex); printf $header, ++$counter, 'bot', '', '', "[$lc,(nil)]"; } else { my $lc = dump_tree($lchild, $windex); my $rc = dump_tree($rchild, $windex); printf $header, ++$counter, 'bot', '', '', "[$lc,$rc]"; } my $child = $counter; if ($adjoin) { if ($modiftype == $LEFTAUX) { printf $header, ++$counter, 'top', $nodetype, '', "[$modifchild,$child]"; } elsif ($modiftype == $RIGHTAUX) { printf $header, ++$counter, 'top', $nodetype, '', "[$child,$modifchild]"; } else { printf $header, ++$counter, 'top', $nodetype, '', "[$modifchild,$child]"; } } else { printf $header, ++$counter, 'top', $nodetype, '', "[$child,(nil)]"; } return($counter); } sub dump_derivation { if ($#sentence < 0) { return; } print "begin sent=\"" . join(' ', @sentence) . "\"\n"; my ($root, $windex) = split(' ', $derivation{$TOP}{-1}); $counter = 0; $counter = dump_tree(int($root), int($windex)); print "start: $counter []\n"; print "end\n"; } ($user1,$system1,$cuser1,$csystem1) = times; my $ln = 0; open (T, $filename) or die "could not open $filename\n"; while () { chomp; $ln++; next if (/^\s*$/); my @line = split; my $windex = $line[0]; my $depindex = $line[7]; if ($windex eq '0') { dump_derivation(); %derivation = (); @sentence = (); } # word, treename adjoins/substitutes into depword, deptree at # gornaddr (in deptreename) my $word = $line[1]; my $POS = $line[2]; my $treename = $line[3]; push(@sentence, $word); if ($treename eq $PUNCT) { $ispunct{$word} = 1; next; } my $anchor = $line[4]; my $gornaddr = $line[6]; my $depword = $line[8]; my $depPOS = $line[9]; my $deptreename = $line[10]; # pick only the tree_number as the name of the tree (see etrees.y) my $t = $treename; $t =~ s/^.*=(\d+)_.*$/$1/; my $tree = $treename{$t}; my $d = $deptreename; $d =~ s/^.*=(\d+)_.*$/$1/; my $deptree = $treename{$d}; my $root = $rootnode[$tree]; $wordtbl{"$word $headp[$tree]"} = 1; $treetbl{$tree} = 1; # root of the derivation if ($depword eq $TOP) { $start{$root} = 1; if (defined $derivation{$TOP}{-1}) { die "Error: duplicate TOP node at line $ln\n"; } $derivation{$TOP}{-1} = "$root $windex"; next; } my $treenode = find_treenode($deptree, $gornaddr); if (defined $derivation{$treenode}{$depindex}) { die "Error: duplicate node: $treenode and index: $windex at line $ln\n"; } $derivation{$treenode}{$depindex} = "$root $windex"; print STDERR "$word, $POS, $tree, $treenode, $depword, $depPOS, $deptree\n" if ($verbose); } dump_derivation(); close(T); ($user2,$system2,$cuser2,$csystem2) = times; print STDERR "time taken to process corpus:", " user=", $user2-$user1, " system=", $system2-$system1, "\n";