2004-06-08 10:13:17 +02:00
|
|
|
#!/usr/bin/perl
|
|
|
|
#
|
|
|
|
|
|
|
|
# Given a .pdb or .xwd file, print all the words in the DAWG.
|
|
|
|
# Optionally write the values and faces to files whose names are
|
|
|
|
# provided.
|
|
|
|
|
|
|
|
use strict;
|
|
|
|
use Fcntl;
|
|
|
|
|
|
|
|
my $gValueFileName;
|
|
|
|
my $gASCIIFacesFileName;
|
|
|
|
my $gUTFFacesFileName;
|
|
|
|
my $gInFile;
|
|
|
|
my $gFileType;
|
|
|
|
my $gNodeSize;
|
|
|
|
|
|
|
|
sub usage() {
|
|
|
|
print STDERR "USAGE: $0 "
|
|
|
|
. "[-vf valuesFileName] "
|
|
|
|
. "[-fa asciiFacesFileName] "
|
|
|
|
. "[-fu unicodeFacesFileName] "
|
|
|
|
. "xwdORpdb"
|
|
|
|
. "\n";
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
sub parseARGV {
|
|
|
|
|
|
|
|
my $arg;
|
|
|
|
while ( my $arg = shift(@ARGV) ) {
|
|
|
|
|
|
|
|
SWITCH: {
|
|
|
|
if ($arg =~ /-vf/) {$gValueFileName = shift(@ARGV), last SWITCH;}
|
|
|
|
if ($arg =~ /-fa/) {$gASCIIFacesFileName = shift(@ARGV);
|
|
|
|
last SWITCH;}
|
|
|
|
if ($arg =~ /-fu/) {$gUTFFacesFileName = shift(@ARGV);
|
|
|
|
last SWITCH;}
|
|
|
|
|
|
|
|
# Get here it must be the final arg, the input file name.
|
|
|
|
$gInFile = $arg;
|
|
|
|
if ( 0 != @ARGV ) {
|
|
|
|
usage();
|
|
|
|
exit 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( $gInFile =~ m|.xwd$| ) {
|
|
|
|
$gFileType = "xwd";
|
|
|
|
} elsif ( $gInFile =~ m|.pdb$| ) {
|
|
|
|
$gFileType = "pdb";
|
|
|
|
} else {
|
|
|
|
usage();
|
|
|
|
exit 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
} # parseARGV
|
|
|
|
|
|
|
|
sub countSpecials($) {
|
|
|
|
my ( $lref ) = @_;
|
|
|
|
my $count = 0;
|
|
|
|
foreach my $val (@$lref) {
|
|
|
|
if ( ord($val) < 32 ) {
|
|
|
|
++$count;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return $count;
|
|
|
|
} # countSpecials
|
|
|
|
|
|
|
|
sub readFaces($$$) {
|
|
|
|
my ( $fh, $facRef, $nSpecials ) = @_;
|
|
|
|
|
|
|
|
my $buf;
|
|
|
|
my $nRead = sysread( $fh, $buf, 1 );
|
|
|
|
my $nChars = unpack( 'c', $buf );
|
|
|
|
|
|
|
|
my @faces;
|
|
|
|
for ( my $i = 0; $i < $nChars; ++$i ) {
|
|
|
|
my $nRead = sysread( $fh, $buf, 2 );
|
|
|
|
push( @faces, chr(unpack( "n", $buf ) ) );
|
|
|
|
}
|
|
|
|
|
|
|
|
${$nSpecials} = countSpecials( \@faces );
|
|
|
|
@{$facRef} = @faces;
|
|
|
|
return $nChars;
|
|
|
|
} # readFaces
|
|
|
|
|
2004-06-09 06:09:19 +02:00
|
|
|
sub skipBitmap($) {
|
2004-06-08 10:13:17 +02:00
|
|
|
my ( $fh ) = @_;
|
|
|
|
my $buf;
|
|
|
|
sysread( $fh, $buf, 1 );
|
|
|
|
my $nCols = unpack( 'C', $buf );
|
2004-06-09 06:09:19 +02:00
|
|
|
if ( $nCols > 0 ) {
|
|
|
|
sysread( $fh, $buf, 1 );
|
|
|
|
my $nRows = unpack( 'C', $buf );
|
|
|
|
my $nBytes = (($nRows * $nCols) + 7) / 8;
|
|
|
|
|
|
|
|
sysread( $fh, $buf, $nBytes );
|
|
|
|
}
|
|
|
|
} # skipBitmap
|
2004-06-08 10:13:17 +02:00
|
|
|
|
|
|
|
sub getSpecials($$$) {
|
|
|
|
my ( $fh, $nSpecials, $specRef ) = @_;
|
|
|
|
|
|
|
|
my @specials;
|
|
|
|
for ( my $i = 0; $i < $nSpecials; ++$i ) {
|
|
|
|
my $buf;
|
|
|
|
sysread( $fh, $buf, 1 );
|
|
|
|
my $len = unpack( 'C', $buf );
|
|
|
|
sysread( $fh, $buf, $len );
|
|
|
|
push( @specials, $buf );
|
2004-06-09 06:09:19 +02:00
|
|
|
skipBitmap( $fh );
|
|
|
|
skipBitmap( $fh );
|
2004-06-08 10:13:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
@{$specRef} = @specials;
|
|
|
|
} # getSpecials
|
|
|
|
|
2004-06-09 05:55:45 +02:00
|
|
|
sub readNodesToEnd($) {
|
|
|
|
my ( $fh ) = @_;
|
|
|
|
my @nodes;
|
|
|
|
my $count = 0;
|
|
|
|
my $offset = 4 - $gNodeSize;
|
|
|
|
my ( $buf, $nRead );
|
2004-06-08 10:13:17 +02:00
|
|
|
|
2004-06-09 05:55:45 +02:00
|
|
|
do {
|
|
|
|
$nRead = sysread( $fh, $buf, $gNodeSize, $offset );
|
|
|
|
$count += $nRead;
|
|
|
|
my $node = unpack( 'N', $buf );
|
|
|
|
push( @nodes, $node );
|
|
|
|
} while ( $nRead == $gNodeSize );
|
|
|
|
die "out of sync? nRead=$nRead, count=$count" if $nRead != 0;
|
|
|
|
|
|
|
|
return @nodes;
|
|
|
|
} # readNodesToEnd
|
2004-06-08 10:13:17 +02:00
|
|
|
|
2004-06-09 05:55:45 +02:00
|
|
|
sub nodeSizeFromFlags($) {
|
|
|
|
my ( $flags ) = @_;
|
2004-06-08 10:13:17 +02:00
|
|
|
if ( $flags == 2 ) {
|
2004-06-09 05:55:45 +02:00
|
|
|
return 3;
|
2004-06-08 10:13:17 +02:00
|
|
|
} elsif ( $flags == 3 ) {
|
2004-06-09 05:55:45 +02:00
|
|
|
return 4;
|
2004-06-08 10:13:17 +02:00
|
|
|
} else {
|
|
|
|
die "invalid dict flags";
|
|
|
|
}
|
2004-06-09 05:55:45 +02:00
|
|
|
} # nodeSizeFromFlags
|
|
|
|
|
2004-06-09 06:09:19 +02:00
|
|
|
sub mergeSpecials($$) {
|
|
|
|
my ( $facesRef, $specialsRef ) = @_;
|
|
|
|
for ( my $i = 0; $i < @$facesRef; ++$i ) {
|
|
|
|
my $ref = ord($$facesRef[$i]);
|
|
|
|
if ( $ref < 32 ) {
|
|
|
|
$$facesRef[$i] = $$specialsRef[$ref];
|
|
|
|
print STDERR "set $ref to $$specialsRef[$ref]\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2004-06-09 05:55:45 +02:00
|
|
|
sub prepXWD($$$$) {
|
|
|
|
my ( $path, $facRef, $nodesRef, $startRef ) = @_;
|
|
|
|
|
|
|
|
sysopen(INFILE, $path, O_RDONLY) or die "couldn't open $path: $!\n";;
|
|
|
|
binmode INFILE;
|
|
|
|
|
|
|
|
my $buf;
|
|
|
|
my $nRead = sysread( INFILE, $buf, 2 );
|
|
|
|
my $flags = unpack( "n", $buf );
|
|
|
|
|
|
|
|
$gNodeSize = nodeSizeFromFlags( $flags );
|
2004-06-08 10:13:17 +02:00
|
|
|
|
|
|
|
my $nSpecials;
|
2004-06-09 05:55:45 +02:00
|
|
|
my $faceCount = readFaces( *INFILE, $facRef, \$nSpecials );
|
2004-06-08 10:13:17 +02:00
|
|
|
|
|
|
|
# skip xloc header
|
2004-06-09 05:55:45 +02:00
|
|
|
$nRead = sysread( INFILE, $buf, 2 );
|
2004-06-08 10:13:17 +02:00
|
|
|
|
|
|
|
# skip values info.
|
2004-06-09 05:55:45 +02:00
|
|
|
sysread( INFILE, $buf, $faceCount * 2 );
|
2004-06-08 10:13:17 +02:00
|
|
|
|
|
|
|
my @specials;
|
2004-06-09 05:55:45 +02:00
|
|
|
getSpecials( *INFILE, $nSpecials, \@specials );
|
2004-06-09 06:09:19 +02:00
|
|
|
mergeSpecials( $facRef, \@specials );
|
2004-06-08 10:13:17 +02:00
|
|
|
|
2004-06-09 05:55:45 +02:00
|
|
|
sysread( INFILE, $buf, 4 );
|
2004-06-08 10:13:17 +02:00
|
|
|
$$startRef = unpack( 'N', $buf );
|
|
|
|
|
2004-06-09 05:55:45 +02:00
|
|
|
my @nodes = readNodesToEnd( *INFILE );
|
|
|
|
|
|
|
|
close INFILE;
|
2004-06-08 10:13:17 +02:00
|
|
|
|
|
|
|
@$nodesRef = @nodes;
|
|
|
|
} # prepXWD
|
|
|
|
|
2004-06-09 05:55:45 +02:00
|
|
|
sub prepPDB($$$$) {
|
|
|
|
my ( $path, $facRef, $nodesRef, $startRef ) = @_;
|
|
|
|
|
|
|
|
$$startRef = 0; # always for palm?
|
|
|
|
|
|
|
|
sysopen(INFILE, $path, O_RDONLY) or die "couldn't open $path: $!\n";;
|
|
|
|
binmode INFILE;
|
|
|
|
|
|
|
|
my $buf;
|
|
|
|
# skip header info
|
|
|
|
my $nRead = sysread( INFILE, $buf, 76 );
|
|
|
|
$nRead += sysread( INFILE, $buf, 2 );
|
|
|
|
my $nRecs = unpack( 'n', $buf );
|
|
|
|
|
|
|
|
my @offsets;
|
|
|
|
for ( my $i = 0; $i < $nRecs; ++$i ) {
|
|
|
|
$nRead += sysread( INFILE, $buf, 4 );
|
|
|
|
push( @offsets, unpack( 'N', $buf ) );
|
|
|
|
$nRead += sysread( INFILE, $buf, 4 ); # skip
|
|
|
|
}
|
|
|
|
|
|
|
|
die "too far" if $nRead > $offsets[0];
|
|
|
|
while ( $nRead < $offsets[0] ) {
|
2004-06-09 06:09:19 +02:00
|
|
|
$nRead += sysread( INFILE, $buf, 1 );
|
2004-06-09 05:55:45 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
my $facesOffset = $offsets[1];
|
|
|
|
my $nChars = ($offsets[2] - $facesOffset) / 2;
|
|
|
|
$nRead += sysread( INFILE, $buf, $facesOffset - $nRead );
|
|
|
|
my @tmp = unpack( 'Nccccccn', $buf );
|
|
|
|
$gNodeSize = nodeSizeFromFlags( $tmp[7] );
|
|
|
|
|
|
|
|
my @faces;
|
|
|
|
for ( my $i = 0; $i < $nChars; ++$i ) {
|
|
|
|
$nRead += sysread( INFILE, $buf, 2 );
|
|
|
|
push( @faces, chr(unpack( "n", $buf ) ) );
|
|
|
|
}
|
|
|
|
@{$facRef} = @faces;
|
|
|
|
|
|
|
|
die "out of sync: $nRead != $offsets[2]" if $nRead != $offsets[2];
|
|
|
|
|
|
|
|
# now skip count and values. We'll want to get the "specials"
|
|
|
|
# shortly.
|
|
|
|
$nRead += sysread( INFILE, $buf, $offsets[3] - $nRead );
|
|
|
|
|
|
|
|
die "out of sync" if $nRead != $offsets[3];
|
|
|
|
my @nodes = readNodesToEnd( *INFILE );
|
|
|
|
close INFILE;
|
|
|
|
|
|
|
|
@$nodesRef = @nodes;
|
|
|
|
} # prepPDB
|
|
|
|
|
2004-06-08 10:13:17 +02:00
|
|
|
sub parseNode($$$$$) {
|
|
|
|
my ( $node, $chrIndex, $nextEdge, $accepting, $last ) = @_;
|
|
|
|
|
|
|
|
if ( $gNodeSize == 4 ) {
|
|
|
|
$$accepting = ($node & 0x00008000) != 0;
|
|
|
|
$$last = ($node & 0x00004000) != 0;
|
|
|
|
$$chrIndex = ($node & 0x00003f00) >> 8;
|
|
|
|
$$nextEdge = ($node >> 16) + (($node & 0x000000FF) << 16);
|
|
|
|
} elsif( $gNodeSize == 3 ) {
|
|
|
|
$$accepting = ($node & 0x00000080) != 0;
|
|
|
|
$$last = ($node & 0x00000040) != 0;
|
|
|
|
$$chrIndex = $node & 0x0000001f;
|
|
|
|
$$nextEdge = ($node >> 8) + (($node & 0x00000020) << 11);
|
|
|
|
}
|
|
|
|
|
|
|
|
# printf "%x: acpt=$$accepting; last=$$last; "
|
|
|
|
# . "next=$$nextEdge; ci=$$chrIndex\n", $node;
|
|
|
|
} # parseNode
|
|
|
|
|
2004-06-09 06:09:19 +02:00
|
|
|
sub printStr($$) {
|
|
|
|
my ( $strRef, $facesRef ) = @_;
|
2004-06-08 10:13:17 +02:00
|
|
|
|
2004-06-09 06:09:19 +02:00
|
|
|
print join( "", map {$$facesRef[$_]} @$strRef), "\n";
|
2004-06-08 10:13:17 +02:00
|
|
|
} # printStr
|
|
|
|
|
|
|
|
|
|
|
|
# Given an array of 4-byte nodes, a start index. and another array of
|
|
|
|
# two-byte faces, print out all of the words in the nodes array.
|
|
|
|
|
|
|
|
sub printDAWGInternal($$$$) {
|
|
|
|
my ( $str, $arrRef, $start, $facesRef ) = @_;
|
|
|
|
|
|
|
|
for ( ; ; ) {
|
|
|
|
my $node = $$arrRef[$start++];
|
|
|
|
my $nextEdge;
|
|
|
|
my $chrIndex;
|
|
|
|
my $accepting;
|
|
|
|
my $lastEdge;
|
|
|
|
|
|
|
|
parseNode( $node, \$chrIndex, \$nextEdge, \$accepting, \$lastEdge );
|
|
|
|
|
2004-06-09 06:09:19 +02:00
|
|
|
push( @$str, $chrIndex );
|
2004-06-09 05:55:45 +02:00
|
|
|
if ( $accepting ) {
|
2004-06-09 06:09:19 +02:00
|
|
|
printStr( $str, $facesRef );
|
2004-06-09 05:55:45 +02:00
|
|
|
}
|
2004-06-08 10:13:17 +02:00
|
|
|
|
|
|
|
if ( $nextEdge != 0 ) {
|
|
|
|
printDAWGInternal( $str, $arrRef, $nextEdge, $facesRef );
|
|
|
|
}
|
|
|
|
|
|
|
|
pop( @$str );
|
|
|
|
|
|
|
|
if ( $lastEdge ) {
|
|
|
|
last;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} # printDAWGInternal
|
|
|
|
|
|
|
|
sub printDAWG($$$) {
|
|
|
|
my ( $arrRef, $start, $facesRef ) = @_;
|
|
|
|
|
2004-06-09 05:55:45 +02:00
|
|
|
die "no nodes!!!" if 0 == @$arrRef;
|
|
|
|
|
2004-06-08 10:13:17 +02:00
|
|
|
my @str;
|
|
|
|
printDAWGInternal( \@str, $arrRef, $start, $facesRef );
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#################################################################
|
|
|
|
# main
|
|
|
|
#################################################################
|
|
|
|
|
|
|
|
|
|
|
|
if ( !parseARGV() ) {
|
|
|
|
usage();
|
|
|
|
exit 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
my @faces;
|
|
|
|
my @nodes;
|
|
|
|
my $startIndex;
|
|
|
|
|
|
|
|
if ( $gFileType eq "xwd" ){
|
2004-06-09 05:55:45 +02:00
|
|
|
prepXWD( $gInFile, \@faces, \@nodes, \$startIndex );
|
2004-06-08 10:13:17 +02:00
|
|
|
} elsif ( $gFileType eq "pdb" ) {
|
2004-06-09 05:55:45 +02:00
|
|
|
prepPDB( $gInFile, \@faces, \@nodes, \$startIndex );
|
2004-06-09 06:09:19 +02:00
|
|
|
print STDERR join( ",", @faces), "\n";
|
2004-06-08 10:13:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
printDAWG( \@nodes, $startIndex, \@faces );
|
2004-06-09 06:09:19 +02:00
|
|
|
|
|
|
|
if ( $gASCIIFacesFileName ) {
|
|
|
|
open FACES, "> $gASCIIFacesFileName";
|
|
|
|
foreach my $face (@faces) {
|
|
|
|
print FACES pack('cc', 0, $face );
|
|
|
|
}
|
|
|
|
close FACES;
|
|
|
|
}
|