xwords/xwords4/dawg/dict2dawg.pl

#!/usr/bin/perl

##############################################################################
# adapted from C++ code Copyright (C) 2000 Falk Hueffner
# This version Copyright (C) 2002 Eric House (fixin@peak.org)
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
# USA
##############################################################################

# inputs: 0. Name of file mapping letters to 0..31 values.  In English
# case just contains A..Z.  This will be used to translate the tries
# on output.
#         1. Max number of bytes per binary output file.
#
#         2. Basename of binary files for output.

#         3. Name of file to which to write the number of the
# startNode, since I'm not rewriting a bunch of code to expect Falk's
# '*' node at the start.
#

#         In STDIN, the text file to be compressed.  It absolutely
# must be sorted.  The sort doesn't have to follow the order in the
# map file, however.

# This is meant eventually to be runnable as part of a cgi system for
# letting users generating Crosswords dicts online.


use strict;
use POSIX;

my $gFirstDiff;
my @gCurrentWord;
my $gCurrentWord;               # save so can check for sortedness
my $gDone = 0;
my @gInputStrings;
my $gNeedsSort = 1;             # read from cmd line eventually
my @gNodes;                       # final array of nodes
my $gNBytesPerOutfile = 0xFFFFFFFF;
my $gTableFile;
my $gOutFileBase;
my $gStartNodeOut;
my $gInFileName;
my $gKillIfMissing = 1;
my $gTermChar = '/n';
my $gDumpText = 0;              # dump the dict as text after?
my $gCountFile;
my $gBytesPerNodeFile;          # where to write whether node size 3 or 4
my $gWordCount = 0;
my %gTableHash;
my @gRevMap;
my $debug = 0;
my %gSubsHash;
my $gForceFour = 0;             # use four bytes regardless of need?
my $gNBytesPerNode;
my $gUseUnicode;

main();

exit;

##############################################################################

sub main() { 

    if ( !parseARGV() ) {
        usage();
        exit();
    }

    makeTableHash();

    my $infile;

    if ( $gInFileName ) {
        open $infile, "<$gInFileName";
    } else {
        $infile = \*STDIN;
    }

    @gInputStrings = parseAndSort( $infile );
    if ( $gInFileName ) {
        close $infile;
    }

    # Do I need this stupid thing?  Better to move the first row to
    # the front of the array and patch everything else.  Or fix the
    # non-palm dictionary format to include the offset of the first
    # node.

    my $dummyNode = 0xFFFFFFFF;
    @gNodes = ( $dummyNode );
    
    readNextWord();

    my $firstRootChildOffset = buildNode(0);

    moveTopToFront( \$firstRootChildOffset );

    if ( $gStartNodeOut ) {
        writeOutStartNode( $gStartNodeOut, $firstRootChildOffset );
    }

    print STDERR "\n... dumping table ...\n" if $debug;
    printNodes( \@gNodes, "done with main" ) if $debug;

    # write out the number of nodes if requested
    if ( $gCountFile ) {
        open OFILE, "> $gCountFile";
        print OFILE pack( "N", $gWordCount );
        close OFILE;
        print STDERR "wrote out: got $gWordCount words\n";
    }

    if ( $gOutFileBase ) {
        emitNodes( $gNBytesPerOutfile, $gOutFileBase );
    }

    if ( $gDumpText && @gNodes > 0 ) {
        printOneLevel( $firstRootChildOffset, "" );
    }

    if ( $gBytesPerNodeFile ) {
        open OFILE, "> $gBytesPerNodeFile";
        print OFILE $gNBytesPerNode;
        close OFILE;
    }
    print STDERR "Used $gNBytesPerNode per node.\n";
} # main

# We now have an array of nodes with the last subarray being the
# logical top of the tree.  Move them to the start, fixing all fco
# refs, so that legacy code like Palm can assume top==0.
#
# Note: It'd probably be a bit faster to integrate this with emitNodes
# -- unless I need to have an in-memory list that can be used for
# lookups.  But that's best for debugging, so keep it this way for now.
#
# Also Note: the first node is a dummy that can and should be tossed
# now.

sub moveTopToFront($) {
    my ( $firstRef ) = @_;

    my $firstChild = ${$firstRef};
    ${$firstRef} = 0;
    my @lastSub;

    if ( $firstChild > 0 ) {
        # remove the last (the root) subarray
        @lastSub = splice( @gNodes, $firstChild );
    } else {
        die "there should be no words!!" if $gWordCount != 0;
    }
    # remove the first (garbage) node
    shift @gNodes;

    my $diff;
    if ( $firstChild > 0 ) {
        # -1 because all move down by 1; see prev line
        $diff = @lastSub - 1;
        die "something wrong with len\n" if $diff < 0;
    } else {
        $diff = 0;
    }

    # stick it on the front
    splice( @gNodes, 0, 0, @lastSub);

    # We add $diff to everything. There's no subtracting because
    # nobody had any refs to the top list.

    for ( my $i = 0; $i < @gNodes; ++$i ) {
        my $fco = TrieNodeGetFirstChildOffset( $gNodes[$i] );
        if ( $fco != 0 ) {      # 0 means NONE, not 0th!!
            TrieNodeSetFirstChildOffset( \$gNodes[$i], $fco+$diff );
        }
    }
} # moveTopToFront


sub buildNode {
    my ( $depth ) = @_;

    if ( @gCurrentWord == $depth ) {
        # End of word reached. If the next word isn't a continuation
        # of the current one, then we've reached the bottom of the
        # recursion tree.
        readNextWord();
        if ($gFirstDiff < $depth || $gDone) {
            return 0;
        }
    }

    my @newedges;

    do {
        my $letter = $gCurrentWord[$depth];
        my $isTerminal = @gCurrentWord - 1 == $depth ? 1:0;

        my $nodeOffset = buildNode($depth+1);
        my $newNode = MakeTrieNode($letter, $isTerminal, $nodeOffset);
        push( @newedges, $newNode );

    } while ( ($gFirstDiff == $depth) && !$gDone);

    TrieNodeSetIsLastSibling( \@newedges[@newedges-1], 1 );

    return addNodes( \@newedges );
} # buildNode

sub addNodes {
    my ( $newedgesR ) = @_;

    my $found = findSubArray( $newedgesR );

    if ( $found >= 0 ) {
        die "0 is an invalid match!!!" if $found == 0;
        return $found;
    } else {

        my $firstFreeIndex = @gNodes;

        print STDERR "adding...\n" if $debug;
        printNodes( $newedgesR ) if $debug;

        push @gNodes, (@{$newedgesR});

        registerSubArray( $newedgesR, $firstFreeIndex );
        return $firstFreeIndex;
    }
} # addNodes

sub printNode {
    my ( $index, $node ) = @_;

    print STDERR "[$index] ";

    printf( STDERR
            "letter=%d; isTerminal=%d; isLastSib=%d; fco=%d;\n",
            TrieNodeGetLetter($node),
            TrieNodeGetIsTerminal($node),
            TrieNodeGetIsLastSibling($node),
            TrieNodeGetFirstChildOffset($node));
} # printNode

sub printNodes {
    my ( $nodesR, $name ) = @_;

    my $len = @{$nodesR};
    # print "printNodes($name): len = $len\n";

    for ( my $i = 0; $i < $len; ++$i ) {
        my $node = ${$nodesR}[$i];
        printNode( $i, $node );
    }

}


# Hashing.  We'll keep a hash of offsets into the existing nodes
# array, and as the key use a string that represents the entire sub
# array.  Since the key is what we're matching for, there should never
# be more than one value per hash and so we don't need buckets.
# Return -1 if there's no match.

sub findSubArray {
   my ( $newedgesR ) = @_;

	my $key = join('', @{$newedgesR});

    if ( exists( $gSubsHash{$key} ) ) {
        return $gSubsHash{$key};
    } else {
        return -1;
    }
} # findSubArray

# add to the hash
sub registerSubArray {
    my ( $edgesR, $nodeLoc ) = @_;

    my $key = join( '', @{$edgesR} );

    if ( exists $gSubsHash{$key} ) {
        die "entry for key shouldn't exist!!";
    } else {
        $gSubsHash{$key} = $nodeLoc;
    }

} # registerSubArray

sub readNextWord() {
    my @word;

    if ( !$gDone ) {
        $gDone = @gInputStrings == 0;
        if ( !$gDone ) {
            @word = @{shift @gInputStrings};
        } else {
            print STDERR "gDone set to true\n" if $debug;
        }

        print STDERR "got word: ", join(',',@word), "\n" if $debug;
    }
    my $numCommonLetters = 0;
    my $len = @word;
    if ( @gCurrentWord < $len ) {
        $len = @gCurrentWord;
    }

    while ( @gCurrentWord[$numCommonLetters] eq @word[$numCommonLetters]
            && $numCommonLetters < $len) {
        ++$numCommonLetters;
    }

    $gFirstDiff = $numCommonLetters;
    die "words ", join(",",@gCurrentWord), " and ", join(",", @word), " out of order" if #$debug && 
        @gCurrentWord > 0 && @word > 0
        && !firstBeforeSecond( \@gCurrentWord, \@word );
    @gCurrentWord = @word;
} # readNextWord

sub firstBeforeSecond {
    my ( $firstR, $secondR ) = @_;
  
    for ( my $i = 0; ; ++$i ) {

        # if we reach the end of the first word/list, we're done.
        if ( $i == @{$firstR} ) {  
            die "duplicate!!!" if $i == @{$secondR};
            return 1;
            # but if we reach the second end first, we've failed
        } elsif ( $i == @{$secondR} ) {
            return 0;
        }

        my $diff = ${$firstR}[$i] <=> ${$secondR}[$i];

        if ( $diff == 0 ) {
            next;
        } else {
            return $diff < 0;
        }
    }
} # firstBeforeSecond

# passed to sort.  Should remain unprototyped for effeciency's sake

sub cmpWords {

    my $lenA = @{$a};
    my $lenB = @{$b};
    my $min = $lenA > $lenB? $lenB: $lenA;

    for ( my $i = 0; $i < $min; ++$i ) {
        my $ac = ${$a}[$i];
        my $bc = ${$b}[$i];

        my $res = $ac <=> $bc;

        if ( $res != 0 ) {
            return $res;        # we're done
        }
    }

    # If we got here, they match up to their common length.  Longer is
    # greater.
    my $res = @{$a} <=> @{$b};
    return $res; # which is longer?
} # cmpWords

sub parseAndSort() {
    my ( $infile ) = @_;

    my @wordlist;
    my @word;

  WORDLOOP:
    for ( ; ; ) {

        my $dropWord = 0;
        splice @word;           # empty it

        # for each byte
        for ( ; ; ) {
            my $byt = getc($infile);

            if ( $byt eq undef ) {
                last WORDLOOP;
            } elsif ( $byt eq $gTermChar ) {
                if ( !$dropWord ) {
                    push @wordlist, [ @word ];
                    ++$gWordCount;
                }
                next WORDLOOP;
            } elsif ( exists( $gTableHash{$byt} ) ) {
                if ( !$dropWord ) {
                    push @word, $gTableHash{$byt};
                    die "word too long" if @word > 15;
                }
            } elsif ($gKillIfMissing) {
                die "$0: chr $byt (", $byt+0, ") not in map file $gTableFile\n";
            } else {
                $dropWord = 1;
                splice @word;     # lose anything we already have
            }
        }
    }

    if ( $gNeedsSort && ($gWordCount > 0)  ) {
        @wordlist = sort cmpWords @wordlist;
    }

    print STDERR "length of list is ", @wordlist + 0, ".\n" if $debug;

    return @wordlist;
} # parseAndSort

# Print binary representation of trie array.  This isn't used yet, but
# eventually it'll want to dump to multiple files appropriate for Palm
# that can be catenated together on other platforms.  There'll need to
# be a file giving the offset of the first node too.  Also, might want
# to move to 4-byte representation when the input can't otherwise be
# handled.

sub dumpNodes {

    for ( my $i = 0; $i < @gNodes; ++$i ) {
        my $node = $gNodes[$i];
        my $bstr = pack( "I", $node );
        print STDOUT $bstr;
    }
}

##############################################################################
# Little node-field setters and getters to hide what bits represent
# what.
##############################################################################

sub TrieNodeSetIsTerminal {
    my ( $nodeR, $isTerminal ) = @_;

    if ( $isTerminal ) {
        ${$nodeR} |= 1 << 31;
    } else {
        ${$nodeR} &= ~(1 << 31);
    }
}

sub TrieNodeGetIsTerminal {
    my ( $node ) = @_;
    return ($node & 1 << 31) != 0;
}

sub TrieNodeSetIsLastSibling {
    my ( $nodeR, $isLastSibling ) = @_;
    if ( $isLastSibling ) {
        ${$nodeR} |= 1 << 30;
    } else {
        ${$nodeR} &= ~(1 << 30);
    }
}

sub TrieNodeGetIsLastSibling {
    my ( $node ) = @_;
    return ($node & 1 << 30) != 0;
}

sub TrieNodeSetLetter {
    my ( $nodeR, $letter ) = @_;

    die "letter ", $letter, " too big" if $letter >= 32;

    my $mask = ~(0x1F << 25);
    ${$nodeR} &= $mask;                         # clear all the bits
    ${$nodeR} |= ($letter << 25);          # set new ones
}

sub TrieNodeGetLetter {
    my ( $node ) = @_;
    $node >>= 25;
    $node &= 0x1F;
    return $node;
}

sub TrieNodeSetFirstChildOffset {
    my ( $nodeR, $fco ) = @_;

    die "$fco larger than 25 bits" if ($fco & 0xFE000000) != 0;

    my $mask = ~0x01FFFFFF;
    ${$nodeR} &= $mask;                   # clear all the bits
    ${$nodeR} |= $fco;                    # set new ones
}

sub TrieNodeGetFirstChildOffset {
    my ( $node ) = @_;
    $node &= 0x01FFFFFF;                  # 24 bits
    return $node;
}


sub MakeTrieNode {
    my ( $letter, $isTerminal, $firstChildOffset, $isLastSibling ) = @_;
    my $result = 0;

    TrieNodeSetIsTerminal( \$result, $isTerminal );
    TrieNodeSetIsLastSibling( \$result, $isLastSibling );
    TrieNodeSetLetter( \$result, $letter );
    TrieNodeSetFirstChildOffset( \$result, $firstChildOffset );

    return $result;
} # MakeTrieNode

# Caller may need to know the offset of the first top-level node.
# Write it here.
sub writeOutStartNode {
    my ( $startNodeOut, $firstRootChildOffset ) = @_;

    open NODEOUT, ">$startNodeOut";
    print NODEOUT pack( "N", $firstRootChildOffset );
    close NODEOUT;
} # writeOutStartNode

# build the hash for translating.  I'm using a hash assuming it'll be
# fast.  Key is the letter; value is the 0..31 value to be output.
sub makeTableHash {
    my $i;
    open TABLEFILE, "< $gTableFile";

    splice @gRevMap;            # empty it

    for ( $i = 0; ; ++$i ) {
        my $ch = getc(TABLEFILE);
        if ( $ch eq undef ) {
            last;
        }

        if ( $gUseUnicode ) {   # skip the first byte each time: tmp HACK!!!
            $ch = getc(TABLEFILE);
        }
        if ( $ch eq undef ) {
            last;
        }

        push @gRevMap, $ch;

        if ( ord($ch) == 0 ) {	# blank
            next;       # we want to increment i when blank seen since 
                        # it is a tile value
        }

        die "$gTableFile too large\n" if $i > 32;
        die "only blank (0) can be 32nd char\n" if ($i == 32 && $ch != 0);

        $gTableHash{$ch} = $i;
    }

    close TABLEFILE;
} # makeTableHash

# emitNodes. "input" is $gNodes.  From it we write up to
# $nBytesPerOutfile to files named $outFileBase0..n, mapping the
# letter field down to 5 bits with a hash built from $tableFile.  If
# at any point we encounter a letter not in the hash we fail with an
# error.

sub emitNodes($$) {
    my ( $gNBytesPerOutfile, $outFileBase ) = @_;

    # now do the emit.

    # is 17 bits enough?
    printf STDOUT ("There are %d (0x%x) nodes in this DAWG.\n",
                   0 + @gNodes, 0 + @gNodes );
    if ( @gNodes > 0x1FFFF || $gForceFour ) {
        $gNBytesPerNode = 4;
    } else {
        $gNBytesPerNode = 3;
    }

    my $nextIndex = 0;
    my $nextFileNum = 0;

    for ( $nextFileNum = 0; ; ++$nextFileNum ) {

        if ( $nextIndex >= @gNodes ) {
            last;	# we're done
        }
        
        die "Too many outfiles; infinite loop?" if $nextFileNum > 99;

        my $outName = sprintf("${outFileBase}_%03d.bin", $nextFileNum);
        open OUTFILE, "> $outName";
        my $curSize = 0;

        while ( $nextIndex < @gNodes ) {

            # scan to find the next terminal
            my $i;
            for ( $i = $nextIndex; 
                  !TrieNodeGetIsLastSibling($gNodes[$i]);
                  ++$i ) {

                # do nothing but a sanity check
                if ( $i >= @gNodes) {
                    die "bad trie format: last node not last sibling" ;
                }

            }
            ++$i;	# move beyond the terminal
            my $nextSize = ($i - $nextIndex) * $gNBytesPerNode;
            if ($curSize + $nextSize > $gNBytesPerOutfile) {
                last;
            } else {
                # emit the subarray
                while ( $nextIndex < $i ) {
                    outputNode( $gNodes[$nextIndex], $gNBytesPerNode,
                                \*OUTFILE );
                    ++$nextIndex;
                }
                $curSize += $nextSize;
            }
        }

        close OUTFILE;
    }

} # emitNodes

sub printWord {
    my ( $str ) = @_;

    print STDERR "$str\n";
}

# print out the entire dictionary, as text, to STDERR.

sub printOneLevel {

    my ( $index, $str ) = @_;

    for ( ; ; ) {

        my $newStr = $str;
        my $node = $gNodes[$index++];

        my $lindx = $gRevMap[TrieNodeGetLetter($node)];

        if ( ord($lindx) >= 0x20 ) {
            $newStr .= "$lindx";
        } else {
            print STDERR "sub space" if $debug;
            $newStr .= "\\" . chr('0'+$lindx);
        }

        if ( TrieNodeGetIsTerminal($node) ) {
            printWord( $newStr );
        } 

        my $fco = TrieNodeGetFirstChildOffset( $node );
        if ( $fco != 0 ) {
            printOneLevel( $fco, $newStr );
        }

        if ( TrieNodeGetIsLastSibling($node) ) {
            last;
        }
    }
}

sub outputNode {
    my ( $node, $nBytes, $outfile ) = @_;

    my $fco = TrieNodeGetFirstChildOffset($node);
    my $fourthByte;

    if ( $nBytes == 4 ) {
        $fourthByte = $fco >> 17;
        die "fco too big" if $fourthByte > 0xFF;
        $fco &= 0x1FFFF;
    }

    # format according to dawg.h:
    # typedef struct array_edge {
    #     unsigned char highByte;
    #     unsigned char lowByte;
    #     unsigned char bits;
#ifdef FOUR_BYTE
    #     unsigned char moreBits;
#endif
    # } array_edge;

    # define LETTERMASK 0x1f
    # define ACCEPTINGMASK 0x20
    # define LASTEDGEMASK 0x40
    # define LASTBITMASK 0x80

    # write the fco (less that one bit).  We want two bytes worth
    # in three-byte mode, and three in four-byte mode (which is
    # untested)
    for ( my $i = 1; $i >= 0; --$i ) {
        my $tmp = ($fco >> ($i * 8)) & 0xFF;
        print $outfile pack( "C", $tmp );
    }
    $fco >>= 16;                # it should now be 1 or 0
    die "fco not 1 or 0" if $fco > 1;

    my $chIn5 = TrieNodeGetLetter($node);
    my $bits = $chIn5;

    if ( TrieNodeGetIsLastSibling($node) ) {
        $bits |= 0x40;
    }
    if ( TrieNodeGetIsTerminal($node) ) {
        $bits |= 0x20;
    }
    if ( $fco != 0 ) {
        $bits |= 0x80;
    }
    print $outfile pack( "C", $bits );

    # the final byte, if in use
    if ( $nBytes == 4 ) {
        print $outfile pack( "C", $fourthByte );
    }
} # outputNode

sub usage {
    print STDERR "usage: $0 \n"
        . "\t[-b    bytesPerFile] (default = 0xFFFFFFFF)\n"
        . "\t-m     mapFile\n"
        . "\t-ob    outFileBase\n"
        . "\t-sn    start node out file\n"
        . "\t[-if   input file name]  -- default = stdin\n"
        . "\t[-term ch] (word terminator -- default = '\\0'\n"
        . "\t[-nosort] (input already sorted in accord with -m; " .
        " default=sort'\n"
        . "\t[-dump]  (write dictionary as text to STDERR for testing)\n"
        . "\t[-force4](use 4 bytes per node regardless of need)\n"
        . "\t[-r]     (reject words with letters not in mapfile)\n"
        . "\t[-k]     (kill if any letters no in mapfile -- default)\n"
        ;

} # usage

sub parseARGV {

    my $arg;
    while ( my $arg = shift(@ARGV) ) {

      SWITCH: {
          if ($arg =~ /-b/) {$gNBytesPerOutfile = shift(@ARGV), last SWITCH;}
          if ($arg =~ /-mn/) {$gTableFile = shift(@ARGV);
                              $gUseUnicode = 1;
                              last SWITCH;}
          if ($arg =~ /-m/) {$gTableFile = shift(@ARGV); last SWITCH;}
          if ($arg =~ /-ob/) {$gOutFileBase = shift(@ARGV), last SWITCH;}
          if ($arg =~ /-sn/) {$gStartNodeOut = shift(@ARGV), last SWITCH;}
          if ($arg =~ /-if/) {$gInFileName = shift(@ARGV), last SWITCH;}
          if ($arg =~ /-r/) {$gKillIfMissing = 0; last SWITCH;}
          if ($arg =~ /-k/) {$gKillIfMissing = 1; last SWITCH;}
          if ($arg =~ /-term/) {$gTermChar = chr(shift(@ARGV)); last SWITCH;}
          if ($arg =~ /-dump/) {$gDumpText = 1; last SWITCH;}
          if ($arg =~ /-nosort/) {$gNeedsSort = 0; last SWITCH;}
          if ($arg =~ /-wc/) {$gCountFile = shift(@ARGV); last SWITCH;}
          if ($arg =~ /-ns/) {$gBytesPerNodeFile = shift(@ARGV); last SWITCH;}
          if ($arg =~ /-force4/) {$gForceFour = 1; last SWITCH;}
          die "unexpected arg $arg\n";
      }
    }


    print STDERR "gNBytesPerOutfile=$gNBytesPerOutfile\n" if $debug;
    print STDERR "gTableFile=$gTableFile\n" if $debug;
    print STDERR "gOutFileBase=$gOutFileBase\n" if $debug;
    print STDERR "gStartNodeOut=$gStartNodeOut\n" if $debug;
    printf STDERR "gTermChar=%s(%d)\n", $gTermChar, ord($gTermChar) if $debug;

    return $gTableFile;
    
} # parseARGV
first checkin from personal project 2003-11-01 07:19:20 +01:00			`#!/usr/bin/perl`

			`##############################################################################`
			`# adapted from C++ code Copyright (C) 2000 Falk Hueffner`
			`# This version Copyright (C) 2002 Eric House (fixin@peak.org)`
			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation; either version 2 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful, but`
			`# WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`# General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program; if not, write to the Free Software`
			`# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307`
			`# USA`
			`##############################################################################`

			`# inputs: 0. Name of file mapping letters to 0..31 values. In English`
			`# case just contains A..Z. This will be used to translate the tries`
			`# on output.`
			`# 1. Max number of bytes per binary output file.`
			`#`
			`# 2. Basename of binary files for output.`

			`# 3. Name of file to which to write the number of the`
			`# startNode, since I'm not rewriting a bunch of code to expect Falk's`
			`# '*' node at the start.`
			`#`

			`# In STDIN, the text file to be compressed. It absolutely`
			`# must be sorted. The sort doesn't have to follow the order in the`
			`# map file, however.`

			`# This is meant eventually to be runnable as part of a cgi system for`
			`# letting users generating Crosswords dicts online.`



			`use strict;`
			`use POSIX;`

			`my $gFirstDiff;`
			`my @gCurrentWord;`
			`my $gCurrentWord; # save so can check for sortedness`
			`my $gDone = 0;`
			`my @gInputStrings;`
			`my $gNeedsSort = 1; # read from cmd line eventually`
			`my @gNodes; # final array of nodes`
			`my $gNBytesPerOutfile = 0xFFFFFFFF;`
			`my $gTableFile;`
			`my $gOutFileBase;`
			`my $gStartNodeOut;`
			`my $gInFileName;`
			`my $gKillIfMissing = 1;`
			`my $gTermChar = '/n';`
			`my $gDumpText = 0; # dump the dict as text after?`
			`my $gCountFile;`
			`my $gBytesPerNodeFile; # where to write whether node size 3 or 4`
			`my $gWordCount = 0;`
			`my %gTableHash;`
			`my @gRevMap;`
			`my $debug = 0;`
			`my %gSubsHash;`
			`my $gForceFour = 0; # use four bytes regardless of need?`
			`my $gNBytesPerNode;`
			`my $gUseUnicode;`

			`main();`

			`exit;`

			`##############################################################################`

			`sub main() {`

			`if ( !parseARGV() ) {`
			`usage();`
			`exit();`
			`}`

			`makeTableHash();`

			`my $infile;`

			`if ( $gInFileName ) {`
			`open $infile, "<$gInFileName";`
			`} else {`
			`$infile = \*STDIN;`
			`}`

			`@gInputStrings = parseAndSort( $infile );`
			`if ( $gInFileName ) {`
			`close $infile;`
			`}`

			`# Do I need this stupid thing? Better to move the first row to`
			`# the front of the array and patch everything else. Or fix the`
			`# non-palm dictionary format to include the offset of the first`
			`# node.`

			`my $dummyNode = 0xFFFFFFFF;`
			`@gNodes = ( $dummyNode );`

			`readNextWord();`

			`my $firstRootChildOffset = buildNode(0);`

			`moveTopToFront( \$firstRootChildOffset );`

			`if ( $gStartNodeOut ) {`
			`writeOutStartNode( $gStartNodeOut, $firstRootChildOffset );`
			`}`

			`print STDERR "\n... dumping table ...\n" if $debug;`
			`printNodes( \@gNodes, "done with main" ) if $debug;`

			`# write out the number of nodes if requested`
			`if ( $gCountFile ) {`
			`open OFILE, "> $gCountFile";`
			`print OFILE pack( "N", $gWordCount );`
			`close OFILE;`
			`print STDERR "wrote out: got $gWordCount words\n";`
			`}`

			`if ( $gOutFileBase ) {`
			`emitNodes( $gNBytesPerOutfile, $gOutFileBase );`
			`}`

			`if ( $gDumpText && @gNodes > 0 ) {`
			`printOneLevel( $firstRootChildOffset, "" );`
			`}`

			`if ( $gBytesPerNodeFile ) {`
			`open OFILE, "> $gBytesPerNodeFile";`
			`print OFILE $gNBytesPerNode;`
			`close OFILE;`
			`}`
			`print STDERR "Used $gNBytesPerNode per node.\n";`
			`} # main`

			`# We now have an array of nodes with the last subarray being the`
			`# logical top of the tree. Move them to the start, fixing all fco`
			`# refs, so that legacy code like Palm can assume top==0.`
			`#`
			`# Note: It'd probably be a bit faster to integrate this with emitNodes`
			`# -- unless I need to have an in-memory list that can be used for`
			`# lookups. But that's best for debugging, so keep it this way for now.`
			`#`
			`# Also Note: the first node is a dummy that can and should be tossed`
			`# now.`

			`sub moveTopToFront($) {`
			`my ( $firstRef ) = @_;`

			`my $firstChild = ${$firstRef};`
			`${$firstRef} = 0;`
			`my @lastSub;`

			`if ( $firstChild > 0 ) {`
			`# remove the last (the root) subarray`
			`@lastSub = splice( @gNodes, $firstChild );`
			`} else {`
			`die "there should be no words!!" if $gWordCount != 0;`
			`}`
			`# remove the first (garbage) node`
			`shift @gNodes;`

			`my $diff;`
			`if ( $firstChild > 0 ) {`
			`# -1 because all move down by 1; see prev line`
			`$diff = @lastSub - 1;`
			`die "something wrong with len\n" if $diff < 0;`
			`} else {`
			`$diff = 0;`
			`}`

			`# stick it on the front`
			`splice( @gNodes, 0, 0, @lastSub);`

			`# We add $diff to everything. There's no subtracting because`
			`# nobody had any refs to the top list.`

			`for ( my $i = 0; $i < @gNodes; ++$i ) {`
			`my $fco = TrieNodeGetFirstChildOffset( $gNodes[$i] );`
			`if ( $fco != 0 ) { # 0 means NONE, not 0th!!`
			`TrieNodeSetFirstChildOffset( \$gNodes[$i], $fco+$diff );`
			`}`
			`}`
			`} # moveTopToFront`


			`sub buildNode {`
			`my ( $depth ) = @_;`

			`if ( @gCurrentWord == $depth ) {`
			`# End of word reached. If the next word isn't a continuation`
			`# of the current one, then we've reached the bottom of the`
			`# recursion tree.`
			`readNextWord();`
			`if ($gFirstDiff < $depth \|\| $gDone) {`
			`return 0;`
			`}`
			`}`

			`my @newedges;`

			`do {`
			`my $letter = $gCurrentWord[$depth];`
			`my $isTerminal = @gCurrentWord - 1 == $depth ? 1:0;`

			`my $nodeOffset = buildNode($depth+1);`
			`my $newNode = MakeTrieNode($letter, $isTerminal, $nodeOffset);`
			`push( @newedges, $newNode );`

			`} while ( ($gFirstDiff == $depth) && !$gDone);`

			`TrieNodeSetIsLastSibling( \@newedges[@newedges-1], 1 );`

			`return addNodes( \@newedges );`
			`} # buildNode`

			`sub addNodes {`
			`my ( $newedgesR ) = @_;`

			`my $found = findSubArray( $newedgesR );`

			`if ( $found >= 0 ) {`
			`die "0 is an invalid match!!!" if $found == 0;`
			`return $found;`
			`} else {`

			`my $firstFreeIndex = @gNodes;`

			`print STDERR "adding...\n" if $debug;`
			`printNodes( $newedgesR ) if $debug;`

			`push @gNodes, (@{$newedgesR});`

			`registerSubArray( $newedgesR, $firstFreeIndex );`
			`return $firstFreeIndex;`
			`}`
			`} # addNodes`

			`sub printNode {`
			`my ( $index, $node ) = @_;`

			`print STDERR "[$index] ";`

			`printf( STDERR`
			`"letter=%d; isTerminal=%d; isLastSib=%d; fco=%d;\n",`
			`TrieNodeGetLetter($node),`
			`TrieNodeGetIsTerminal($node),`
			`TrieNodeGetIsLastSibling($node),`
			`TrieNodeGetFirstChildOffset($node));`
			`} # printNode`

			`sub printNodes {`
			`my ( $nodesR, $name ) = @_;`

			`my $len = @{$nodesR};`
			`# print "printNodes($name): len = $len\n";`

			`for ( my $i = 0; $i < $len; ++$i ) {`
			`my $node = ${$nodesR}[$i];`
			`printNode( $i, $node );`
			`}`

			`}`


			`# Hashing. We'll keep a hash of offsets into the existing nodes`
			`# array, and as the key use a string that represents the entire sub`
			`# array. Since the key is what we're matching for, there should never`
			`# be more than one value per hash and so we don't need buckets.`
			`# Return -1 if there's no match.`

			`sub findSubArray {`
			`my ( $newedgesR ) = @_;`

			`my $key = join('', @{$newedgesR});`

			`if ( exists( $gSubsHash{$key} ) ) {`
			`return $gSubsHash{$key};`
			`} else {`
			`return -1;`
			`}`
			`} # findSubArray`

			`# add to the hash`
			`sub registerSubArray {`
			`my ( $edgesR, $nodeLoc ) = @_;`

			`my $key = join( '', @{$edgesR} );`

			`if ( exists $gSubsHash{$key} ) {`
			`die "entry for key shouldn't exist!!";`
			`} else {`
			`$gSubsHash{$key} = $nodeLoc;`
			`}`

			`} # registerSubArray`

			`sub readNextWord() {`
			`my @word;`

			`if ( !$gDone ) {`
			`$gDone = @gInputStrings == 0;`
			`if ( !$gDone ) {`
			`@word = @{shift @gInputStrings};`
			`} else {`
			`print STDERR "gDone set to true\n" if $debug;`
			`}`

			`print STDERR "got word: ", join(',',@word), "\n" if $debug;`
			`}`
			`my $numCommonLetters = 0;`
			`my $len = @word;`
			`if ( @gCurrentWord < $len ) {`
			`$len = @gCurrentWord;`
			`}`

			`while ( @gCurrentWord[$numCommonLetters] eq @word[$numCommonLetters]`
			`&& $numCommonLetters < $len) {`
			`++$numCommonLetters;`
			`}`

			`$gFirstDiff = $numCommonLetters;`
			`die "words ", join(",",@gCurrentWord), " and ", join(",", @word), " out of order" if #$debug &&`
			`@gCurrentWord > 0 && @word > 0`
			`&& !firstBeforeSecond( \@gCurrentWord, \@word );`
			`@gCurrentWord = @word;`
			`} # readNextWord`

			`sub firstBeforeSecond {`
			`my ( $firstR, $secondR ) = @_;`

			`for ( my $i = 0; ; ++$i ) {`

			`# if we reach the end of the first word/list, we're done.`
			`if ( $i == @{$firstR} ) {`
			`die "duplicate!!!" if $i == @{$secondR};`
			`return 1;`
			`# but if we reach the second end first, we've failed`
			`} elsif ( $i == @{$secondR} ) {`
			`return 0;`
			`}`

			`my $diff = ${$firstR}[$i] <=> ${$secondR}[$i];`

			`if ( $diff == 0 ) {`
			`next;`
			`} else {`
			`return $diff < 0;`
			`}`
			`}`
			`} # firstBeforeSecond`

			`# passed to sort. Should remain unprototyped for effeciency's sake`

			`sub cmpWords {`

			`my $lenA = @{$a};`
			`my $lenB = @{$b};`
			`my $min = $lenA > $lenB? $lenB: $lenA;`

			`for ( my $i = 0; $i < $min; ++$i ) {`
			`my $ac = ${$a}[$i];`
			`my $bc = ${$b}[$i];`

			`my $res = $ac <=> $bc;`

			`if ( $res != 0 ) {`
			`return $res; # we're done`
			`}`
			`}`

			`# If we got here, they match up to their common length. Longer is`
			`# greater.`
			`my $res = @{$a} <=> @{$b};`
			`return $res; # which is longer?`
			`} # cmpWords`

			`sub parseAndSort() {`
			`my ( $infile ) = @_;`

			`my @wordlist;`
			`my @word;`

			`WORDLOOP:`
			`for ( ; ; ) {`

			`my $dropWord = 0;`
			`splice @word; # empty it`

			`# for each byte`
			`for ( ; ; ) {`
			`my $byt = getc($infile);`

			`if ( $byt eq undef ) {`
			`last WORDLOOP;`
			`} elsif ( $byt eq $gTermChar ) {`
			`if ( !$dropWord ) {`
			`push @wordlist, [ @word ];`
			`++$gWordCount;`
			`}`
			`next WORDLOOP;`
			`} elsif ( exists( $gTableHash{$byt} ) ) {`
			`if ( !$dropWord ) {`
			`push @word, $gTableHash{$byt};`
			`die "word too long" if @word > 15;`
			`}`
			`} elsif ($gKillIfMissing) {`
better error message 2003-12-10 07:00:56 +01:00			`die "$0: chr $byt (", $byt+0, ") not in map file $gTableFile\n";`
first checkin from personal project 2003-11-01 07:19:20 +01:00			`} else {`
			`$dropWord = 1;`
			`splice @word; # lose anything we already have`
			`}`
			`}`
			`}`

			`if ( $gNeedsSort && ($gWordCount > 0) ) {`
			`@wordlist = sort cmpWords @wordlist;`
			`}`

			`print STDERR "length of list is ", @wordlist + 0, ".\n" if $debug;`

			`return @wordlist;`
			`} # parseAndSort`

			`# Print binary representation of trie array. This isn't used yet, but`
			`# eventually it'll want to dump to multiple files appropriate for Palm`
			`# that can be catenated together on other platforms. There'll need to`
			`# be a file giving the offset of the first node too. Also, might want`
			`# to move to 4-byte representation when the input can't otherwise be`
			`# handled.`

			`sub dumpNodes {`

			`for ( my $i = 0; $i < @gNodes; ++$i ) {`
			`my $node = $gNodes[$i];`
			`my $bstr = pack( "I", $node );`
			`print STDOUT $bstr;`
			`}`
			`}`

			`##############################################################################`
			`# Little node-field setters and getters to hide what bits represent`
			`# what.`
			`##############################################################################`

			`sub TrieNodeSetIsTerminal {`
			`my ( $nodeR, $isTerminal ) = @_;`

			`if ( $isTerminal ) {`
			`${$nodeR} \|= 1 << 31;`
			`} else {`
			`${$nodeR} &= ~(1 << 31);`
			`}`
			`}`

			`sub TrieNodeGetIsTerminal {`
			`my ( $node ) = @_;`
			`return ($node & 1 << 31) != 0;`
			`}`

			`sub TrieNodeSetIsLastSibling {`
			`my ( $nodeR, $isLastSibling ) = @_;`
			`if ( $isLastSibling ) {`
			`${$nodeR} \|= 1 << 30;`
			`} else {`
			`${$nodeR} &= ~(1 << 30);`
			`}`
			`}`

			`sub TrieNodeGetIsLastSibling {`
			`my ( $node ) = @_;`
			`return ($node & 1 << 30) != 0;`
			`}`

			`sub TrieNodeSetLetter {`
			`my ( $nodeR, $letter ) = @_;`

			`die "letter ", $letter, " too big" if $letter >= 32;`

			`my $mask = ~(0x1F << 25);`
			`${$nodeR} &= $mask; # clear all the bits`
			`${$nodeR} \|= ($letter << 25); # set new ones`
			`}`

			`sub TrieNodeGetLetter {`
			`my ( $node ) = @_;`
			`$node >>= 25;`
			`$node &= 0x1F;`
			`return $node;`
			`}`

			`sub TrieNodeSetFirstChildOffset {`
			`my ( $nodeR, $fco ) = @_;`

			`die "$fco larger than 25 bits" if ($fco & 0xFE000000) != 0;`

			`my $mask = ~0x01FFFFFF;`
			`${$nodeR} &= $mask; # clear all the bits`
			`${$nodeR} \|= $fco; # set new ones`
			`}`

			`sub TrieNodeGetFirstChildOffset {`
			`my ( $node ) = @_;`
			`$node &= 0x01FFFFFF; # 24 bits`
			`return $node;`
			`}`


			`sub MakeTrieNode {`
			`my ( $letter, $isTerminal, $firstChildOffset, $isLastSibling ) = @_;`
			`my $result = 0;`

			`TrieNodeSetIsTerminal( \$result, $isTerminal );`
			`TrieNodeSetIsLastSibling( \$result, $isLastSibling );`
			`TrieNodeSetLetter( \$result, $letter );`
			`TrieNodeSetFirstChildOffset( \$result, $firstChildOffset );`

			`return $result;`
			`} # MakeTrieNode`

			`# Caller may need to know the offset of the first top-level node.`
			`# Write it here.`
			`sub writeOutStartNode {`
			`my ( $startNodeOut, $firstRootChildOffset ) = @_;`

			`open NODEOUT, ">$startNodeOut";`
			`print NODEOUT pack( "N", $firstRootChildOffset );`
			`close NODEOUT;`
			`} # writeOutStartNode`

			`# build the hash for translating. I'm using a hash assuming it'll be`
			`# fast. Key is the letter; value is the 0..31 value to be output.`
			`sub makeTableHash {`
			`my $i;`
			`open TABLEFILE, "< $gTableFile";`

			`splice @gRevMap; # empty it`

			`for ( $i = 0; ; ++$i ) {`
			`my $ch = getc(TABLEFILE);`
			`if ( $ch eq undef ) {`
			`last;`
			`}`

			`if ( $gUseUnicode ) { # skip the first byte each time: tmp HACK!!!`
			`$ch = getc(TABLEFILE);`
			`}`
			`if ( $ch eq undef ) {`
			`last;`
			`}`

			`push @gRevMap, $ch;`

			`if ( ord($ch) == 0 ) { # blank`
			`next; # we want to increment i when blank seen since`
			`# it is a tile value`
			`}`

			`die "$gTableFile too large\n" if $i > 32;`
			`die "only blank (0) can be 32nd char\n" if ($i == 32 && $ch != 0);`

			`$gTableHash{$ch} = $i;`
			`}`

			`close TABLEFILE;`
			`} # makeTableHash`

			`# emitNodes. "input" is $gNodes. From it we write up to`
			`# $nBytesPerOutfile to files named $outFileBase0..n, mapping the`
			`# letter field down to 5 bits with a hash built from $tableFile. If`
			`# at any point we encounter a letter not in the hash we fail with an`
			`# error.`

			`sub emitNodes($$) {`
			`my ( $gNBytesPerOutfile, $outFileBase ) = @_;`

			`# now do the emit.`

			`# is 17 bits enough?`
			`printf STDOUT ("There are %d (0x%x) nodes in this DAWG.\n",`
			`0 + @gNodes, 0 + @gNodes );`
			`if ( @gNodes > 0x1FFFF \|\| $gForceFour ) {`
			`$gNBytesPerNode = 4;`
			`} else {`
			`$gNBytesPerNode = 3;`
			`}`

			`my $nextIndex = 0;`
			`my $nextFileNum = 0;`

			`for ( $nextFileNum = 0; ; ++$nextFileNum ) {`

			`if ( $nextIndex >= @gNodes ) {`
			`last; # we're done`
			`}`

			`die "Too many outfiles; infinite loop?" if $nextFileNum > 99;`

			`my $outName = sprintf("${outFileBase}_%03d.bin", $nextFileNum);`
			`open OUTFILE, "> $outName";`
			`my $curSize = 0;`

			`while ( $nextIndex < @gNodes ) {`

			`# scan to find the next terminal`
			`my $i;`
			`for ( $i = $nextIndex;`
			`!TrieNodeGetIsLastSibling($gNodes[$i]);`
			`++$i ) {`

			`# do nothing but a sanity check`
			`if ( $i >= @gNodes) {`
			`die "bad trie format: last node not last sibling" ;`
			`}`

			`}`
			`++$i; # move beyond the terminal`
			`my $nextSize = ($i - $nextIndex) * $gNBytesPerNode;`
			`if ($curSize + $nextSize > $gNBytesPerOutfile) {`
			`last;`
			`} else {`
			`# emit the subarray`
			`while ( $nextIndex < $i ) {`
			`outputNode( $gNodes[$nextIndex], $gNBytesPerNode,`
			`\*OUTFILE );`
			`++$nextIndex;`
			`}`
			`$curSize += $nextSize;`
			`}`
			`}`

			`close OUTFILE;`
			`}`

			`} # emitNodes`

			`sub printWord {`
			`my ( $str ) = @_;`

			`print STDERR "$str\n";`
			`}`

			`# print out the entire dictionary, as text, to STDERR.`

			`sub printOneLevel {`

			`my ( $index, $str ) = @_;`

			`for ( ; ; ) {`

			`my $newStr = $str;`
			`my $node = $gNodes[$index++];`

			`my $lindx = $gRevMap[TrieNodeGetLetter($node)];`

			`if ( ord($lindx) >= 0x20 ) {`
			`$newStr .= "$lindx";`
			`} else {`
			`print STDERR "sub space" if $debug;`
			`$newStr .= "\\" . chr('0'+$lindx);`
			`}`

			`if ( TrieNodeGetIsTerminal($node) ) {`
			`printWord( $newStr );`
			`}`

			`my $fco = TrieNodeGetFirstChildOffset( $node );`
			`if ( $fco != 0 ) {`
			`printOneLevel( $fco, $newStr );`
			`}`

			`if ( TrieNodeGetIsLastSibling($node) ) {`
			`last;`
			`}`
			`}`
			`}`

			`sub outputNode {`
			`my ( $node, $nBytes, $outfile ) = @_;`

			`my $fco = TrieNodeGetFirstChildOffset($node);`
			`my $fourthByte;`

			`if ( $nBytes == 4 ) {`
			`$fourthByte = $fco >> 17;`
			`die "fco too big" if $fourthByte > 0xFF;`
			`$fco &= 0x1FFFF;`
			`}`

			`# format according to dawg.h:`
			`# typedef struct array_edge {`
			`# unsigned char highByte;`
			`# unsigned char lowByte;`
			`# unsigned char bits;`
			`#ifdef FOUR_BYTE`
			`# unsigned char moreBits;`
			`#endif`
			`# } array_edge;`

			`# define LETTERMASK 0x1f`
			`# define ACCEPTINGMASK 0x20`
			`# define LASTEDGEMASK 0x40`
			`# define LASTBITMASK 0x80`

			`# write the fco (less that one bit). We want two bytes worth`
			`# in three-byte mode, and three in four-byte mode (which is`
			`# untested)`
			`for ( my $i = 1; $i >= 0; --$i ) {`
			`my $tmp = ($fco >> ($i * 8)) & 0xFF;`
			`print $outfile pack( "C", $tmp );`
			`}`
			`$fco >>= 16; # it should now be 1 or 0`
			`die "fco not 1 or 0" if $fco > 1;`

			`my $chIn5 = TrieNodeGetLetter($node);`
			`my $bits = $chIn5;`

			`if ( TrieNodeGetIsLastSibling($node) ) {`
			`$bits \|= 0x40;`
			`}`
			`if ( TrieNodeGetIsTerminal($node) ) {`
			`$bits \|= 0x20;`
			`}`
			`if ( $fco != 0 ) {`
			`$bits \|= 0x80;`
			`}`
			`print $outfile pack( "C", $bits );`

			`# the final byte, if in use`
			`if ( $nBytes == 4 ) {`
			`print $outfile pack( "C", $fourthByte );`
			`}`
			`} # outputNode`

			`sub usage {`
			`print STDERR "usage: $0 \n"`
			`. "\t[-b bytesPerFile] (default = 0xFFFFFFFF)\n"`
			`. "\t-m mapFile\n"`
			`. "\t-ob outFileBase\n"`
			`. "\t-sn start node out file\n"`
			`. "\t[-if input file name] -- default = stdin\n"`
			`. "\t[-term ch] (word terminator -- default = '\\0'\n"`
			`. "\t[-nosort] (input already sorted in accord with -m; " .`
			`" default=sort'\n"`
			`. "\t[-dump] (write dictionary as text to STDERR for testing)\n"`
			`. "\t[-force4](use 4 bytes per node regardless of need)\n"`
			`. "\t[-r] (reject words with letters not in mapfile)\n"`
			`. "\t[-k] (kill if any letters no in mapfile -- default)\n"`
			`;`

			`} # usage`

			`sub parseARGV {`

			`my $arg;`
			`while ( my $arg = shift(@ARGV) ) {`

			`SWITCH: {`
			`if ($arg =~ /-b/) {$gNBytesPerOutfile = shift(@ARGV), last SWITCH;}`
			`if ($arg =~ /-mn/) {$gTableFile = shift(@ARGV);`
			`$gUseUnicode = 1;`
			`last SWITCH;}`
			`if ($arg =~ /-m/) {$gTableFile = shift(@ARGV); last SWITCH;}`
			`if ($arg =~ /-ob/) {$gOutFileBase = shift(@ARGV), last SWITCH;}`
			`if ($arg =~ /-sn/) {$gStartNodeOut = shift(@ARGV), last SWITCH;}`
			`if ($arg =~ /-if/) {$gInFileName = shift(@ARGV), last SWITCH;}`
			`if ($arg =~ /-r/) {$gKillIfMissing = 0; last SWITCH;}`
			`if ($arg =~ /-k/) {$gKillIfMissing = 1; last SWITCH;}`
			`if ($arg =~ /-term/) {$gTermChar = chr(shift(@ARGV)); last SWITCH;}`
			`if ($arg =~ /-dump/) {$gDumpText = 1; last SWITCH;}`
			`if ($arg =~ /-nosort/) {$gNeedsSort = 0; last SWITCH;}`
			`if ($arg =~ /-wc/) {$gCountFile = shift(@ARGV); last SWITCH;}`
			`if ($arg =~ /-ns/) {$gBytesPerNodeFile = shift(@ARGV); last SWITCH;}`
			`if ($arg =~ /-force4/) {$gForceFour = 1; last SWITCH;}`
			`die "unexpected arg $arg\n";`
			`}`
			`}`


			`print STDERR "gNBytesPerOutfile=$gNBytesPerOutfile\n" if $debug;`
			`print STDERR "gTableFile=$gTableFile\n" if $debug;`
			`print STDERR "gOutFileBase=$gOutFileBase\n" if $debug;`
			`print STDERR "gStartNodeOut=$gStartNodeOut\n" if $debug;`
			`printf STDERR "gTermChar=%s(%d)\n", $gTermChar, ord($gTermChar) if $debug;`

			`return $gTableFile;`

			`} # parseARGV`