diff --git a/xwords4/dawg/dict2dawg.cpp b/xwords4/dawg/dict2dawg.cpp index 590b06c5b..ee8d73251 100644 --- a/xwords4/dawg/dict2dawg.cpp +++ b/xwords4/dawg/dict2dawg.cpp @@ -42,19 +42,26 @@ #include #include +#include #include #include #include #include +typedef unsigned int Node; +typedef std::vector NodeList; +typedef std::vector WordList; + int gFirstDiff; char* gCurrentWord = ""; +int gCurrentWordLen; char* gCurWord = NULL; // save so can check for sortedness bool gDone = false; -std::list* gInputStrings; +WordList* gInputStrings; +static int gNextWordIndex; bool gNeedsSort = true; -std::vector gNodes; // final array of nodes +NodeList gNodes; // final array of nodes unsigned int gNBytesPerOutfile = 0xFFFFFFFF; char* gTableFile = NULL; char* gOutFileBase = NULL; @@ -70,26 +77,48 @@ std::map gTableHash; int gBlankIndex; std::vector gRevMap; bool gDebug = false; -std::map gSubsHash; +std::map gSubsHash; bool gForceFour = false; // use four bytes regardless of need? int gNBytesPerNode; bool gUseUnicode; -typedef unsigned int Node; - #define MAX_POOL_SIZE 1000000 +#define ERROR_EXIT(...) error_exit( __LINE__, __VA_ARGS__ ); static char* parseARGV( int argc, char** argv ); static void usage( const char* name ); -static void error_exit( const char* fmt, ... ); +static void error_exit( int line, const char* fmt, ... ); static char parsechar( const char* in ); static void makeTableHash( void ); -static std::list* parseAndSort( FILE* file ); -static void printWords( std::list* strings ); +static WordList* parseAndSort( FILE* file ); +static void printWords( WordList* strings ); static void readNextWord( void ); static bool firstBeforeSecond( const char* lhs, const char* rhs ); static char* tileToAscii( char* out, const char* in ); +static int buildNode( int depth ); +static void TrieNodeSetIsLastSibling( Node* nodeR, bool isLastSibling ); +static int addNodes( NodeList& newedgesR ); +static void TrieNodeSetIsTerminal( Node* nodeR, bool isTerminal ); +static bool TrieNodeGetIsTerminal( Node node ); +static void TrieNodeSetIsLastSibling( Node* nodeR, bool isLastSibling ); +static bool TrieNodeGetIsLastSibling( Node node ); +static void TrieNodeSetLetter( Node* nodeR, int letter ); +static int TrieNodeGetLetter( Node node ); +static void TrieNodeSetFirstChildOffset( Node* nodeR, int fco ); +static int TrieNodeGetFirstChildOffset( Node node ); +static int findSubArray( NodeList& newedgesR ); +static void registerSubArray( NodeList& edgesR, int nodeLoc ); +static Node MakeTrieNode( int letter, bool isTerminal, int firstChildOffset, + bool isLastSibling ); +static void printNodes( NodeList& nodesR ); +static void printNode( int index, Node node ); +static void moveTopToFront( int* firstRef ); +static void writeOutStartNode( const char* startNodeOut, + int firstRootChildOffset ); +static void emitNodes( unsigned int nBytesPerOutfile, const char* outFileBase ); +static void outputNode( Node node, int nBytes, FILE* outfile ); +static void printOneLevel( int index, char* str, int curlen ); int main( int argc, char** argv ) @@ -109,11 +138,14 @@ main( int argc, char** argv ) } gInputStrings = parseAndSort( infile ); + gNextWordIndex = 0; if ( gInFileName ) { fclose( infile ); } - printWords( gInputStrings ); + if ( gDebug ) { + printWords( gInputStrings ); + } // Do I need this stupid thing? Better to move the first row to // the front of the array and patch everything else. Or fix the @@ -126,45 +158,45 @@ main( int argc, char** argv ) readNextWord(); -#if 0 int firstRootChildOffset = buildNode(0); + moveTopToFront( &firstRootChildOffset ); - moveTopToFront( \$firstRootChildOffset ); - - if ( $gStartNodeOut ) { - writeOutStartNode( $gStartNodeOut, $firstRootChildOffset ); + if ( gStartNodeOut ) { + writeOutStartNode( gStartNodeOut, firstRootChildOffset ); } - print STDERR "\n... dumping table ...\n" if $debug; - printNodes( \@gNodes, "done with main" ) if $debug; + if ( gDebug ) { + fprintf( stderr, "\n... dumping table ...\n" ); + printNodes( gNodes ); + } // write out the number of nodes if requested - if ( $gCountFile ) { - open OFILE, "> $gCountFile"; - print OFILE pack( "N", $gWordCount ); - close OFILE; - print STDERR "wrote out: got $gWordCount words\n"; + if ( gCountFile ) { + FILE* OFILE; + OFILE = fopen( gCountFile, "w" ); + unsigned long be = htonl( gWordCount ); + fwrite( &be, sizeof(be), 1, OFILE ); + fclose( OFILE ); + fprintf( stderr, "wrote out: got %d words\n", gWordCount ); } - if ( $gOutFileBase ) { - emitNodes( $gNBytesPerOutfile, $gOutFileBase ); + if ( gOutFileBase ) { + emitNodes( gNBytesPerOutfile, gOutFileBase ); } - if ( $gDumpText && @gNodes > 0 ) { - printOneLevel( $firstRootChildOffset, "" ); + if ( gDumpText && gNodes.size() > 0 ) { + char buf[31]; + printOneLevel( firstRootChildOffset, buf, 0 ); } - if ( $gBytesPerNodeFile ) { - open OFILE, "> $gBytesPerNodeFile"; - print OFILE $gNBytesPerNode; - close OFILE; + if ( gBytesPerNodeFile ) { + FILE* OFILE = fopen( gBytesPerNodeFile, "w" ); + fprintf( OFILE, "%d", gNBytesPerNode ); + fclose( OFILE ); } - print STDERR "Used $gNBytesPerNode per node.\n"; -#endif + fprintf( stderr, "Used %d per node.\n", gNBytesPerNode ); } /* main */ -#if 0 - // We now have an array of nodes with the last subarray being the // logical top of the tree. Move them to the start, fixing all fco // refs, so that legacy code like Palm can assume top==0. @@ -176,157 +208,161 @@ main( int argc, char** argv ) // Also Note: the first node is a dummy that can and should be tossed // now. -sub moveTopToFront($) { - my ( $firstRef ) = @_; +static void +moveTopToFront( int* firstRef ) +{ + int firstChild = *firstRef; + *firstRef = 0; - my $firstChild = ${$firstRef}; - ${$firstRef} = 0; - my @lastSub; + NodeList lastSub; - if ( $firstChild > 0 ) { - // remove the last (the root) subarray - @lastSub = splice( @gNodes, $firstChild ); - } else { - die "there should be no words!!" if $gWordCount != 0; + if ( firstChild > 0 ) { + lastSub.assign( gNodes.begin() + firstChild, gNodes.end() ); + gNodes.erase( gNodes.begin() + firstChild, gNodes.end() ); + } else if ( gWordCount != 0 ) { + error_exit( "there should be no words!!" ); } - // remove the first (garbage) node - shift @gNodes; - my $diff; - if ( $firstChild > 0 ) { + // remove the first (garbage) node + gNodes.erase( gNodes.begin() ); + + int diff; + if ( firstChild > 0 ) { // -1 because all move down by 1; see prev line - $diff = @lastSub - 1; - die "something wrong with len\n" if $diff < 0; + diff = lastSub.size() - 1; + if ( diff < 0 ) { + error_exit( "something wrong with lastSub.size()" ); + } } else { - $diff = 0; + diff = 0; } // stick it on the front - splice( @gNodes, 0, 0, @lastSub); + gNodes.insert( gNodes.begin(), lastSub.begin(), lastSub.end() ); - // We add $diff to everything. There's no subtracting because + // We add diff to everything. There's no subtracting because // nobody had any refs to the top list. - for ( my $i = 0; $i < @gNodes; ++$i ) { - my $fco = TrieNodeGetFirstChildOffset( $gNodes[$i] ); - if ( $fco != 0 ) { // 0 means NONE, not 0th!! - TrieNodeSetFirstChildOffset( \$gNodes[$i], $fco+$diff ); + for ( int i = 0; i < gNodes.size(); ++i ) { + int fco = TrieNodeGetFirstChildOffset( gNodes[i] ); + if ( fco != 0 ) { // 0 means NONE, not 0th!! + TrieNodeSetFirstChildOffset( &gNodes[i], fco + diff ); } } } // moveTopToFront - -sub buildNode { - my ( $depth ) = @_; - - if ( @gCurrentWord == $depth ) { +static int +buildNode( int depth ) +{ + if ( gCurrentWordLen == depth ) { // End of word reached. If the next word isn't a continuation // of the current one, then we've reached the bottom of the // recursion tree. readNextWord(); - if ($gFirstDiff < $depth || $gDone) { + if (gFirstDiff < depth || gDone) { return 0; } } - my @newedges; + NodeList newedges; + bool wordEnd; do { - my $letter = $gCurrentWord[$depth]; - my $isTerminal = @gCurrentWord - 1 == $depth ? 1:0; + char letter = gCurrentWord[depth]; + bool isTerminal = (gCurrentWordLen - 1) == depth; - my $nodeOffset = buildNode($depth+1); - my $newNode = MakeTrieNode($letter, $isTerminal, $nodeOffset); - push( @newedges, $newNode ); + int nodeOffset = buildNode( depth + 1 ); + Node newNode = MakeTrieNode( letter, isTerminal, nodeOffset, false ); - } while ( ($gFirstDiff == $depth) && !$gDone); + wordEnd = (gFirstDiff != depth) || gDone; + if ( wordEnd ) { + TrieNodeSetIsLastSibling( &newNode, true ); + } - TrieNodeSetIsLastSibling( \@newedges[@newedges-1], 1 ); + newedges.push_back( newNode ); + } while ( !wordEnd ); - return addNodes( \@newedges ); + return addNodes( newedges ); } // buildNode -sub addNodes { - my ( $newedgesR ) = @_; +static int +addNodes( NodeList& newedgesR ) +{ + int found = findSubArray( newedgesR ); - my $found = findSubArray( $newedgesR ); - - if ( $found >= 0 ) { - die "0 is an invalid match!!!" if $found == 0; - return $found; - } else { - - my $firstFreeIndex = @gNodes; - - print STDERR "adding...\n" if $debug; - printNodes( $newedgesR ) if $debug; - - push @gNodes, (@{$newedgesR}); - - registerSubArray( $newedgesR, $firstFreeIndex ); - return $firstFreeIndex; + if ( found == 0 ) { + error_exit( "0 is an invalid match!!!" ); } + + if ( found < 0 ) { + found = gNodes.size(); + + if ( gDebug ) { + fprintf( stderr, "adding...\n" ); + printNodes( newedgesR ); + } + gNodes.insert( gNodes.end(), newedgesR.begin(), newedgesR.end() ); + + registerSubArray( newedgesR, found ); + } + if ( gDebug ) { + fprintf( stderr, "%s => %d\n", __FUNCTION__, found ); + } + return found; } // addNodes -sub printNode { - my ( $index, $node ) = @_; - - print STDERR "[$index] "; - - printf( STDERR - "letter=%d; isTerminal=%d; isLastSib=%d; fco=%d;\n", - TrieNodeGetLetter($node), - TrieNodeGetIsTerminal($node), - TrieNodeGetIsLastSibling($node), - TrieNodeGetFirstChildOffset($node)); +static void +printNode( int index, Node node ) +{ + int letter = TrieNodeGetLetter(node); + fprintf( stderr, + "[%d] letter=%d(%c); isTerminal=%s; isLastSib=%s; fco=%d;\n", + index, letter, gRevMap[letter], + TrieNodeGetIsTerminal(node)?"true":"false", + TrieNodeGetIsLastSibling(node)?"true":"false", + TrieNodeGetFirstChildOffset(node)); } // printNode -sub printNodes { - my ( $nodesR, $name ) = @_; - - my $len = @{$nodesR}; - // print "printNodes($name): len = $len\n"; - - for ( my $i = 0; $i < $len; ++$i ) { - my $node = ${$nodesR}[$i]; - printNode( $i, $node ); +static void +printNodes( NodeList& nodesR ) +{ + for ( int i = 0; i < nodesR.size(); ++i ) { + Node node = nodesR[i]; + printNode( i, node ); } - } - // Hashing. We'll keep a hash of offsets into the existing nodes // array, and as the key use a string that represents the entire sub // array. Since the key is what we're matching for, there should never // be more than one value per hash and so we don't need buckets. // Return -1 if there's no match. -sub findSubArray { - my ( $newedgesR ) = @_; - - my $key = join('', @{$newedgesR}); - - if ( exists( $gSubsHash{$key} ) ) { - return $gSubsHash{$key}; +static int +findSubArray( NodeList& newedgesR ) +{ + std::map::iterator iter = gSubsHash.find( newedgesR ); + if ( iter != gSubsHash.end() ) { + return iter->second; } else { return -1; } } // findSubArray // add to the hash -sub registerSubArray { - my ( $edgesR, $nodeLoc ) = @_; - - my $key = join( '', @{$edgesR} ); - - if ( exists $gSubsHash{$key} ) { - die "entry for key shouldn't exist!!"; - } else { - $gSubsHash{$key} = $nodeLoc; +static void +registerSubArray( NodeList& edgesR, int nodeLoc ) +{ +#ifdef DEBUG + std::map::iterator iter = gSubsHash.find( edgesR ); + if ( iter != gSubsHash.end() ) { + error_exit( "entry for key shouldn't exist!!" ); } - +#endif + gSubsHash[edgesR] = nodeLoc; } // registerSubArray +#if 0 sub toWord($) { my ( $tileARef ) = @_; my $word = ""; @@ -347,13 +383,12 @@ sub toWord($) { static void readNextWord( void ) { - char* word; + char* word = ""; if ( !gDone ) { - gDone = gInputStrings->size() == 0; + gDone = gNextWordIndex == gInputStrings->size(); if ( !gDone ) { - word = gInputStrings->front(); - gInputStrings->pop_front(); + word = gInputStrings->at(gNextWordIndex++); } else if ( gDebug ) { fprintf( stderr, "gDone set to true\n" ); } @@ -363,9 +398,8 @@ readNextWord( void ) } int numCommonLetters = 0; int len = strlen( word ); - int curWordLen = strlen(gCurrentWord); - if ( curWordLen < len ) { - len = curWordLen; + if ( gCurrentWordLen < len ) { + len = gCurrentWordLen; } while ( gCurrentWord[numCommonLetters] == word[numCommonLetters] @@ -374,7 +408,7 @@ readNextWord( void ) } gFirstDiff = numCommonLetters; - if ( (curWordLen > 0) && (strlen(word) > 0) + if ( (gCurrentWordLen > 0) && (strlen(word) > 0) && !firstBeforeSecond( gCurrentWord, word ) ) { char buf1[16]; char buf2[16]; @@ -384,9 +418,12 @@ readNextWord( void ) buf1, buf2 ); } gCurrentWord = word; + gCurrentWordLen = strlen(word); - char buf[16]; - fprintf( stderr, "gCurrentWord now %s\n", tileToAscii(buf, gCurrentWord) ); + if ( gDebug ) { + char buf[16]; + fprintf( stderr, "gCurrentWord now %s\n", tileToAscii(buf, gCurrentWord) ); + } } // readNextWord static bool @@ -399,8 +436,8 @@ firstBeforeSecond( const char* lhs, const char* rhs ) tileToAscii( sr, rhs ); bool gt = 0 > strcmp( lhs, rhs ); - fprintf( stderr, "comparing %s, %s; returning %s\n", - sl, sr, gt?"true":"false" ); +// fprintf( stderr, "comparing %s, %s; returning %s\n", +// sl, sr, gt?"true":"false" ); return gt; } @@ -447,10 +484,10 @@ tileToAscii( char* out, const char* in ) return orig; } -static std::list* +static WordList* parseAndSort( FILE* infile ) { - std::list* wordlist = new std::list; + WordList* wordlist = new WordList; // allocate storage for the actual chars. wordlist's char* // elements will point into this. It'll leak. So what. @@ -487,8 +524,10 @@ parseAndSort( FILE* infile ) break; } else if ( gTableHash.find(byt) != gTableHash.end() ) { if ( !dropWord ) { - fprintf( stderr, "adding %d for %c\n", - gTableHash[byt], (char)byt ); + if ( gDebug ) { + fprintf( stderr, "adding %d for %c\n", + gTableHash[byt], (char)byt ); + } word += (char)gTableHash[byt]; assert( word.size() <= 15 ); if ( gKillIfMissing ) { @@ -510,7 +549,7 @@ parseAndSort( FILE* infile ) if ( gDebug ) { fprintf( stderr, "starting sort...\n" ); } -// std::sort( wordlist->begin(), wordlist->end(), firstBeforeSecond ); + std::sort( wordlist->begin(), wordlist->end(), firstBeforeSecond ); if ( gDebug ) { fprintf( stderr, "sort finished\n" ); } @@ -523,9 +562,9 @@ parseAndSort( FILE* infile ) } // parseAndSort static void -printWords( std::list* strings ) +printWords( std::vector* strings ) { - std::list::iterator iter = strings->begin(); + std::vector::iterator iter = strings->begin(); while ( iter != strings->end() ) { char buf[16]; tileToAscii( buf, *iter ); @@ -553,6 +592,7 @@ sub dumpNodes { print STDOUT $bstr; } } +#endif /***************************************************************************** * Little node-field setters and getters to hide what bits represent @@ -564,91 +604,102 @@ sub dumpNodes { * final 24 bits (23-0) are the index of the first child (fco) ******************************************************************************/ -sub TrieNodeSetIsTerminal { - my ( $nodeR, $isTerminal ) = @_; - - if ( $isTerminal ) { - ${$nodeR} |= (1 << 31); +static void +TrieNodeSetIsTerminal( Node* nodeR, bool isTerminal ) +{ + if ( isTerminal ) { + *nodeR |= (1 << 31); } else { - ${$nodeR} &= ~(1 << 31); + *nodeR &= ~(1 << 31); } } -sub TrieNodeGetIsTerminal { - my ( $node ) = @_; - return ($node & (1 << 31)) != 0; +static bool +TrieNodeGetIsTerminal( Node node ) +{ + return (node & (1 << 31)) != 0; } -sub TrieNodeSetIsLastSibling { - my ( $nodeR, $isLastSibling ) = @_; - if ( $isLastSibling ) { - ${$nodeR} |= (1 << 30); +static void +TrieNodeSetIsLastSibling( Node* nodeR, bool isLastSibling ) +{ + if ( isLastSibling ) { + *nodeR |= (1 << 30); } else { - ${$nodeR} &= ~(1 << 30); + *nodeR &= ~(1 << 30); } } -sub TrieNodeGetIsLastSibling { - my ( $node ) = @_; - return ($node & (1 << 30)) != 0; +static bool +TrieNodeGetIsLastSibling( Node node ) +{ + return (node & (1 << 30)) != 0; } -sub TrieNodeSetLetter { - my ( $nodeR, $letter ) = @_; +static void +TrieNodeSetLetter( Node* nodeR, int letter ) +{ + if( letter >= 64 ) { + error_exit( "letter %d too big", letter ); + } - die "$0: letter ", $letter, " too big" if $letter >= 64; - - my $mask = ~(0x3F << 24); - ${$nodeR} &= $mask; // clear all the bits - ${$nodeR} |= ($letter << 24); // set new ones + int mask = ~(0x3F << 24); + *nodeR &= mask; // clear all the bits + *nodeR |= (letter << 24); // set new ones } -sub TrieNodeGetLetter { - my ( $node ) = @_; - $node >>= 24; - $node &= 0x3F; // is 3f ok for 3-byte case??? - return $node; +static int +TrieNodeGetLetter( Node node ) +{ + node >>= 24; + node &= 0x3F; // is 3f ok for 3-byte case??? + return node; } -sub TrieNodeSetFirstChildOffset { - my ( $nodeR, $fco ) = @_; +static void +TrieNodeSetFirstChildOffset( Node* nodeR, int fco ) +{ + if ( (fco & 0xFF000000) != 0 ) { + error_exit( "%x larger than 24 bits", fco ); + } - die "$0: $fco larger than 24 bits" if ($fco & 0xFF000000) != 0; - - my $mask = ~0x00FFFFFF; - ${$nodeR} &= $mask; // clear all the bits - ${$nodeR} |= $fco; // set new ones + int mask = ~0x00FFFFFF; + *nodeR &= mask; // clear all the bits + *nodeR |= fco; // set new ones } -sub TrieNodeGetFirstChildOffset { - my ( $node ) = @_; - $node &= 0x00FFFFFF; // 24 bits - return $node; +static int +TrieNodeGetFirstChildOffset( Node node ) +{ + node &= 0x00FFFFFF; // 24 bits + return node; } +static Node +MakeTrieNode( int letter, bool isTerminal, int firstChildOffset, + bool isLastSibling ) +{ + Node result = 0; -sub MakeTrieNode { - my ( $letter, $isTerminal, $firstChildOffset, $isLastSibling ) = @_; - my $result = 0; + TrieNodeSetIsTerminal( &result, isTerminal ); + TrieNodeSetIsLastSibling( &result, isLastSibling ); + TrieNodeSetLetter( &result, letter ); + TrieNodeSetFirstChildOffset( &result, firstChildOffset ); - TrieNodeSetIsTerminal( \$result, $isTerminal ); - TrieNodeSetIsLastSibling( \$result, $isLastSibling ); - TrieNodeSetLetter( \$result, $letter ); - TrieNodeSetFirstChildOffset( \$result, $firstChildOffset ); - - return $result; + return result; } // MakeTrieNode // Caller may need to know the offset of the first top-level node. // Write it here. -sub writeOutStartNode { - my ( $startNodeOut, $firstRootChildOffset ) = @_; - - open NODEOUT, ">$startNodeOut"; - print NODEOUT pack( "N", $firstRootChildOffset ); - close NODEOUT; +static void +writeOutStartNode( const char* startNodeOut, int firstRootChildOffset ) +{ + FILE* NODEOUT; + NODEOUT = fopen( startNodeOut, "w" ); + unsigned long be = htonl( firstRootChildOffset ); + (void)fwrite( &be, sizeof(be), 1, NODEOUT ); + fclose( NODEOUT ); } // writeOutStartNode -#endif // build the hash for translating. I'm using a hash assuming it'll be // fast. Key is the letter; value is the 0..31 value to be output. @@ -694,136 +745,138 @@ makeTableHash( void ) fclose( TABLEFILE ); } // makeTableHash -#if 0 // emitNodes. "input" is $gNodes. From it we write up to // $nBytesPerOutfile to files named $outFileBase0..n, mapping the // letter field down to 5 bits with a hash built from $tableFile. If // at any point we encounter a letter not in the hash we fail with an // error. -sub emitNodes($$) { - my ( $nBytesPerOutfile, $outFileBase ) = @_; - +static void +emitNodes( unsigned int nBytesPerOutfile, const char* outFileBase ) +{ // now do the emit. // is 17 bits enough? - printf STDERR ("There are %d (0x%x) nodes in this DAWG.\n", - 0 + @gNodes, 0 + @gNodes ); - my $nTiles = 0 + keys(%gTableHash); // blank is not included in this count! - if ( @gNodes > 0x1FFFF || $gForceFour || $nTiles > 32 ) { - $gNBytesPerNode = 4; - } elsif ( $nTiles < 32 ) { - $gNBytesPerNode = 3; + fprintf( stderr, "There are %d (0x%x) nodes in this DAWG.\n", + gNodes.size(), gNodes.size() ); + int nTiles = gTableHash.size(); // blank is not included in this count! + if ( gNodes.size() > 0x1FFFF || gForceFour || nTiles > 32 ) { + gNBytesPerNode = 4; + } else if ( nTiles < 32 ) { + gNBytesPerNode = 3; } else { - if ( $gBlankIndex == 32 ) { // blank - print STDERR "blank's at 32; 3-byte-nodes still ok\n"; - $gNBytesPerNode = 3; + if ( gBlankIndex == 32 ) { // blank + fprintf( stderr, "blank's at 32; 3-byte-nodes still ok\n" ); + gNBytesPerNode = 3; } else { - die "$0: move blank to last position in info.txt for smaller DAWG"; + error_exit( "move blank to last position in info.txt " + "for smaller DAWG." ); } } - my $nextIndex = 0; - my $nextFileNum = 0; + int nextIndex = 0; + int nextFileNum; - for ( $nextFileNum = 0; ; ++$nextFileNum ) { + for ( nextFileNum = 0; ; ++nextFileNum ) { - if ( $nextIndex >= @gNodes ) { - last; // we're done + if ( nextIndex >= gNodes.size() ) { + break; // we're done } - - die "Too many outfiles; infinite loop?" if $nextFileNum > 99; - my $outName = sprintf("${outFileBase}_%03d.bin", $nextFileNum); - open OUTFILE, "> $outName"; - binmode( OUTFILE ); - my $curSize = 0; + if ( nextFileNum > 99 ) { + error_exit( "Too many outfiles; infinite loop?" ); + } - while ( $nextIndex < @gNodes ) { + char outName[256]; + snprintf( outName, sizeof(outName), "%s_%03d.bin", + outFileBase, nextFileNum); + FILE* OUTFILE = fopen( outName, "w" ); + assert( OUTFILE ); + int curSize = 0; + while ( nextIndex < gNodes.size() ) { // scan to find the next terminal - my $i; - for ( $i = $nextIndex; - !TrieNodeGetIsLastSibling($gNodes[$i]); - ++$i ) { + int i; + for ( i = nextIndex; !TrieNodeGetIsLastSibling(gNodes[i]); ++i ) { // do nothing but a sanity check - if ( $i >= @gNodes) { - die "bad trie format: last node not last sibling" ; + if ( i >= gNodes.size() ) { + error_exit( "bad trie format: last node not last sibling" ); } } - ++$i; // move beyond the terminal - my $nextSize = ($i - $nextIndex) * $gNBytesPerNode; - if ($curSize + $nextSize > $nBytesPerOutfile) { - last; + ++i; // move beyond the terminal + int nextSize = (i - nextIndex) * gNBytesPerNode; + if (curSize + nextSize > nBytesPerOutfile ) { + break; } else { // emit the subarray - while ( $nextIndex < $i ) { - outputNode( $gNodes[$nextIndex], $gNBytesPerNode, - \*OUTFILE ); - ++$nextIndex; + while ( nextIndex < i ) { + outputNode( gNodes[nextIndex], gNBytesPerNode, OUTFILE ); + ++nextIndex; } - $curSize += $nextSize; + curSize += nextSize; } } - close OUTFILE; + fclose( OUTFILE ); } } // emitNodes -sub printWord { - my ( $str ) = @_; - - print STDERR "$str\n"; -} - // print out the entire dictionary, as text, to STDERR. - -sub printOneLevel { - - my ( $index, $str ) = @_; - +static void +printOneLevel( int index, char* str, int curlen ) +{ + int inlen = curlen; for ( ; ; ) { +// char* newStr = str; + Node node = gNodes[index++]; - my $newStr = $str; - my $node = $gNodes[$index++]; + char lindx = gRevMap[TrieNodeGetLetter(node)]; - my $lindx = $gRevMap[TrieNodeGetLetter($node)]; - - if ( ord($lindx) >= 0x20 ) { - $newStr .= "$lindx"; + if ( (int)lindx >= 0x20 ) { +// newStr .= "$lindx"; + str[curlen++] = lindx; } else { - print STDERR "sub space" if $debug; - $newStr .= "\\" . chr('0'+$lindx); + if ( gDebug ) { + fprintf( stderr, "sub space\n" ); + } +// $newStr .= "\\" . chr('0'+$lindx); + str[curlen++] = '\\'; + str[curlen++] = '0' + lindx; } + str[curlen] = '\0'; - if ( TrieNodeGetIsTerminal($node) ) { - printWord( $newStr ); + if ( TrieNodeGetIsTerminal(node) ) { + fprintf( stderr, "%s\n", str ); } - my $fco = TrieNodeGetFirstChildOffset( $node ); - if ( $fco != 0 ) { - printOneLevel( $fco, $newStr ); + int fco = TrieNodeGetFirstChildOffset( node ); + if ( fco != 0 ) { + printOneLevel( fco, str, curlen ); } - if ( TrieNodeGetIsLastSibling($node) ) { - last; + if ( TrieNodeGetIsLastSibling(node) ) { + break; } + curlen = inlen; } + str[inlen] = '\0'; } -sub outputNode ($$$) { - my ( $node, $nBytes, $outfile ) = @_; +static void +outputNode( Node node, int nBytes, FILE* outfile ) +{ + unsigned int fco = TrieNodeGetFirstChildOffset(node); + unsigned int fourthByte; - my $fco = TrieNodeGetFirstChildOffset($node); - my $fourthByte; - - if ( $nBytes == 4 ) { - $fourthByte = $fco >> 16; - die "$0: fco too big" if $fourthByte > 0xFF; - $fco &= 0xFFFF; + if ( nBytes == 4 ) { + fourthByte = fco >> 16; + if ( fourthByte > 0xFF ) { + error_exit( "fco too big" ); + } + fco &= 0xFFFF; } // Formats are different depending on whether it's to have 3- or @@ -848,37 +901,41 @@ sub outputNode ($$$) { // in three-byte mode, and three in four-byte mode // first two bytes are low-word of fco, regardless of format - for ( my $i = 1; $i >= 0; --$i ) { - my $tmp = ($fco >> ($i * 8)) & 0xFF; - print $outfile pack( "C", $tmp ); + for ( int i = 1; i >= 0; --i ) { + unsigned char tmp = (fco >> (i * 8)) & 0xFF; + fwrite( &tmp, 1, 1, outfile ); } - $fco >>= 16; // it should now be 1 or 0 - die "fco not 1 or 0" if $fco > 1; - - my $chIn5 = TrieNodeGetLetter($node); - my $bits = $chIn5; - die "$0: char $bits too big" if $bits > 0x1F && $nBytes == 3; - - if ( TrieNodeGetIsLastSibling($node) ) { - $bits |= 0x40; + fco >>= 16; // it should now be 1 or 0 + if ( fco > 1 ) { + error_exit( "fco not 1 or 0" ); } - if ( TrieNodeGetIsTerminal($node) ) { - $bits |= 0x80; + + unsigned char chIn5 = TrieNodeGetLetter(node); + unsigned char bits = chIn5; + if ( bits > 0x1F && nBytes == 3 ) { + error_exit( "char %d too big", bits ); + } + + if ( TrieNodeGetIsLastSibling(node) ) { + bits |= 0x40; + } + if ( TrieNodeGetIsTerminal(node) ) { + bits |= 0x80; } // We set the 17th next-node bit only in 3-byte case (where char is // 5 bits) - if ( $nBytes == 3 && $fco != 0 ) { - $bits |= 0x20; + if ( nBytes == 3 && fco != 0 ) { + bits |= 0x20; } - print $outfile pack( "C", $bits ); + fwrite( &bits, 1, 1, outfile ); // the final byte, if in use - if ( $nBytes == 4 ) { - print $outfile pack( "C", $fourthByte ); + if ( nBytes == 4 ) { + unsigned char tmp = (unsigned char)fourthByte; + fwrite( &tmp, 1, 1, outfile ); } } // outputNode -#endif static void usage( const char* name ) @@ -903,36 +960,17 @@ usage( const char* name ) } // usage static void -error_exit( const char* fmt, ... ) +error_exit( int line, const char* fmt, ... ) { + fprintf( stderr, "Line %d: ", line ); va_list ap; va_start( ap, fmt ); vfprintf( stderr, fmt, ap ); va_end( ap ); + fprintf( stderr, "\n" ); exit( 1 ); } -static char -parsechar( const char* in ) -{ - char result = *in++; - if ( '\\' == result ) { - switch ( *in ) { - case 'n': - result = '\n'; - break; - case '0': - result = '\0'; - break; - default: - assert(0); - break; - } - } - - return result; -} - static char* parseARGV( int argc, char** argv ) { @@ -959,7 +997,7 @@ parseARGV( int argc, char** argv ) } else if ( 0 == strcmp( arg, "-k" ) ) { gKillIfMissing = true; } else if ( 0 == strcmp( arg, "-term" ) ) { - gTermChar = parsechar(argv[index++]); + gTermChar = (char)atoi(argv[index++]); } else if ( 0 == strcmp( arg, "-dump" ) ) { gDumpText = true; } else if ( 0 == strcmp( arg, "-nosort" ) ) {