/* -*- compile-command: "g++ -g -o dict2dawg dict2dawg.cpp"; -*- */ /************************************************************************* * adapted from perl code that was itself adapted from C++ code * Copyright (C) 2000 Falk Hueffner * This version Copyright (C) 2002,2006 Eric House (xwords@eehouse.org) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA ************************************************************************** * inputs: 0. Name of file mapping letters to 0..31 values. In English * case just contains A..Z. This will be used to translate the tries * on output. * 1. Max number of bytes per binary output file. * * 2. Basename of binary files for output. * 3. Name of file to which to write the number of the * startNode, since I'm not rewriting a bunch of code to expect Falk's * '*' node at the start. * * In STDIN, the text file to be compressed. It absolutely * must be sorted. The sort doesn't have to follow the order in the * map file, however. * This is meant eventually to be runnable as part of a cgi system for * letting users generate Crosswords dicts online. **************************************************************************/ #include #include #include #include #include #include #include typedef unsigned int Node; typedef std::vector NodeList; typedef std::vector WordList; #define MAX_WORD_LEN 15 int gFirstDiff; char* gCurrentWord = ""; int gCurrentWordLen; char* gCurWord = NULL; // save so can check for sortedness bool gDone = false; WordList* gInputStrings; static int gNextWordIndex; bool gNeedsSort = true; NodeList gNodes; // final array of nodes unsigned int gNBytesPerOutfile = 0xFFFFFFFF; char* gTableFile = NULL; char* gOutFileBase = NULL; char* gStartNodeOut = NULL; char* gInFileName = NULL; bool gKillIfMissing = true; char gTermChar = '\n'; bool gDumpText = false; // dump the dict as text after? char* gCountFile = NULL; char* gBytesPerNodeFile = NULL; // where to write whether node size 3 or 4 int gWordCount = 0; std::map gTableHash; int gBlankIndex; std::vector gRevMap; #ifdef DEBUG bool gDebug = false; #endif std::map gSubsHash; bool gForceFour = false; // use four bytes regardless of need? int gNBytesPerNode; bool gUseUnicode; #define MAX_POOL_SIZE 1000000 #define ERROR_EXIT(...) error_exit( __LINE__, __VA_ARGS__ ); static char* parseARGV( int argc, char** argv ); static void usage( const char* name ); static void error_exit( int line, const char* fmt, ... ); static char parsechar( const char* in ); static void makeTableHash( void ); static WordList* parseAndSort( FILE* file ); static void printWords( WordList* strings ); static void readNextWord( void ); static bool firstBeforeSecond( const char* lhs, const char* rhs ); static char* tileToAscii( char* out, int outSize, const char* in ); static int buildNode( int depth ); static void TrieNodeSetIsLastSibling( Node* nodeR, bool isLastSibling ); static int addNodes( NodeList& newedgesR ); static void TrieNodeSetIsTerminal( Node* nodeR, bool isTerminal ); static bool TrieNodeGetIsTerminal( Node node ); static void TrieNodeSetIsLastSibling( Node* nodeR, bool isLastSibling ); static bool TrieNodeGetIsLastSibling( Node node ); static void TrieNodeSetLetter( Node* nodeR, int letter ); static int TrieNodeGetLetter( Node node ); static void TrieNodeSetFirstChildOffset( Node* nodeR, int fco ); static int TrieNodeGetFirstChildOffset( Node node ); static int findSubArray( NodeList& newedgesR ); static void registerSubArray( NodeList& edgesR, int nodeLoc ); static Node MakeTrieNode( int letter, bool isTerminal, int firstChildOffset, bool isLastSibling ); static void printNodes( NodeList& nodesR ); static void printNode( int index, Node node ); static void moveTopToFront( int* firstRef ); static void writeOutStartNode( const char* startNodeOut, int firstRootChildOffset ); static void emitNodes( unsigned int nBytesPerOutfile, const char* outFileBase ); static void outputNode( Node node, int nBytes, FILE* outfile ); static void printOneLevel( int index, char* str, int curlen ); int main( int argc, char** argv ) { if ( NULL == parseARGV( argc, argv ) ) { usage(argv[0]); exit(1); } makeTableHash(); FILE* infile; if ( gInFileName ) { infile = fopen( gInFileName, "r" ); } else { infile = stdin; } gInputStrings = parseAndSort( infile ); gNextWordIndex = 0; if ( gInFileName ) { fclose( infile ); } #ifdef DEBUG if ( gDebug ) { printWords( gInputStrings ); } #endif // Do I need this stupid thing? Better to move the first row to // the front of the array and patch everything else. Or fix the // non-palm dictionary format to include the offset of the first // node. Node dummyNode = (Node)0xFFFFFFFF; assert( sizeof(Node) == 4 ); gNodes.push_back(dummyNode); readNextWord(); int firstRootChildOffset = buildNode(0); moveTopToFront( &firstRootChildOffset ); if ( gStartNodeOut ) { writeOutStartNode( gStartNodeOut, firstRootChildOffset ); } #ifdef DEBUG if ( gDebug ) { fprintf( stderr, "\n... dumping table ...\n" ); printNodes( gNodes ); } #endif // write out the number of nodes if requested if ( gCountFile ) { FILE* OFILE; OFILE = fopen( gCountFile, "w" ); unsigned long be = htonl( gWordCount ); fwrite( &be, sizeof(be), 1, OFILE ); fclose( OFILE ); fprintf( stderr, "wrote out: got %d words\n", gWordCount ); } if ( gOutFileBase ) { emitNodes( gNBytesPerOutfile, gOutFileBase ); } if ( gDumpText && gNodes.size() > 0 ) { char buf[(MAX_WORD_LEN*2)+1]; printOneLevel( firstRootChildOffset, buf, 0 ); } if ( gBytesPerNodeFile ) { FILE* OFILE = fopen( gBytesPerNodeFile, "w" ); fprintf( OFILE, "%d", gNBytesPerNode ); fclose( OFILE ); } fprintf( stderr, "Used %d per node.\n", gNBytesPerNode ); } /* main */ // We now have an array of nodes with the last subarray being the // logical top of the tree. Move them to the start, fixing all fco // refs, so that legacy code like Palm can assume top==0. // // Note: It'd probably be a bit faster to integrate this with emitNodes // -- unless I need to have an in-memory list that can be used for // lookups. But that's best for debugging, so keep it this way for now. // // Also Note: the first node is a dummy that can and should be tossed // now. static void moveTopToFront( int* firstRef ) { int firstChild = *firstRef; *firstRef = 0; NodeList lastSub; if ( firstChild > 0 ) { lastSub.assign( gNodes.begin() + firstChild, gNodes.end() ); gNodes.erase( gNodes.begin() + firstChild, gNodes.end() ); } else if ( gWordCount != 0 ) { ERROR_EXIT( "there should be no words!!" ); } // remove the first (garbage) node gNodes.erase( gNodes.begin() ); int diff; if ( firstChild > 0 ) { // -1 because all move down by 1; see prev line diff = lastSub.size() - 1; if ( diff < 0 ) { ERROR_EXIT( "something wrong with lastSub.size()" ); } } else { diff = 0; } // stick it on the front gNodes.insert( gNodes.begin(), lastSub.begin(), lastSub.end() ); // We add diff to everything. There's no subtracting because // nobody had any refs to the top list. for ( int i = 0; i < gNodes.size(); ++i ) { int fco = TrieNodeGetFirstChildOffset( gNodes[i] ); if ( fco != 0 ) { // 0 means NONE, not 0th!! TrieNodeSetFirstChildOffset( &gNodes[i], fco + diff ); } } } // moveTopToFront static int buildNode( int depth ) { if ( gCurrentWordLen == depth ) { // End of word reached. If the next word isn't a continuation // of the current one, then we've reached the bottom of the // recursion tree. readNextWord(); if (gFirstDiff < depth || gDone) { return 0; } } NodeList newedges; bool wordEnd; do { char letter = gCurrentWord[depth]; bool isTerminal = (gCurrentWordLen - 1) == depth; int nodeOffset = buildNode( depth + 1 ); Node newNode = MakeTrieNode( letter, isTerminal, nodeOffset, false ); wordEnd = (gFirstDiff != depth) || gDone; if ( wordEnd ) { TrieNodeSetIsLastSibling( &newNode, true ); } newedges.push_back( newNode ); } while ( !wordEnd ); return addNodes( newedges ); } // buildNode static int addNodes( NodeList& newedgesR ) { int found = findSubArray( newedgesR ); if ( found == 0 ) { ERROR_EXIT( "0 is an invalid match!!!" ); } if ( found < 0 ) { found = gNodes.size(); #ifdef DEBUG if ( gDebug ) { fprintf( stderr, "adding...\n" ); printNodes( newedgesR ); } #endif gNodes.insert( gNodes.end(), newedgesR.begin(), newedgesR.end() ); registerSubArray( newedgesR, found ); } #ifdef DEBUG if ( gDebug ) { fprintf( stderr, "%s => %d\n", __FUNCTION__, found ); } #endif return found; } // addNodes static void printNode( int index, Node node ) { int letter = TrieNodeGetLetter(node); assert( letter < gRevMap.size() ); fprintf( stderr, "[%d] letter=%d(%c); isTerminal=%s; isLastSib=%s; fco=%d;\n", index, letter, gRevMap[letter], TrieNodeGetIsTerminal(node)?"true":"false", TrieNodeGetIsLastSibling(node)?"true":"false", TrieNodeGetFirstChildOffset(node)); } // printNode static void printNodes( NodeList& nodesR ) { for ( int i = 0; i < nodesR.size(); ++i ) { Node node = nodesR[i]; printNode( i, node ); } } // Hashing. We'll keep a hash of offsets into the existing nodes // array, and as the key use a string that represents the entire sub // array. Since the key is what we're matching for, there should never // be more than one value per hash and so we don't need buckets. // Return -1 if there's no match. static int findSubArray( NodeList& newedgesR ) { std::map::iterator iter = gSubsHash.find( newedgesR ); if ( iter != gSubsHash.end() ) { return iter->second; } else { return -1; } } // findSubArray // add to the hash static void registerSubArray( NodeList& edgesR, int nodeLoc ) { #ifdef DEBUG std::map::iterator iter = gSubsHash.find( edgesR ); if ( iter != gSubsHash.end() ) { ERROR_EXIT( "entry for key shouldn't exist!!" ); } #endif gSubsHash[edgesR] = nodeLoc; } // registerSubArray #if 0 sub toWord($) { my ( $tileARef ) = @_; my $word = ""; foreach my $tile (@$tileARef) { foreach my $letter (keys (%gTableHash) ) { if ( $tile == $gTableHash{$letter} ) { $word .= $letter; last; } } } return $word; } #endif static void readNextWord( void ) { char* word = ""; if ( !gDone ) { gDone = gNextWordIndex == gInputStrings->size(); if ( !gDone ) { word = gInputStrings->at(gNextWordIndex++); #ifdef DEBUG } else if ( gDebug ) { fprintf( stderr, "gDone set to true\n" ); #endif } #ifdef DEBUG if ( gDebug ) { fprintf( stderr, "got word: %s\n", word ); } #endif } int numCommonLetters = 0; int len = strlen( word ); if ( gCurrentWordLen < len ) { len = gCurrentWordLen; } while ( gCurrentWord[numCommonLetters] == word[numCommonLetters] && numCommonLetters < len ) { ++numCommonLetters; } gFirstDiff = numCommonLetters; if ( (gCurrentWordLen > 0) && (strlen(word) > 0) && !firstBeforeSecond( gCurrentWord, word ) ) { char buf1[MAX_WORD_LEN+1]; char buf2[MAX_WORD_LEN+1]; tileToAscii( buf1, sizeof(buf1), gCurrentWord ); tileToAscii( buf2, sizeof(buf2), word ); ERROR_EXIT( "words %s and %s are out of order\n", buf1, buf2 ); } gCurrentWord = word; gCurrentWordLen = strlen(word); #ifdef DEBUG if ( gDebug ) { char buf[MAX_WORD_LEN+1]; fprintf( stderr, "gCurrentWord now %s\n", tileToAscii( buf, sizeof(buf), gCurrentWord) ); } #endif } // readNextWord static bool firstBeforeSecond( const char* lhs, const char* rhs ) { char sl[16]; char sr[16]; // tileToAscii( sl, lhs ); // tileToAscii( sr, rhs ); bool gt = 0 > strcmp( lhs, rhs ); // fprintf( stderr, "comparing %s, %s; returning %s\n", // sl, sr, gt?"true":"false" ); return gt; } #if 0 // passed to sort. Should remain unprototyped for effeciency's sake sub cmpWords { my $lenA = @{$a}; my $lenB = @{$b}; my $min = $lenA > $lenB? $lenB: $lenA; for ( my $i = 0; $i < $min; ++$i ) { my $ac = ${$a}[$i]; my $bc = ${$b}[$i]; my $res = $ac <=> $bc; if ( $res != 0 ) { return $res; // we're done } } // If we got here, they match up to their common length. Longer is // greater. my $res = @{$a} <=> @{$b}; return $res; // which is longer? } // cmpWords #endif static char* tileToAscii( char* out, int outSize, const char* in ) { char* orig = out; for ( ; ; ) { char ch = *in++; if ( '\0' == ch ) { *out = '\0'; break; } assert( ch < gRevMap.size() ); *out++ = gRevMap[ch]; assert( (out - orig) < outSize ); } return orig; } static WordList* parseAndSort( FILE* infile ) { WordList* wordlist = new WordList; // allocate storage for the actual chars. wordlist's char* // elements will point into this. It'll leak. So what. // void* pool = malloc( MAX_POOL_SIZE ); // assert( NULL != pool ); // memset( pool, 0, MAX_POOL_SIZE ); std::string word; std::string asciiWord; for ( ; ; ) { bool dropWord = false; word.clear(); // for each byte for ( ; ; ) { int byt = getc( infile ); if ( byt == EOF ) { goto done; } else if ( byt == gTermChar ) { if ( !dropWord ) { int len = word.length() + 1; char* str = (char*)malloc( len ); assert( str ); memcpy( str, word.c_str(), len); wordlist->push_back( str ); ++gWordCount; #ifdef DEBUG if ( gDebug ) { char buf[MAX_WORD_LEN+1]; fprintf( stderr, "loaded %s\n", asciiWord.c_str() ); fprintf( stderr, "from tiles: %s\n", tileToAscii( buf, sizeof(buf), str ) ); } #endif } asciiWord = ""; break; } else if ( gTableHash.find(byt) != gTableHash.end() ) { if ( !dropWord ) { #ifdef DEBUG if ( gDebug ) { fprintf( stderr, "adding %d for %c\n", gTableHash[byt], (char)byt ); } #endif word += (char)gTableHash[byt]; assert( word.size() <= MAX_WORD_LEN ); if ( gKillIfMissing ) { asciiWord += byt; } } } else if ( gKillIfMissing ) { ERROR_EXIT( "chr %c (%d) not in map file %s\n" "last word was %s\n", byt, (int)byt, gTableFile, asciiWord.c_str() ); } else { dropWord = true; word = ""; // lose anything we already have } } } done: if ( gNeedsSort && (gWordCount > 1) ) { #ifdef DEBUG if ( gDebug ) { fprintf( stderr, "starting sort...\n" ); } #endif std::sort( wordlist->begin(), wordlist->end(), firstBeforeSecond ); #ifdef DEBUG if ( gDebug ) { fprintf( stderr, "sort finished\n" ); } #endif } return wordlist; } // parseAndSort static void printWords( std::vector* strings ) { std::vector::iterator iter = strings->begin(); while ( iter != strings->end() ) { char buf[MAX_WORD_LEN+1]; tileToAscii( buf, sizeof(buf), *iter ); fprintf( stderr, "%s\n", buf ); ++iter; } // for ( int i = 0; i < strings->size(); ++i ) { // char* str = strings[i]; // } } #if 0 // Print binary representation of trie array. This isn't used yet, but // eventually it'll want to dump to multiple files appropriate for Palm // that can be catenated together on other platforms. There'll need to // be a file giving the offset of the first node too. Also, might want // to move to 4-byte representation when the input can't otherwise be // handled. sub dumpNodes { for ( my $i = 0; $i < @gNodes; ++$i ) { my $node = $gNodes[$i]; my $bstr = pack( "I", $node ); print STDOUT $bstr; } } #endif /***************************************************************************** * Little node-field setters and getters to hide what bits represent * what. * high bit (31) is ACCEPTING bit * next bit (30) is LAST_SIBLING bit * next 6 bits (29-24) are tile bit (allowing alphabets of 64 letters) * final 24 bits (23-0) are the index of the first child (fco) ******************************************************************************/ static void TrieNodeSetIsTerminal( Node* nodeR, bool isTerminal ) { if ( isTerminal ) { *nodeR |= (1 << 31); } else { *nodeR &= ~(1 << 31); } } static bool TrieNodeGetIsTerminal( Node node ) { return (node & (1 << 31)) != 0; } static void TrieNodeSetIsLastSibling( Node* nodeR, bool isLastSibling ) { if ( isLastSibling ) { *nodeR |= (1 << 30); } else { *nodeR &= ~(1 << 30); } } static bool TrieNodeGetIsLastSibling( Node node ) { return (node & (1 << 30)) != 0; } static void TrieNodeSetLetter( Node* nodeR, int letter ) { if( letter >= 64 ) { ERROR_EXIT( "letter %d too big", letter ); } int mask = ~(0x3F << 24); *nodeR &= mask; // clear all the bits *nodeR |= (letter << 24); // set new ones } static int TrieNodeGetLetter( Node node ) { node >>= 24; node &= 0x3F; // is 3f ok for 3-byte case??? return node; } static void TrieNodeSetFirstChildOffset( Node* nodeR, int fco ) { if ( (fco & 0xFF000000) != 0 ) { ERROR_EXIT( "%x larger than 24 bits", fco ); } int mask = ~0x00FFFFFF; *nodeR &= mask; // clear all the bits *nodeR |= fco; // set new ones } static int TrieNodeGetFirstChildOffset( Node node ) { node &= 0x00FFFFFF; // 24 bits return node; } static Node MakeTrieNode( int letter, bool isTerminal, int firstChildOffset, bool isLastSibling ) { Node result = 0; TrieNodeSetIsTerminal( &result, isTerminal ); TrieNodeSetIsLastSibling( &result, isLastSibling ); TrieNodeSetLetter( &result, letter ); TrieNodeSetFirstChildOffset( &result, firstChildOffset ); return result; } // MakeTrieNode // Caller may need to know the offset of the first top-level node. // Write it here. static void writeOutStartNode( const char* startNodeOut, int firstRootChildOffset ) { FILE* NODEOUT; NODEOUT = fopen( startNodeOut, "w" ); unsigned long be = htonl( firstRootChildOffset ); (void)fwrite( &be, sizeof(be), 1, NODEOUT ); fclose( NODEOUT ); } // writeOutStartNode // build the hash for translating. I'm using a hash assuming it'll be // fast. Key is the letter; value is the 0..31 value to be output. static void makeTableHash( void ) { int i; FILE* TABLEFILE = fopen( gTableFile, "r" ); // open TABLEFILE, "< $gTableFile"; //splice @gRevMap; // empty it for ( i = 0; ; ++i ) { int ch = getc(TABLEFILE); if ( ch == EOF ) { break; } if ( gUseUnicode ) { // skip the first byte each time: tmp HACK!!! ch = getc(TABLEFILE); } if ( ch == EOF ) { break; } // push @gRevMap, $ch; gRevMap.push_back(ch); if ( ch == 0 ) { // blank gBlankIndex = i; // we want to increment i when blank seen since it is a // tile value continue; } // die "$0: $gTableFile too large\n" assert( i < 64 ); // die "$0: only blank (0) can be 64th char\n" ; assert( i < 64 || ch == 0 ); gTableHash[ch] = i; } fclose( TABLEFILE ); } // makeTableHash // emitNodes. "input" is $gNodes. From it we write up to // $nBytesPerOutfile to files named $outFileBase0..n, mapping the // letter field down to 5 bits with a hash built from $tableFile. If // at any point we encounter a letter not in the hash we fail with an // error. static void emitNodes( unsigned int nBytesPerOutfile, const char* outFileBase ) { // now do the emit. // is 17 bits enough? fprintf( stderr, "There are %d (0x%x) nodes in this DAWG.\n", gNodes.size(), gNodes.size() ); int nTiles = gTableHash.size(); // blank is not included in this count! if ( gNodes.size() > 0x1FFFF || gForceFour || nTiles > 32 ) { gNBytesPerNode = 4; } else if ( nTiles < 32 ) { gNBytesPerNode = 3; } else { if ( gBlankIndex == 32 ) { // blank fprintf( stderr, "blank's at 32; 3-byte-nodes still ok\n" ); gNBytesPerNode = 3; } else { ERROR_EXIT( "move blank to last position in info.txt " "for smaller DAWG." ); } } int nextIndex = 0; int nextFileNum; for ( nextFileNum = 0; ; ++nextFileNum ) { if ( nextIndex >= gNodes.size() ) { break; // we're done } if ( nextFileNum > 99 ) { ERROR_EXIT( "Too many outfiles; infinite loop?" ); } char outName[256]; snprintf( outName, sizeof(outName), "%s_%03d.bin", outFileBase, nextFileNum); FILE* OUTFILE = fopen( outName, "w" ); assert( OUTFILE ); int curSize = 0; while ( nextIndex < gNodes.size() ) { // scan to find the next terminal int i; for ( i = nextIndex; !TrieNodeGetIsLastSibling(gNodes[i]); ++i ) { // do nothing but a sanity check if ( i >= gNodes.size() ) { ERROR_EXIT( "bad trie format: last node not last sibling" ); } } ++i; // move beyond the terminal int nextSize = (i - nextIndex) * gNBytesPerNode; if (curSize + nextSize > nBytesPerOutfile ) { break; } else { // emit the subarray while ( nextIndex < i ) { outputNode( gNodes[nextIndex], gNBytesPerNode, OUTFILE ); ++nextIndex; } curSize += nextSize; } } fclose( OUTFILE ); } } // emitNodes // print out the entire dictionary, as text, to STDERR. static void printOneLevel( int index, char* str, int curlen ) { int inlen = curlen; for ( ; ; ) { // char* newStr = str; Node node = gNodes[index++]; assert( TrieNodeGetLetter(node) < gRevMap.size() ); char lindx = gRevMap[TrieNodeGetLetter(node)]; if ( (int)lindx >= 0x20 ) { // newStr .= "$lindx"; str[curlen++] = lindx; } else { #ifdef DEBUG if ( gDebug ) { fprintf( stderr, "sub space\n" ); } #endif // $newStr .= "\\" . chr('0'+$lindx); str[curlen++] = '\\'; str[curlen++] = '0' + lindx; } str[curlen] = '\0'; if ( TrieNodeGetIsTerminal(node) ) { fprintf( stderr, "%s\n", str ); } int fco = TrieNodeGetFirstChildOffset( node ); if ( fco != 0 ) { printOneLevel( fco, str, curlen ); } if ( TrieNodeGetIsLastSibling(node) ) { break; } curlen = inlen; } str[inlen] = '\0'; } static void outputNode( Node node, int nBytes, FILE* outfile ) { unsigned int fco = TrieNodeGetFirstChildOffset(node); unsigned int fourthByte; if ( nBytes == 4 ) { fourthByte = fco >> 16; if ( fourthByte > 0xFF ) { ERROR_EXIT( "fco too big" ); } fco &= 0xFFFF; } // Formats are different depending on whether it's to have 3- or // 4-byte nodes. // Here's what the three-byte node looks like. 16 bits plus one // burried in the last byte for the next node address, five for a // character/tile and one each for accepting and last-edge. // 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 // |-------- 16 bits of next node address -------| | | | |-tile indx-| // | | | // accepting bit ---+ | | // last edge bit ------+ | // ---- last bit (17th on next node addr)---------+ // The four-byte format adds a byte at the right end for // addressing, but removes the extra bit (5) in order to let the // chars field be six bits. Bits 7 and 6 remain the same. // write the fco (less that one bit). We want two bytes worth // in three-byte mode, and three in four-byte mode // first two bytes are low-word of fco, regardless of format for ( int i = 1; i >= 0; --i ) { unsigned char tmp = (fco >> (i * 8)) & 0xFF; fwrite( &tmp, 1, 1, outfile ); } fco >>= 16; // it should now be 1 or 0 if ( fco > 1 ) { ERROR_EXIT( "fco not 1 or 0" ); } unsigned char chIn5 = TrieNodeGetLetter(node); unsigned char bits = chIn5; if ( bits > 0x1F && nBytes == 3 ) { ERROR_EXIT( "char %d too big", bits ); } if ( TrieNodeGetIsLastSibling(node) ) { bits |= 0x40; } if ( TrieNodeGetIsTerminal(node) ) { bits |= 0x80; } // We set the 17th next-node bit only in 3-byte case (where char is // 5 bits) if ( nBytes == 3 && fco != 0 ) { bits |= 0x20; } fwrite( &bits, 1, 1, outfile ); // the final byte, if in use if ( nBytes == 4 ) { unsigned char tmp = (unsigned char)fourthByte; fwrite( &tmp, 1, 1, outfile ); } } // outputNode static void usage( const char* name ) { fprintf( stderr, "usage: %s \n" "\t[-b bytesPerFile] (default = 0xFFFFFFFF)\n" "\t-m mapFile\n" "\t-mn mapFile (unicode)\n" "\t-ob outFileBase\n" "\t-sn start node out file\n" "\t[-if input file name] -- default = stdin\n" "\t[-term ch] (word terminator -- default = '\\0'\n" "\t[-nosort] (input already sorted in accord with -m; " " default=sort'\n" "\t[-dump] (write dictionary as text to STDERR for testing)\n" #ifdef DEBUG "\t[-debug] (turn on verbose output)\n" #endif "\t[-force4](use 4 bytes per node regardless of need)\n" "\t[-r] (reject words with letters not in mapfile)\n" "\t[-k] (kill if any letters not in mapfile -- default)\n", name ); } // usage static void error_exit( int line, const char* fmt, ... ) { fprintf( stderr, "Error on line %d: ", line ); va_list ap; va_start( ap, fmt ); vfprintf( stderr, fmt, ap ); va_end( ap ); fprintf( stderr, "\n" ); exit( 1 ); } static char* parseARGV( int argc, char** argv ) { int index = 1; while ( index < argc ) { char* arg = argv[index++]; if ( 0 == strcmp( arg, "-b" ) ) { gNBytesPerOutfile = atol( argv[index++] ); } else if ( 0 == strcmp( arg, "-mn" ) ) { gTableFile = argv[index++]; gUseUnicode = true; } else if ( 0 == strcmp( arg, "-m" ) ) { gTableFile = argv[index++]; } else if ( 0 == strcmp( arg, "-ob" ) ) { gOutFileBase = argv[index++]; } else if ( 0 == strcmp( arg, "-sn" ) ) { gStartNodeOut = argv[index++]; } else if ( 0 == strcmp( arg, "-if" ) ) { gInFileName = argv[index++]; } else if ( 0 == strcmp( arg, "-r" ) ) { gKillIfMissing = false; } else if ( 0 == strcmp( arg, "-k" ) ) { gKillIfMissing = true; } else if ( 0 == strcmp( arg, "-term" ) ) { gTermChar = (char)atoi(argv[index++]); } else if ( 0 == strcmp( arg, "-dump" ) ) { gDumpText = true; } else if ( 0 == strcmp( arg, "-nosort" ) ) { gNeedsSort = false; } else if ( 0 == strcmp( arg, "-wc" ) ) { gCountFile = argv[index++]; } else if ( 0 == strcmp( arg, "-ns" ) ) { gBytesPerNodeFile = argv[index++]; } else if ( 0 == strcmp( arg, "-force4" ) ) { gForceFour = true; #ifdef DEBUG } else if ( 0 == strcmp( arg, "-debug" ) ) { gDebug = true; #endif } else { ERROR_EXIT( "unexpected arg %s", arg ); } } #ifdef DEBUG if ( gDebug ) { fprintf( stderr, "gNBytesPerOutfile=$gNBytesPerOutfile\n" ); fprintf( stderr, "gTableFile=$gTableFile\n" ); fprintf( stderr, "gOutFileBase=$gOutFileBase\n" ); fprintf( stderr, "gStartNodeOut=$gStartNodeOut\n" ); fprintf( stderr, "gTermChar=%c(%d)\n", gTermChar, (int)gTermChar ); } #endif return gTableFile; } // parseARGV