From 3df1e461e4d95b02a19b5e50765ef7e7ab9bddb4 Mon Sep 17 00:00:00 2001 From: ehouse Date: Fri, 14 Apr 2006 05:23:30 +0000 Subject: [PATCH] For already-sorted case, read words from file on as-needed basis rather than build a vector to hold them. --- xwords4/dawg/dict2dawg.cpp | 277 ++++++++++++++++++++++++------------- 1 file changed, 181 insertions(+), 96 deletions(-) diff --git a/xwords4/dawg/dict2dawg.cpp b/xwords4/dawg/dict2dawg.cpp index 525b2e121..fe80c3e3f 100644 --- a/xwords4/dawg/dict2dawg.cpp +++ b/xwords4/dawg/dict2dawg.cpp @@ -56,19 +56,22 @@ typedef std::vector WordList; #define MAX_WORD_LEN 15 int gFirstDiff; -char* gCurrentWord = ""; -int gCurrentWordLen; + +static char gCurrentWordBuf[MAX_WORD_LEN+1] = { '\0' }; + // this will never change for non-sort case +static char* gCurrentWord = gCurrentWordBuf; +static int gCurrentWordLen; + char* gCurWord = NULL; // save so can check for sortedness bool gDone = false; -WordList* gInputStrings; static int gNextWordIndex; -bool gNeedsSort = true; +static void (*gReadWordProc)(void) = NULL; NodeList gNodes; // final array of nodes unsigned int gNBytesPerOutfile = 0xFFFFFFFF; char* gTableFile = NULL; char* gOutFileBase = NULL; char* gStartNodeOut = NULL; -char* gInFileName = NULL; +static FILE* gInFile = NULL; bool gKillIfMissing = true; char gTermChar = '\n'; bool gDumpText = false; // dump the dict as text after? @@ -91,14 +94,13 @@ bool gUseUnicode; #define MAX_POOL_SIZE 3000000 #define ERROR_EXIT(...) error_exit( __LINE__, __VA_ARGS__ ); -static char* parseARGV( int argc, char** argv ); +static char* parseARGV( int argc, char** argv, const char** inFileName ); static void usage( const char* name ); static void error_exit( int line, const char* fmt, ... ); static char parsechar( const char* in ); static void makeTableHash( void ); static WordList* parseAndSort( FILE* file ); static void printWords( WordList* strings ); -static void readNextWord( void ); static bool firstBeforeSecond( const char* lhs, const char* rhs ); static char* tileToAscii( char* out, int outSize, const char* in ); static int buildNode( int depth ); @@ -124,34 +126,21 @@ static void writeOutStartNode( const char* startNodeOut, static void emitNodes( unsigned int nBytesPerOutfile, const char* outFileBase ); static void outputNode( Node node, int nBytes, FILE* outfile ); static void printOneLevel( int index, char* str, int curlen ); +static void readFromSortedArray( void ); int main( int argc, char** argv ) { - if ( NULL == parseARGV( argc, argv ) ) { + gReadWordProc = readFromSortedArray; + + const char* inFileName; + if ( NULL == parseARGV( argc, argv, &inFileName ) ) { usage(argv[0]); exit(1); } makeTableHash(); - FILE* infile; - if ( gInFileName ) { - infile = fopen( gInFileName, "r" ); - } else { - infile = stdin; - } - - gInputStrings = parseAndSort( infile ); - gNextWordIndex = 0; - if ( gInFileName ) { - fclose( infile ); - } -#ifdef DEBUG - if ( gDebug ) { - printWords( gInputStrings ); - } -#endif // Do I need this stupid thing? Better to move the first row to // the front of the array and patch everything else. Or fix the // non-palm dictionary format to include the offset of the first @@ -160,8 +149,14 @@ main( int argc, char** argv ) Node dummyNode = (Node)0xFFFFFFFF; assert( sizeof(Node) == 4 ); gNodes.push_back(dummyNode); + + if ( NULL == inFileName ) { + gInFile = stdin; + } else { + gInFile = fopen( inFileName, "r" ); + } - readNextWord(); + (*gReadWordProc)(); int firstRootChildOffset = buildNode(0); moveTopToFront( &firstRootChildOffset ); @@ -201,6 +196,11 @@ main( int argc, char** argv ) fclose( OFILE ); } fprintf( stderr, "Used %d per node.\n", gNBytesPerNode ); + + if ( NULL != inFileName ) { + fclose( gInFile ); + } + } /* main */ // We now have an array of nodes with the last subarray being the @@ -264,7 +264,7 @@ buildNode( int depth ) // End of word reached. If the next word isn't a continuation // of the current one, then we've reached the bottom of the // recursion tree. - readNextWord(); + (*gReadWordProc)(); if (gFirstDiff < depth || gDone) { return 0; } @@ -373,14 +373,28 @@ registerSubArray( NodeList& edgesR, int nodeLoc ) } // registerSubArray static void -readNextWord( void ) +readFromSortedArray( void ) { + // The first time we need a new word, we read 'em all in. + static WordList* sInputStrings = NULL; // we'll just let this leak + + if ( sInputStrings == NULL ) { + sInputStrings = parseAndSort( gInFile ); + gNextWordIndex = 0; + +#ifdef DEBUG + if ( gDebug ) { + printWords( sInputStrings ); + } +#endif + } + char* word = ""; if ( !gDone ) { - gDone = gNextWordIndex == gInputStrings->size(); + gDone = gNextWordIndex == sInputStrings->size(); if ( !gDone ) { - word = gInputStrings->at(gNextWordIndex++); + word = sInputStrings->at(gNextWordIndex++); #ifdef DEBUG } else if ( gDebug ) { fprintf( stderr, "gDone set to true\n" ); @@ -422,7 +436,115 @@ readNextWord( void ) tileToAscii( buf, sizeof(buf), gCurrentWord) ); } #endif -} // readNextWord +} // readFromSortedArray + +static char* +readOneWord( char* wordBuf, int bufLen, int* lenp, bool* gotEOF ) +{ + char* result = NULL; + int count = 0; + bool dropWord = false; + bool done = false; + + // for each byte + for ( ; ; ) { + int byt = getc( gInFile ); + + // EOF is special: we don't try for another word even if + // dropWord is true; we must leave now. + if ( byt == EOF || byt == gTermChar ) { + *gotEOF = byt == EOF; + + if ( !dropWord || *gotEOF ) { + if ( count != 0 ) { + wordBuf[count] = '\0'; + result = wordBuf; + *lenp = count; + ++gWordCount; + } + break; // we've finished a word + } else if ( *gotEOF ) { + break; + } + + // Don't call into the hashtable twice here!! + } else if ( gTableHash.find(byt) != gTableHash.end() ) { + if ( !dropWord ) { + wordBuf[count++] = (char)gTableHash[byt]; + if ( count >= bufLen ) { + char buf[MAX_WORD_LEN+1]; + ERROR_EXIT( "word starting \"%s\" too long", + tileToAscii( buf, sizeof(buf), wordBuf )); + } + } + } else if ( gKillIfMissing ) { + char buf[MAX_WORD_LEN+1]; + ERROR_EXIT( "chr %c (%d) not in map file %s\n" + "last word was %s\n", + byt, (int)byt, gTableFile, + tileToAscii( buf, sizeof(buf), wordBuf ) ); + } else { + dropWord = true; + count = 0; // lose anything we already have + } + } + +// if ( NULL != result ) { +// char buf[MAX_WORD_LEN+1]; +// fprintf( stderr, "%s returning %s\n", __FUNCTION__, +// tileToAscii( buf, sizeof(buf), result ) ); +// } + return result; +} // readOneWord + +static void +readFromFile( void ) +{ + char wordBuf[MAX_WORD_LEN+1]; + static bool s_eof = false;; + char* word; + int len; + + gDone = s_eof; + if ( !gDone ) { + word = readOneWord( wordBuf, sizeof(wordBuf), &len, &s_eof ); + gDone = NULL == word; + } + if ( gDone ) { + word = ""; + len = 0; + } + + int numCommonLetters = 0; + if ( gCurrentWordLen < len ) { + len = gCurrentWordLen; + } + + while ( gCurrentWord[numCommonLetters] == word[numCommonLetters] + && numCommonLetters < len ) { + ++numCommonLetters; + } + + gFirstDiff = numCommonLetters; + if ( (gCurrentWordLen > 0) && (strlen(word) > 0) + && !firstBeforeSecond( gCurrentWord, word ) ) { + char buf1[MAX_WORD_LEN+1]; + char buf2[MAX_WORD_LEN+1]; + ERROR_EXIT( "words %s and %s are out of order\n", + tileToAscii( buf1, sizeof(buf1), gCurrentWord ), + tileToAscii( buf2, sizeof(buf2), word ) ); + } + gCurrentWordLen = strlen(word); + strncpy( gCurrentWordBuf, word, sizeof(gCurrentWordBuf) ); + +#ifdef DEBUG + if ( gDebug ) { + char buf[MAX_WORD_LEN+1]; + fprintf( stderr, "gCurrentWord now %s\n", + tileToAscii( buf, sizeof(buf), gCurrentWord) ); + } +#endif +} // readFromFile static bool firstBeforeSecond( const char* lhs, const char* rhs ) @@ -456,75 +578,37 @@ parseAndSort( FILE* infile ) // allocate storage for the actual chars. wordlist's char* // elements will point into this. It'll leak. So what. - char* str = (char*)malloc( MAX_POOL_SIZE ); - assert( NULL != str ); - - std::string word; -#ifdef DEBUG - std::string asciiWord; -#endif + int memleft = MAX_POOL_SIZE; + char* str = (char*)malloc( memleft ); + if ( NULL == str ) { + ERROR_EXIT( "can't allocate main string storage" ); + } + bool eof = false; for ( ; ; ) { + int len; + char buf[MAX_WORD_LEN+1]; + char* word = readOneWord( str, memleft, &len, &eof ); - bool dropWord = false; - word.clear(); + if ( NULL == word ) { + break; + } - // for each byte - for ( ; ; ) { - int byt = getc( infile ); + wordlist->push_back( str ); + ++len; // include null byte + str += len; + memleft -= len; + ++gWordCount; - if ( byt == EOF ) { - goto done; - } else if ( byt == gTermChar ) { - if ( !dropWord ) { - int len = word.length() + 1; - memcpy( str, word.c_str(), len); - wordlist->push_back( str ); - str += len; - ++gWordCount; -#ifdef DEBUG - if ( gDebug ) { - char buf[MAX_WORD_LEN+1]; - fprintf( stderr, "loaded %s\n", asciiWord.c_str() ); - } -#endif - } -#ifdef DEBUG - asciiWord.clear(); -#endif - break; - - // Don't call into the hashtable twice here!! - } else if ( gTableHash.find(byt) != gTableHash.end() ) { - if ( !dropWord ) { -#if defined DEBUG && defined SEVERE_DEBUG - if ( gDebug ) { - fprintf( stderr, "adding %d for %c\n", - gTableHash[byt], (char)byt ); - } -#endif - word += (char)gTableHash[byt]; - assert( word.size() <= MAX_WORD_LEN ); -#ifdef DEBUG - if ( gKillIfMissing ) { - asciiWord += byt; - } -#endif - } - } else if ( gKillIfMissing ) { - char buf[MAX_WORD_LEN+1]; - ERROR_EXIT( "chr %c (%d) not in map file %s\n" - "last word was %s\n", - byt, (int)byt, gTableFile, - tileToAscii( buf, sizeof(buf), word.c_str() ) ); - } else { - dropWord = true; - word.clear(); // lose anything we already have - } + if ( eof ) { + break; + } + if ( memleft <= 0 ) { + ERROR_EXIT( "no memory left\n" ); } } - done: - if ( gNeedsSort && (gWordCount > 1) ) { + + if ( gWordCount > 1 ) { #ifdef DEBUG if ( gDebug ) { fprintf( stderr, "starting sort...\n" ); @@ -928,8 +1012,9 @@ error_exit( int line, const char* fmt, ... ) } static char* -parseARGV( int argc, char** argv ) +parseARGV( int argc, char** argv, const char** inFileName ) { + *inFileName = NULL; int index = 1; while ( index < argc ) { @@ -947,7 +1032,7 @@ parseARGV( int argc, char** argv ) } else if ( 0 == strcmp( arg, "-sn" ) ) { gStartNodeOut = argv[index++]; } else if ( 0 == strcmp( arg, "-if" ) ) { - gInFileName = argv[index++]; + *inFileName = argv[index++]; } else if ( 0 == strcmp( arg, "-r" ) ) { gKillIfMissing = false; } else if ( 0 == strcmp( arg, "-k" ) ) { @@ -957,7 +1042,7 @@ parseARGV( int argc, char** argv ) } else if ( 0 == strcmp( arg, "-dump" ) ) { gDumpText = true; } else if ( 0 == strcmp( arg, "-nosort" ) ) { - gNeedsSort = false; + gReadWordProc = readFromFile; } else if ( 0 == strcmp( arg, "-wc" ) ) { gCountFile = argv[index++]; } else if ( 0 == strcmp( arg, "-ns" ) ) {