diff --git a/dawg/dict2dawg.cpp b/dawg/dict2dawg.cpp index 90e3ac7c5..c48f5655e 100644 --- a/dawg/dict2dawg.cpp +++ b/dawg/dict2dawg.cpp @@ -78,7 +78,7 @@ static void (*gReadWordProc)(void) = NULL; static NodeList gNodes; // final array of nodes static unsigned int gNBytesPerOutfile = 0xFFFFFFFF; static char* gTableFile = NULL; -static bool gIsMultibyte = false; +static bool gIsMultibyte = true; // always true static const char* gEncoding = NULL; static char* gOutFileBase = NULL; static char* gStartNodeOut = NULL; @@ -91,7 +91,7 @@ static const char* gLang = NULL; static char* gBytesPerNodeFile = NULL; // where to write whether node // size 3 or 4 int gWordCount = 0; -std::map gTableHash; +std::map gTableHash; int gBlankIndex; std::vector gRevMap; #ifdef DEBUG @@ -107,17 +107,19 @@ int gLimHigh = MAX_WORD_LEN; // OWL is 1.7M -#define MAX_POOL_SIZE (10 * 0x100000) +#define MAX_POOL_SIZE (10 * 0x100000 * sizeof(wchar_t)) #define ERROR_EXIT(...) error_exit( __LINE__, __VA_ARGS__ ); +#define VSIZE(a) (sizeof(a)/sizeof(a[0])) static char* parseARGV( int argc, char** argv, const char** inFileName ); static void usage( const char* name ); static void error_exit( int line, const char* fmt, ... ); static void makeTableHash( void ); +static void printTableHash( void ); static WordList* parseAndSort( void ); static void printWords( WordList* strings ); static bool firstBeforeSecond( const Letter* lhs, const Letter* rhs ); -static wchar_t* tileToAscii( wchar_t* out, int outSize, const Letter* in ); +static wchar_t* tilesToText( wchar_t* out, int outLen, const Letter* in ); static int buildNode( int depth ); static void TrieNodeSetIsLastSibling( Node* nodeR, bool isLastSibling ); static int addNodes( NodeList& newedgesR ); @@ -178,6 +180,7 @@ main( int argc, char** argv ) } makeTableHash(); + printTableHash(); // Do I need this stupid thing? Better to move the first row to // the front of the array and patch everything else. Or fix the @@ -453,7 +456,7 @@ readFromSortedArray( void ) if ( gDebug ) { wchar_t buf[T2ABUFLEN(MAX_WORD_LEN)]; fprintf( stderr, "%s: got word: %ls\n", __func__, - tileToAscii( buf, sizeof(buf), word ) ); + tilesToText( buf, VSIZE(buf), word ) ); } #endif } @@ -478,8 +481,8 @@ readFromSortedArray( void ) fprintf( stderr, "%s: words %ls and %ls are the same or out of order\n", __func__, - tileToAscii( buf1, sizeof(buf1), gCurrentWord ), - tileToAscii( buf2, sizeof(buf2), word ) ); + tilesToText( buf1, VSIZE(buf1), gCurrentWord ), + tilesToText( buf2, VSIZE(buf2), word ) ); } #endif continue; @@ -494,7 +497,7 @@ readFromSortedArray( void ) if ( gDebug ) { wchar_t buf[T2ABUFLEN(MAX_WORD_LEN)]; fprintf( stderr, "gCurrentWord now %ls\n", - tileToAscii( buf, sizeof(buf), gCurrentWord) ); + tilesToText( buf, VSIZE(buf), gCurrentWord) ); } #endif } // readFromSortedArray @@ -536,7 +539,7 @@ getWideChar( FILE* file ) } // getWideChar static Letter* -readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF ) +readOneWord( Letter* wordBuf, const int bufLen, int* lenp, bool* gotEOF ) { Letter* result = NULL; int count = 0; @@ -548,7 +551,7 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF ) // return it. If no, start over ONLY IF the terminator was not // EOF. for ( ; ; ) { - wchar_t byt = gIsMultibyte? getWideChar( gInFile ) : getc( gInFile ); + wchar_t byt = getWideChar( gInFile ); // EOF is special: we don't try for another word even if // dropWord is true; we must leave now. @@ -560,6 +563,13 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF ) if ( !dropWord && (count >= gLimLow) && (count <= gLimHigh) ) { assert( count < bufLen ); wordBuf[count] = '\0'; +#ifdef DEBUG + if ( gDebug ) { + wchar_t buf[T2ABUFLEN(count)]; + fprintf( stderr, "%s: adding word: %ls\n", + __func__, tilesToText( buf, VSIZE(buf), wordBuf ) ); + } +#endif result = wordBuf; *lenp = count; ++gWordCount; @@ -572,9 +582,10 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF ) if ( gDebug ) { wchar_t buf[T2ABUFLEN(count)]; wordBuf[count] = '\0'; - fprintf( stderr, "%s: dropping word (len %d>=%d): %ls\n", - __func__, count, gLimHigh, - tileToAscii( buf, sizeof(buf), wordBuf ) ); + fprintf( stderr, "%s: dropping word (len %d >%d or <%d or " + "dropWord:%d): %ls\n", __func__, count, gLimHigh, + gLimLow, (int)dropWord, + tilesToText( buf, VSIZE(buf), wordBuf ) ); } #endif count = 0; // we'll start over @@ -582,43 +593,43 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF ) } else if ( count >= bufLen ) { // Just drop it... + assert(0); // Fix this -- but need to warn when out of + // memory!!! dropWord = true; // Don't call into the hashtable twice here!! - } else if ( gTableHash.find(byt) != gTableHash.end() ) { - assert( count < bufLen ); - wordBuf[count++] = gTableHash[byt]; - if ( count >= bufLen ) { - dropWord = true; - } - } else if ( gKillIfMissing || !dropWord ) { - wchar_t buf[T2ABUFLEN(count)]; - wordBuf[count] = '\0'; - - tileToAscii( buf, sizeof(buf), wordBuf ); - - if ( gKillIfMissing ) { - ERROR_EXIT( "chr %lc (%d/0x%x) not in map file %s\n" - "last word was %ls\n", - byt, (int)byt, (int)byt, gTableFile, buf ); - } else if ( !dropWord ) { -#ifdef DEBUG - if ( gDebug ) { - fprintf( stderr, "%s: chr %lc (%d) not in map file %s\n" - "dropping partial word %ls\n", __func__, - byt, (int)byt, gTableFile, buf ); + } else { + std::map::iterator iter = gTableHash.find(byt); + if ( iter != gTableHash.end() ) { + assert( count < bufLen ); + wordBuf[count++] = iter->second; + if ( count >= bufLen ) { + dropWord = true; } + } else if ( gKillIfMissing || !dropWord ) { + wchar_t buf[T2ABUFLEN(count)]; + wordBuf[count] = '\0'; + + tilesToText( buf, VSIZE(buf), wordBuf ); + + if ( gKillIfMissing ) { + ERROR_EXIT( "chr %lc (%d/0x%x) not in map file %s\n" + "last word was %ls\n", + byt, (int)byt, (int)byt, gTableFile, buf ); + } else if ( !dropWord ) { +#ifdef DEBUG + if ( gDebug ) { + fprintf( stderr, "%s: chr %lc (%d) not in map file %s\n" + "dropping partial word %ls\n", __func__, + byt, (int)byt, gTableFile, buf ); + } #endif - dropWord = true; + dropWord = true; + } } } - } + } // for -// if ( NULL != result ) { -// char buf[T2ABUFLEN(MAX_WORD_LEN)]; -// fprintf( stderr, "%s returning %s\n", __func__, -// tileToAscii( buf, sizeof(buf), result ) ); -// } return result; } // readOneWord @@ -638,7 +649,7 @@ readFromFile( void ) // during the sort. This seems easier. for ( ; ; ) { if ( !gDone ) { - word = readOneWord( wordBuf, sizeof(wordBuf), &len, &s_eof ); + word = readOneWord( wordBuf, VSIZE(wordBuf), &len, &s_eof ); gDone = NULL == word; } if ( gDone ) { @@ -666,8 +677,8 @@ readFromFile( void ) fprintf( stderr, "%s: words %ls and %ls are the smae or out of order\n", __func__, - tileToAscii( buf1, sizeof(buf1), gCurrentWord ), - tileToAscii( buf2, sizeof(buf2), word ) ); + tilesToText( buf1, VSIZE(buf1), gCurrentWord ), + tilesToText( buf2, VSIZE(buf2), word ) ); } #endif continue; @@ -681,7 +692,7 @@ readFromFile( void ) if ( gDebug ) { wchar_t buf[T2ABUFLEN(MAX_WORD_LEN)]; fprintf( stderr, "gCurrentWord now %ls\n", - tileToAscii( buf, sizeof(buf), gCurrentWord) ); + tilesToText( buf, VSIZE(buf), gCurrentWord) ); } #endif } // readFromFile @@ -694,15 +705,14 @@ firstBeforeSecond( const Letter* lhs, const Letter* rhs ) } static wchar_t* -tileToAscii( wchar_t* out, int outSize, const Letter* in ) +tilesToText( wchar_t* out, int outSize, const Letter* in ) { - // FIX THIS! Pass actual size from callsite - outSize /= sizeof(wchar_t)/sizeof(char); wchar_t tiles[outSize]; - int tilesLen = 1; - tiles[0] = L'['; - wchar_t* orig = out; + int tilesLen = 0; + + tiles[tilesLen++] = L'['; + for ( ; ; ) { Letter ch = *in++; if ( '\0' == ch ) { @@ -710,6 +720,7 @@ tileToAscii( wchar_t* out, int outSize, const Letter* in ) } assert( ch < gRevMap.size() ); *out++ = gRevMap[ch]; + tilesLen += swprintf( &tiles[tilesLen], outSize-tilesLen, L"%d,", ch ); assert( (out - orig) < outSize ); } @@ -783,7 +794,7 @@ printWords( WordList* strings ) std::vector::iterator iter = strings->begin(); while ( iter != strings->end() ) { wchar_t buf[T2ABUFLEN(MAX_WORD_LEN)]; - tileToAscii( buf, sizeof(buf), *iter ); + tilesToText( buf, VSIZE(buf), *iter ); fprintf( stderr, "%ls\n", buf ); ++iter; } @@ -916,7 +927,7 @@ makeTableHash( void ) break; } - fprintf( stderr, "adding %x\n", ch ); + fprintf( stderr, "adding %lc/%x\n", ch, ch ); gRevMap.push_back(ch); if ( ch == 0 ) { // blank @@ -939,6 +950,26 @@ makeTableHash( void ) fclose( TABLEFILE ); } // makeTableHash +static void +printTableHash( void ) +{ + if ( gDebug ) { + std::vector::iterator iter = gRevMap.begin(); + int count = 0; // 0th entry is 0 + while ( iter != gRevMap.end() ) { + wchar_t ch = *iter; + if ( 0 != ch ) { + fprintf( stderr, "%s: gRevMap[%d]: %lc\n", __func__, count, ch ); + fprintf( stderr, "%s: gTableHash[%lc]: %d\n", __func__, ch, + gTableHash[ch] ); + assert( gTableHash[ch] == count ); + } + ++iter; + ++count; + } + } +} + // emitNodes. "input" is $gNodes. From it we write up to // $nBytesPerOutfile to files named $outFileBase0..n, mapping the // letter field down to 5 bits with a hash built from $tableFile. If @@ -1064,6 +1095,9 @@ outputNode( Node node, int nBytes, FILE* outfile ) unsigned int fco = TrieNodeGetFirstChildOffset(node); unsigned int fourthByte = 0; + assert( ((3 == nBytes) && (fco < (1<<17))) + || ((4 == nBytes) && (fco < (1<<24))) ); + if ( nBytes == 4 ) { fourthByte = fco >> 16; if ( fourthByte > 0xFF ) { @@ -1084,7 +1118,7 @@ outputNode( Node node, int nBytes, FILE* outfile ) // | | | // accepting bit ---+ | | // last edge bit ------+ | - // ---- last bit (17th on next node addr)---------+ + // ---- last bit (17th of next node addr)---------+ // The four-byte format adds a byte at the right end for // addressing, but removes the extra bit (5) in order to let the @@ -1246,13 +1280,13 @@ parseARGV( int argc, char** argv, const char** inFileName ) if ( !!enc ) { if ( !strcasecmp( enc, "UTF-8" ) ) { - gIsMultibyte = true; +// gIsMultibyte = true; } else if ( !strcasecmp( enc, "iso-8859-1" ) ) { - gIsMultibyte = false; +// gIsMultibyte = false; } else if ( !strcasecmp( enc, "iso-latin-1" ) ) { - gIsMultibyte = false; +// gIsMultibyte = false; } else if ( !strcasecmp( enc, "ISO-8859-2" ) ) { - gIsMultibyte = false; +// gIsMultibyte = false; } else { ERROR_EXIT( "%s: unknown encoding %s", __func__, enc ); }