allocate 4x the memory to biggest Polish dictionary fits; fix

backwards map declaration (these should be moved to trunk). Always assume utf8 input; fix size calculations.
2024-12-27 09:58:45 +01:00 · 2009-03-22 23:47:12 +00:00 · 2009-03-22 23:47:12 +00:00 · b7f32f28be
commit b7f32f28be
parent 5b78c4183a
1 changed files with 93 additions and 59 deletions
--- a/dawg/dict2dawg.cpp
+++ b/dawg/dict2dawg.cpp
@ -78,7 +78,7 @@ static void (*gReadWordProc)(void) = NULL;
 static NodeList gNodes;       // final array of nodes
 static unsigned int gNBytesPerOutfile = 0xFFFFFFFF;
 static char* gTableFile = NULL;
-static bool gIsMultibyte = false;
+static bool gIsMultibyte = true; // always true
 static const char* gEncoding = NULL;
 static char* gOutFileBase = NULL;
 static char* gStartNodeOut = NULL;
@ -91,7 +91,7 @@ static const char* gLang = NULL;
 static char* gBytesPerNodeFile = NULL;        // where to write whether node
                                       // size 3 or 4
 int gWordCount = 0;
-std::map<Letter,wchar_t> gTableHash;
+std::map<wchar_t,Letter> gTableHash;
 int gBlankIndex;
 std::vector<wchar_t> gRevMap;
 #ifdef DEBUG
@ -107,17 +107,19 @@ int gLimHigh = MAX_WORD_LEN;


 // OWL is 1.7M
-#define MAX_POOL_SIZE (10 * 0x100000)
+#define MAX_POOL_SIZE (10 * 0x100000 * sizeof(wchar_t))
 #define ERROR_EXIT(...) error_exit( __LINE__, __VA_ARGS__ );
+#define VSIZE(a) (sizeof(a)/sizeof(a[0]))

 static char* parseARGV( int argc, char** argv, const char** inFileName );
 static void usage( const char* name );
 static void error_exit( int line, const char* fmt, ... );
 static void makeTableHash( void );
+static void printTableHash( void );
 static WordList* parseAndSort( void );
 static void printWords( WordList* strings );
 static bool firstBeforeSecond( const Letter* lhs, const Letter* rhs );
-static wchar_t* tileToAscii( wchar_t* out, int outSize, const Letter* in );
+static wchar_t* tilesToText( wchar_t* out, int outLen, const Letter* in );
 static int buildNode( int depth );
 static void TrieNodeSetIsLastSibling( Node* nodeR, bool isLastSibling );
 static int addNodes( NodeList& newedgesR );
@ -178,6 +180,7 @@ main( int argc, char** argv )
    }
    
    makeTableHash();
+    printTableHash();

    // Do I need this stupid thing?  Better to move the first row to
    // the front of the array and patch everything else.  Or fix the
@ -453,7 +456,7 @@ readFromSortedArray( void )
            if ( gDebug ) {
                wchar_t buf[T2ABUFLEN(MAX_WORD_LEN)];
                fprintf( stderr, "%s: got word: %ls\n", __func__,
-                         tileToAscii( buf, sizeof(buf), word ) );
+                         tilesToText( buf, VSIZE(buf), word ) );
            }
 #endif
        }
@ -478,8 +481,8 @@ readFromSortedArray( void )
                fprintf( stderr,
                         "%s: words %ls and %ls are the same or out of order\n",
                         __func__, 
-                         tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
-                         tileToAscii( buf2, sizeof(buf2), word ) );
+                         tilesToText( buf1, VSIZE(buf1), gCurrentWord ),
+                         tilesToText( buf2, VSIZE(buf2), word ) );
            }
 #endif
            continue;
@ -494,7 +497,7 @@ readFromSortedArray( void )
    if ( gDebug ) {
        wchar_t buf[T2ABUFLEN(MAX_WORD_LEN)];
        fprintf( stderr, "gCurrentWord now %ls\n", 
-                 tileToAscii( buf, sizeof(buf), gCurrentWord) );
+                 tilesToText( buf, VSIZE(buf), gCurrentWord) );
    }
 #endif
 } // readFromSortedArray
@ -536,7 +539,7 @@ getWideChar( FILE* file )
 } // getWideChar

 static Letter*
-readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
+readOneWord( Letter* wordBuf, const int bufLen, int* lenp, bool* gotEOF )
 {
    Letter* result = NULL;
    int count = 0;
@ -548,7 +551,7 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
    // return it.  If no, start over ONLY IF the terminator was not
    // EOF.
    for ( ; ; ) {
-        wchar_t byt = gIsMultibyte? getWideChar( gInFile ) : getc( gInFile );
+        wchar_t byt = getWideChar( gInFile );

        // EOF is special: we don't try for another word even if
        // dropWord is true; we must leave now.
@ -560,6 +563,13 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
            if ( !dropWord && (count >= gLimLow) && (count <= gLimHigh) ) {
                assert( count < bufLen );
                wordBuf[count] = '\0';
+#ifdef DEBUG
+                if ( gDebug ) {
+                    wchar_t buf[T2ABUFLEN(count)];
+                    fprintf( stderr, "%s: adding word: %ls\n", 
+                             __func__, tilesToText( buf, VSIZE(buf), wordBuf ) );
+                }
+#endif
                result = wordBuf;
                *lenp = count;
                ++gWordCount;
@ -572,9 +582,10 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
            if ( gDebug ) {
                wchar_t buf[T2ABUFLEN(count)];
                wordBuf[count] = '\0';
-                fprintf( stderr, "%s: dropping word (len %d>=%d): %ls\n", 
-                         __func__, count, gLimHigh, 
-                         tileToAscii( buf, sizeof(buf), wordBuf ) );
+                fprintf( stderr, "%s: dropping word (len %d >%d or <%d or "
+                         "dropWord:%d): %ls\n", __func__, count, gLimHigh, 
+                         gLimLow, (int)dropWord,
+                         tilesToText( buf, VSIZE(buf), wordBuf ) );
            }
 #endif
            count = 0;  // we'll start over
@ -582,43 +593,43 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )

        } else if ( count >= bufLen ) {
            // Just drop it...
+            assert(0);          // Fix this -- but need to warn when out of
+                                // memory!!!
            dropWord = true;

            // Don't call into the hashtable twice here!!
-        } else if ( gTableHash.find(byt) != gTableHash.end() ) {
-            assert( count < bufLen );
-            wordBuf[count++] = gTableHash[byt];
-            if ( count >= bufLen ) {
-                dropWord = true;
-            }
-        } else if ( gKillIfMissing || !dropWord ) {
-            wchar_t buf[T2ABUFLEN(count)];
-            wordBuf[count] = '\0';
-
-            tileToAscii( buf, sizeof(buf), wordBuf );
-
-            if ( gKillIfMissing ) {
-                ERROR_EXIT( "chr %lc (%d/0x%x) not in map file %s\n"
-                            "last word was %ls\n",
-                            byt, (int)byt, (int)byt, gTableFile, buf );
-            } else if ( !dropWord ) {
-#ifdef DEBUG
-                if ( gDebug ) {
-                    fprintf( stderr, "%s: chr %lc (%d) not in map file %s\n"
-                             "dropping partial word %ls\n", __func__,
-                             byt, (int)byt, gTableFile, buf );
+        } else {
+            std::map<wchar_t,Letter>::iterator iter = gTableHash.find(byt);
+            if ( iter != gTableHash.end() ) {
+                assert( count < bufLen );
+                wordBuf[count++] = iter->second;
+                if ( count >= bufLen ) {
+                    dropWord = true;
                }
+            } else if ( gKillIfMissing || !dropWord ) {
+                wchar_t buf[T2ABUFLEN(count)];
+                wordBuf[count] = '\0';
+
+                tilesToText( buf, VSIZE(buf), wordBuf );
+
+                if ( gKillIfMissing ) {
+                    ERROR_EXIT( "chr %lc (%d/0x%x) not in map file %s\n"
+                                "last word was %ls\n",
+                                byt, (int)byt, (int)byt, gTableFile, buf );
+                } else if ( !dropWord ) {
+#ifdef DEBUG
+                    if ( gDebug ) {
+                        fprintf( stderr, "%s: chr %lc (%d) not in map file %s\n"
+                                 "dropping partial word %ls\n", __func__,
+                                 byt, (int)byt, gTableFile, buf );
+                    }
 #endif
-                dropWord = true;
+                    dropWord = true;
+                }
            }
        }
-    }
+    } // for

-//     if ( NULL != result ) {
-//         char buf[T2ABUFLEN(MAX_WORD_LEN)];
-//         fprintf( stderr, "%s returning %s\n", __func__,
-//                  tileToAscii( buf, sizeof(buf), result ) );
-//     }
    return result;
 } // readOneWord

@ -638,7 +649,7 @@ readFromFile( void )
    // during the sort.  This seems easier.
    for ( ; ; ) {
        if ( !gDone ) {
-            word = readOneWord( wordBuf, sizeof(wordBuf), &len, &s_eof );
+            word = readOneWord( wordBuf, VSIZE(wordBuf), &len, &s_eof );
            gDone = NULL == word;
        }
        if ( gDone ) {
@ -666,8 +677,8 @@ readFromFile( void )
                fprintf( stderr,
                         "%s: words %ls and %ls are the smae or out of order\n",
                         __func__, 
-                         tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
-                         tileToAscii( buf2, sizeof(buf2), word ) );
+                         tilesToText( buf1, VSIZE(buf1), gCurrentWord ),
+                         tilesToText( buf2, VSIZE(buf2), word ) );
            }
 #endif
            continue;
@ -681,7 +692,7 @@ readFromFile( void )
    if ( gDebug ) {
        wchar_t buf[T2ABUFLEN(MAX_WORD_LEN)];
        fprintf( stderr, "gCurrentWord now %ls\n", 
-                 tileToAscii( buf, sizeof(buf), gCurrentWord) );
+                 tilesToText( buf, VSIZE(buf), gCurrentWord) );
    }
 #endif
 } // readFromFile
@ -694,15 +705,14 @@ firstBeforeSecond( const Letter* lhs, const Letter* rhs )
 }

 static wchar_t*
-tileToAscii( wchar_t* out, int outSize, const Letter* in )
+tilesToText( wchar_t* out, int outSize, const Letter* in )
 {
-    // FIX THIS!  Pass actual size from callsite
-    outSize /= sizeof(wchar_t)/sizeof(char);
    wchar_t tiles[outSize];
-    int tilesLen = 1;
-    tiles[0] = L'[';
-
    wchar_t* orig = out;
+    int tilesLen = 0;
+
+    tiles[tilesLen++] = L'[';
+
    for ( ; ; ) {
        Letter ch = *in++;
        if ( '\0' == ch ) {
@ -710,6 +720,7 @@ tileToAscii( wchar_t* out, int outSize, const Letter* in )
        }
        assert( ch < gRevMap.size() );
        *out++ = gRevMap[ch];
+
        tilesLen += swprintf( &tiles[tilesLen], outSize-tilesLen, L"%d,", ch );
        assert( (out - orig) < outSize );
    }
@ -783,7 +794,7 @@ printWords( WordList* strings )
    std::vector<Letter*>::iterator iter = strings->begin();
    while ( iter != strings->end() ) {
        wchar_t buf[T2ABUFLEN(MAX_WORD_LEN)];
-        tileToAscii( buf, sizeof(buf), *iter );
+        tilesToText( buf, VSIZE(buf), *iter );
        fprintf( stderr, "%ls\n", buf );
        ++iter;
    }
@ -916,7 +927,7 @@ makeTableHash( void )
            break;
        }

-        fprintf( stderr, "adding %x\n", ch );
+        fprintf( stderr, "adding %lc/%x\n", ch, ch );
        gRevMap.push_back(ch);

        if ( ch == 0 ) {	// blank
@ -939,6 +950,26 @@ makeTableHash( void )
    fclose( TABLEFILE );
 } // makeTableHash

+static void
+printTableHash( void )
+{
+    if ( gDebug ) {
+        std::vector<wchar_t>::iterator iter = gRevMap.begin();
+        int count = 0;              // 0th entry is 0
+        while ( iter != gRevMap.end() ) {
+            wchar_t ch = *iter;
+            if ( 0 != ch ) {
+                fprintf( stderr, "%s: gRevMap[%d]: %lc\n", __func__, count, ch );
+                fprintf( stderr, "%s: gTableHash[%lc]: %d\n", __func__, ch, 
+                         gTableHash[ch] );
+                assert( gTableHash[ch] == count );
+            }
+            ++iter; 
+            ++count;
+        }
+    }
+}
+
 // emitNodes. "input" is $gNodes.  From it we write up to
 // $nBytesPerOutfile to files named $outFileBase0..n, mapping the
 // letter field down to 5 bits with a hash built from $tableFile.  If
@ -1064,6 +1095,9 @@ outputNode( Node node, int nBytes, FILE* outfile )
    unsigned int fco = TrieNodeGetFirstChildOffset(node);
    unsigned int fourthByte = 0;

+    assert( ((3 == nBytes) && (fco < (1<<17)))
+            || ((4 == nBytes) && (fco < (1<<24))) );
+
    if ( nBytes == 4 ) {
        fourthByte = fco >> 16;
        if ( fourthByte > 0xFF ) {
@ -1084,7 +1118,7 @@ outputNode( Node node, int nBytes, FILE* outfile )
    //                                                  |  |  |    
    //                                accepting bit  ---+  |  |
    //                                 last edge bit ------+  |
-    //         ---- last bit (17th on next node addr)---------+
+    //         ---- last bit (17th of next node addr)---------+

    // The four-byte format adds a byte at the right end for
    // addressing, but removes the extra bit (5) in order to let the
@ -1246,13 +1280,13 @@ parseARGV( int argc, char** argv, const char** inFileName )

    if ( !!enc ) {
        if ( !strcasecmp( enc, "UTF-8" ) ) {
-            gIsMultibyte = true;
+//             gIsMultibyte = true;
        } else if ( !strcasecmp( enc, "iso-8859-1" ) ) {
-            gIsMultibyte = false;
+//             gIsMultibyte = false;
        } else if ( !strcasecmp( enc, "iso-latin-1" ) ) {
-            gIsMultibyte = false;
+//             gIsMultibyte = false;
        } else if ( !strcasecmp( enc, "ISO-8859-2" ) ) {
-            gIsMultibyte = false;
+//             gIsMultibyte = false;
        } else {
            ERROR_EXIT( "%s: unknown encoding %s", __func__, enc );
        }