For already-sorted case, read words from file on as-needed basis rather

than build a vector to hold them.
This commit is contained in:
ehouse 2006-04-14 05:23:30 +00:00
parent 8f909cd3a7
commit 3df1e461e4

View file

@ -56,19 +56,22 @@ typedef std::vector<char*> WordList;
#define MAX_WORD_LEN 15 #define MAX_WORD_LEN 15
int gFirstDiff; int gFirstDiff;
char* gCurrentWord = "";
int gCurrentWordLen; static char gCurrentWordBuf[MAX_WORD_LEN+1] = { '\0' };
// this will never change for non-sort case
static char* gCurrentWord = gCurrentWordBuf;
static int gCurrentWordLen;
char* gCurWord = NULL; // save so can check for sortedness char* gCurWord = NULL; // save so can check for sortedness
bool gDone = false; bool gDone = false;
WordList* gInputStrings;
static int gNextWordIndex; static int gNextWordIndex;
bool gNeedsSort = true; static void (*gReadWordProc)(void) = NULL;
NodeList gNodes; // final array of nodes NodeList gNodes; // final array of nodes
unsigned int gNBytesPerOutfile = 0xFFFFFFFF; unsigned int gNBytesPerOutfile = 0xFFFFFFFF;
char* gTableFile = NULL; char* gTableFile = NULL;
char* gOutFileBase = NULL; char* gOutFileBase = NULL;
char* gStartNodeOut = NULL; char* gStartNodeOut = NULL;
char* gInFileName = NULL; static FILE* gInFile = NULL;
bool gKillIfMissing = true; bool gKillIfMissing = true;
char gTermChar = '\n'; char gTermChar = '\n';
bool gDumpText = false; // dump the dict as text after? bool gDumpText = false; // dump the dict as text after?
@ -91,14 +94,13 @@ bool gUseUnicode;
#define MAX_POOL_SIZE 3000000 #define MAX_POOL_SIZE 3000000
#define ERROR_EXIT(...) error_exit( __LINE__, __VA_ARGS__ ); #define ERROR_EXIT(...) error_exit( __LINE__, __VA_ARGS__ );
static char* parseARGV( int argc, char** argv ); static char* parseARGV( int argc, char** argv, const char** inFileName );
static void usage( const char* name ); static void usage( const char* name );
static void error_exit( int line, const char* fmt, ... ); static void error_exit( int line, const char* fmt, ... );
static char parsechar( const char* in ); static char parsechar( const char* in );
static void makeTableHash( void ); static void makeTableHash( void );
static WordList* parseAndSort( FILE* file ); static WordList* parseAndSort( FILE* file );
static void printWords( WordList* strings ); static void printWords( WordList* strings );
static void readNextWord( void );
static bool firstBeforeSecond( const char* lhs, const char* rhs ); static bool firstBeforeSecond( const char* lhs, const char* rhs );
static char* tileToAscii( char* out, int outSize, const char* in ); static char* tileToAscii( char* out, int outSize, const char* in );
static int buildNode( int depth ); static int buildNode( int depth );
@ -124,34 +126,21 @@ static void writeOutStartNode( const char* startNodeOut,
static void emitNodes( unsigned int nBytesPerOutfile, const char* outFileBase ); static void emitNodes( unsigned int nBytesPerOutfile, const char* outFileBase );
static void outputNode( Node node, int nBytes, FILE* outfile ); static void outputNode( Node node, int nBytes, FILE* outfile );
static void printOneLevel( int index, char* str, int curlen ); static void printOneLevel( int index, char* str, int curlen );
static void readFromSortedArray( void );
int int
main( int argc, char** argv ) main( int argc, char** argv )
{ {
if ( NULL == parseARGV( argc, argv ) ) { gReadWordProc = readFromSortedArray;
const char* inFileName;
if ( NULL == parseARGV( argc, argv, &inFileName ) ) {
usage(argv[0]); usage(argv[0]);
exit(1); exit(1);
} }
makeTableHash(); makeTableHash();
FILE* infile;
if ( gInFileName ) {
infile = fopen( gInFileName, "r" );
} else {
infile = stdin;
}
gInputStrings = parseAndSort( infile );
gNextWordIndex = 0;
if ( gInFileName ) {
fclose( infile );
}
#ifdef DEBUG
if ( gDebug ) {
printWords( gInputStrings );
}
#endif
// Do I need this stupid thing? Better to move the first row to // Do I need this stupid thing? Better to move the first row to
// the front of the array and patch everything else. Or fix the // the front of the array and patch everything else. Or fix the
// non-palm dictionary format to include the offset of the first // non-palm dictionary format to include the offset of the first
@ -160,8 +149,14 @@ main( int argc, char** argv )
Node dummyNode = (Node)0xFFFFFFFF; Node dummyNode = (Node)0xFFFFFFFF;
assert( sizeof(Node) == 4 ); assert( sizeof(Node) == 4 );
gNodes.push_back(dummyNode); gNodes.push_back(dummyNode);
if ( NULL == inFileName ) {
gInFile = stdin;
} else {
gInFile = fopen( inFileName, "r" );
}
readNextWord(); (*gReadWordProc)();
int firstRootChildOffset = buildNode(0); int firstRootChildOffset = buildNode(0);
moveTopToFront( &firstRootChildOffset ); moveTopToFront( &firstRootChildOffset );
@ -201,6 +196,11 @@ main( int argc, char** argv )
fclose( OFILE ); fclose( OFILE );
} }
fprintf( stderr, "Used %d per node.\n", gNBytesPerNode ); fprintf( stderr, "Used %d per node.\n", gNBytesPerNode );
if ( NULL != inFileName ) {
fclose( gInFile );
}
} /* main */ } /* main */
// We now have an array of nodes with the last subarray being the // We now have an array of nodes with the last subarray being the
@ -264,7 +264,7 @@ buildNode( int depth )
// End of word reached. If the next word isn't a continuation // End of word reached. If the next word isn't a continuation
// of the current one, then we've reached the bottom of the // of the current one, then we've reached the bottom of the
// recursion tree. // recursion tree.
readNextWord(); (*gReadWordProc)();
if (gFirstDiff < depth || gDone) { if (gFirstDiff < depth || gDone) {
return 0; return 0;
} }
@ -373,14 +373,28 @@ registerSubArray( NodeList& edgesR, int nodeLoc )
} // registerSubArray } // registerSubArray
static void static void
readNextWord( void ) readFromSortedArray( void )
{ {
// The first time we need a new word, we read 'em all in.
static WordList* sInputStrings = NULL; // we'll just let this leak
if ( sInputStrings == NULL ) {
sInputStrings = parseAndSort( gInFile );
gNextWordIndex = 0;
#ifdef DEBUG
if ( gDebug ) {
printWords( sInputStrings );
}
#endif
}
char* word = ""; char* word = "";
if ( !gDone ) { if ( !gDone ) {
gDone = gNextWordIndex == gInputStrings->size(); gDone = gNextWordIndex == sInputStrings->size();
if ( !gDone ) { if ( !gDone ) {
word = gInputStrings->at(gNextWordIndex++); word = sInputStrings->at(gNextWordIndex++);
#ifdef DEBUG #ifdef DEBUG
} else if ( gDebug ) { } else if ( gDebug ) {
fprintf( stderr, "gDone set to true\n" ); fprintf( stderr, "gDone set to true\n" );
@ -422,7 +436,115 @@ readNextWord( void )
tileToAscii( buf, sizeof(buf), gCurrentWord) ); tileToAscii( buf, sizeof(buf), gCurrentWord) );
} }
#endif #endif
} // readNextWord } // readFromSortedArray
static char*
readOneWord( char* wordBuf, int bufLen, int* lenp, bool* gotEOF )
{
char* result = NULL;
int count = 0;
bool dropWord = false;
bool done = false;
// for each byte
for ( ; ; ) {
int byt = getc( gInFile );
// EOF is special: we don't try for another word even if
// dropWord is true; we must leave now.
if ( byt == EOF || byt == gTermChar ) {
*gotEOF = byt == EOF;
if ( !dropWord || *gotEOF ) {
if ( count != 0 ) {
wordBuf[count] = '\0';
result = wordBuf;
*lenp = count;
++gWordCount;
}
break; // we've finished a word
} else if ( *gotEOF ) {
break;
}
// Don't call into the hashtable twice here!!
} else if ( gTableHash.find(byt) != gTableHash.end() ) {
if ( !dropWord ) {
wordBuf[count++] = (char)gTableHash[byt];
if ( count >= bufLen ) {
char buf[MAX_WORD_LEN+1];
ERROR_EXIT( "word starting \"%s\" too long",
tileToAscii( buf, sizeof(buf), wordBuf ));
}
}
} else if ( gKillIfMissing ) {
char buf[MAX_WORD_LEN+1];
ERROR_EXIT( "chr %c (%d) not in map file %s\n"
"last word was %s\n",
byt, (int)byt, gTableFile,
tileToAscii( buf, sizeof(buf), wordBuf ) );
} else {
dropWord = true;
count = 0; // lose anything we already have
}
}
// if ( NULL != result ) {
// char buf[MAX_WORD_LEN+1];
// fprintf( stderr, "%s returning %s\n", __FUNCTION__,
// tileToAscii( buf, sizeof(buf), result ) );
// }
return result;
} // readOneWord
static void
readFromFile( void )
{
char wordBuf[MAX_WORD_LEN+1];
static bool s_eof = false;;
char* word;
int len;
gDone = s_eof;
if ( !gDone ) {
word = readOneWord( wordBuf, sizeof(wordBuf), &len, &s_eof );
gDone = NULL == word;
}
if ( gDone ) {
word = "";
len = 0;
}
int numCommonLetters = 0;
if ( gCurrentWordLen < len ) {
len = gCurrentWordLen;
}
while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
&& numCommonLetters < len ) {
++numCommonLetters;
}
gFirstDiff = numCommonLetters;
if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
&& !firstBeforeSecond( gCurrentWord, word ) ) {
char buf1[MAX_WORD_LEN+1];
char buf2[MAX_WORD_LEN+1];
ERROR_EXIT( "words %s and %s are out of order\n",
tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
tileToAscii( buf2, sizeof(buf2), word ) );
}
gCurrentWordLen = strlen(word);
strncpy( gCurrentWordBuf, word, sizeof(gCurrentWordBuf) );
#ifdef DEBUG
if ( gDebug ) {
char buf[MAX_WORD_LEN+1];
fprintf( stderr, "gCurrentWord now %s\n",
tileToAscii( buf, sizeof(buf), gCurrentWord) );
}
#endif
} // readFromFile
static bool static bool
firstBeforeSecond( const char* lhs, const char* rhs ) firstBeforeSecond( const char* lhs, const char* rhs )
@ -456,75 +578,37 @@ parseAndSort( FILE* infile )
// allocate storage for the actual chars. wordlist's char* // allocate storage for the actual chars. wordlist's char*
// elements will point into this. It'll leak. So what. // elements will point into this. It'll leak. So what.
char* str = (char*)malloc( MAX_POOL_SIZE ); int memleft = MAX_POOL_SIZE;
assert( NULL != str ); char* str = (char*)malloc( memleft );
if ( NULL == str ) {
std::string word; ERROR_EXIT( "can't allocate main string storage" );
#ifdef DEBUG }
std::string asciiWord;
#endif
bool eof = false;
for ( ; ; ) { for ( ; ; ) {
int len;
char buf[MAX_WORD_LEN+1];
char* word = readOneWord( str, memleft, &len, &eof );
bool dropWord = false; if ( NULL == word ) {
word.clear(); break;
}
// for each byte wordlist->push_back( str );
for ( ; ; ) { ++len; // include null byte
int byt = getc( infile ); str += len;
memleft -= len;
++gWordCount;
if ( byt == EOF ) { if ( eof ) {
goto done; break;
} else if ( byt == gTermChar ) { }
if ( !dropWord ) { if ( memleft <= 0 ) {
int len = word.length() + 1; ERROR_EXIT( "no memory left\n" );
memcpy( str, word.c_str(), len);
wordlist->push_back( str );
str += len;
++gWordCount;
#ifdef DEBUG
if ( gDebug ) {
char buf[MAX_WORD_LEN+1];
fprintf( stderr, "loaded %s\n", asciiWord.c_str() );
}
#endif
}
#ifdef DEBUG
asciiWord.clear();
#endif
break;
// Don't call into the hashtable twice here!!
} else if ( gTableHash.find(byt) != gTableHash.end() ) {
if ( !dropWord ) {
#if defined DEBUG && defined SEVERE_DEBUG
if ( gDebug ) {
fprintf( stderr, "adding %d for %c\n",
gTableHash[byt], (char)byt );
}
#endif
word += (char)gTableHash[byt];
assert( word.size() <= MAX_WORD_LEN );
#ifdef DEBUG
if ( gKillIfMissing ) {
asciiWord += byt;
}
#endif
}
} else if ( gKillIfMissing ) {
char buf[MAX_WORD_LEN+1];
ERROR_EXIT( "chr %c (%d) not in map file %s\n"
"last word was %s\n",
byt, (int)byt, gTableFile,
tileToAscii( buf, sizeof(buf), word.c_str() ) );
} else {
dropWord = true;
word.clear(); // lose anything we already have
}
} }
} }
done:
if ( gNeedsSort && (gWordCount > 1) ) { if ( gWordCount > 1 ) {
#ifdef DEBUG #ifdef DEBUG
if ( gDebug ) { if ( gDebug ) {
fprintf( stderr, "starting sort...\n" ); fprintf( stderr, "starting sort...\n" );
@ -928,8 +1012,9 @@ error_exit( int line, const char* fmt, ... )
} }
static char* static char*
parseARGV( int argc, char** argv ) parseARGV( int argc, char** argv, const char** inFileName )
{ {
*inFileName = NULL;
int index = 1; int index = 1;
while ( index < argc ) { while ( index < argc ) {
@ -947,7 +1032,7 @@ parseARGV( int argc, char** argv )
} else if ( 0 == strcmp( arg, "-sn" ) ) { } else if ( 0 == strcmp( arg, "-sn" ) ) {
gStartNodeOut = argv[index++]; gStartNodeOut = argv[index++];
} else if ( 0 == strcmp( arg, "-if" ) ) { } else if ( 0 == strcmp( arg, "-if" ) ) {
gInFileName = argv[index++]; *inFileName = argv[index++];
} else if ( 0 == strcmp( arg, "-r" ) ) { } else if ( 0 == strcmp( arg, "-r" ) ) {
gKillIfMissing = false; gKillIfMissing = false;
} else if ( 0 == strcmp( arg, "-k" ) ) { } else if ( 0 == strcmp( arg, "-k" ) ) {
@ -957,7 +1042,7 @@ parseARGV( int argc, char** argv )
} else if ( 0 == strcmp( arg, "-dump" ) ) { } else if ( 0 == strcmp( arg, "-dump" ) ) {
gDumpText = true; gDumpText = true;
} else if ( 0 == strcmp( arg, "-nosort" ) ) { } else if ( 0 == strcmp( arg, "-nosort" ) ) {
gNeedsSort = false; gReadWordProc = readFromFile;
} else if ( 0 == strcmp( arg, "-wc" ) ) { } else if ( 0 == strcmp( arg, "-wc" ) ) {
gCountFile = argv[index++]; gCountFile = argv[index++];
} else if ( 0 == strcmp( arg, "-ns" ) ) { } else if ( 0 == strcmp( arg, "-ns" ) ) {