Add support for Russian. So that Russian text can be processed on systems without setting LANG=ru_RU.CP1251, modify dict2dawg to skip duplicates and words outside of specified lengths. Modify all info.txt files for the new scheme (which includes change to byod.cgi not kept on sourceforge.)

2025-02-12 08:47:50 +01:00 · 2007-02-17 17:06:05 +00:00 · 2007-02-17 17:06:05 +00:00 · 3bb2fb018f
commit 3bb2fb018f
parent 326ecb00f4
15 changed files with 366 additions and 181 deletions
--- a/dawg/Danish/info.txt
+++ b/dawg/Danish/info.txt
@ -18,23 +18,21 @@ LANGCODE:da_DK
 # deal with DOS files
-LANGFILTER_PRECLIP: tr -d '\r' |
+LANGFILTER: tr -d '\r'
 # uppercase all
-LANGFILTER_POSTCLIP: | tr [a-zĺćř] [A-ZĹĆŘ]
+LANGFILTER: | tr [a-zåæø] [A-ZÅÆØ]
 # no words not containing a vowel
-LANGFILTER_POSTCLIP: | grep '[AEIOUYĹĆŘ]' 
+LANGFILTER: | grep '[AEIOUYÅÆØ]' 
 # none with illegal chars
-LANGFILTER_POSTCLIP: | grep '^[A-PR-VX-ZĹĆŘ]\+$'
+LANGFILTER: | grep '^[A-PR-VX-ZÅÆØ]\+$'
 # remove duplicates
-LANGFILTER_POSTCLIP: | sort -u
+LANGFILTER: | sort -u
 LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
 # Until I can figure out how to force sort to use a locale's collation
 # rules we can't trust sort in the filtering rules above and so must
 # leave the sorting work to dict2dawg.pl.
-NEEDSSORT:true
+D2DARGS: -r -term 10
 LANGINFO: <p>Danish uses all English letters except Q and W.  There
 LANGINFO: are three non-English letters: 'Å', 'Æ' and 'Ø'.  </p>
--- a/dawg/Dutch/info.txt
+++ b/dawg/Dutch/info.txt
@ -18,22 +18,19 @@ LANGCODE:nl_NL
 # deal with DOS files
-LANGFILTER_PRECLIP: tr -d '\r' |
+LANGFILTER: tr -d '\r'
 # uppercase all
-LANGFILTER_POSTCLIP: | tr [a-z] [A-Z]
+LANGFILTER: | tr [a-z] [A-Z]
 # no words not containing a vowel
-LANGFILTER_POSTCLIP: | grep '[AEIOU]' 
+LANGFILTER: | grep '[AEIOU]' 
 # none with illegal chars
-LANGFILTER_POSTCLIP: | grep '^[A-Z]\+$'
+LANGFILTER: | grep '^[A-Z]\+$'
-LANGFILTER_POSTCLIP: | sort -u
+LANGFILTER: | sort -u
 LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
 # Until I can figure out how to force sort to use a locale's collation
 # rules we can't trust sort in the filtering rules above and so must
 # leave the sorting work to dict2dawg.pl.
-
+D2DARGS: -r -term 10
 NEEDSSORT:false
 LANGINFO: <p>Dutch has the same 26 letters as English, though of
 LANGINFO: course the counts and values are different.  Filtering rules
--- a/dawg/English/Makefile.BasEnglish
+++ b/dawg/English/Makefile.BasEnglish
@ -1,4 +1,4 @@
-# -*-mode: Makefile -*-
+# -*-mode: Makefile; compile-command: "make -f Makefile.BasEnglish"; -*-
 # Copyright 2002 by Eric House (xwords@eehouse.org).  All rights reserved.
 #
 # This program is free software; you can redistribute it and/or
@ -17,7 +17,7 @@
 XWLANG=BasEnglish
 LANGCODE=en_US
-#NEWDAWG=1
+DICT2DAWGARGS = -r -nosort
 TARGET_TYPE ?= PALM
--- a/dawg/English/info.txt
+++ b/dawg/English/info.txt
@ -17,16 +17,16 @@
 LANGCODE:en_US
 # deal with DOS files
-LANGFILTER_PRECLIP: tr -d '\r' |
+LANGFILTER: tr -d '\r'
-
+LANGFILTER: | tr [a-z] [A-Z]
-LANGFILTER_POSTCLIP: | tr [a-z] [A-Z]
+LANGFILTER: | grep '^[A-Z]*$'
-LANGFILTER_POSTCLIP: | grep '^[A-Z]*$'
+LANGFILTER: | sort -u
 LANGFILTER_POSTCLIP: | sort -u
 # We can trust sort (above) to do the right thing since there's no
 # high ascii.  dict2dawg.pl is much faster if I can trust that its
 # input is in sorted order.
-NEEDSSORT:false
+D2DARGS: -nosort -term 10
 LANGINFO: <p>English dictionaries can contain words with any of the 26
 LANGINFO: letters you think of as making up the alphabet: A-Z.  At
--- a/dawg/French/info.txt
+++ b/dawg/French/info.txt
@ -17,16 +17,14 @@
 LANGCODE:fr_FR
 # deal with DOS files
-LANGFILTER_PRECLIP: tr -d '\r' |
+LANGFILTER: tr -d '\r'
-LANGFILTER_POSTCLIP: | tr [a-z] [A-Z]
+LANGFILTER: | tr [a-z] [A-Z]
-LANGFILTER_POSTCLIP: | grep '^[A-Z]*$'
+LANGFILTER: | grep '^[A-Z]*$'
-LANGFILTER_POSTCLIP: | tr '\n' '\000'
+LANGFILTER: | tr '\n' '\000'
-LANGFILTER_POSTCLIP: | sort -u -z
+LANGFILTER: | sort -u -z
 NEEDSSORT:false
 D2DARGS: -r -nosort -term 0
 LANGINFO: <p>At this point French is getting treated the same as
 LANGINFO: English.  But I think I should be transforming accented
--- a/dawg/German/info.txt
+++ b/dawg/German/info.txt
@ -16,26 +16,21 @@
 LANGCODE:de_DE
 # deal with DOS files
-LANGFILTER_PRECLIP: tr -d '\r' |
+LANGFILTER: tr -d '\r'
 # substitute for sharfes-s
-LANGFILTER_PRECLIP: sed -e 's/ß/SS/g' |
+LANGFILTER: | sed -e 's/ß/SS/g'
 # uppercase all
-LANGFILTER_POSTCLIP: | tr [a-zäöü] [A-ZÄÖÜ]
+LANGFILTER: | tr [a-zäöü] [A-ZÄÖÜ]
 # no words not containing a vowel
-LANGFILTER_POSTCLIP: | grep '[AEIOUÄÖÜ]' 
+LANGFILTER: | grep '[AEIOUÄÖÜ]' 
 # none with illegal chars
-LANGFILTER_POSTCLIP: | grep '^[A-ZÄÖÜ]\+$'
+LANGFILTER: | grep '^[A-ZÄÖÜ]\+$'
 LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
 # Until I can figure out how to force sort to use a locale's collation
 # rules we can't trust sort in the filtering rules above and so must
 # leave the sorting work to dict2dawg.pl.
-
+D2DARGS: -r -term 10
 NEEDSSORT:true
 LANGINFO: <p>German has the 26 English letters plus the three umlaut
 LANGINFO: vowels.  Scharfes-s is not a legal tile, but if present in
--- a/dawg/Italian/info.txt
+++ b/dawg/Italian/info.txt
@ -18,16 +18,12 @@
 LANGCODE:it_IT
 # deal with DOS files
-LANGFILTER_PRECLIP: tr -d '\r' |
+LANGFILTER: tr -d '\r'
-
+LANGFILTER: | tr [a-z] [A-Z]
-LANGFILTER_POSTCLIP: | tr [a-z] [A-Z]
+LANGFILTER: | grep '^[A-IL-VZ]*$'
-LANGFILTER_POSTCLIP: | grep '^[A-IL-VZ]*$'
+LANGFILTER: | sort -u
 LANGFILTER_POSTCLIP: | tr '\n' '\000'
 LANGFILTER_POSTCLIP: | sort -u -z
 NEEDSSORT:false
 D2DARGS: -r -term 10 -nosort
 LANGINFO: <p>Italian is treated the same as English but for
 LANGINFO: missing letters J, K, W, X and Y.</p>
--- a/dawg/Makefile.langcommon
+++ b/dawg/Makefile.langcommon
@ -197,7 +197,7 @@ frankspecials.bin: ../frank_mkspecials.pl  $(BMPFILES)
 # a binary file (one byte) giving the number of tiles in the dict
 charcount.bin: table.bin
 ifdef NEWDAWG
-	siz=$$(wc $< | awk '{print $$3}'); \
+	siz=$$(ls -l $< | awk '{print $$5}'); \
 	perl -e "print pack(\"c\",$$siz/2)" > $@
 else
 	siz=$$(wc -c $< | sed -e 's/$<//'); \
@ -240,11 +240,10 @@ dawg$(XWLANG)%.stamp: $(XWLANG)Main.dict.gz $(DICT2DAWG) table.bin ../Makefile.l
 	start=$$(echo $@ | sed -e 's/dawg$(XWLANG)\([0-9]*\)to[0-9]*.stamp/\1/'); \
 	end=$$(echo $@ | sed -e 's/dawg$(XWLANG)[0-9]*to\([0-9]*\).stamp/\1/'); \
 	echo $${start} and $$end; \
-	zcat $< | grep "^.\{$${start},$${end}\}$$" | \
+	zcat $< | $(DICT2DAWG) $(DICT2DAWGARGS) $(TABLE_ARG) table.bin -b 28000 \
 		sort -u | $(DICT2DAWG) $(TABLE_ARG) table.bin -b 28000 \
 		-ob dawg$(XWLANG)$* \
-		-sn $(XWLANG)StartLoc.bin -k -term 10 -wc $(XWLANG)$*_wordcount.bin \
+		-sn $(XWLANG)StartLoc.bin -min $$start -max $$end \
-		$(FORCE_4) -ns $(XWLANG)$*_nodesize.bin
+		-wc $(XWLANG)$*_wordcount.bin $(FORCE_4) -ns $(XWLANG)$*_nodesize.bin
 	touch $@
 $(XWLANG)%_wordcount.bin: dawg$(XWLANG)%.stamp
--- a/dawg/Portuguese/info.txt
+++ b/dawg/Portuguese/info.txt
@ -17,22 +17,19 @@
 LANGCODE:pt_PT
 # deal with DOS files
-LANGFILTER_PRECLIP: tr -d '\r' |
+LANGFILTER: tr -d '\r'
 # uppercase all
-LANGFILTER_POSTCLIP: | tr [a-zç] [A-ZÇ]
+LANGFILTER: | tr [a-zç] [A-ZÇ]
 # no words not containing a vowel
-LANGFILTER_POSTCLIP: | grep '[AEIOU]' 
+LANGFILTER: | grep '[AEIOU]' 
 # none with illegal chars
-LANGFILTER_POSTCLIP: | grep '^[A-JL-VXZÇ]\+$'
+LANGFILTER: | grep '^[A-JL-VXZÇ]\+$'
 LANGFILTER_POSTCLIP: | sort -u 
 LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
 # Until I can figure out how to force sort to use a locale's collation
 # rules we can't trust sort in the filtering rules above and so must
 # leave the sorting work to dict2dawg.pl.
 D2DARGS: -r -term 10
 NEEDSSORT:true
 LANGINFO: <p>Portugese uses the letter A-Z, excluding K, W and Y, and adds
 LANGINFO: Ç.  Words containing any other letters are dropped. </p>
--- a/dawg/Russian/Makefile
+++ b/dawg/Russian/Makefile
@ -0,0 +1,41 @@
 # -*- mode: makefile -*-
 # Copyright 2002-2007 by Eric House (xwords@eehouse.org).  All rights reserved.
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
 # as published by the Free Software Foundation; either version 2
 # of the License, or (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 XWLANG=Russian
 LANGCODE=ru_RU
 DICT2DAWGARGS = -r
 TARGET_TYPE ?= WINCE
 include ../Makefile.2to8
 include ../Makefile.langcommon
 SOURCEDICT ?= $(XWDICTPATH)/$(XWLANG)/RU5000.txt.gz
 $(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile
 	zcat $< | tr -d '\r' | \
 	tr [àáâãäåæçèéêëìíîïðñòóôõö÷øùÚûüýþÿ] [ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß] | \
 	gzip -c > $@
 # Everything but creating of the Main.dict file is inherited from the
 # "parent" Makefile.langcommon in the parent directory.
 clean: clean_common
 	rm -f $(XWLANG)Main.dict.gz *.bin $(XWLANG)*.pdb $(XWLANG)*.seb
--- a/dawg/Russian/info.txt
+++ b/dawg/Russian/info.txt
@ -0,0 +1,76 @@
 # Copyright 2002,2007 by Eric House (xwords@eehouse.org).  All rights
 # reserved.
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
 # as published by the Free Software Foundation; either version 2
 # of the License, or (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 LANGCODE:ru_RU
 CHARSET:windows-1251
 # deal with DOS files
 LANGFILTER: tr -d '\r'
 # uppercase all
 LANGFILTER: | tr [àáâãäåæçèéêëìíîïðñòóôõö÷øùÚûüýþÿ] [ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß]
 # LANGFILTER: | tr -s '\n' '\000'
 # note: don't turn off sorting!  Can't do it with GNU 'sort' without
 # setting LANG
 D2DARGS: -r -term 10
 LANGINFO: <p>Russian wordlists must be in the Windows-1251
 LANGINFO: codepage. Lower-case letters are converted to upper case and
 LANGINFO: any words that contain letters not listed below are
 LANGINFO: removed.</p>
 # High bit means "official".  Next 7 bits are an enum where
 # Russian==0x0F.  Low byte is padding.
 XLOC_HEADER:0x8F00
 <BEGIN_TILES>
 8       1         'À'
 2       3         'Á'
 4       1         'Â'
 2       3         'Ã'
 2       2         'Ä'
 7       1         'Å'
 1       4         'Æ'
 1       3         'Ç'
 7       1         'È'
 1       2         'É'
 4       2         'Ê'
 4       2         'Ë'
 2       3         'Ì'
 4       1         'Í'
 9       1         'Î'
 4       2         'Ï'
 5       1         'Ð'
 5       1         'Ñ'
 7       1         'Ò'
 4       2         'Ó'
 1       5         'Ô'
 1       4         'Õ'
 1       4         'Ö'
 1       3         '×'
 1       4         'Ø'
 1       5         'Ù'
 1       10        'Ú'
 2       2         'Û'
 4       1         'Ü'
 1       8         'Ý'
 1       5         'Þ'
 2       2         'ß'
 2			0		{"_"}
 <END_TILES>
 # should ignore all after the <END_TILES> above
--- a/dawg/Spanish/info.txt
+++ b/dawg/Spanish/info.txt
@ -21,24 +21,25 @@
 NEEDSSORT:true
 # MSDos LF chars go bye-bye
-LANGFILTER_PRECLIP: tr -d '\r' |
+LANGFILTER: tr -d '\r'
 # convert accented vowels
-LANGFILTER_POSTCLIP: | tr '\207\216\222\227\234\237\226' 'aeiouu\321'
+LANGFILTER: | tr '\207\216\222\227\234\237\226' 'aeiouu\321'
 # uppercase
-LANGFILTER_POSTCLIP: | tr [a-zń] [A-ZŃ]
+LANGFILTER: | tr [a-zñ] [A-ZÑ]
 # remove words with illegal letters
-LANGFILTER_POSTCLIP: | grep '^[[A-JL-VX-ZŃ]*$'
+LANGFILTER: | grep '^[[A-JL-VX-ZÑ]*$'
 # substitute pairs (can't figure out how to use octal values)
-LANGFILTER_POSTCLIP: | sed 's/CH/1/g'
+LANGFILTER: | sed 's/CH/1/g'
-LANGFILTER_POSTCLIP: | sed 's/LL/2/g'
+LANGFILTER: | sed 's/LL/2/g'
-LANGFILTER_POSTCLIP: | sed 's/RR/3/g'
+LANGFILTER: | sed 's/RR/3/g'
 # substitute in the octal control character values
-LANGFILTER_POSTCLIP: | tr '123' '\001\002\003'
+LANGFILTER: | tr '123' '\001\002\003'
 # now add nulls as terminators
-LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
+LANGFILTER: | tr -s '\n' '\000'
-LANGFILTER_POSTCLIP: | sort -u -z
+LANGFILTER: | sort -u -z
 D2DARGS: -r -term 0
 LANGINFO: <p>Spanish words include all letters in the English alphabet
 LANGINFO: except "K" and "W", and with "Ñ" added. Since there are no
--- a/dawg/Swedish/info.txt
+++ b/dawg/Swedish/info.txt
@ -16,12 +16,11 @@
 LANGCODE:sv_SE
 LANGFILTER: tr -d '\r'
 LANGFILTER: | tr [a-zäåæöü] [A-ZÄÅÆÖÜ]
 LANGFILTER: | grep '^[A-ZÄÅÆÖÜ]*$'
-LANGFILTER_POSTCLIP: | tr [a-zäĺćöü] [A-ZÄĹĆÖÜ]
+D2DARGS: -r -term 10
 LANGFILTER_POSTCLIP: | grep '^[A-ZÄĹĆÖÜ]*$'
 LANGFILTER_POSTCLIP: | tr '\n' '\000'
 NEEDSSORT:true
 LANGINFO: <p>From an English-speaker's perspective, Swedish drops Q
 LANGINFO: and W, and adds Ä, Å, Æ, Ö and Ü.</p>
--- a/dawg/dict2dawg.cpp
+++ b/dawg/dict2dawg.cpp
@ -1,9 +1,9 @@
-/* -*- compile-command: "g++ -O -o dict2dawg dict2dawg.cpp"; -*- */
+/* -*- compile-command: "g++ -DDEBUG -O -o dict2dawg dict2dawg.cpp"; -*- */
 /*************************************************************************
 * adapted from perl code that was itself adapted from C++ code
 * Copyright (C) 2000 Falk Hueffner
- * This version Copyright (C) 2002,2006 Eric House (xwords@eehouse.org)
+ * This version Copyright (C) 2002,2006-2007 Eric House (xwords@eehouse.org)
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@ -54,9 +54,11 @@ typedef unsigned int Node;
 typedef std::vector<Node> NodeList;
 typedef std::vector<char*> WordList;
 #define MAX_WORD_LEN 15
 #define VERSION_STR "$Rev$"
 #define MAX_WORD_LEN 15
 #define T2ABUFLEN(s) (((s)*4)+3)
 int gFirstDiff;
 static char gCurrentWordBuf[MAX_WORD_LEN+1] = { '\0' };
@ -92,10 +94,12 @@ bool gForceFour = false;             // use four bytes regardless of need?
 static int gFileSize = 0;
 int gNBytesPerNode;
 bool gUseUnicode;
 int gLimLow = 2;
 int gLimHigh = MAX_WORD_LEN;
 // OWL is 1.7M
-#define MAX_POOL_SIZE (3 * 0x100000)
+#define MAX_POOL_SIZE (5 * 0x100000)
 #define ERROR_EXIT(...) error_exit( __LINE__, __VA_ARGS__ );
 static char* parseARGV( int argc, char** argv, const char** inFileName );
@ -182,7 +186,8 @@ main( int argc, char** argv )
        unsigned long be = htonl( gWordCount );
        fwrite( &be, sizeof(be), 1, OFILE );
        fclose( OFILE );
-        fprintf( stderr, "wrote out: got %d words\n", gWordCount );
+        fprintf( stderr, "Wrote %d (word count) to %s\n", gWordCount, 
                 gCountFile );
    }
    if ( gOutFileBase ) {
@ -393,49 +398,62 @@ readFromSortedArray( void )
 #endif
    }
-    char* word = "";
+    for ( ; ; ) {
        char* word = "";
    if ( !gDone ) {
        gDone = gNextWordIndex == sInputStrings->size();
        if ( !gDone ) {
-            word = sInputStrings->at(gNextWordIndex++);
+            gDone = gNextWordIndex == sInputStrings->size();
            if ( !gDone ) {
                word = sInputStrings->at(gNextWordIndex++);
 #ifdef DEBUG
-        } else if ( gDebug ) {
+            } else if ( gDebug ) {
-            fprintf( stderr, "gDone set to true\n" );
+                fprintf( stderr, "gDone set to true\n" );
 #endif
            }
 #ifdef DEBUG
            if ( gDebug ) {
                char buf[T2ABUFLEN(MAX_WORD_LEN)];
                fprintf( stderr, "%s: got word: %s\n", __func__,
                         tileToAscii( buf, sizeof(buf), word ) );
            }
 #endif
        }
-#ifdef DEBUG
+        int numCommonLetters = 0;
-        if ( gDebug ) {
+        int len = strlen( word );
-            fprintf( stderr, "got word: %s\n", word );
+        if ( gCurrentWordLen < len ) {
            len = gCurrentWordLen;
        }
        while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
                && numCommonLetters < len ) {
            ++numCommonLetters;
        }
        gFirstDiff = numCommonLetters;
        if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
             && !firstBeforeSecond( gCurrentWord, word ) ) {
 #ifdef DEBUG
            if ( gDebug ) {
                char buf1[T2ABUFLEN(MAX_WORD_LEN)];
                char buf2[T2ABUFLEN(MAX_WORD_LEN)];
                fprintf( stderr,
                         "%s: words %s and %s are the same or out of order\n",
                         __func__, 
                         tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
                         tileToAscii( buf2, sizeof(buf2), word ) );
            }
 #endif
            continue;
        }
        gCurrentWord = word;
        gCurrentWordLen = strlen(word);
        break;
    }
    int numCommonLetters = 0;
    int len = strlen( word );
    if ( gCurrentWordLen < len ) {
        len = gCurrentWordLen;
    }
    while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
            && numCommonLetters < len ) {
        ++numCommonLetters;
    }
    gFirstDiff = numCommonLetters;
    if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
         && !firstBeforeSecond( gCurrentWord, word ) ) {
        char buf1[MAX_WORD_LEN+1];
        char buf2[MAX_WORD_LEN+1];
        ERROR_EXIT( "words %s and %s are out of order\n",
                    tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
                    tileToAscii( buf2, sizeof(buf2), word ) );
    }
    gCurrentWord = word;
    gCurrentWordLen = strlen(word);
 #ifdef DEBUG
    if ( gDebug ) {
-        char buf[MAX_WORD_LEN+1];
+        char buf[T2ABUFLEN(MAX_WORD_LEN)];
        fprintf( stderr, "gCurrentWord now %s\n", 
                 tileToAscii( buf, sizeof(buf), gCurrentWord) );
    }
@ -450,47 +468,77 @@ readOneWord( char* wordBuf, int bufLen, int* lenp, bool* gotEOF )
    bool dropWord = false;
    bool done = false;
-    // for each byte
+    // for each byte, append to an internal buffer up to size limit.
    // On reaching an end-of-word or EOF, check if the word formed is
    // within the length range and contains no unknown chars.  If yes,
    // return it.  If no, start over ONLY IF the terminator was not
    // EOF.
    for ( ; ; ) {
        int byt = getc( gInFile );
        // EOF is special: we don't try for another word even if
        // dropWord is true; we must leave now.
        if ( byt == EOF || byt == gTermChar ) {
-            *gotEOF = byt == EOF;
+            bool isEOF = byt == EOF;
            *gotEOF = isEOF;
-            if ( !dropWord || *gotEOF ) {
+            assert( isEOF || count < bufLen );
-                if ( count != 0 ) {
+            if ( !dropWord && (count >= gLimLow) && (count <= gLimHigh) ) {
-                    wordBuf[count] = '\0';
+                assert( count < bufLen );
-                    result = wordBuf;
+                wordBuf[count] = '\0';
-                    *lenp = count;
+                result = wordBuf;
-                    ++gWordCount;
+                *lenp = count;
-                }
+                ++gWordCount;
                break;          // we've finished a word
            } else if ( *gotEOF ) {
                break;
            } else if ( isEOF ) {
                assert( !result );
                break;
            } 
 #ifdef DEBUG
            if ( gDebug ) {
                char buf[T2ABUFLEN(count)];
                wordBuf[count] = '\0';
                fprintf( stderr, "%s: dropping word (len=%d): %s\n", __func__,
                         count, tileToAscii( buf, sizeof(buf), wordBuf ) );
            }
 #endif
            count = 0;  // we'll start over
            dropWord = false;
        } else if ( count >= bufLen ) {
            // Just drop it...
            dropWord = true;
            // Don't call into the hashtable twice here!!
        } else if ( gTableHash.find(byt) != gTableHash.end() ) {
-            if ( !dropWord ) {
+            assert( count < bufLen );
-                wordBuf[count++] = (char)gTableHash[byt];
+            wordBuf[count++] = (char)gTableHash[byt];
-                if ( count >= bufLen ) {
+            if ( count >= bufLen ) {
-                    char buf[MAX_WORD_LEN+1];
+                char buf[T2ABUFLEN(count)];
-                    ERROR_EXIT( "no space for word %d (starting \"%s\")", 
+                ERROR_EXIT( "no space for word %d (starting \"%s\")", 
-                                gWordCount, 
+                            gWordCount, 
-                                tileToAscii( buf, sizeof(buf), wordBuf ));
+                            tileToAscii( buf, sizeof(buf), wordBuf ));
-                }
+            }
        } else if ( gKillIfMissing || !dropWord ) {
            char buf[T2ABUFLEN(count)];
            wordBuf[count] = '\0';
            tileToAscii( buf, sizeof(buf), wordBuf );
            if ( gKillIfMissing ) {
                ERROR_EXIT( "chr %c (%d) not in map file %s\n"
                            "last word was %s\n",
                            (char)byt, (int)byt, gTableFile, buf );
            } else if ( !dropWord ) {
 #ifdef DEBUG
                if ( gDebug ) {
                    fprintf( stderr, "%s: chr %c (%d) not in map file %s\n"
                             "dropping partial word %s\n", __func__,
                             (char)byt, (int)byt, gTableFile, buf );
                }
 #endif
                dropWord = true;
            }
        } else if ( gKillIfMissing ) {
            char buf[MAX_WORD_LEN+1];
            ERROR_EXIT( "chr %c (%d) not in map file %s\n"
                        "last word was %s\n",
                        byt, (int)byt, gTableFile, 
                        tileToAscii( buf, sizeof(buf), wordBuf ) );
        } else {
            dropWord = true;
            count = 0;     // lose anything we already have
        }
    }
@ -511,40 +559,55 @@ readFromFile( void )
    int len;
    gDone = s_eof;
-    if ( !gDone ) {
+    
-        word = readOneWord( wordBuf, sizeof(wordBuf), &len, &s_eof );
+    // Repeat until we get a new word that's not "out-of-order".  When
-        gDone = NULL == word;
+    // we see this the problem isn't failure to sort, it's duplicates.
-    }
+    // So dropping is ok.  The alternative would be detecting dupes
-    if ( gDone ) {
+    // during the sort.  This seems easier.
-        word = "";
+    for ( ; ; ) {
-        len = 0;
+        if ( !gDone ) {
-    }
+            word = readOneWord( wordBuf, sizeof(wordBuf), &len, &s_eof );
            gDone = NULL == word;
        }
        if ( gDone ) {
            word = "";
            len = 0;
        }
-    int numCommonLetters = 0;
+        int numCommonLetters = 0;
-    if ( gCurrentWordLen < len ) {
+        if ( gCurrentWordLen < len ) {
-        len = gCurrentWordLen;
+            len = gCurrentWordLen;
-    }
+        }
-    while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
+        while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
-            && numCommonLetters < len ) {
+                && numCommonLetters < len ) {
-        ++numCommonLetters;
+            ++numCommonLetters;
-    }
+        }
-    gFirstDiff = numCommonLetters;
+        gFirstDiff = numCommonLetters;
-    if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
+        if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
-         && !firstBeforeSecond( gCurrentWord, word ) ) {
+             && !firstBeforeSecond( gCurrentWord, word ) ) {
-        char buf1[MAX_WORD_LEN+1];
+#ifdef DEBUG
-        char buf2[MAX_WORD_LEN+1];
+            if ( gDebug ) {
-        ERROR_EXIT( "words %s and %s are out of order\n",
+                char buf1[T2ABUFLEN(MAX_WORD_LEN)];
-                    tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
+                char buf2[T2ABUFLEN(MAX_WORD_LEN)];
-                    tileToAscii( buf2, sizeof(buf2), word ) );
+                fprintf( stderr,
                         "%s: words %s and %s are the smae or out of order\n",
                         __func__, 
                         tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
                         tileToAscii( buf2, sizeof(buf2), word ) );
            }
 #endif
            continue;
        }
        break;
    }
    gCurrentWordLen = strlen(word);
    strncpy( gCurrentWordBuf, word, sizeof(gCurrentWordBuf) );
 #ifdef DEBUG
    if ( gDebug ) {
-        char buf[MAX_WORD_LEN+1];
+        char buf[T2ABUFLEN(MAX_WORD_LEN)];
        fprintf( stderr, "gCurrentWord now %s\n", 
                 tileToAscii( buf, sizeof(buf), gCurrentWord) );
    }
@ -561,17 +624,26 @@ firstBeforeSecond( const char* lhs, const char* rhs )
 static char*
 tileToAscii( char* out, int outSize, const char* in )
 {
    char tiles[outSize];
    int tilesLen = 1;
    tiles[0] = '[';
    char* orig = out;
    for ( ; ; ) {
        char ch = *in++;
        if ( '\0' == ch ) {
            *out = '\0';
            break;
        }
        assert( ch < gRevMap.size() );
        *out++ = gRevMap[ch];
        tilesLen += sprintf( &tiles[tilesLen], "%d,", ch );
        assert( (out - orig) < outSize );
    }
    tiles[tilesLen] = ']';
    tiles[tilesLen+1] = '\0';
    strcpy( out, tiles );
    return orig;
 }
@ -636,7 +708,7 @@ printWords( std::vector<char*>* strings )
 {
    std::vector<char*>::iterator iter = strings->begin();
    while ( iter != strings->end() ) {
-        char buf[MAX_WORD_LEN+1];
+        char buf[T2ABUFLEN(MAX_WORD_LEN)];
        tileToAscii( buf, sizeof(buf), *iter );
        fprintf( stderr, "%s\n", buf );
        ++iter;
@ -760,6 +832,9 @@ makeTableHash( void )
    if ( NULL == TABLEFILE ) {
        ERROR_EXIT( "unable to open %s\n", gTableFile );
    }
    // Fill the 0th space since references are one-based
    gRevMap.push_back(0);
    for ( ii = 0; ; ++ii ) {
        int ch = getc(TABLEFILE);
@ -817,7 +892,6 @@ emitNodes( unsigned int nBytesPerOutfile, const char* outFileBase )
        gNBytesPerNode = 3;
    } else {
        if ( gBlankIndex == 32 ) { // blank
            fprintf( stderr, "blank's at 32; 3-byte-nodes still ok\n" );
            gNBytesPerNode = 3;
        } else {
            ERROR_EXIT( "move blank to last position in info.txt "
@ -994,8 +1068,10 @@ usage( const char* name )
 {
    fprintf( stderr, "usage: %s \n"
             "\t[-v]                 (print version and exit)\n"
-             "\t[-poolsize]          (print size of hardcoded pool and exit)\n"
+             "\t[-poolsize]          (print hardcoded size of pool and exit)\n"
             "\t[-b    bytesPerFile] (default = 0xFFFFFFFF)\n"
             "\t[-min   <num in 0..15>]\n"
             "\t[-max   <num in 0..15>]\n"
             "\t-m     mapFile\n"
             "\t-mn    mapFile (unicode)\n"
             "\t-ob    outFileBase\n"
@ -1048,6 +1124,10 @@ parseARGV( int argc, char** argv, const char** inFileName )
        } else if ( 0 == strcmp( arg, "-mn" ) ) {
            gTableFile = argv[index++];
            gUseUnicode = true;
        } else if ( 0 == strcmp( arg, "-min" ) ) {
            gLimLow = atoi(argv[index++]);
        } else if ( 0 == strcmp( arg, "-max" ) ) {
            gLimHigh = atoi(argv[index++]);
        } else if ( 0 == strcmp( arg, "-m" ) ) {
            gTableFile = argv[index++];
        } else if ( 0 == strcmp( arg, "-ob" ) ) {
@ -1079,17 +1159,25 @@ parseARGV( int argc, char** argv, const char** inFileName )
            gDebug = true;
 #endif
        } else {
-            ERROR_EXIT( "unexpected arg %s", arg );
+            ERROR_EXIT( "%s: unexpected arg %s", __func__, arg );
        }
    }
    if ( gLimHigh > MAX_WORD_LEN || gLimLow > MAX_WORD_LEN ) {
        usage( argv[0] );
        exit(1);
    }
 #ifdef DEBUG
    if ( gDebug ) {
-        fprintf( stderr, "gNBytesPerOutfile=$gNBytesPerOutfile\n" );
+        fprintf( stderr, "gNBytesPerOutfile=%d\n", gNBytesPerOutfile );
-        fprintf( stderr, "gTableFile=$gTableFile\n" );
+        fprintf( stderr, "gTableFile=%s\n", gTableFile );
-        fprintf( stderr, "gOutFileBase=$gOutFileBase\n" );
+        fprintf( stderr, "gOutFileBase=%s\n", gOutFileBase );
-        fprintf( stderr, "gStartNodeOut=$gStartNodeOut\n" );
+        fprintf( stderr, "gStartNodeOut=%s\n", gStartNodeOut );
        fprintf( stderr, "gTermChar=%c(%d)\n", gTermChar, (int)gTermChar );
        fprintf( stderr, "gFileSize=%d\n", gFileSize );
        fprintf( stderr, "gLimLow=%d\n", gLimLow );
        fprintf( stderr, "gLimHigh=%d\n", gLimHigh );
    }
 #endif
    return gTableFile;
--- a/dawg/xloc.pm
+++ b/dawg/xloc.pm
@ -117,7 +117,7 @@ sub WriteMapFile($$$) {
        } elsif ( $str =~ /(\d+)/ ) {
            print $fhr pack( $packStr, $1 );
        } else {
-            die "WriteMapFile: unrecognized face format $str";
+            die "WriteMapFile: unrecognized face format $str, elem $i";
        }
    }
 } # WriteMapFile