Add support for Russian. So that Russian text can be processed on systems without setting LANG=ru_RU.CP1251, modify dict2dawg to skip duplicates and words outside of specified lengths. Modify all info.txt files for the new scheme (which includes change to byod.cgi not kept on sourceforge.)

2025-01-28 07:58:08 +01:00 · 2007-02-17 17:06:05 +00:00 · 2007-02-17 17:06:05 +00:00 · 3bb2fb018f
commit 3bb2fb018f
parent 326ecb00f4
15 changed files with 366 additions and 181 deletions
--- a/dawg/Danish/info.txt
+++ b/dawg/Danish/info.txt
@ -18,23 +18,21 @@ LANGCODE:da_DK


 # deal with DOS files
-LANGFILTER_PRECLIP: tr -d '\r' |
-
+LANGFILTER: tr -d '\r'
 # uppercase all
-LANGFILTER_POSTCLIP: | tr [a-zĺćř] [A-ZĹĆŘ]
+LANGFILTER: | tr [a-zåæø] [A-ZÅÆØ]
 # no words not containing a vowel
-LANGFILTER_POSTCLIP: | grep '[AEIOUYĹĆŘ]' 
+LANGFILTER: | grep '[AEIOUYÅÆØ]' 
 # none with illegal chars
-LANGFILTER_POSTCLIP: | grep '^[A-PR-VX-ZĹĆŘ]\+$'
+LANGFILTER: | grep '^[A-PR-VX-ZÅÆØ]\+$'
 # remove duplicates
-LANGFILTER_POSTCLIP: | sort -u
-LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
+LANGFILTER: | sort -u

 # Until I can figure out how to force sort to use a locale's collation
 # rules we can't trust sort in the filtering rules above and so must
 # leave the sorting work to dict2dawg.pl.

-NEEDSSORT:true
+D2DARGS: -r -term 10

 LANGINFO: <p>Danish uses all English letters except Q and W.  There
 LANGINFO: are three non-English letters: 'Å', 'Æ' and 'Ø'.  </p>
--- a/dawg/Dutch/info.txt
+++ b/dawg/Dutch/info.txt
@ -18,22 +18,19 @@ LANGCODE:nl_NL


 # deal with DOS files
-LANGFILTER_PRECLIP: tr -d '\r' |
-
+LANGFILTER: tr -d '\r'
 # uppercase all
-LANGFILTER_POSTCLIP: | tr [a-z] [A-Z]
+LANGFILTER: | tr [a-z] [A-Z]
 # no words not containing a vowel
-LANGFILTER_POSTCLIP: | grep '[AEIOU]' 
+LANGFILTER: | grep '[AEIOU]' 
 # none with illegal chars
-LANGFILTER_POSTCLIP: | grep '^[A-Z]\+$'
-LANGFILTER_POSTCLIP: | sort -u
-LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
+LANGFILTER: | grep '^[A-Z]\+$'
+LANGFILTER: | sort -u

 # Until I can figure out how to force sort to use a locale's collation
 # rules we can't trust sort in the filtering rules above and so must
 # leave the sorting work to dict2dawg.pl.
-
-NEEDSSORT:false
+D2DARGS: -r -term 10

 LANGINFO: <p>Dutch has the same 26 letters as English, though of
 LANGINFO: course the counts and values are different.  Filtering rules
--- a/dawg/English/Makefile.BasEnglish
+++ b/dawg/English/Makefile.BasEnglish
@ -1,4 +1,4 @@
-# -*-mode: Makefile -*-
+# -*-mode: Makefile; compile-command: "make -f Makefile.BasEnglish"; -*-
 # Copyright 2002 by Eric House (xwords@eehouse.org).  All rights reserved.
 #
 # This program is free software; you can redistribute it and/or
@ -17,7 +17,7 @@

 XWLANG=BasEnglish
 LANGCODE=en_US
-#NEWDAWG=1
+DICT2DAWGARGS = -r -nosort

 TARGET_TYPE ?= PALM

--- a/dawg/English/info.txt
+++ b/dawg/English/info.txt
@ -17,16 +17,16 @@
 LANGCODE:en_US

 # deal with DOS files
-LANGFILTER_PRECLIP: tr -d '\r' |
-
-LANGFILTER_POSTCLIP: | tr [a-z] [A-Z]
-LANGFILTER_POSTCLIP: | grep '^[A-Z]*$'
-LANGFILTER_POSTCLIP: | sort -u
+LANGFILTER: tr -d '\r'
+LANGFILTER: | tr [a-z] [A-Z]
+LANGFILTER: | grep '^[A-Z]*$'
+LANGFILTER: | sort -u

 # We can trust sort (above) to do the right thing since there's no
 # high ascii.  dict2dawg.pl is much faster if I can trust that its
 # input is in sorted order.
-NEEDSSORT:false
+D2DARGS: -nosort -term 10
+

 LANGINFO: <p>English dictionaries can contain words with any of the 26
 LANGINFO: letters you think of as making up the alphabet: A-Z.  At
--- a/dawg/French/info.txt
+++ b/dawg/French/info.txt
@ -17,16 +17,14 @@
 LANGCODE:fr_FR

 # deal with DOS files
-LANGFILTER_PRECLIP: tr -d '\r' |
+LANGFILTER: tr -d '\r'

-LANGFILTER_POSTCLIP: | tr [a-z] [A-Z]
-LANGFILTER_POSTCLIP: | grep '^[A-Z]*$'
-LANGFILTER_POSTCLIP: | tr '\n' '\000'
-LANGFILTER_POSTCLIP: | sort -u -z
-
-
-NEEDSSORT:false
+LANGFILTER: | tr [a-z] [A-Z]
+LANGFILTER: | grep '^[A-Z]*$'
+LANGFILTER: | tr '\n' '\000'
+LANGFILTER: | sort -u -z

+D2DARGS: -r -nosort -term 0

 LANGINFO: <p>At this point French is getting treated the same as
 LANGINFO: English.  But I think I should be transforming accented
--- a/dawg/German/info.txt
+++ b/dawg/German/info.txt
@ -16,26 +16,21 @@

 LANGCODE:de_DE

-
 # deal with DOS files
-LANGFILTER_PRECLIP: tr -d '\r' |
-
+LANGFILTER: tr -d '\r'
 # substitute for sharfes-s
-LANGFILTER_PRECLIP: sed -e 's/ß/SS/g' |
-
+LANGFILTER: | sed -e 's/ß/SS/g'
 # uppercase all
-LANGFILTER_POSTCLIP: | tr [a-zäöü] [A-ZÄÖÜ]
+LANGFILTER: | tr [a-zäöü] [A-ZÄÖÜ]
 # no words not containing a vowel
-LANGFILTER_POSTCLIP: | grep '[AEIOUÄÖÜ]' 
+LANGFILTER: | grep '[AEIOUÄÖÜ]' 
 # none with illegal chars
-LANGFILTER_POSTCLIP: | grep '^[A-ZÄÖÜ]\+$'
-LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
+LANGFILTER: | grep '^[A-ZÄÖÜ]\+$'

 # Until I can figure out how to force sort to use a locale's collation
 # rules we can't trust sort in the filtering rules above and so must
 # leave the sorting work to dict2dawg.pl.
-
-NEEDSSORT:true
+D2DARGS: -r -term 10

 LANGINFO: <p>German has the 26 English letters plus the three umlaut
 LANGINFO: vowels.  Scharfes-s is not a legal tile, but if present in
--- a/dawg/Italian/info.txt
+++ b/dawg/Italian/info.txt
@ -18,16 +18,12 @@
 LANGCODE:it_IT

 # deal with DOS files
-LANGFILTER_PRECLIP: tr -d '\r' |
-
-LANGFILTER_POSTCLIP: | tr [a-z] [A-Z]
-LANGFILTER_POSTCLIP: | grep '^[A-IL-VZ]*$'
-LANGFILTER_POSTCLIP: | tr '\n' '\000'
-LANGFILTER_POSTCLIP: | sort -u -z
-
-
-NEEDSSORT:false
+LANGFILTER: tr -d '\r'
+LANGFILTER: | tr [a-z] [A-Z]
+LANGFILTER: | grep '^[A-IL-VZ]*$'
+LANGFILTER: | sort -u

+D2DARGS: -r -term 10 -nosort

 LANGINFO: <p>Italian is treated the same as English but for
 LANGINFO: missing letters J, K, W, X and Y.</p>
--- a/dawg/Makefile.langcommon
+++ b/dawg/Makefile.langcommon
@ -197,7 +197,7 @@ frankspecials.bin: ../frank_mkspecials.pl  $(BMPFILES)
 # a binary file (one byte) giving the number of tiles in the dict
 charcount.bin: table.bin
 ifdef NEWDAWG
-	siz=$$(wc $< | awk '{print $$3}'); \
+	siz=$$(ls -l $< | awk '{print $$5}'); \
 	perl -e "print pack(\"c\",$$siz/2)" > $@
 else
 	siz=$$(wc -c $< | sed -e 's/$<//'); \
@ -240,11 +240,10 @@ dawg$(XWLANG)%.stamp: $(XWLANG)Main.dict.gz $(DICT2DAWG) table.bin ../Makefile.l
 	start=$$(echo $@ | sed -e 's/dawg$(XWLANG)\([0-9]*\)to[0-9]*.stamp/\1/'); \
 	end=$$(echo $@ | sed -e 's/dawg$(XWLANG)[0-9]*to\([0-9]*\).stamp/\1/'); \
 	echo $${start} and $$end; \
-	zcat $< | grep "^.\{$${start},$${end}\}$$" | \
-		sort -u | $(DICT2DAWG) $(TABLE_ARG) table.bin -b 28000 \
+	zcat $< | $(DICT2DAWG) $(DICT2DAWGARGS) $(TABLE_ARG) table.bin -b 28000 \
 		-ob dawg$(XWLANG)$* \
-		-sn $(XWLANG)StartLoc.bin -k -term 10 -wc $(XWLANG)$*_wordcount.bin \
-		$(FORCE_4) -ns $(XWLANG)$*_nodesize.bin
+		-sn $(XWLANG)StartLoc.bin -min $$start -max $$end \
+		-wc $(XWLANG)$*_wordcount.bin $(FORCE_4) -ns $(XWLANG)$*_nodesize.bin
 	touch $@

 $(XWLANG)%_wordcount.bin: dawg$(XWLANG)%.stamp
--- a/dawg/Portuguese/info.txt
+++ b/dawg/Portuguese/info.txt
@ -17,22 +17,19 @@
 LANGCODE:pt_PT

 # deal with DOS files
-LANGFILTER_PRECLIP: tr -d '\r' |
-
+LANGFILTER: tr -d '\r'
 # uppercase all
-LANGFILTER_POSTCLIP: | tr [a-zç] [A-ZÇ]
+LANGFILTER: | tr [a-zç] [A-ZÇ]
 # no words not containing a vowel
-LANGFILTER_POSTCLIP: | grep '[AEIOU]' 
+LANGFILTER: | grep '[AEIOU]' 
 # none with illegal chars
-LANGFILTER_POSTCLIP: | grep '^[A-JL-VXZÇ]\+$'
-LANGFILTER_POSTCLIP: | sort -u 
-LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
+LANGFILTER: | grep '^[A-JL-VXZÇ]\+$'

 # Until I can figure out how to force sort to use a locale's collation
 # rules we can't trust sort in the filtering rules above and so must
 # leave the sorting work to dict2dawg.pl.
+D2DARGS: -r -term 10

-NEEDSSORT:true

 LANGINFO: <p>Portugese uses the letter A-Z, excluding K, W and Y, and adds
 LANGINFO: Ç.  Words containing any other letters are dropped. </p>
--- a/dawg/Russian/Makefile
+++ b/dawg/Russian/Makefile
@ -0,0 +1,41 @@
+# -*- mode: makefile -*-
+# Copyright 2002-2007 by Eric House (xwords@eehouse.org).  All rights reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+XWLANG=Russian
+LANGCODE=ru_RU
+DICT2DAWGARGS = -r
+
+TARGET_TYPE ?= WINCE
+
+include ../Makefile.2to8
+
+include ../Makefile.langcommon
+
+SOURCEDICT ?= $(XWDICTPATH)/$(XWLANG)/RU5000.txt.gz
+
+$(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile
+	zcat $< | tr -d '\r' | \
+	tr [àáâãäåæçèéêëìíîïðñòóôõö÷øùÚûüýþÿ] [ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß] | \
+	gzip -c > $@
+
+
+# Everything but creating of the Main.dict file is inherited from the
+# "parent" Makefile.langcommon in the parent directory.
+
+clean: clean_common
+	rm -f $(XWLANG)Main.dict.gz *.bin $(XWLANG)*.pdb $(XWLANG)*.seb
+
--- a/dawg/Russian/info.txt
+++ b/dawg/Russian/info.txt
@ -0,0 +1,76 @@
+# Copyright 2002,2007 by Eric House (xwords@eehouse.org).  All rights
+# reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+LANGCODE:ru_RU
+CHARSET:windows-1251
+
+# deal with DOS files
+LANGFILTER: tr -d '\r'
+# uppercase all
+LANGFILTER: | tr [àáâãäåæçèéêëìíîïðñòóôõö÷øùÚûüýþÿ] [ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß]
+# LANGFILTER: | tr -s '\n' '\000'
+
+# note: don't turn off sorting!  Can't do it with GNU 'sort' without
+# setting LANG
+D2DARGS: -r -term 10
+
+LANGINFO: <p>Russian wordlists must be in the Windows-1251
+LANGINFO: codepage. Lower-case letters are converted to upper case and
+LANGINFO: any words that contain letters not listed below are
+LANGINFO: removed.</p>
+
+# High bit means "official".  Next 7 bits are an enum where
+# Russian==0x0F.  Low byte is padding.
+XLOC_HEADER:0x8F00
+
+
+<BEGIN_TILES>
+8       1         'À'
+2       3         'Á'
+4       1         'Â'
+2       3         'Ã'
+2       2         'Ä'
+7       1         'Å'
+1       4         'Æ'
+1       3         'Ç'
+7       1         'È'
+1       2         'É'
+4       2         'Ê'
+4       2         'Ë'
+2       3         'Ì'
+4       1         'Í'
+9       1         'Î'
+4       2         'Ï'
+5       1         'Ð'
+5       1         'Ñ'
+7       1         'Ò'
+4       2         'Ó'
+1       5         'Ô'
+1       4         'Õ'
+1       4         'Ö'
+1       3         '×'
+1       4         'Ø'
+1       5         'Ù'
+1       10        'Ú'
+2       2         'Û'
+4       1         'Ü'
+1       8         'Ý'
+1       5         'Þ'
+2       2         'ß'
+2			0		{"_"}
+<END_TILES>
+# should ignore all after the <END_TILES> above
--- a/dawg/Spanish/info.txt
+++ b/dawg/Spanish/info.txt
@ -21,24 +21,25 @@
 NEEDSSORT:true

 # MSDos LF chars go bye-bye
-LANGFILTER_PRECLIP: tr -d '\r' |
+LANGFILTER: tr -d '\r'

 # convert accented vowels
-LANGFILTER_POSTCLIP: | tr '\207\216\222\227\234\237\226' 'aeiouu\321'
+LANGFILTER: | tr '\207\216\222\227\234\237\226' 'aeiouu\321'
 # uppercase
-LANGFILTER_POSTCLIP: | tr [a-zń] [A-ZŃ]
+LANGFILTER: | tr [a-zñ] [A-ZÑ]
 # remove words with illegal letters
-LANGFILTER_POSTCLIP: | grep '^[[A-JL-VX-ZŃ]*$'
+LANGFILTER: | grep '^[[A-JL-VX-ZÑ]*$'
 # substitute pairs (can't figure out how to use octal values)
-LANGFILTER_POSTCLIP: | sed 's/CH/1/g'
-LANGFILTER_POSTCLIP: | sed 's/LL/2/g'
-LANGFILTER_POSTCLIP: | sed 's/RR/3/g'
+LANGFILTER: | sed 's/CH/1/g'
+LANGFILTER: | sed 's/LL/2/g'
+LANGFILTER: | sed 's/RR/3/g'
 # substitute in the octal control character values
-LANGFILTER_POSTCLIP: | tr '123' '\001\002\003'
+LANGFILTER: | tr '123' '\001\002\003'
 # now add nulls as terminators
-LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
-LANGFILTER_POSTCLIP: | sort -u -z
+LANGFILTER: | tr -s '\n' '\000'
+LANGFILTER: | sort -u -z

+D2DARGS: -r -term 0

 LANGINFO: <p>Spanish words include all letters in the English alphabet
 LANGINFO: except "K" and "W", and with "Ñ" added. Since there are no
--- a/dawg/Swedish/info.txt
+++ b/dawg/Swedish/info.txt
@ -16,12 +16,11 @@

 LANGCODE:sv_SE

+LANGFILTER: tr -d '\r'
+LANGFILTER: | tr [a-zäåæöü] [A-ZÄÅÆÖÜ]
+LANGFILTER: | grep '^[A-ZÄÅÆÖÜ]*$'

-LANGFILTER_POSTCLIP: | tr [a-zäĺćöü] [A-ZÄĹĆÖÜ]
-LANGFILTER_POSTCLIP: | grep '^[A-ZÄĹĆÖÜ]*$'
-LANGFILTER_POSTCLIP: | tr '\n' '\000'
-
-NEEDSSORT:true
+D2DARGS: -r -term 10

 LANGINFO: <p>From an English-speaker's perspective, Swedish drops Q
 LANGINFO: and W, and adds Ä, Å, Æ, Ö and Ü.</p>
--- a/dawg/dict2dawg.cpp
+++ b/dawg/dict2dawg.cpp
@ -1,9 +1,9 @@
-/* -*- compile-command: "g++ -O -o dict2dawg dict2dawg.cpp"; -*- */
+/* -*- compile-command: "g++ -DDEBUG -O -o dict2dawg dict2dawg.cpp"; -*- */
 /*************************************************************************
 * adapted from perl code that was itself adapted from C++ code
 * Copyright (C) 2000 Falk Hueffner

- * This version Copyright (C) 2002,2006 Eric House (xwords@eehouse.org)
+ * This version Copyright (C) 2002,2006-2007 Eric House (xwords@eehouse.org)
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@ -54,9 +54,11 @@ typedef unsigned int Node;
 typedef std::vector<Node> NodeList;
 typedef std::vector<char*> WordList;

-#define MAX_WORD_LEN 15
 #define VERSION_STR "$Rev$"

+#define MAX_WORD_LEN 15
+#define T2ABUFLEN(s) (((s)*4)+3)
+
 int gFirstDiff;

 static char gCurrentWordBuf[MAX_WORD_LEN+1] = { '\0' };
@ -92,10 +94,12 @@ bool gForceFour = false;             // use four bytes regardless of need?
 static int gFileSize = 0;
 int gNBytesPerNode;
 bool gUseUnicode;
+int gLimLow = 2;
+int gLimHigh = MAX_WORD_LEN;


 // OWL is 1.7M
-#define MAX_POOL_SIZE (3 * 0x100000)
+#define MAX_POOL_SIZE (5 * 0x100000)
 #define ERROR_EXIT(...) error_exit( __LINE__, __VA_ARGS__ );

 static char* parseARGV( int argc, char** argv, const char** inFileName );
@ -182,7 +186,8 @@ main( int argc, char** argv )
        unsigned long be = htonl( gWordCount );
        fwrite( &be, sizeof(be), 1, OFILE );
        fclose( OFILE );
-        fprintf( stderr, "wrote out: got %d words\n", gWordCount );
+        fprintf( stderr, "Wrote %d (word count) to %s\n", gWordCount, 
+                 gCountFile );
    }

    if ( gOutFileBase ) {
@ -393,49 +398,62 @@ readFromSortedArray( void )
 #endif
    }

-    char* word = "";
+    for ( ; ; ) {
+        char* word = "";

-    if ( !gDone ) {
-        gDone = gNextWordIndex == sInputStrings->size();
        if ( !gDone ) {
-            word = sInputStrings->at(gNextWordIndex++);
+            gDone = gNextWordIndex == sInputStrings->size();
+            if ( !gDone ) {
+                word = sInputStrings->at(gNextWordIndex++);
 #ifdef DEBUG
-        } else if ( gDebug ) {
-            fprintf( stderr, "gDone set to true\n" );
+            } else if ( gDebug ) {
+                fprintf( stderr, "gDone set to true\n" );
+#endif
+            }
+#ifdef DEBUG
+            if ( gDebug ) {
+                char buf[T2ABUFLEN(MAX_WORD_LEN)];
+                fprintf( stderr, "%s: got word: %s\n", __func__,
+                         tileToAscii( buf, sizeof(buf), word ) );
+            }
 #endif
        }
-#ifdef DEBUG
-        if ( gDebug ) {
-            fprintf( stderr, "got word: %s\n", word );
+        int numCommonLetters = 0;
+        int len = strlen( word );
+        if ( gCurrentWordLen < len ) {
+            len = gCurrentWordLen;
        }
+
+        while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
+                && numCommonLetters < len ) {
+            ++numCommonLetters;
+        }
+
+        gFirstDiff = numCommonLetters;
+        if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
+             && !firstBeforeSecond( gCurrentWord, word ) ) {
+#ifdef DEBUG
+            if ( gDebug ) {
+                char buf1[T2ABUFLEN(MAX_WORD_LEN)];
+                char buf2[T2ABUFLEN(MAX_WORD_LEN)];
+                fprintf( stderr,
+                         "%s: words %s and %s are the same or out of order\n",
+                         __func__, 
+                         tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
+                         tileToAscii( buf2, sizeof(buf2), word ) );
+            }
 #endif
+            continue;
+        }
+    
+        gCurrentWord = word;
+        gCurrentWordLen = strlen(word);
+        break;
    }
-    int numCommonLetters = 0;
-    int len = strlen( word );
-    if ( gCurrentWordLen < len ) {
-        len = gCurrentWordLen;
-    }
-
-    while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
-            && numCommonLetters < len ) {
-        ++numCommonLetters;
-    }
-
-    gFirstDiff = numCommonLetters;
-    if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
-         && !firstBeforeSecond( gCurrentWord, word ) ) {
-        char buf1[MAX_WORD_LEN+1];
-        char buf2[MAX_WORD_LEN+1];
-        ERROR_EXIT( "words %s and %s are out of order\n",
-                    tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
-                    tileToAscii( buf2, sizeof(buf2), word ) );
-    }
-    gCurrentWord = word;
-    gCurrentWordLen = strlen(word);

 #ifdef DEBUG
    if ( gDebug ) {
-        char buf[MAX_WORD_LEN+1];
+        char buf[T2ABUFLEN(MAX_WORD_LEN)];
        fprintf( stderr, "gCurrentWord now %s\n", 
                 tileToAscii( buf, sizeof(buf), gCurrentWord) );
    }
@ -450,47 +468,77 @@ readOneWord( char* wordBuf, int bufLen, int* lenp, bool* gotEOF )
    bool dropWord = false;
    bool done = false;

-    // for each byte
+    // for each byte, append to an internal buffer up to size limit.
+    // On reaching an end-of-word or EOF, check if the word formed is
+    // within the length range and contains no unknown chars.  If yes,
+    // return it.  If no, start over ONLY IF the terminator was not
+    // EOF.
    for ( ; ; ) {
        int byt = getc( gInFile );

        // EOF is special: we don't try for another word even if
        // dropWord is true; we must leave now.
        if ( byt == EOF || byt == gTermChar ) {
-            *gotEOF = byt == EOF;
+            bool isEOF = byt == EOF;
+            *gotEOF = isEOF;

-            if ( !dropWord || *gotEOF ) {
-                if ( count != 0 ) {
-                    wordBuf[count] = '\0';
-                    result = wordBuf;
-                    *lenp = count;
-                    ++gWordCount;
-                }
-                break;          // we've finished a word
-            } else if ( *gotEOF ) {
+            assert( isEOF || count < bufLen );
+            if ( !dropWord && (count >= gLimLow) && (count <= gLimHigh) ) {
+                assert( count < bufLen );
+                wordBuf[count] = '\0';
+                result = wordBuf;
+                *lenp = count;
+                ++gWordCount;
                break;
+            } else if ( isEOF ) {
+                assert( !result );
+                break;
+            } 
+#ifdef DEBUG
+            if ( gDebug ) {
+                char buf[T2ABUFLEN(count)];
+                wordBuf[count] = '\0';
+                fprintf( stderr, "%s: dropping word (len=%d): %s\n", __func__,
+                         count, tileToAscii( buf, sizeof(buf), wordBuf ) );
            }
+#endif
+            count = 0;  // we'll start over
+            dropWord = false;
+
+        } else if ( count >= bufLen ) {
+            // Just drop it...
+            dropWord = true;

            // Don't call into the hashtable twice here!!
        } else if ( gTableHash.find(byt) != gTableHash.end() ) {
-            if ( !dropWord ) {
-                wordBuf[count++] = (char)gTableHash[byt];
-                if ( count >= bufLen ) {
-                    char buf[MAX_WORD_LEN+1];
-                    ERROR_EXIT( "no space for word %d (starting \"%s\")", 
-                                gWordCount, 
-                                tileToAscii( buf, sizeof(buf), wordBuf ));
-                }
+            assert( count < bufLen );
+            wordBuf[count++] = (char)gTableHash[byt];
+            if ( count >= bufLen ) {
+                char buf[T2ABUFLEN(count)];
+                ERROR_EXIT( "no space for word %d (starting \"%s\")", 
+                            gWordCount, 
+                            tileToAscii( buf, sizeof(buf), wordBuf ));
+            }
+        } else if ( gKillIfMissing || !dropWord ) {
+            char buf[T2ABUFLEN(count)];
+            wordBuf[count] = '\0';
+
+            tileToAscii( buf, sizeof(buf), wordBuf );
+
+            if ( gKillIfMissing ) {
+                ERROR_EXIT( "chr %c (%d) not in map file %s\n"
+                            "last word was %s\n",
+                            (char)byt, (int)byt, gTableFile, buf );
+            } else if ( !dropWord ) {
+#ifdef DEBUG
+                if ( gDebug ) {
+                    fprintf( stderr, "%s: chr %c (%d) not in map file %s\n"
+                             "dropping partial word %s\n", __func__,
+                             (char)byt, (int)byt, gTableFile, buf );
+                }
+#endif
+                dropWord = true;
            }
-        } else if ( gKillIfMissing ) {
-            char buf[MAX_WORD_LEN+1];
-            ERROR_EXIT( "chr %c (%d) not in map file %s\n"
-                        "last word was %s\n",
-                        byt, (int)byt, gTableFile, 
-                        tileToAscii( buf, sizeof(buf), wordBuf ) );
-        } else {
-            dropWord = true;
-            count = 0;     // lose anything we already have
        }
    }

@ -511,40 +559,55 @@ readFromFile( void )
    int len;

    gDone = s_eof;
-    if ( !gDone ) {
-        word = readOneWord( wordBuf, sizeof(wordBuf), &len, &s_eof );
-        gDone = NULL == word;
-    }
-    if ( gDone ) {
-        word = "";
-        len = 0;
-    }
+    
+    // Repeat until we get a new word that's not "out-of-order".  When
+    // we see this the problem isn't failure to sort, it's duplicates.
+    // So dropping is ok.  The alternative would be detecting dupes
+    // during the sort.  This seems easier.
+    for ( ; ; ) {
+        if ( !gDone ) {
+            word = readOneWord( wordBuf, sizeof(wordBuf), &len, &s_eof );
+            gDone = NULL == word;
+        }
+        if ( gDone ) {
+            word = "";
+            len = 0;
+        }

-    int numCommonLetters = 0;
-    if ( gCurrentWordLen < len ) {
-        len = gCurrentWordLen;
-    }
+        int numCommonLetters = 0;
+        if ( gCurrentWordLen < len ) {
+            len = gCurrentWordLen;
+        }

-    while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
-            && numCommonLetters < len ) {
-        ++numCommonLetters;
-    }
+        while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
+                && numCommonLetters < len ) {
+            ++numCommonLetters;
+        }

-    gFirstDiff = numCommonLetters;
-    if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
-         && !firstBeforeSecond( gCurrentWord, word ) ) {
-        char buf1[MAX_WORD_LEN+1];
-        char buf2[MAX_WORD_LEN+1];
-        ERROR_EXIT( "words %s and %s are out of order\n",
-                    tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
-                    tileToAscii( buf2, sizeof(buf2), word ) );
+        gFirstDiff = numCommonLetters;
+        if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
+             && !firstBeforeSecond( gCurrentWord, word ) ) {
+#ifdef DEBUG
+            if ( gDebug ) {
+                char buf1[T2ABUFLEN(MAX_WORD_LEN)];
+                char buf2[T2ABUFLEN(MAX_WORD_LEN)];
+                fprintf( stderr,
+                         "%s: words %s and %s are the smae or out of order\n",
+                         __func__, 
+                         tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
+                         tileToAscii( buf2, sizeof(buf2), word ) );
+            }
+#endif
+            continue;
+        }
+        break;
    }
    gCurrentWordLen = strlen(word);
    strncpy( gCurrentWordBuf, word, sizeof(gCurrentWordBuf) );

 #ifdef DEBUG
    if ( gDebug ) {
-        char buf[MAX_WORD_LEN+1];
+        char buf[T2ABUFLEN(MAX_WORD_LEN)];
        fprintf( stderr, "gCurrentWord now %s\n", 
                 tileToAscii( buf, sizeof(buf), gCurrentWord) );
    }
@ -561,17 +624,26 @@ firstBeforeSecond( const char* lhs, const char* rhs )
 static char*
 tileToAscii( char* out, int outSize, const char* in )
 {
+    char tiles[outSize];
+    int tilesLen = 1;
+    tiles[0] = '[';
+
    char* orig = out;
    for ( ; ; ) {
        char ch = *in++;
        if ( '\0' == ch ) {
-            *out = '\0';
            break;
        }
        assert( ch < gRevMap.size() );
        *out++ = gRevMap[ch];
+        tilesLen += sprintf( &tiles[tilesLen], "%d,", ch );
        assert( (out - orig) < outSize );
    }
+
+    tiles[tilesLen] = ']';
+    tiles[tilesLen+1] = '\0';
+    strcpy( out, tiles );
+
    return orig;
 }

@ -636,7 +708,7 @@ printWords( std::vector<char*>* strings )
 {
    std::vector<char*>::iterator iter = strings->begin();
    while ( iter != strings->end() ) {
-        char buf[MAX_WORD_LEN+1];
+        char buf[T2ABUFLEN(MAX_WORD_LEN)];
        tileToAscii( buf, sizeof(buf), *iter );
        fprintf( stderr, "%s\n", buf );
        ++iter;
@ -760,6 +832,9 @@ makeTableHash( void )
    if ( NULL == TABLEFILE ) {
        ERROR_EXIT( "unable to open %s\n", gTableFile );
    }
+    
+    // Fill the 0th space since references are one-based
+    gRevMap.push_back(0);

    for ( ii = 0; ; ++ii ) {
        int ch = getc(TABLEFILE);
@ -817,7 +892,6 @@ emitNodes( unsigned int nBytesPerOutfile, const char* outFileBase )
        gNBytesPerNode = 3;
    } else {
        if ( gBlankIndex == 32 ) { // blank
-            fprintf( stderr, "blank's at 32; 3-byte-nodes still ok\n" );
            gNBytesPerNode = 3;
        } else {
            ERROR_EXIT( "move blank to last position in info.txt "
@ -994,8 +1068,10 @@ usage( const char* name )
 {
    fprintf( stderr, "usage: %s \n"
             "\t[-v]                 (print version and exit)\n"
-             "\t[-poolsize]          (print size of hardcoded pool and exit)\n"
+             "\t[-poolsize]          (print hardcoded size of pool and exit)\n"
             "\t[-b    bytesPerFile] (default = 0xFFFFFFFF)\n"
+             "\t[-min   <num in 0..15>]\n"
+             "\t[-max   <num in 0..15>]\n"
             "\t-m     mapFile\n"
             "\t-mn    mapFile (unicode)\n"
             "\t-ob    outFileBase\n"
@ -1048,6 +1124,10 @@ parseARGV( int argc, char** argv, const char** inFileName )
        } else if ( 0 == strcmp( arg, "-mn" ) ) {
            gTableFile = argv[index++];
            gUseUnicode = true;
+        } else if ( 0 == strcmp( arg, "-min" ) ) {
+            gLimLow = atoi(argv[index++]);
+        } else if ( 0 == strcmp( arg, "-max" ) ) {
+            gLimHigh = atoi(argv[index++]);
        } else if ( 0 == strcmp( arg, "-m" ) ) {
            gTableFile = argv[index++];
        } else if ( 0 == strcmp( arg, "-ob" ) ) {
@ -1079,17 +1159,25 @@ parseARGV( int argc, char** argv, const char** inFileName )
            gDebug = true;
 #endif
        } else {
-            ERROR_EXIT( "unexpected arg %s", arg );
+            ERROR_EXIT( "%s: unexpected arg %s", __func__, arg );
        }
    }

+    if ( gLimHigh > MAX_WORD_LEN || gLimLow > MAX_WORD_LEN ) {
+        usage( argv[0] );
+        exit(1);
+    }
+
 #ifdef DEBUG
    if ( gDebug ) {
-        fprintf( stderr, "gNBytesPerOutfile=$gNBytesPerOutfile\n" );
-        fprintf( stderr, "gTableFile=$gTableFile\n" );
-        fprintf( stderr, "gOutFileBase=$gOutFileBase\n" );
-        fprintf( stderr, "gStartNodeOut=$gStartNodeOut\n" );
+        fprintf( stderr, "gNBytesPerOutfile=%d\n", gNBytesPerOutfile );
+        fprintf( stderr, "gTableFile=%s\n", gTableFile );
+        fprintf( stderr, "gOutFileBase=%s\n", gOutFileBase );
+        fprintf( stderr, "gStartNodeOut=%s\n", gStartNodeOut );
        fprintf( stderr, "gTermChar=%c(%d)\n", gTermChar, (int)gTermChar );
+        fprintf( stderr, "gFileSize=%d\n", gFileSize );
+        fprintf( stderr, "gLimLow=%d\n", gLimLow );
+        fprintf( stderr, "gLimHigh=%d\n", gLimHigh );
    }
 #endif
    return gTableFile;
--- a/dawg/xloc.pm
+++ b/dawg/xloc.pm
@ -117,7 +117,7 @@ sub WriteMapFile($$$) {
        } elsif ( $str =~ /(\d+)/ ) {
            print $fhr pack( $packStr, $1 );
        } else {
-            die "WriteMapFile: unrecognized face format $str";
+            die "WriteMapFile: unrecognized face format $str, elem $i";
        }
    }
 } # WriteMapFile