From 3bb2fb018f9d4e97c0a532761adb472ec549c409 Mon Sep 17 00:00:00 2001
From: ehouse <ehouse@0782aaa5-4710-0410-8820-a96bf9123855>
Date: Sat, 17 Feb 2007 17:06:05 +0000
Subject: [PATCH] Add support for Russian.  So that Russian text can be
 processed on systems without setting LANG=ru_RU.CP1251, modify dict2dawg to
 skip duplicates and words outside of specified lengths.  Modify all info.txt
 files for the new scheme (which includes change to byod.cgi not kept on
 sourceforge.)

---
 dawg/Danish/info.txt             |  14 +-
 dawg/Dutch/info.txt              |  15 +-
 dawg/English/Makefile.BasEnglish |   4 +-
 dawg/English/info.txt            |  12 +-
 dawg/French/info.txt             |  14 +-
 dawg/German/info.txt             |  17 +-
 dawg/Italian/info.txt            |  14 +-
 dawg/Makefile.langcommon         |   9 +-
 dawg/Portuguese/info.txt         |  13 +-
 dawg/Russian/Makefile            |  41 +++++
 dawg/Russian/info.txt            |  76 ++++++++
 dawg/Spanish/info.txt            |  21 +--
 dawg/Swedish/info.txt            |   9 +-
 dawg/dict2dawg.cpp               | 286 ++++++++++++++++++++-----------
 dawg/xloc.pm                     |   2 +-
 15 files changed, 366 insertions(+), 181 deletions(-)
 create mode 100644 dawg/Russian/Makefile
 create mode 100644 dawg/Russian/info.txt
diff --git a/dawg/Danish/info.txt b/dawg/Danish/info.txt
index ae6d6b5b1..7e8353249 100644
--- a/dawg/Danish/info.txt
+++ b/dawg/Danish/info.txt
@@ -18,23 +18,21 @@ LANGCODE:da_DK
 
 
 # deal with DOS files
-LANGFILTER_PRECLIP: tr -d '\r' |
-
+LANGFILTER: tr -d '\r'
 # uppercase all
-LANGFILTER_POSTCLIP: | tr [a-zåæø] [A-ZÅÆØ]
+LANGFILTER: | tr [a-zåæø] [A-ZÅÆØ]
 # no words not containing a vowel
-LANGFILTER_POSTCLIP: | grep '[AEIOUYÅÆØ]' 
+LANGFILTER: | grep '[AEIOUYÅÆØ]' 
 # none with illegal chars
-LANGFILTER_POSTCLIP: | grep '^[A-PR-VX-ZÅÆØ]\+$'
+LANGFILTER: | grep '^[A-PR-VX-ZÅÆØ]\+$'
 # remove duplicates
-LANGFILTER_POSTCLIP: | sort -u
-LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
+LANGFILTER: | sort -u
 
 # Until I can figure out how to force sort to use a locale's collation
 # rules we can't trust sort in the filtering rules above and so must
 # leave the sorting work to dict2dawg.pl.
 
-NEEDSSORT:true
+D2DARGS: -r -term 10
 
 LANGINFO: <p>Danish uses all English letters except Q and W.  There
 LANGINFO: are three non-English letters: 'Å', 'Æ' and 'Ø'.  </p>
diff --git a/dawg/Dutch/info.txt b/dawg/Dutch/info.txt
index 27e7a2679..6a4e03c12 100644
--- a/dawg/Dutch/info.txt
+++ b/dawg/Dutch/info.txt
@@ -18,22 +18,19 @@ LANGCODE:nl_NL
 
 
 # deal with DOS files
-LANGFILTER_PRECLIP: tr -d '\r' |
-
+LANGFILTER: tr -d '\r'
 # uppercase all
-LANGFILTER_POSTCLIP: | tr [a-z] [A-Z]
+LANGFILTER: | tr [a-z] [A-Z]
 # no words not containing a vowel
-LANGFILTER_POSTCLIP: | grep '[AEIOU]' 
+LANGFILTER: | grep '[AEIOU]' 
 # none with illegal chars
-LANGFILTER_POSTCLIP: | grep '^[A-Z]\+$'
-LANGFILTER_POSTCLIP: | sort -u
-LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
+LANGFILTER: | grep '^[A-Z]\+$'
+LANGFILTER: | sort -u
 
 # Until I can figure out how to force sort to use a locale's collation
 # rules we can't trust sort in the filtering rules above and so must
 # leave the sorting work to dict2dawg.pl.
-
-NEEDSSORT:false
+D2DARGS: -r -term 10
 
 LANGINFO: <p>Dutch has the same 26 letters as English, though of
 LANGINFO: course the counts and values are different.  Filtering rules
diff --git a/dawg/English/Makefile.BasEnglish b/dawg/English/Makefile.BasEnglish
index 89f1567ec..03019e7a2 100644
--- a/dawg/English/Makefile.BasEnglish
+++ b/dawg/English/Makefile.BasEnglish
@@ -1,4 +1,4 @@
-# -*-mode: Makefile -*-
+# -*-mode: Makefile; compile-command: "make -f Makefile.BasEnglish"; -*-
 # Copyright 2002 by Eric House (xwords@eehouse.org).  All rights reserved.
 #
 # This program is free software; you can redistribute it and/or
@@ -17,7 +17,7 @@
 
 XWLANG=BasEnglish
 LANGCODE=en_US
-#NEWDAWG=1
+DICT2DAWGARGS = -r -nosort
 
 TARGET_TYPE ?= PALM
 
diff --git a/dawg/English/info.txt b/dawg/English/info.txt
index a8eeb5128..70f8d8a49 100644
--- a/dawg/English/info.txt
+++ b/dawg/English/info.txt
@@ -17,16 +17,16 @@
 LANGCODE:en_US
 
 # deal with DOS files
-LANGFILTER_PRECLIP: tr -d '\r' |
-
-LANGFILTER_POSTCLIP: | tr [a-z] [A-Z]
-LANGFILTER_POSTCLIP: | grep '^[A-Z]*$'
-LANGFILTER_POSTCLIP: | sort -u
+LANGFILTER: tr -d '\r'
+LANGFILTER: | tr [a-z] [A-Z]
+LANGFILTER: | grep '^[A-Z]*$'
+LANGFILTER: | sort -u
 
 # We can trust sort (above) to do the right thing since there's no
 # high ascii.  dict2dawg.pl is much faster if I can trust that its
 # input is in sorted order.
-NEEDSSORT:false
+D2DARGS: -nosort -term 10
+
 
 LANGINFO: <p>English dictionaries can contain words with any of the 26
 LANGINFO: letters you think of as making up the alphabet: A-Z.  At
diff --git a/dawg/French/info.txt b/dawg/French/info.txt
index b961cdd62..00c5bfc0f 100755
--- a/dawg/French/info.txt
+++ b/dawg/French/info.txt
@@ -17,16 +17,14 @@
 LANGCODE:fr_FR
 
 # deal with DOS files
-LANGFILTER_PRECLIP: tr -d '\r' |
+LANGFILTER: tr -d '\r'
 
-LANGFILTER_POSTCLIP: | tr [a-z] [A-Z]
-LANGFILTER_POSTCLIP: | grep '^[A-Z]*$'
-LANGFILTER_POSTCLIP: | tr '\n' '\000'
-LANGFILTER_POSTCLIP: | sort -u -z
-
-
-NEEDSSORT:false
+LANGFILTER: | tr [a-z] [A-Z]
+LANGFILTER: | grep '^[A-Z]*$'
+LANGFILTER: | tr '\n' '\000'
+LANGFILTER: | sort -u -z
 
+D2DARGS: -r -nosort -term 0
 
 LANGINFO: <p>At this point French is getting treated the same as
 LANGINFO: English.  But I think I should be transforming accented
diff --git a/dawg/German/info.txt b/dawg/German/info.txt
index 7c21c94ad..f6321981d 100644
--- a/dawg/German/info.txt
+++ b/dawg/German/info.txt
@@ -16,26 +16,21 @@
 
 LANGCODE:de_DE
 
-
 # deal with DOS files
-LANGFILTER_PRECLIP: tr -d '\r' |
-
+LANGFILTER: tr -d '\r'
 # substitute for sharfes-s
-LANGFILTER_PRECLIP: sed -e 's/ß/SS/g' |
-
+LANGFILTER: | sed -e 's/ß/SS/g'
 # uppercase all
-LANGFILTER_POSTCLIP: | tr [a-zäöü] [A-ZÄÖÜ]
+LANGFILTER: | tr [a-zäöü] [A-ZÄÖÜ]
 # no words not containing a vowel
-LANGFILTER_POSTCLIP: | grep '[AEIOUÄÖÜ]' 
+LANGFILTER: | grep '[AEIOUÄÖÜ]' 
 # none with illegal chars
-LANGFILTER_POSTCLIP: | grep '^[A-ZÄÖÜ]\+$'
-LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
+LANGFILTER: | grep '^[A-ZÄÖÜ]\+$'
 
 # Until I can figure out how to force sort to use a locale's collation
 # rules we can't trust sort in the filtering rules above and so must
 # leave the sorting work to dict2dawg.pl.
-
-NEEDSSORT:true
+D2DARGS: -r -term 10
 
 LANGINFO: <p>German has the 26 English letters plus the three umlaut
 LANGINFO: vowels.  Scharfes-s is not a legal tile, but if present in
diff --git a/dawg/Italian/info.txt b/dawg/Italian/info.txt
index 001bf6130..8b60c6478 100755
--- a/dawg/Italian/info.txt
+++ b/dawg/Italian/info.txt
@@ -18,16 +18,12 @@
 LANGCODE:it_IT
 
 # deal with DOS files
-LANGFILTER_PRECLIP: tr -d '\r' |
-
-LANGFILTER_POSTCLIP: | tr [a-z] [A-Z]
-LANGFILTER_POSTCLIP: | grep '^[A-IL-VZ]*$'
-LANGFILTER_POSTCLIP: | tr '\n' '\000'
-LANGFILTER_POSTCLIP: | sort -u -z
-
-
-NEEDSSORT:false
+LANGFILTER: tr -d '\r'
+LANGFILTER: | tr [a-z] [A-Z]
+LANGFILTER: | grep '^[A-IL-VZ]*$'
+LANGFILTER: | sort -u
 
+D2DARGS: -r -term 10 -nosort
 
 LANGINFO: <p>Italian is treated the same as English but for
 LANGINFO: missing letters J, K, W, X and Y.</p>
diff --git a/dawg/Makefile.langcommon b/dawg/Makefile.langcommon
index 5a1e1ffeb..767d58a8d 100644
--- a/dawg/Makefile.langcommon
+++ b/dawg/Makefile.langcommon
@@ -197,7 +197,7 @@ frankspecials.bin: ../frank_mkspecials.pl  $(BMPFILES)
 # a binary file (one byte) giving the number of tiles in the dict
 charcount.bin: table.bin
 ifdef NEWDAWG
-	siz=$$(wc $< | awk '{print $$3}'); \
+	siz=$$(ls -l $< | awk '{print $$5}'); \
 	perl -e "print pack(\"c\",$$siz/2)" > $@
 else
 	siz=$$(wc -c $< | sed -e 's/$<//'); \
@@ -240,11 +240,10 @@ dawg$(XWLANG)%.stamp: $(XWLANG)Main.dict.gz $(DICT2DAWG) table.bin ../Makefile.l
 	start=$$(echo $@ | sed -e 's/dawg$(XWLANG)\([0-9]*\)to[0-9]*.stamp/\1/'); \
 	end=$$(echo $@ | sed -e 's/dawg$(XWLANG)[0-9]*to\([0-9]*\).stamp/\1/'); \
 	echo $${start} and $$end; \
-	zcat $< | grep "^.\{$${start},$${end}\}$$" | \
-		sort -u | $(DICT2DAWG) $(TABLE_ARG) table.bin -b 28000 \
+	zcat $< | $(DICT2DAWG) $(DICT2DAWGARGS) $(TABLE_ARG) table.bin -b 28000 \
 		-ob dawg$(XWLANG)$* \
-		-sn $(XWLANG)StartLoc.bin -k -term 10 -wc $(XWLANG)$*_wordcount.bin \
-		$(FORCE_4) -ns $(XWLANG)$*_nodesize.bin
+		-sn $(XWLANG)StartLoc.bin -min $$start -max $$end \
+		-wc $(XWLANG)$*_wordcount.bin $(FORCE_4) -ns $(XWLANG)$*_nodesize.bin
 	touch $@
 
 $(XWLANG)%_wordcount.bin: dawg$(XWLANG)%.stamp
diff --git a/dawg/Portuguese/info.txt b/dawg/Portuguese/info.txt
index 13967f137..1c52afbb5 100644
--- a/dawg/Portuguese/info.txt
+++ b/dawg/Portuguese/info.txt
@@ -17,22 +17,19 @@
 LANGCODE:pt_PT
 
 # deal with DOS files
-LANGFILTER_PRECLIP: tr -d '\r' |
-
+LANGFILTER: tr -d '\r'
 # uppercase all
-LANGFILTER_POSTCLIP: | tr [a-zç] [A-ZÇ]
+LANGFILTER: | tr [a-zç] [A-ZÇ]
 # no words not containing a vowel
-LANGFILTER_POSTCLIP: | grep '[AEIOU]' 
+LANGFILTER: | grep '[AEIOU]' 
 # none with illegal chars
-LANGFILTER_POSTCLIP: | grep '^[A-JL-VXZÇ]\+$'
-LANGFILTER_POSTCLIP: | sort -u 
-LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
+LANGFILTER: | grep '^[A-JL-VXZÇ]\+$'
 
 # Until I can figure out how to force sort to use a locale's collation
 # rules we can't trust sort in the filtering rules above and so must
 # leave the sorting work to dict2dawg.pl.
+D2DARGS: -r -term 10
 
-NEEDSSORT:true
 
 LANGINFO: <p>Portugese uses the letter A-Z, excluding K, W and Y, and adds
 LANGINFO: Ç.  Words containing any other letters are dropped. </p>
diff --git a/dawg/Russian/Makefile b/dawg/Russian/Makefile
new file mode 100644
index 000000000..e2d5ee127
--- /dev/null
+++ b/dawg/Russian/Makefile
@@ -0,0 +1,41 @@
+# -*- mode: makefile -*-
+# Copyright 2002-2007 by Eric House (xwords@eehouse.org).  All rights reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+XWLANG=Russian
+LANGCODE=ru_RU
+DICT2DAWGARGS = -r
+
+TARGET_TYPE ?= WINCE
+
+include ../Makefile.2to8
+
+include ../Makefile.langcommon
+
+SOURCEDICT ?= $(XWDICTPATH)/$(XWLANG)/RU5000.txt.gz
+
+$(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile
+	zcat $< | tr -d '\r' | \
+	tr [àáâãäåæçèéêëìíîïðñòóôõö÷øùÚûüýþÿ] [ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß] | \
+	gzip -c > $@
+
+
+# Everything but creating of the Main.dict file is inherited from the
+# "parent" Makefile.langcommon in the parent directory.
+
+clean: clean_common
+	rm -f $(XWLANG)Main.dict.gz *.bin $(XWLANG)*.pdb $(XWLANG)*.seb
+
diff --git a/dawg/Russian/info.txt b/dawg/Russian/info.txt
new file mode 100644
index 000000000..912f508f4
--- /dev/null
+++ b/dawg/Russian/info.txt
@@ -0,0 +1,76 @@
+# Copyright 2002,2007 by Eric House (xwords@eehouse.org).  All rights
+# reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+LANGCODE:ru_RU
+CHARSET:windows-1251
+
+# deal with DOS files
+LANGFILTER: tr -d '\r'
+# uppercase all
+LANGFILTER: | tr [àáâãäåæçèéêëìíîïðñòóôõö÷øùÚûüýþÿ] [ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß]
+# LANGFILTER: | tr -s '\n' '\000'
+
+# note: don't turn off sorting!  Can't do it with GNU 'sort' without
+# setting LANG
+D2DARGS: -r -term 10
+
+LANGINFO: <p>Russian wordlists must be in the Windows-1251
+LANGINFO: codepage. Lower-case letters are converted to upper case and
+LANGINFO: any words that contain letters not listed below are
+LANGINFO: removed.</p>
+
+# High bit means "official".  Next 7 bits are an enum where
+# Russian==0x0F.  Low byte is padding.
+XLOC_HEADER:0x8F00
+
+
+<BEGIN_TILES>
+8       1         'À'
+2       3         'Á'
+4       1         'Â'
+2       3         'Ã'
+2       2         'Ä'
+7       1         'Å'
+1       4         'Æ'
+1       3         'Ç'
+7       1         'È'
+1       2         'É'
+4       2         'Ê'
+4       2         'Ë'
+2       3         'Ì'
+4       1         'Í'
+9       1         'Î'
+4       2         'Ï'
+5       1         'Ð'
+5       1         'Ñ'
+7       1         'Ò'
+4       2         'Ó'
+1       5         'Ô'
+1       4         'Õ'
+1       4         'Ö'
+1       3         '×'
+1       4         'Ø'
+1       5         'Ù'
+1       10        'Ú'
+2       2         'Û'
+4       1         'Ü'
+1       8         'Ý'
+1       5         'Þ'
+2       2         'ß'
+2			0		{"_"}
+<END_TILES>
+# should ignore all after the <END_TILES> above
diff --git a/dawg/Spanish/info.txt b/dawg/Spanish/info.txt
index 8cabfa6d8..129af7d83 100644
--- a/dawg/Spanish/info.txt
+++ b/dawg/Spanish/info.txt
@@ -21,24 +21,25 @@
 NEEDSSORT:true
 
 # MSDos LF chars go bye-bye
-LANGFILTER_PRECLIP: tr -d '\r' |
+LANGFILTER: tr -d '\r'
 
 # convert accented vowels
-LANGFILTER_POSTCLIP: | tr '\207\216\222\227\234\237\226' 'aeiouu\321'
+LANGFILTER: | tr '\207\216\222\227\234\237\226' 'aeiouu\321'
 # uppercase
-LANGFILTER_POSTCLIP: | tr [a-zñ] [A-ZÑ]
+LANGFILTER: | tr [a-zñ] [A-ZÑ]
 # remove words with illegal letters
-LANGFILTER_POSTCLIP: | grep '^[[A-JL-VX-ZÑ]*$'
+LANGFILTER: | grep '^[[A-JL-VX-ZÑ]*$'
 # substitute pairs (can't figure out how to use octal values)
-LANGFILTER_POSTCLIP: | sed 's/CH/1/g'
-LANGFILTER_POSTCLIP: | sed 's/LL/2/g'
-LANGFILTER_POSTCLIP: | sed 's/RR/3/g'
+LANGFILTER: | sed 's/CH/1/g'
+LANGFILTER: | sed 's/LL/2/g'
+LANGFILTER: | sed 's/RR/3/g'
 # substitute in the octal control character values
-LANGFILTER_POSTCLIP: | tr '123' '\001\002\003'
+LANGFILTER: | tr '123' '\001\002\003'
 # now add nulls as terminators
-LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
-LANGFILTER_POSTCLIP: | sort -u -z
+LANGFILTER: | tr -s '\n' '\000'
+LANGFILTER: | sort -u -z
 
+D2DARGS: -r -term 0
 
 LANGINFO: <p>Spanish words include all letters in the English alphabet
 LANGINFO: except "K" and "W", and with "Ñ" added. Since there are no
diff --git a/dawg/Swedish/info.txt b/dawg/Swedish/info.txt
index ccd188410..161cbc060 100644
--- a/dawg/Swedish/info.txt
+++ b/dawg/Swedish/info.txt
@@ -16,12 +16,11 @@
 
 LANGCODE:sv_SE
 
+LANGFILTER: tr -d '\r'
+LANGFILTER: | tr [a-zäåæöü] [A-ZÄÅÆÖÜ]
+LANGFILTER: | grep '^[A-ZÄÅÆÖÜ]*$'
 
-LANGFILTER_POSTCLIP: | tr [a-zäåæöü] [A-ZÄÅÆÖÜ]
-LANGFILTER_POSTCLIP: | grep '^[A-ZÄÅÆÖÜ]*$'
-LANGFILTER_POSTCLIP: | tr '\n' '\000'
-
-NEEDSSORT:true
+D2DARGS: -r -term 10
 
 LANGINFO: <p>From an English-speaker's perspective, Swedish drops Q
 LANGINFO: and W, and adds Ä, Å, Æ, Ö and Ü.</p>
diff --git a/dawg/dict2dawg.cpp b/dawg/dict2dawg.cpp
index ba2475578..187728215 100644
--- a/dawg/dict2dawg.cpp
+++ b/dawg/dict2dawg.cpp
@@ -1,9 +1,9 @@
-/* -*- compile-command: "g++ -O -o dict2dawg dict2dawg.cpp"; -*- */
+/* -*- compile-command: "g++ -DDEBUG -O -o dict2dawg dict2dawg.cpp"; -*- */
 /*************************************************************************
  * adapted from perl code that was itself adapted from C++ code
  * Copyright (C) 2000 Falk Hueffner
 
- * This version Copyright (C) 2002,2006 Eric House (xwords@eehouse.org)
+ * This version Copyright (C) 2002,2006-2007 Eric House (xwords@eehouse.org)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -54,9 +54,11 @@ typedef unsigned int Node;
 typedef std::vector<Node> NodeList;
 typedef std::vector<char*> WordList;
 
-#define MAX_WORD_LEN 15
 #define VERSION_STR "$Rev$"
 
+#define MAX_WORD_LEN 15
+#define T2ABUFLEN(s) (((s)*4)+3)
+
 int gFirstDiff;
 
 static char gCurrentWordBuf[MAX_WORD_LEN+1] = { '\0' };
@@ -92,10 +94,12 @@ bool gForceFour = false;             // use four bytes regardless of need?
 static int gFileSize = 0;
 int gNBytesPerNode;
 bool gUseUnicode;
+int gLimLow = 2;
+int gLimHigh = MAX_WORD_LEN;
 
 
 // OWL is 1.7M
-#define MAX_POOL_SIZE (3 * 0x100000)
+#define MAX_POOL_SIZE (5 * 0x100000)
 #define ERROR_EXIT(...) error_exit( __LINE__, __VA_ARGS__ );
 
 static char* parseARGV( int argc, char** argv, const char** inFileName );
@@ -182,7 +186,8 @@ main( int argc, char** argv )
         unsigned long be = htonl( gWordCount );
         fwrite( &be, sizeof(be), 1, OFILE );
         fclose( OFILE );
-        fprintf( stderr, "wrote out: got %d words\n", gWordCount );
+        fprintf( stderr, "Wrote %d (word count) to %s\n", gWordCount, 
+                 gCountFile );
     }
 
     if ( gOutFileBase ) {
@@ -393,49 +398,62 @@ readFromSortedArray( void )
 #endif
     }
 
-    char* word = "";
+    for ( ; ; ) {
+        char* word = "";
 
-    if ( !gDone ) {
-        gDone = gNextWordIndex == sInputStrings->size();
         if ( !gDone ) {
-            word = sInputStrings->at(gNextWordIndex++);
+            gDone = gNextWordIndex == sInputStrings->size();
+            if ( !gDone ) {
+                word = sInputStrings->at(gNextWordIndex++);
 #ifdef DEBUG
-        } else if ( gDebug ) {
-            fprintf( stderr, "gDone set to true\n" );
+            } else if ( gDebug ) {
+                fprintf( stderr, "gDone set to true\n" );
+#endif
+            }
+#ifdef DEBUG
+            if ( gDebug ) {
+                char buf[T2ABUFLEN(MAX_WORD_LEN)];
+                fprintf( stderr, "%s: got word: %s\n", __func__,
+                         tileToAscii( buf, sizeof(buf), word ) );
+            }
 #endif
         }
-#ifdef DEBUG
-        if ( gDebug ) {
-            fprintf( stderr, "got word: %s\n", word );
+        int numCommonLetters = 0;
+        int len = strlen( word );
+        if ( gCurrentWordLen < len ) {
+            len = gCurrentWordLen;
         }
+
+        while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
+                && numCommonLetters < len ) {
+            ++numCommonLetters;
+        }
+
+        gFirstDiff = numCommonLetters;
+        if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
+             && !firstBeforeSecond( gCurrentWord, word ) ) {
+#ifdef DEBUG
+            if ( gDebug ) {
+                char buf1[T2ABUFLEN(MAX_WORD_LEN)];
+                char buf2[T2ABUFLEN(MAX_WORD_LEN)];
+                fprintf( stderr,
+                         "%s: words %s and %s are the same or out of order\n",
+                         __func__, 
+                         tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
+                         tileToAscii( buf2, sizeof(buf2), word ) );
+            }
 #endif
+            continue;
+        }
+    
+        gCurrentWord = word;
+        gCurrentWordLen = strlen(word);
+        break;
     }
-    int numCommonLetters = 0;
-    int len = strlen( word );
-    if ( gCurrentWordLen < len ) {
-        len = gCurrentWordLen;
-    }
-
-    while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
-            && numCommonLetters < len ) {
-        ++numCommonLetters;
-    }
-
-    gFirstDiff = numCommonLetters;
-    if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
-         && !firstBeforeSecond( gCurrentWord, word ) ) {
-        char buf1[MAX_WORD_LEN+1];
-        char buf2[MAX_WORD_LEN+1];
-        ERROR_EXIT( "words %s and %s are out of order\n",
-                    tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
-                    tileToAscii( buf2, sizeof(buf2), word ) );
-    }
-    gCurrentWord = word;
-    gCurrentWordLen = strlen(word);
 
 #ifdef DEBUG
     if ( gDebug ) {
-        char buf[MAX_WORD_LEN+1];
+        char buf[T2ABUFLEN(MAX_WORD_LEN)];
         fprintf( stderr, "gCurrentWord now %s\n", 
                  tileToAscii( buf, sizeof(buf), gCurrentWord) );
     }
@@ -450,47 +468,77 @@ readOneWord( char* wordBuf, int bufLen, int* lenp, bool* gotEOF )
     bool dropWord = false;
     bool done = false;
 
-    // for each byte
+    // for each byte, append to an internal buffer up to size limit.
+    // On reaching an end-of-word or EOF, check if the word formed is
+    // within the length range and contains no unknown chars.  If yes,
+    // return it.  If no, start over ONLY IF the terminator was not
+    // EOF.
     for ( ; ; ) {
         int byt = getc( gInFile );
 
         // EOF is special: we don't try for another word even if
         // dropWord is true; we must leave now.
         if ( byt == EOF || byt == gTermChar ) {
-            *gotEOF = byt == EOF;
+            bool isEOF = byt == EOF;
+            *gotEOF = isEOF;
 
-            if ( !dropWord || *gotEOF ) {
-                if ( count != 0 ) {
-                    wordBuf[count] = '\0';
-                    result = wordBuf;
-                    *lenp = count;
-                    ++gWordCount;
-                }
-                break;          // we've finished a word
-            } else if ( *gotEOF ) {
+            assert( isEOF || count < bufLen );
+            if ( !dropWord && (count >= gLimLow) && (count <= gLimHigh) ) {
+                assert( count < bufLen );
+                wordBuf[count] = '\0';
+                result = wordBuf;
+                *lenp = count;
+                ++gWordCount;
                 break;
+            } else if ( isEOF ) {
+                assert( !result );
+                break;
+            } 
+#ifdef DEBUG
+            if ( gDebug ) {
+                char buf[T2ABUFLEN(count)];
+                wordBuf[count] = '\0';
+                fprintf( stderr, "%s: dropping word (len=%d): %s\n", __func__,
+                         count, tileToAscii( buf, sizeof(buf), wordBuf ) );
             }
+#endif
+            count = 0;  // we'll start over
+            dropWord = false;
+
+        } else if ( count >= bufLen ) {
+            // Just drop it...
+            dropWord = true;
 
             // Don't call into the hashtable twice here!!
         } else if ( gTableHash.find(byt) != gTableHash.end() ) {
-            if ( !dropWord ) {
-                wordBuf[count++] = (char)gTableHash[byt];
-                if ( count >= bufLen ) {
-                    char buf[MAX_WORD_LEN+1];
-                    ERROR_EXIT( "no space for word %d (starting \"%s\")", 
-                                gWordCount, 
-                                tileToAscii( buf, sizeof(buf), wordBuf ));
-                }
+            assert( count < bufLen );
+            wordBuf[count++] = (char)gTableHash[byt];
+            if ( count >= bufLen ) {
+                char buf[T2ABUFLEN(count)];
+                ERROR_EXIT( "no space for word %d (starting \"%s\")", 
+                            gWordCount, 
+                            tileToAscii( buf, sizeof(buf), wordBuf ));
+            }
+        } else if ( gKillIfMissing || !dropWord ) {
+            char buf[T2ABUFLEN(count)];
+            wordBuf[count] = '\0';
+
+            tileToAscii( buf, sizeof(buf), wordBuf );
+
+            if ( gKillIfMissing ) {
+                ERROR_EXIT( "chr %c (%d) not in map file %s\n"
+                            "last word was %s\n",
+                            (char)byt, (int)byt, gTableFile, buf );
+            } else if ( !dropWord ) {
+#ifdef DEBUG
+                if ( gDebug ) {
+                    fprintf( stderr, "%s: chr %c (%d) not in map file %s\n"
+                             "dropping partial word %s\n", __func__,
+                             (char)byt, (int)byt, gTableFile, buf );
+                }
+#endif
+                dropWord = true;
             }
-        } else if ( gKillIfMissing ) {
-            char buf[MAX_WORD_LEN+1];
-            ERROR_EXIT( "chr %c (%d) not in map file %s\n"
-                        "last word was %s\n",
-                        byt, (int)byt, gTableFile, 
-                        tileToAscii( buf, sizeof(buf), wordBuf ) );
-        } else {
-            dropWord = true;
-            count = 0;     // lose anything we already have
         }
     }
 
@@ -511,40 +559,55 @@ readFromFile( void )
     int len;
 
     gDone = s_eof;
-    if ( !gDone ) {
-        word = readOneWord( wordBuf, sizeof(wordBuf), &len, &s_eof );
-        gDone = NULL == word;
-    }
-    if ( gDone ) {
-        word = "";
-        len = 0;
-    }
+    
+    // Repeat until we get a new word that's not "out-of-order".  When
+    // we see this the problem isn't failure to sort, it's duplicates.
+    // So dropping is ok.  The alternative would be detecting dupes
+    // during the sort.  This seems easier.
+    for ( ; ; ) {
+        if ( !gDone ) {
+            word = readOneWord( wordBuf, sizeof(wordBuf), &len, &s_eof );
+            gDone = NULL == word;
+        }
+        if ( gDone ) {
+            word = "";
+            len = 0;
+        }
 
-    int numCommonLetters = 0;
-    if ( gCurrentWordLen < len ) {
-        len = gCurrentWordLen;
-    }
+        int numCommonLetters = 0;
+        if ( gCurrentWordLen < len ) {
+            len = gCurrentWordLen;
+        }
 
-    while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
-            && numCommonLetters < len ) {
-        ++numCommonLetters;
-    }
+        while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
+                && numCommonLetters < len ) {
+            ++numCommonLetters;
+        }
 
-    gFirstDiff = numCommonLetters;
-    if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
-         && !firstBeforeSecond( gCurrentWord, word ) ) {
-        char buf1[MAX_WORD_LEN+1];
-        char buf2[MAX_WORD_LEN+1];
-        ERROR_EXIT( "words %s and %s are out of order\n",
-                    tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
-                    tileToAscii( buf2, sizeof(buf2), word ) );
+        gFirstDiff = numCommonLetters;
+        if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
+             && !firstBeforeSecond( gCurrentWord, word ) ) {
+#ifdef DEBUG
+            if ( gDebug ) {
+                char buf1[T2ABUFLEN(MAX_WORD_LEN)];
+                char buf2[T2ABUFLEN(MAX_WORD_LEN)];
+                fprintf( stderr,
+                         "%s: words %s and %s are the smae or out of order\n",
+                         __func__, 
+                         tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
+                         tileToAscii( buf2, sizeof(buf2), word ) );
+            }
+#endif
+            continue;
+        }
+        break;
     }
     gCurrentWordLen = strlen(word);
     strncpy( gCurrentWordBuf, word, sizeof(gCurrentWordBuf) );
 
 #ifdef DEBUG
     if ( gDebug ) {
-        char buf[MAX_WORD_LEN+1];
+        char buf[T2ABUFLEN(MAX_WORD_LEN)];
         fprintf( stderr, "gCurrentWord now %s\n", 
                  tileToAscii( buf, sizeof(buf), gCurrentWord) );
     }
@@ -561,17 +624,26 @@ firstBeforeSecond( const char* lhs, const char* rhs )
 static char*
 tileToAscii( char* out, int outSize, const char* in )
 {
+    char tiles[outSize];
+    int tilesLen = 1;
+    tiles[0] = '[';
+
     char* orig = out;
     for ( ; ; ) {
         char ch = *in++;
         if ( '\0' == ch ) {
-            *out = '\0';
             break;
         }
         assert( ch < gRevMap.size() );
         *out++ = gRevMap[ch];
+        tilesLen += sprintf( &tiles[tilesLen], "%d,", ch );
         assert( (out - orig) < outSize );
     }
+
+    tiles[tilesLen] = ']';
+    tiles[tilesLen+1] = '\0';
+    strcpy( out, tiles );
+
     return orig;
 }
 
@@ -636,7 +708,7 @@ printWords( std::vector<char*>* strings )
 {
     std::vector<char*>::iterator iter = strings->begin();
     while ( iter != strings->end() ) {
-        char buf[MAX_WORD_LEN+1];
+        char buf[T2ABUFLEN(MAX_WORD_LEN)];
         tileToAscii( buf, sizeof(buf), *iter );
         fprintf( stderr, "%s\n", buf );
         ++iter;
@@ -760,6 +832,9 @@ makeTableHash( void )
     if ( NULL == TABLEFILE ) {
         ERROR_EXIT( "unable to open %s\n", gTableFile );
     }
+    
+    // Fill the 0th space since references are one-based
+    gRevMap.push_back(0);
 
     for ( ii = 0; ; ++ii ) {
         int ch = getc(TABLEFILE);
@@ -817,7 +892,6 @@ emitNodes( unsigned int nBytesPerOutfile, const char* outFileBase )
         gNBytesPerNode = 3;
     } else {
         if ( gBlankIndex == 32 ) { // blank
-            fprintf( stderr, "blank's at 32; 3-byte-nodes still ok\n" );
             gNBytesPerNode = 3;
         } else {
             ERROR_EXIT( "move blank to last position in info.txt "
@@ -994,8 +1068,10 @@ usage( const char* name )
 {
     fprintf( stderr, "usage: %s \n"
              "\t[-v]                 (print version and exit)\n"
-             "\t[-poolsize]          (print size of hardcoded pool and exit)\n"
+             "\t[-poolsize]          (print hardcoded size of pool and exit)\n"
              "\t[-b    bytesPerFile] (default = 0xFFFFFFFF)\n"
+             "\t[-min   <num in 0..15>]\n"
+             "\t[-max   <num in 0..15>]\n"
              "\t-m     mapFile\n"
              "\t-mn    mapFile (unicode)\n"
              "\t-ob    outFileBase\n"
@@ -1048,6 +1124,10 @@ parseARGV( int argc, char** argv, const char** inFileName )
         } else if ( 0 == strcmp( arg, "-mn" ) ) {
             gTableFile = argv[index++];
             gUseUnicode = true;
+        } else if ( 0 == strcmp( arg, "-min" ) ) {
+            gLimLow = atoi(argv[index++]);
+        } else if ( 0 == strcmp( arg, "-max" ) ) {
+            gLimHigh = atoi(argv[index++]);
         } else if ( 0 == strcmp( arg, "-m" ) ) {
             gTableFile = argv[index++];
         } else if ( 0 == strcmp( arg, "-ob" ) ) {
@@ -1079,17 +1159,25 @@ parseARGV( int argc, char** argv, const char** inFileName )
             gDebug = true;
 #endif
         } else {
-            ERROR_EXIT( "unexpected arg %s", arg );
+            ERROR_EXIT( "%s: unexpected arg %s", __func__, arg );
         }
     }
 
+    if ( gLimHigh > MAX_WORD_LEN || gLimLow > MAX_WORD_LEN ) {
+        usage( argv[0] );
+        exit(1);
+    }
+
 #ifdef DEBUG
     if ( gDebug ) {
-        fprintf( stderr, "gNBytesPerOutfile=$gNBytesPerOutfile\n" );
-        fprintf( stderr, "gTableFile=$gTableFile\n" );
-        fprintf( stderr, "gOutFileBase=$gOutFileBase\n" );
-        fprintf( stderr, "gStartNodeOut=$gStartNodeOut\n" );
+        fprintf( stderr, "gNBytesPerOutfile=%d\n", gNBytesPerOutfile );
+        fprintf( stderr, "gTableFile=%s\n", gTableFile );
+        fprintf( stderr, "gOutFileBase=%s\n", gOutFileBase );
+        fprintf( stderr, "gStartNodeOut=%s\n", gStartNodeOut );
         fprintf( stderr, "gTermChar=%c(%d)\n", gTermChar, (int)gTermChar );
+        fprintf( stderr, "gFileSize=%d\n", gFileSize );
+        fprintf( stderr, "gLimLow=%d\n", gLimLow );
+        fprintf( stderr, "gLimHigh=%d\n", gLimHigh );
     }
 #endif
     return gTableFile;
diff --git a/dawg/xloc.pm b/dawg/xloc.pm
index ad5c0fa8c..4aefe7440 100644
--- a/dawg/xloc.pm
+++ b/dawg/xloc.pm
@@ -117,7 +117,7 @@ sub WriteMapFile($$$) {
         } elsif ( $str =~ /(\d+)/ ) {
             print $fhr pack( $packStr, $1 );
         } else {
-            die "WriteMapFile: unrecognized face format $str";
+            die "WriteMapFile: unrecognized face format $str, elem $i";
         }
     }
 } # WriteMapFile