From 3bb2fb018f9d4e97c0a532761adb472ec549c409 Mon Sep 17 00:00:00 2001 From: ehouse Date: Sat, 17 Feb 2007 17:06:05 +0000 Subject: [PATCH] Add support for Russian. So that Russian text can be processed on systems without setting LANG=ru_RU.CP1251, modify dict2dawg to skip duplicates and words outside of specified lengths. Modify all info.txt files for the new scheme (which includes change to byod.cgi not kept on sourceforge.) --- dawg/Danish/info.txt | 14 +- dawg/Dutch/info.txt | 15 +- dawg/English/Makefile.BasEnglish | 4 +- dawg/English/info.txt | 12 +- dawg/French/info.txt | 14 +- dawg/German/info.txt | 17 +- dawg/Italian/info.txt | 14 +- dawg/Makefile.langcommon | 9 +- dawg/Portuguese/info.txt | 13 +- dawg/Russian/Makefile | 41 +++++ dawg/Russian/info.txt | 76 ++++++++ dawg/Spanish/info.txt | 21 +-- dawg/Swedish/info.txt | 9 +- dawg/dict2dawg.cpp | 286 ++++++++++++++++++++----------- dawg/xloc.pm | 2 +- 15 files changed, 366 insertions(+), 181 deletions(-) create mode 100644 dawg/Russian/Makefile create mode 100644 dawg/Russian/info.txt diff --git a/dawg/Danish/info.txt b/dawg/Danish/info.txt index ae6d6b5b1..7e8353249 100644 --- a/dawg/Danish/info.txt +++ b/dawg/Danish/info.txt @@ -18,23 +18,21 @@ LANGCODE:da_DK # deal with DOS files -LANGFILTER_PRECLIP: tr -d '\r' | - +LANGFILTER: tr -d '\r' # uppercase all -LANGFILTER_POSTCLIP: | tr [a-zåæø] [A-ZÅÆØ] +LANGFILTER: | tr [a-zåæø] [A-ZÅÆØ] # no words not containing a vowel -LANGFILTER_POSTCLIP: | grep '[AEIOUYÅÆØ]' +LANGFILTER: | grep '[AEIOUYÅÆØ]' # none with illegal chars -LANGFILTER_POSTCLIP: | grep '^[A-PR-VX-ZÅÆØ]\+$' +LANGFILTER: | grep '^[A-PR-VX-ZÅÆØ]\+$' # remove duplicates -LANGFILTER_POSTCLIP: | sort -u -LANGFILTER_POSTCLIP: | tr -s '\n' '\000' +LANGFILTER: | sort -u # Until I can figure out how to force sort to use a locale's collation # rules we can't trust sort in the filtering rules above and so must # leave the sorting work to dict2dawg.pl. -NEEDSSORT:true +D2DARGS: -r -term 10 LANGINFO:

Danish uses all English letters except Q and W. There LANGINFO: are three non-English letters: 'Å', 'Æ' and 'Ø'.

diff --git a/dawg/Dutch/info.txt b/dawg/Dutch/info.txt index 27e7a2679..6a4e03c12 100644 --- a/dawg/Dutch/info.txt +++ b/dawg/Dutch/info.txt @@ -18,22 +18,19 @@ LANGCODE:nl_NL # deal with DOS files -LANGFILTER_PRECLIP: tr -d '\r' | - +LANGFILTER: tr -d '\r' # uppercase all -LANGFILTER_POSTCLIP: | tr [a-z] [A-Z] +LANGFILTER: | tr [a-z] [A-Z] # no words not containing a vowel -LANGFILTER_POSTCLIP: | grep '[AEIOU]' +LANGFILTER: | grep '[AEIOU]' # none with illegal chars -LANGFILTER_POSTCLIP: | grep '^[A-Z]\+$' -LANGFILTER_POSTCLIP: | sort -u -LANGFILTER_POSTCLIP: | tr -s '\n' '\000' +LANGFILTER: | grep '^[A-Z]\+$' +LANGFILTER: | sort -u # Until I can figure out how to force sort to use a locale's collation # rules we can't trust sort in the filtering rules above and so must # leave the sorting work to dict2dawg.pl. - -NEEDSSORT:false +D2DARGS: -r -term 10 LANGINFO:

Dutch has the same 26 letters as English, though of LANGINFO: course the counts and values are different. Filtering rules diff --git a/dawg/English/Makefile.BasEnglish b/dawg/English/Makefile.BasEnglish index 89f1567ec..03019e7a2 100644 --- a/dawg/English/Makefile.BasEnglish +++ b/dawg/English/Makefile.BasEnglish @@ -1,4 +1,4 @@ -# -*-mode: Makefile -*- +# -*-mode: Makefile; compile-command: "make -f Makefile.BasEnglish"; -*- # Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved. # # This program is free software; you can redistribute it and/or @@ -17,7 +17,7 @@ XWLANG=BasEnglish LANGCODE=en_US -#NEWDAWG=1 +DICT2DAWGARGS = -r -nosort TARGET_TYPE ?= PALM diff --git a/dawg/English/info.txt b/dawg/English/info.txt index a8eeb5128..70f8d8a49 100644 --- a/dawg/English/info.txt +++ b/dawg/English/info.txt @@ -17,16 +17,16 @@ LANGCODE:en_US # deal with DOS files -LANGFILTER_PRECLIP: tr -d '\r' | - -LANGFILTER_POSTCLIP: | tr [a-z] [A-Z] -LANGFILTER_POSTCLIP: | grep '^[A-Z]*$' -LANGFILTER_POSTCLIP: | sort -u +LANGFILTER: tr -d '\r' +LANGFILTER: | tr [a-z] [A-Z] +LANGFILTER: | grep '^[A-Z]*$' +LANGFILTER: | sort -u # We can trust sort (above) to do the right thing since there's no # high ascii. dict2dawg.pl is much faster if I can trust that its # input is in sorted order. -NEEDSSORT:false +D2DARGS: -nosort -term 10 + LANGINFO:

English dictionaries can contain words with any of the 26 LANGINFO: letters you think of as making up the alphabet: A-Z. At diff --git a/dawg/French/info.txt b/dawg/French/info.txt index b961cdd62..00c5bfc0f 100755 --- a/dawg/French/info.txt +++ b/dawg/French/info.txt @@ -17,16 +17,14 @@ LANGCODE:fr_FR # deal with DOS files -LANGFILTER_PRECLIP: tr -d '\r' | +LANGFILTER: tr -d '\r' -LANGFILTER_POSTCLIP: | tr [a-z] [A-Z] -LANGFILTER_POSTCLIP: | grep '^[A-Z]*$' -LANGFILTER_POSTCLIP: | tr '\n' '\000' -LANGFILTER_POSTCLIP: | sort -u -z - - -NEEDSSORT:false +LANGFILTER: | tr [a-z] [A-Z] +LANGFILTER: | grep '^[A-Z]*$' +LANGFILTER: | tr '\n' '\000' +LANGFILTER: | sort -u -z +D2DARGS: -r -nosort -term 0 LANGINFO:

At this point French is getting treated the same as LANGINFO: English. But I think I should be transforming accented diff --git a/dawg/German/info.txt b/dawg/German/info.txt index 7c21c94ad..f6321981d 100644 --- a/dawg/German/info.txt +++ b/dawg/German/info.txt @@ -16,26 +16,21 @@ LANGCODE:de_DE - # deal with DOS files -LANGFILTER_PRECLIP: tr -d '\r' | - +LANGFILTER: tr -d '\r' # substitute for sharfes-s -LANGFILTER_PRECLIP: sed -e 's/ß/SS/g' | - +LANGFILTER: | sed -e 's/ß/SS/g' # uppercase all -LANGFILTER_POSTCLIP: | tr [a-zäöü] [A-ZÄÖÜ] +LANGFILTER: | tr [a-zäöü] [A-ZÄÖÜ] # no words not containing a vowel -LANGFILTER_POSTCLIP: | grep '[AEIOUÄÖÜ]' +LANGFILTER: | grep '[AEIOUÄÖÜ]' # none with illegal chars -LANGFILTER_POSTCLIP: | grep '^[A-ZÄÖÜ]\+$' -LANGFILTER_POSTCLIP: | tr -s '\n' '\000' +LANGFILTER: | grep '^[A-ZÄÖÜ]\+$' # Until I can figure out how to force sort to use a locale's collation # rules we can't trust sort in the filtering rules above and so must # leave the sorting work to dict2dawg.pl. - -NEEDSSORT:true +D2DARGS: -r -term 10 LANGINFO:

German has the 26 English letters plus the three umlaut LANGINFO: vowels. Scharfes-s is not a legal tile, but if present in diff --git a/dawg/Italian/info.txt b/dawg/Italian/info.txt index 001bf6130..8b60c6478 100755 --- a/dawg/Italian/info.txt +++ b/dawg/Italian/info.txt @@ -18,16 +18,12 @@ LANGCODE:it_IT # deal with DOS files -LANGFILTER_PRECLIP: tr -d '\r' | - -LANGFILTER_POSTCLIP: | tr [a-z] [A-Z] -LANGFILTER_POSTCLIP: | grep '^[A-IL-VZ]*$' -LANGFILTER_POSTCLIP: | tr '\n' '\000' -LANGFILTER_POSTCLIP: | sort -u -z - - -NEEDSSORT:false +LANGFILTER: tr -d '\r' +LANGFILTER: | tr [a-z] [A-Z] +LANGFILTER: | grep '^[A-IL-VZ]*$' +LANGFILTER: | sort -u +D2DARGS: -r -term 10 -nosort LANGINFO:

Italian is treated the same as English but for LANGINFO: missing letters J, K, W, X and Y.

diff --git a/dawg/Makefile.langcommon b/dawg/Makefile.langcommon index 5a1e1ffeb..767d58a8d 100644 --- a/dawg/Makefile.langcommon +++ b/dawg/Makefile.langcommon @@ -197,7 +197,7 @@ frankspecials.bin: ../frank_mkspecials.pl $(BMPFILES) # a binary file (one byte) giving the number of tiles in the dict charcount.bin: table.bin ifdef NEWDAWG - siz=$$(wc $< | awk '{print $$3}'); \ + siz=$$(ls -l $< | awk '{print $$5}'); \ perl -e "print pack(\"c\",$$siz/2)" > $@ else siz=$$(wc -c $< | sed -e 's/$Portugese uses the letter A-Z, excluding K, W and Y, and adds LANGINFO: Ç. Words containing any other letters are dropped.

diff --git a/dawg/Russian/Makefile b/dawg/Russian/Makefile new file mode 100644 index 000000000..e2d5ee127 --- /dev/null +++ b/dawg/Russian/Makefile @@ -0,0 +1,41 @@ +# -*- mode: makefile -*- +# Copyright 2002-2007 by Eric House (xwords@eehouse.org). All rights reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +XWLANG=Russian +LANGCODE=ru_RU +DICT2DAWGARGS = -r + +TARGET_TYPE ?= WINCE + +include ../Makefile.2to8 + +include ../Makefile.langcommon + +SOURCEDICT ?= $(XWDICTPATH)/$(XWLANG)/RU5000.txt.gz + +$(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile + zcat $< | tr -d '\r' | \ + tr [àáâãäåæçèéêëìíîïðñòóôõö÷øùÚûüýþÿ] [ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß] | \ + gzip -c > $@ + + +# Everything but creating of the Main.dict file is inherited from the +# "parent" Makefile.langcommon in the parent directory. + +clean: clean_common + rm -f $(XWLANG)Main.dict.gz *.bin $(XWLANG)*.pdb $(XWLANG)*.seb + diff --git a/dawg/Russian/info.txt b/dawg/Russian/info.txt new file mode 100644 index 000000000..912f508f4 --- /dev/null +++ b/dawg/Russian/info.txt @@ -0,0 +1,76 @@ +# Copyright 2002,2007 by Eric House (xwords@eehouse.org). All rights +# reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +LANGCODE:ru_RU +CHARSET:windows-1251 + +# deal with DOS files +LANGFILTER: tr -d '\r' +# uppercase all +LANGFILTER: | tr [àáâãäåæçèéêëìíîïðñòóôõö÷øùÚûüýþÿ] [ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß] +# LANGFILTER: | tr -s '\n' '\000' + +# note: don't turn off sorting! Can't do it with GNU 'sort' without +# setting LANG +D2DARGS: -r -term 10 + +LANGINFO:

Russian wordlists must be in the Windows-1251 +LANGINFO: codepage. Lower-case letters are converted to upper case and +LANGINFO: any words that contain letters not listed below are +LANGINFO: removed.

+ +# High bit means "official". Next 7 bits are an enum where +# Russian==0x0F. Low byte is padding. +XLOC_HEADER:0x8F00 + + + +8 1 'À' +2 3 'Á' +4 1 'Â' +2 3 'Ã' +2 2 'Ä' +7 1 'Å' +1 4 'Æ' +1 3 'Ç' +7 1 'È' +1 2 'É' +4 2 'Ê' +4 2 'Ë' +2 3 'Ì' +4 1 'Í' +9 1 'Î' +4 2 'Ï' +5 1 'Ð' +5 1 'Ñ' +7 1 'Ò' +4 2 'Ó' +1 5 'Ô' +1 4 'Õ' +1 4 'Ö' +1 3 '×' +1 4 'Ø' +1 5 'Ù' +1 10 'Ú' +2 2 'Û' +4 1 'Ü' +1 8 'Ý' +1 5 'Þ' +2 2 'ß' +2 0 {"_"} + +# should ignore all after the above diff --git a/dawg/Spanish/info.txt b/dawg/Spanish/info.txt index 8cabfa6d8..129af7d83 100644 --- a/dawg/Spanish/info.txt +++ b/dawg/Spanish/info.txt @@ -21,24 +21,25 @@ NEEDSSORT:true # MSDos LF chars go bye-bye -LANGFILTER_PRECLIP: tr -d '\r' | +LANGFILTER: tr -d '\r' # convert accented vowels -LANGFILTER_POSTCLIP: | tr '\207\216\222\227\234\237\226' 'aeiouu\321' +LANGFILTER: | tr '\207\216\222\227\234\237\226' 'aeiouu\321' # uppercase -LANGFILTER_POSTCLIP: | tr [a-zñ] [A-ZÑ] +LANGFILTER: | tr [a-zñ] [A-ZÑ] # remove words with illegal letters -LANGFILTER_POSTCLIP: | grep '^[[A-JL-VX-ZÑ]*$' +LANGFILTER: | grep '^[[A-JL-VX-ZÑ]*$' # substitute pairs (can't figure out how to use octal values) -LANGFILTER_POSTCLIP: | sed 's/CH/1/g' -LANGFILTER_POSTCLIP: | sed 's/LL/2/g' -LANGFILTER_POSTCLIP: | sed 's/RR/3/g' +LANGFILTER: | sed 's/CH/1/g' +LANGFILTER: | sed 's/LL/2/g' +LANGFILTER: | sed 's/RR/3/g' # substitute in the octal control character values -LANGFILTER_POSTCLIP: | tr '123' '\001\002\003' +LANGFILTER: | tr '123' '\001\002\003' # now add nulls as terminators -LANGFILTER_POSTCLIP: | tr -s '\n' '\000' -LANGFILTER_POSTCLIP: | sort -u -z +LANGFILTER: | tr -s '\n' '\000' +LANGFILTER: | sort -u -z +D2DARGS: -r -term 0 LANGINFO:

Spanish words include all letters in the English alphabet LANGINFO: except "K" and "W", and with "Ñ" added. Since there are no diff --git a/dawg/Swedish/info.txt b/dawg/Swedish/info.txt index ccd188410..161cbc060 100644 --- a/dawg/Swedish/info.txt +++ b/dawg/Swedish/info.txt @@ -16,12 +16,11 @@ LANGCODE:sv_SE +LANGFILTER: tr -d '\r' +LANGFILTER: | tr [a-zäåæöü] [A-ZÄÅÆÖÜ] +LANGFILTER: | grep '^[A-ZÄÅÆÖÜ]*$' -LANGFILTER_POSTCLIP: | tr [a-zäåæöü] [A-ZÄÅÆÖÜ] -LANGFILTER_POSTCLIP: | grep '^[A-ZÄÅÆÖÜ]*$' -LANGFILTER_POSTCLIP: | tr '\n' '\000' - -NEEDSSORT:true +D2DARGS: -r -term 10 LANGINFO:

From an English-speaker's perspective, Swedish drops Q LANGINFO: and W, and adds Ä, Å, Æ, Ö and Ü.

diff --git a/dawg/dict2dawg.cpp b/dawg/dict2dawg.cpp index ba2475578..187728215 100644 --- a/dawg/dict2dawg.cpp +++ b/dawg/dict2dawg.cpp @@ -1,9 +1,9 @@ -/* -*- compile-command: "g++ -O -o dict2dawg dict2dawg.cpp"; -*- */ +/* -*- compile-command: "g++ -DDEBUG -O -o dict2dawg dict2dawg.cpp"; -*- */ /************************************************************************* * adapted from perl code that was itself adapted from C++ code * Copyright (C) 2000 Falk Hueffner - * This version Copyright (C) 2002,2006 Eric House (xwords@eehouse.org) + * This version Copyright (C) 2002,2006-2007 Eric House (xwords@eehouse.org) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -54,9 +54,11 @@ typedef unsigned int Node; typedef std::vector NodeList; typedef std::vector WordList; -#define MAX_WORD_LEN 15 #define VERSION_STR "$Rev$" +#define MAX_WORD_LEN 15 +#define T2ABUFLEN(s) (((s)*4)+3) + int gFirstDiff; static char gCurrentWordBuf[MAX_WORD_LEN+1] = { '\0' }; @@ -92,10 +94,12 @@ bool gForceFour = false; // use four bytes regardless of need? static int gFileSize = 0; int gNBytesPerNode; bool gUseUnicode; +int gLimLow = 2; +int gLimHigh = MAX_WORD_LEN; // OWL is 1.7M -#define MAX_POOL_SIZE (3 * 0x100000) +#define MAX_POOL_SIZE (5 * 0x100000) #define ERROR_EXIT(...) error_exit( __LINE__, __VA_ARGS__ ); static char* parseARGV( int argc, char** argv, const char** inFileName ); @@ -182,7 +186,8 @@ main( int argc, char** argv ) unsigned long be = htonl( gWordCount ); fwrite( &be, sizeof(be), 1, OFILE ); fclose( OFILE ); - fprintf( stderr, "wrote out: got %d words\n", gWordCount ); + fprintf( stderr, "Wrote %d (word count) to %s\n", gWordCount, + gCountFile ); } if ( gOutFileBase ) { @@ -393,49 +398,62 @@ readFromSortedArray( void ) #endif } - char* word = ""; + for ( ; ; ) { + char* word = ""; - if ( !gDone ) { - gDone = gNextWordIndex == sInputStrings->size(); if ( !gDone ) { - word = sInputStrings->at(gNextWordIndex++); + gDone = gNextWordIndex == sInputStrings->size(); + if ( !gDone ) { + word = sInputStrings->at(gNextWordIndex++); #ifdef DEBUG - } else if ( gDebug ) { - fprintf( stderr, "gDone set to true\n" ); + } else if ( gDebug ) { + fprintf( stderr, "gDone set to true\n" ); +#endif + } +#ifdef DEBUG + if ( gDebug ) { + char buf[T2ABUFLEN(MAX_WORD_LEN)]; + fprintf( stderr, "%s: got word: %s\n", __func__, + tileToAscii( buf, sizeof(buf), word ) ); + } #endif } -#ifdef DEBUG - if ( gDebug ) { - fprintf( stderr, "got word: %s\n", word ); + int numCommonLetters = 0; + int len = strlen( word ); + if ( gCurrentWordLen < len ) { + len = gCurrentWordLen; } + + while ( gCurrentWord[numCommonLetters] == word[numCommonLetters] + && numCommonLetters < len ) { + ++numCommonLetters; + } + + gFirstDiff = numCommonLetters; + if ( (gCurrentWordLen > 0) && (strlen(word) > 0) + && !firstBeforeSecond( gCurrentWord, word ) ) { +#ifdef DEBUG + if ( gDebug ) { + char buf1[T2ABUFLEN(MAX_WORD_LEN)]; + char buf2[T2ABUFLEN(MAX_WORD_LEN)]; + fprintf( stderr, + "%s: words %s and %s are the same or out of order\n", + __func__, + tileToAscii( buf1, sizeof(buf1), gCurrentWord ), + tileToAscii( buf2, sizeof(buf2), word ) ); + } #endif + continue; + } + + gCurrentWord = word; + gCurrentWordLen = strlen(word); + break; } - int numCommonLetters = 0; - int len = strlen( word ); - if ( gCurrentWordLen < len ) { - len = gCurrentWordLen; - } - - while ( gCurrentWord[numCommonLetters] == word[numCommonLetters] - && numCommonLetters < len ) { - ++numCommonLetters; - } - - gFirstDiff = numCommonLetters; - if ( (gCurrentWordLen > 0) && (strlen(word) > 0) - && !firstBeforeSecond( gCurrentWord, word ) ) { - char buf1[MAX_WORD_LEN+1]; - char buf2[MAX_WORD_LEN+1]; - ERROR_EXIT( "words %s and %s are out of order\n", - tileToAscii( buf1, sizeof(buf1), gCurrentWord ), - tileToAscii( buf2, sizeof(buf2), word ) ); - } - gCurrentWord = word; - gCurrentWordLen = strlen(word); #ifdef DEBUG if ( gDebug ) { - char buf[MAX_WORD_LEN+1]; + char buf[T2ABUFLEN(MAX_WORD_LEN)]; fprintf( stderr, "gCurrentWord now %s\n", tileToAscii( buf, sizeof(buf), gCurrentWord) ); } @@ -450,47 +468,77 @@ readOneWord( char* wordBuf, int bufLen, int* lenp, bool* gotEOF ) bool dropWord = false; bool done = false; - // for each byte + // for each byte, append to an internal buffer up to size limit. + // On reaching an end-of-word or EOF, check if the word formed is + // within the length range and contains no unknown chars. If yes, + // return it. If no, start over ONLY IF the terminator was not + // EOF. for ( ; ; ) { int byt = getc( gInFile ); // EOF is special: we don't try for another word even if // dropWord is true; we must leave now. if ( byt == EOF || byt == gTermChar ) { - *gotEOF = byt == EOF; + bool isEOF = byt == EOF; + *gotEOF = isEOF; - if ( !dropWord || *gotEOF ) { - if ( count != 0 ) { - wordBuf[count] = '\0'; - result = wordBuf; - *lenp = count; - ++gWordCount; - } - break; // we've finished a word - } else if ( *gotEOF ) { + assert( isEOF || count < bufLen ); + if ( !dropWord && (count >= gLimLow) && (count <= gLimHigh) ) { + assert( count < bufLen ); + wordBuf[count] = '\0'; + result = wordBuf; + *lenp = count; + ++gWordCount; break; + } else if ( isEOF ) { + assert( !result ); + break; + } +#ifdef DEBUG + if ( gDebug ) { + char buf[T2ABUFLEN(count)]; + wordBuf[count] = '\0'; + fprintf( stderr, "%s: dropping word (len=%d): %s\n", __func__, + count, tileToAscii( buf, sizeof(buf), wordBuf ) ); } +#endif + count = 0; // we'll start over + dropWord = false; + + } else if ( count >= bufLen ) { + // Just drop it... + dropWord = true; // Don't call into the hashtable twice here!! } else if ( gTableHash.find(byt) != gTableHash.end() ) { - if ( !dropWord ) { - wordBuf[count++] = (char)gTableHash[byt]; - if ( count >= bufLen ) { - char buf[MAX_WORD_LEN+1]; - ERROR_EXIT( "no space for word %d (starting \"%s\")", - gWordCount, - tileToAscii( buf, sizeof(buf), wordBuf )); - } + assert( count < bufLen ); + wordBuf[count++] = (char)gTableHash[byt]; + if ( count >= bufLen ) { + char buf[T2ABUFLEN(count)]; + ERROR_EXIT( "no space for word %d (starting \"%s\")", + gWordCount, + tileToAscii( buf, sizeof(buf), wordBuf )); + } + } else if ( gKillIfMissing || !dropWord ) { + char buf[T2ABUFLEN(count)]; + wordBuf[count] = '\0'; + + tileToAscii( buf, sizeof(buf), wordBuf ); + + if ( gKillIfMissing ) { + ERROR_EXIT( "chr %c (%d) not in map file %s\n" + "last word was %s\n", + (char)byt, (int)byt, gTableFile, buf ); + } else if ( !dropWord ) { +#ifdef DEBUG + if ( gDebug ) { + fprintf( stderr, "%s: chr %c (%d) not in map file %s\n" + "dropping partial word %s\n", __func__, + (char)byt, (int)byt, gTableFile, buf ); + } +#endif + dropWord = true; } - } else if ( gKillIfMissing ) { - char buf[MAX_WORD_LEN+1]; - ERROR_EXIT( "chr %c (%d) not in map file %s\n" - "last word was %s\n", - byt, (int)byt, gTableFile, - tileToAscii( buf, sizeof(buf), wordBuf ) ); - } else { - dropWord = true; - count = 0; // lose anything we already have } } @@ -511,40 +559,55 @@ readFromFile( void ) int len; gDone = s_eof; - if ( !gDone ) { - word = readOneWord( wordBuf, sizeof(wordBuf), &len, &s_eof ); - gDone = NULL == word; - } - if ( gDone ) { - word = ""; - len = 0; - } + + // Repeat until we get a new word that's not "out-of-order". When + // we see this the problem isn't failure to sort, it's duplicates. + // So dropping is ok. The alternative would be detecting dupes + // during the sort. This seems easier. + for ( ; ; ) { + if ( !gDone ) { + word = readOneWord( wordBuf, sizeof(wordBuf), &len, &s_eof ); + gDone = NULL == word; + } + if ( gDone ) { + word = ""; + len = 0; + } - int numCommonLetters = 0; - if ( gCurrentWordLen < len ) { - len = gCurrentWordLen; - } + int numCommonLetters = 0; + if ( gCurrentWordLen < len ) { + len = gCurrentWordLen; + } - while ( gCurrentWord[numCommonLetters] == word[numCommonLetters] - && numCommonLetters < len ) { - ++numCommonLetters; - } + while ( gCurrentWord[numCommonLetters] == word[numCommonLetters] + && numCommonLetters < len ) { + ++numCommonLetters; + } - gFirstDiff = numCommonLetters; - if ( (gCurrentWordLen > 0) && (strlen(word) > 0) - && !firstBeforeSecond( gCurrentWord, word ) ) { - char buf1[MAX_WORD_LEN+1]; - char buf2[MAX_WORD_LEN+1]; - ERROR_EXIT( "words %s and %s are out of order\n", - tileToAscii( buf1, sizeof(buf1), gCurrentWord ), - tileToAscii( buf2, sizeof(buf2), word ) ); + gFirstDiff = numCommonLetters; + if ( (gCurrentWordLen > 0) && (strlen(word) > 0) + && !firstBeforeSecond( gCurrentWord, word ) ) { +#ifdef DEBUG + if ( gDebug ) { + char buf1[T2ABUFLEN(MAX_WORD_LEN)]; + char buf2[T2ABUFLEN(MAX_WORD_LEN)]; + fprintf( stderr, + "%s: words %s and %s are the smae or out of order\n", + __func__, + tileToAscii( buf1, sizeof(buf1), gCurrentWord ), + tileToAscii( buf2, sizeof(buf2), word ) ); + } +#endif + continue; + } + break; } gCurrentWordLen = strlen(word); strncpy( gCurrentWordBuf, word, sizeof(gCurrentWordBuf) ); #ifdef DEBUG if ( gDebug ) { - char buf[MAX_WORD_LEN+1]; + char buf[T2ABUFLEN(MAX_WORD_LEN)]; fprintf( stderr, "gCurrentWord now %s\n", tileToAscii( buf, sizeof(buf), gCurrentWord) ); } @@ -561,17 +624,26 @@ firstBeforeSecond( const char* lhs, const char* rhs ) static char* tileToAscii( char* out, int outSize, const char* in ) { + char tiles[outSize]; + int tilesLen = 1; + tiles[0] = '['; + char* orig = out; for ( ; ; ) { char ch = *in++; if ( '\0' == ch ) { - *out = '\0'; break; } assert( ch < gRevMap.size() ); *out++ = gRevMap[ch]; + tilesLen += sprintf( &tiles[tilesLen], "%d,", ch ); assert( (out - orig) < outSize ); } + + tiles[tilesLen] = ']'; + tiles[tilesLen+1] = '\0'; + strcpy( out, tiles ); + return orig; } @@ -636,7 +708,7 @@ printWords( std::vector* strings ) { std::vector::iterator iter = strings->begin(); while ( iter != strings->end() ) { - char buf[MAX_WORD_LEN+1]; + char buf[T2ABUFLEN(MAX_WORD_LEN)]; tileToAscii( buf, sizeof(buf), *iter ); fprintf( stderr, "%s\n", buf ); ++iter; @@ -760,6 +832,9 @@ makeTableHash( void ) if ( NULL == TABLEFILE ) { ERROR_EXIT( "unable to open %s\n", gTableFile ); } + + // Fill the 0th space since references are one-based + gRevMap.push_back(0); for ( ii = 0; ; ++ii ) { int ch = getc(TABLEFILE); @@ -817,7 +892,6 @@ emitNodes( unsigned int nBytesPerOutfile, const char* outFileBase ) gNBytesPerNode = 3; } else { if ( gBlankIndex == 32 ) { // blank - fprintf( stderr, "blank's at 32; 3-byte-nodes still ok\n" ); gNBytesPerNode = 3; } else { ERROR_EXIT( "move blank to last position in info.txt " @@ -994,8 +1068,10 @@ usage( const char* name ) { fprintf( stderr, "usage: %s \n" "\t[-v] (print version and exit)\n" - "\t[-poolsize] (print size of hardcoded pool and exit)\n" + "\t[-poolsize] (print hardcoded size of pool and exit)\n" "\t[-b bytesPerFile] (default = 0xFFFFFFFF)\n" + "\t[-min ]\n" + "\t[-max ]\n" "\t-m mapFile\n" "\t-mn mapFile (unicode)\n" "\t-ob outFileBase\n" @@ -1048,6 +1124,10 @@ parseARGV( int argc, char** argv, const char** inFileName ) } else if ( 0 == strcmp( arg, "-mn" ) ) { gTableFile = argv[index++]; gUseUnicode = true; + } else if ( 0 == strcmp( arg, "-min" ) ) { + gLimLow = atoi(argv[index++]); + } else if ( 0 == strcmp( arg, "-max" ) ) { + gLimHigh = atoi(argv[index++]); } else if ( 0 == strcmp( arg, "-m" ) ) { gTableFile = argv[index++]; } else if ( 0 == strcmp( arg, "-ob" ) ) { @@ -1079,17 +1159,25 @@ parseARGV( int argc, char** argv, const char** inFileName ) gDebug = true; #endif } else { - ERROR_EXIT( "unexpected arg %s", arg ); + ERROR_EXIT( "%s: unexpected arg %s", __func__, arg ); } } + if ( gLimHigh > MAX_WORD_LEN || gLimLow > MAX_WORD_LEN ) { + usage( argv[0] ); + exit(1); + } + #ifdef DEBUG if ( gDebug ) { - fprintf( stderr, "gNBytesPerOutfile=$gNBytesPerOutfile\n" ); - fprintf( stderr, "gTableFile=$gTableFile\n" ); - fprintf( stderr, "gOutFileBase=$gOutFileBase\n" ); - fprintf( stderr, "gStartNodeOut=$gStartNodeOut\n" ); + fprintf( stderr, "gNBytesPerOutfile=%d\n", gNBytesPerOutfile ); + fprintf( stderr, "gTableFile=%s\n", gTableFile ); + fprintf( stderr, "gOutFileBase=%s\n", gOutFileBase ); + fprintf( stderr, "gStartNodeOut=%s\n", gStartNodeOut ); fprintf( stderr, "gTermChar=%c(%d)\n", gTermChar, (int)gTermChar ); + fprintf( stderr, "gFileSize=%d\n", gFileSize ); + fprintf( stderr, "gLimLow=%d\n", gLimLow ); + fprintf( stderr, "gLimHigh=%d\n", gLimHigh ); } #endif return gTableFile; diff --git a/dawg/xloc.pm b/dawg/xloc.pm index ad5c0fa8c..4aefe7440 100644 --- a/dawg/xloc.pm +++ b/dawg/xloc.pm @@ -117,7 +117,7 @@ sub WriteMapFile($$$) { } elsif ( $str =~ /(\d+)/ ) { print $fhr pack( $packStr, $1 ); } else { - die "WriteMapFile: unrecognized face format $str"; + die "WriteMapFile: unrecognized face format $str, elem $i"; } } } # WriteMapFile