Add support for Russian. So that Russian text can be processed on systems without setting LANG=ru_RU.CP1251, modify dict2dawg to skip duplicates and words outside of specified lengths. Modify all info.txt files for the new scheme (which includes change to byod.cgi not kept on sourceforge.)

This commit is contained in:
ehouse 2007-02-17 17:06:05 +00:00
parent 326ecb00f4
commit 3bb2fb018f
15 changed files with 366 additions and 181 deletions

View file

@ -18,23 +18,21 @@ LANGCODE:da_DK
# deal with DOS files
LANGFILTER_PRECLIP: tr -d '\r' |
LANGFILTER: tr -d '\r'
# uppercase all
LANGFILTER_POSTCLIP: | tr [a-zĺćř] [A-ZĹĆŘ]
LANGFILTER: | tr [a-zåæø] [A-ZÅÆØ]
# no words not containing a vowel
LANGFILTER_POSTCLIP: | grep '[AEIOUYĹĆŘ]'
LANGFILTER: | grep '[AEIOUYÅÆØ]'
# none with illegal chars
LANGFILTER_POSTCLIP: | grep '^[A-PR-VX-ZĹĆŘ]\+$'
LANGFILTER: | grep '^[A-PR-VX-ZÅÆØ]\+$'
# remove duplicates
LANGFILTER_POSTCLIP: | sort -u
LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
LANGFILTER: | sort -u
# Until I can figure out how to force sort to use a locale's collation
# rules we can't trust sort in the filtering rules above and so must
# leave the sorting work to dict2dawg.pl.
NEEDSSORT:true
D2DARGS: -r -term 10
LANGINFO: <p>Danish uses all English letters except Q and W. There
LANGINFO: are three non-English letters: 'Å', 'Æ' and 'Ø'. </p>

View file

@ -18,22 +18,19 @@ LANGCODE:nl_NL
# deal with DOS files
LANGFILTER_PRECLIP: tr -d '\r' |
LANGFILTER: tr -d '\r'
# uppercase all
LANGFILTER_POSTCLIP: | tr [a-z] [A-Z]
LANGFILTER: | tr [a-z] [A-Z]
# no words not containing a vowel
LANGFILTER_POSTCLIP: | grep '[AEIOU]'
LANGFILTER: | grep '[AEIOU]'
# none with illegal chars
LANGFILTER_POSTCLIP: | grep '^[A-Z]\+$'
LANGFILTER_POSTCLIP: | sort -u
LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
LANGFILTER: | grep '^[A-Z]\+$'
LANGFILTER: | sort -u
# Until I can figure out how to force sort to use a locale's collation
# rules we can't trust sort in the filtering rules above and so must
# leave the sorting work to dict2dawg.pl.
NEEDSSORT:false
D2DARGS: -r -term 10
LANGINFO: <p>Dutch has the same 26 letters as English, though of
LANGINFO: course the counts and values are different. Filtering rules

View file

@ -1,4 +1,4 @@
# -*-mode: Makefile -*-
# -*-mode: Makefile; compile-command: "make -f Makefile.BasEnglish"; -*-
# Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved.
#
# This program is free software; you can redistribute it and/or
@ -17,7 +17,7 @@
XWLANG=BasEnglish
LANGCODE=en_US
#NEWDAWG=1
DICT2DAWGARGS = -r -nosort
TARGET_TYPE ?= PALM

View file

@ -17,16 +17,16 @@
LANGCODE:en_US
# deal with DOS files
LANGFILTER_PRECLIP: tr -d '\r' |
LANGFILTER_POSTCLIP: | tr [a-z] [A-Z]
LANGFILTER_POSTCLIP: | grep '^[A-Z]*$'
LANGFILTER_POSTCLIP: | sort -u
LANGFILTER: tr -d '\r'
LANGFILTER: | tr [a-z] [A-Z]
LANGFILTER: | grep '^[A-Z]*$'
LANGFILTER: | sort -u
# We can trust sort (above) to do the right thing since there's no
# high ascii. dict2dawg.pl is much faster if I can trust that its
# input is in sorted order.
NEEDSSORT:false
D2DARGS: -nosort -term 10
LANGINFO: <p>English dictionaries can contain words with any of the 26
LANGINFO: letters you think of as making up the alphabet: A-Z. At

View file

@ -17,16 +17,14 @@
LANGCODE:fr_FR
# deal with DOS files
LANGFILTER_PRECLIP: tr -d '\r' |
LANGFILTER: tr -d '\r'
LANGFILTER_POSTCLIP: | tr [a-z] [A-Z]
LANGFILTER_POSTCLIP: | grep '^[A-Z]*$'
LANGFILTER_POSTCLIP: | tr '\n' '\000'
LANGFILTER_POSTCLIP: | sort -u -z
NEEDSSORT:false
LANGFILTER: | tr [a-z] [A-Z]
LANGFILTER: | grep '^[A-Z]*$'
LANGFILTER: | tr '\n' '\000'
LANGFILTER: | sort -u -z
D2DARGS: -r -nosort -term 0
LANGINFO: <p>At this point French is getting treated the same as
LANGINFO: English. But I think I should be transforming accented

View file

@ -16,26 +16,21 @@
LANGCODE:de_DE
# deal with DOS files
LANGFILTER_PRECLIP: tr -d '\r' |
LANGFILTER: tr -d '\r'
# substitute for sharfes-s
LANGFILTER_PRECLIP: sed -e 's/ß/SS/g' |
LANGFILTER: | sed -e 's/ß/SS/g'
# uppercase all
LANGFILTER_POSTCLIP: | tr [a-zäöü] [A-ZÄÖÜ]
LANGFILTER: | tr [a-zäöü] [A-ZÄÖÜ]
# no words not containing a vowel
LANGFILTER_POSTCLIP: | grep '[AEIOUÄÖÜ]'
LANGFILTER: | grep '[AEIOUÄÖÜ]'
# none with illegal chars
LANGFILTER_POSTCLIP: | grep '^[A-ZÄÖÜ]\+$'
LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
LANGFILTER: | grep '^[A-ZÄÖÜ]\+$'
# Until I can figure out how to force sort to use a locale's collation
# rules we can't trust sort in the filtering rules above and so must
# leave the sorting work to dict2dawg.pl.
NEEDSSORT:true
D2DARGS: -r -term 10
LANGINFO: <p>German has the 26 English letters plus the three umlaut
LANGINFO: vowels. Scharfes-s is not a legal tile, but if present in

View file

@ -18,16 +18,12 @@
LANGCODE:it_IT
# deal with DOS files
LANGFILTER_PRECLIP: tr -d '\r' |
LANGFILTER_POSTCLIP: | tr [a-z] [A-Z]
LANGFILTER_POSTCLIP: | grep '^[A-IL-VZ]*$'
LANGFILTER_POSTCLIP: | tr '\n' '\000'
LANGFILTER_POSTCLIP: | sort -u -z
NEEDSSORT:false
LANGFILTER: tr -d '\r'
LANGFILTER: | tr [a-z] [A-Z]
LANGFILTER: | grep '^[A-IL-VZ]*$'
LANGFILTER: | sort -u
D2DARGS: -r -term 10 -nosort
LANGINFO: <p>Italian is treated the same as English but for
LANGINFO: missing letters J, K, W, X and Y.</p>

View file

@ -197,7 +197,7 @@ frankspecials.bin: ../frank_mkspecials.pl $(BMPFILES)
# a binary file (one byte) giving the number of tiles in the dict
charcount.bin: table.bin
ifdef NEWDAWG
siz=$$(wc $< | awk '{print $$3}'); \
siz=$$(ls -l $< | awk '{print $$5}'); \
perl -e "print pack(\"c\",$$siz/2)" > $@
else
siz=$$(wc -c $< | sed -e 's/$<//'); \
@ -240,11 +240,10 @@ dawg$(XWLANG)%.stamp: $(XWLANG)Main.dict.gz $(DICT2DAWG) table.bin ../Makefile.l
start=$$(echo $@ | sed -e 's/dawg$(XWLANG)\([0-9]*\)to[0-9]*.stamp/\1/'); \
end=$$(echo $@ | sed -e 's/dawg$(XWLANG)[0-9]*to\([0-9]*\).stamp/\1/'); \
echo $${start} and $$end; \
zcat $< | grep "^.\{$${start},$${end}\}$$" | \
sort -u | $(DICT2DAWG) $(TABLE_ARG) table.bin -b 28000 \
zcat $< | $(DICT2DAWG) $(DICT2DAWGARGS) $(TABLE_ARG) table.bin -b 28000 \
-ob dawg$(XWLANG)$* \
-sn $(XWLANG)StartLoc.bin -k -term 10 -wc $(XWLANG)$*_wordcount.bin \
$(FORCE_4) -ns $(XWLANG)$*_nodesize.bin
-sn $(XWLANG)StartLoc.bin -min $$start -max $$end \
-wc $(XWLANG)$*_wordcount.bin $(FORCE_4) -ns $(XWLANG)$*_nodesize.bin
touch $@
$(XWLANG)%_wordcount.bin: dawg$(XWLANG)%.stamp

View file

@ -17,22 +17,19 @@
LANGCODE:pt_PT
# deal with DOS files
LANGFILTER_PRECLIP: tr -d '\r' |
LANGFILTER: tr -d '\r'
# uppercase all
LANGFILTER_POSTCLIP: | tr [a-zç] [A-ZÇ]
LANGFILTER: | tr [a-zç] [A-ZÇ]
# no words not containing a vowel
LANGFILTER_POSTCLIP: | grep '[AEIOU]'
LANGFILTER: | grep '[AEIOU]'
# none with illegal chars
LANGFILTER_POSTCLIP: | grep '^[A-JL-VXZÇ]\+$'
LANGFILTER_POSTCLIP: | sort -u
LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
LANGFILTER: | grep '^[A-JL-VXZÇ]\+$'
# Until I can figure out how to force sort to use a locale's collation
# rules we can't trust sort in the filtering rules above and so must
# leave the sorting work to dict2dawg.pl.
D2DARGS: -r -term 10
NEEDSSORT:true
LANGINFO: <p>Portugese uses the letter A-Z, excluding K, W and Y, and adds
LANGINFO: Ç. Words containing any other letters are dropped. </p>

41
dawg/Russian/Makefile Normal file
View file

@ -0,0 +1,41 @@
# -*- mode: makefile -*-
# Copyright 2002-2007 by Eric House (xwords@eehouse.org). All rights reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
XWLANG=Russian
LANGCODE=ru_RU
DICT2DAWGARGS = -r
TARGET_TYPE ?= WINCE
include ../Makefile.2to8
include ../Makefile.langcommon
SOURCEDICT ?= $(XWDICTPATH)/$(XWLANG)/RU5000.txt.gz
$(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile
zcat $< | tr -d '\r' | \
tr [àáâãäåæçèéêëìíîïðñòóôõö÷øùÚûüýþÿ] [ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß] | \
gzip -c > $@
# Everything but creating of the Main.dict file is inherited from the
# "parent" Makefile.langcommon in the parent directory.
clean: clean_common
rm -f $(XWLANG)Main.dict.gz *.bin $(XWLANG)*.pdb $(XWLANG)*.seb

76
dawg/Russian/info.txt Normal file
View file

@ -0,0 +1,76 @@
# Copyright 2002,2007 by Eric House (xwords@eehouse.org). All rights
# reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
LANGCODE:ru_RU
CHARSET:windows-1251
# deal with DOS files
LANGFILTER: tr -d '\r'
# uppercase all
LANGFILTER: | tr [àáâãäåæçèéêëìíîïðñòóôõö÷øùÚûüýþÿ] [ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß]
# LANGFILTER: | tr -s '\n' '\000'
# note: don't turn off sorting! Can't do it with GNU 'sort' without
# setting LANG
D2DARGS: -r -term 10
LANGINFO: <p>Russian wordlists must be in the Windows-1251
LANGINFO: codepage. Lower-case letters are converted to upper case and
LANGINFO: any words that contain letters not listed below are
LANGINFO: removed.</p>
# High bit means "official". Next 7 bits are an enum where
# Russian==0x0F. Low byte is padding.
XLOC_HEADER:0x8F00
<BEGIN_TILES>
8 1 'À'
2 3 'Á'
4 1 'Â'
2 3 'Ã'
2 2 'Ä'
7 1 'Å'
1 4 'Æ'
1 3 'Ç'
7 1 'È'
1 2 'É'
4 2 'Ê'
4 2 'Ë'
2 3 'Ì'
4 1 'Í'
9 1 'Î'
4 2 'Ï'
5 1 'Ð'
5 1 'Ñ'
7 1 'Ò'
4 2 'Ó'
1 5 'Ô'
1 4 'Õ'
1 4 'Ö'
1 3 '×'
1 4 'Ø'
1 5 'Ù'
1 10 'Ú'
2 2 'Û'
4 1 'Ü'
1 8 'Ý'
1 5 'Þ'
2 2 'ß'
2 0 {"_"}
<END_TILES>
# should ignore all after the <END_TILES> above

View file

@ -21,24 +21,25 @@
NEEDSSORT:true
# MSDos LF chars go bye-bye
LANGFILTER_PRECLIP: tr -d '\r' |
LANGFILTER: tr -d '\r'
# convert accented vowels
LANGFILTER_POSTCLIP: | tr '\207\216\222\227\234\237\226' 'aeiouu\321'
LANGFILTER: | tr '\207\216\222\227\234\237\226' 'aeiouu\321'
# uppercase
LANGFILTER_POSTCLIP: | tr [a-zń] [A-ZŃ]
LANGFILTER: | tr [a-zñ] [A-ZÑ]
# remove words with illegal letters
LANGFILTER_POSTCLIP: | grep '^[[A-JL-VX-ZŃ]*$'
LANGFILTER: | grep '^[[A-JL-VX-ZÑ]*$'
# substitute pairs (can't figure out how to use octal values)
LANGFILTER_POSTCLIP: | sed 's/CH/1/g'
LANGFILTER_POSTCLIP: | sed 's/LL/2/g'
LANGFILTER_POSTCLIP: | sed 's/RR/3/g'
LANGFILTER: | sed 's/CH/1/g'
LANGFILTER: | sed 's/LL/2/g'
LANGFILTER: | sed 's/RR/3/g'
# substitute in the octal control character values
LANGFILTER_POSTCLIP: | tr '123' '\001\002\003'
LANGFILTER: | tr '123' '\001\002\003'
# now add nulls as terminators
LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
LANGFILTER_POSTCLIP: | sort -u -z
LANGFILTER: | tr -s '\n' '\000'
LANGFILTER: | sort -u -z
D2DARGS: -r -term 0
LANGINFO: <p>Spanish words include all letters in the English alphabet
LANGINFO: except "K" and "W", and with "Ñ" added. Since there are no

View file

@ -16,12 +16,11 @@
LANGCODE:sv_SE
LANGFILTER: tr -d '\r'
LANGFILTER: | tr [a-zäåæöü] [A-ZÄÅÆÖÜ]
LANGFILTER: | grep '^[A-ZÄÅÆÖÜ]*$'
LANGFILTER_POSTCLIP: | tr [a-zäĺćöü] [A-ZÄĹĆÖÜ]
LANGFILTER_POSTCLIP: | grep '^[A-ZÄĹĆÖÜ]*$'
LANGFILTER_POSTCLIP: | tr '\n' '\000'
NEEDSSORT:true
D2DARGS: -r -term 10
LANGINFO: <p>From an English-speaker's perspective, Swedish drops Q
LANGINFO: and W, and adds Ä, Å, Æ, Ö and Ü.</p>

View file

@ -1,9 +1,9 @@
/* -*- compile-command: "g++ -O -o dict2dawg dict2dawg.cpp"; -*- */
/* -*- compile-command: "g++ -DDEBUG -O -o dict2dawg dict2dawg.cpp"; -*- */
/*************************************************************************
* adapted from perl code that was itself adapted from C++ code
* Copyright (C) 2000 Falk Hueffner
* This version Copyright (C) 2002,2006 Eric House (xwords@eehouse.org)
* This version Copyright (C) 2002,2006-2007 Eric House (xwords@eehouse.org)
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@ -54,9 +54,11 @@ typedef unsigned int Node;
typedef std::vector<Node> NodeList;
typedef std::vector<char*> WordList;
#define MAX_WORD_LEN 15
#define VERSION_STR "$Rev$"
#define MAX_WORD_LEN 15
#define T2ABUFLEN(s) (((s)*4)+3)
int gFirstDiff;
static char gCurrentWordBuf[MAX_WORD_LEN+1] = { '\0' };
@ -92,10 +94,12 @@ bool gForceFour = false; // use four bytes regardless of need?
static int gFileSize = 0;
int gNBytesPerNode;
bool gUseUnicode;
int gLimLow = 2;
int gLimHigh = MAX_WORD_LEN;
// OWL is 1.7M
#define MAX_POOL_SIZE (3 * 0x100000)
#define MAX_POOL_SIZE (5 * 0x100000)
#define ERROR_EXIT(...) error_exit( __LINE__, __VA_ARGS__ );
static char* parseARGV( int argc, char** argv, const char** inFileName );
@ -182,7 +186,8 @@ main( int argc, char** argv )
unsigned long be = htonl( gWordCount );
fwrite( &be, sizeof(be), 1, OFILE );
fclose( OFILE );
fprintf( stderr, "wrote out: got %d words\n", gWordCount );
fprintf( stderr, "Wrote %d (word count) to %s\n", gWordCount,
gCountFile );
}
if ( gOutFileBase ) {
@ -393,49 +398,62 @@ readFromSortedArray( void )
#endif
}
char* word = "";
for ( ; ; ) {
char* word = "";
if ( !gDone ) {
gDone = gNextWordIndex == sInputStrings->size();
if ( !gDone ) {
word = sInputStrings->at(gNextWordIndex++);
gDone = gNextWordIndex == sInputStrings->size();
if ( !gDone ) {
word = sInputStrings->at(gNextWordIndex++);
#ifdef DEBUG
} else if ( gDebug ) {
fprintf( stderr, "gDone set to true\n" );
} else if ( gDebug ) {
fprintf( stderr, "gDone set to true\n" );
#endif
}
#ifdef DEBUG
if ( gDebug ) {
char buf[T2ABUFLEN(MAX_WORD_LEN)];
fprintf( stderr, "%s: got word: %s\n", __func__,
tileToAscii( buf, sizeof(buf), word ) );
}
#endif
}
#ifdef DEBUG
if ( gDebug ) {
fprintf( stderr, "got word: %s\n", word );
int numCommonLetters = 0;
int len = strlen( word );
if ( gCurrentWordLen < len ) {
len = gCurrentWordLen;
}
while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
&& numCommonLetters < len ) {
++numCommonLetters;
}
gFirstDiff = numCommonLetters;
if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
&& !firstBeforeSecond( gCurrentWord, word ) ) {
#ifdef DEBUG
if ( gDebug ) {
char buf1[T2ABUFLEN(MAX_WORD_LEN)];
char buf2[T2ABUFLEN(MAX_WORD_LEN)];
fprintf( stderr,
"%s: words %s and %s are the same or out of order\n",
__func__,
tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
tileToAscii( buf2, sizeof(buf2), word ) );
}
#endif
continue;
}
gCurrentWord = word;
gCurrentWordLen = strlen(word);
break;
}
int numCommonLetters = 0;
int len = strlen( word );
if ( gCurrentWordLen < len ) {
len = gCurrentWordLen;
}
while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
&& numCommonLetters < len ) {
++numCommonLetters;
}
gFirstDiff = numCommonLetters;
if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
&& !firstBeforeSecond( gCurrentWord, word ) ) {
char buf1[MAX_WORD_LEN+1];
char buf2[MAX_WORD_LEN+1];
ERROR_EXIT( "words %s and %s are out of order\n",
tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
tileToAscii( buf2, sizeof(buf2), word ) );
}
gCurrentWord = word;
gCurrentWordLen = strlen(word);
#ifdef DEBUG
if ( gDebug ) {
char buf[MAX_WORD_LEN+1];
char buf[T2ABUFLEN(MAX_WORD_LEN)];
fprintf( stderr, "gCurrentWord now %s\n",
tileToAscii( buf, sizeof(buf), gCurrentWord) );
}
@ -450,47 +468,77 @@ readOneWord( char* wordBuf, int bufLen, int* lenp, bool* gotEOF )
bool dropWord = false;
bool done = false;
// for each byte
// for each byte, append to an internal buffer up to size limit.
// On reaching an end-of-word or EOF, check if the word formed is
// within the length range and contains no unknown chars. If yes,
// return it. If no, start over ONLY IF the terminator was not
// EOF.
for ( ; ; ) {
int byt = getc( gInFile );
// EOF is special: we don't try for another word even if
// dropWord is true; we must leave now.
if ( byt == EOF || byt == gTermChar ) {
*gotEOF = byt == EOF;
bool isEOF = byt == EOF;
*gotEOF = isEOF;
if ( !dropWord || *gotEOF ) {
if ( count != 0 ) {
wordBuf[count] = '\0';
result = wordBuf;
*lenp = count;
++gWordCount;
}
break; // we've finished a word
} else if ( *gotEOF ) {
assert( isEOF || count < bufLen );
if ( !dropWord && (count >= gLimLow) && (count <= gLimHigh) ) {
assert( count < bufLen );
wordBuf[count] = '\0';
result = wordBuf;
*lenp = count;
++gWordCount;
break;
} else if ( isEOF ) {
assert( !result );
break;
}
#ifdef DEBUG
if ( gDebug ) {
char buf[T2ABUFLEN(count)];
wordBuf[count] = '\0';
fprintf( stderr, "%s: dropping word (len=%d): %s\n", __func__,
count, tileToAscii( buf, sizeof(buf), wordBuf ) );
}
#endif
count = 0; // we'll start over
dropWord = false;
} else if ( count >= bufLen ) {
// Just drop it...
dropWord = true;
// Don't call into the hashtable twice here!!
} else if ( gTableHash.find(byt) != gTableHash.end() ) {
if ( !dropWord ) {
wordBuf[count++] = (char)gTableHash[byt];
if ( count >= bufLen ) {
char buf[MAX_WORD_LEN+1];
ERROR_EXIT( "no space for word %d (starting \"%s\")",
gWordCount,
tileToAscii( buf, sizeof(buf), wordBuf ));
}
assert( count < bufLen );
wordBuf[count++] = (char)gTableHash[byt];
if ( count >= bufLen ) {
char buf[T2ABUFLEN(count)];
ERROR_EXIT( "no space for word %d (starting \"%s\")",
gWordCount,
tileToAscii( buf, sizeof(buf), wordBuf ));
}
} else if ( gKillIfMissing || !dropWord ) {
char buf[T2ABUFLEN(count)];
wordBuf[count] = '\0';
tileToAscii( buf, sizeof(buf), wordBuf );
if ( gKillIfMissing ) {
ERROR_EXIT( "chr %c (%d) not in map file %s\n"
"last word was %s\n",
(char)byt, (int)byt, gTableFile, buf );
} else if ( !dropWord ) {
#ifdef DEBUG
if ( gDebug ) {
fprintf( stderr, "%s: chr %c (%d) not in map file %s\n"
"dropping partial word %s\n", __func__,
(char)byt, (int)byt, gTableFile, buf );
}
#endif
dropWord = true;
}
} else if ( gKillIfMissing ) {
char buf[MAX_WORD_LEN+1];
ERROR_EXIT( "chr %c (%d) not in map file %s\n"
"last word was %s\n",
byt, (int)byt, gTableFile,
tileToAscii( buf, sizeof(buf), wordBuf ) );
} else {
dropWord = true;
count = 0; // lose anything we already have
}
}
@ -511,40 +559,55 @@ readFromFile( void )
int len;
gDone = s_eof;
if ( !gDone ) {
word = readOneWord( wordBuf, sizeof(wordBuf), &len, &s_eof );
gDone = NULL == word;
}
if ( gDone ) {
word = "";
len = 0;
}
// Repeat until we get a new word that's not "out-of-order". When
// we see this the problem isn't failure to sort, it's duplicates.
// So dropping is ok. The alternative would be detecting dupes
// during the sort. This seems easier.
for ( ; ; ) {
if ( !gDone ) {
word = readOneWord( wordBuf, sizeof(wordBuf), &len, &s_eof );
gDone = NULL == word;
}
if ( gDone ) {
word = "";
len = 0;
}
int numCommonLetters = 0;
if ( gCurrentWordLen < len ) {
len = gCurrentWordLen;
}
int numCommonLetters = 0;
if ( gCurrentWordLen < len ) {
len = gCurrentWordLen;
}
while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
&& numCommonLetters < len ) {
++numCommonLetters;
}
while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
&& numCommonLetters < len ) {
++numCommonLetters;
}
gFirstDiff = numCommonLetters;
if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
&& !firstBeforeSecond( gCurrentWord, word ) ) {
char buf1[MAX_WORD_LEN+1];
char buf2[MAX_WORD_LEN+1];
ERROR_EXIT( "words %s and %s are out of order\n",
tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
tileToAscii( buf2, sizeof(buf2), word ) );
gFirstDiff = numCommonLetters;
if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
&& !firstBeforeSecond( gCurrentWord, word ) ) {
#ifdef DEBUG
if ( gDebug ) {
char buf1[T2ABUFLEN(MAX_WORD_LEN)];
char buf2[T2ABUFLEN(MAX_WORD_LEN)];
fprintf( stderr,
"%s: words %s and %s are the smae or out of order\n",
__func__,
tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
tileToAscii( buf2, sizeof(buf2), word ) );
}
#endif
continue;
}
break;
}
gCurrentWordLen = strlen(word);
strncpy( gCurrentWordBuf, word, sizeof(gCurrentWordBuf) );
#ifdef DEBUG
if ( gDebug ) {
char buf[MAX_WORD_LEN+1];
char buf[T2ABUFLEN(MAX_WORD_LEN)];
fprintf( stderr, "gCurrentWord now %s\n",
tileToAscii( buf, sizeof(buf), gCurrentWord) );
}
@ -561,17 +624,26 @@ firstBeforeSecond( const char* lhs, const char* rhs )
static char*
tileToAscii( char* out, int outSize, const char* in )
{
char tiles[outSize];
int tilesLen = 1;
tiles[0] = '[';
char* orig = out;
for ( ; ; ) {
char ch = *in++;
if ( '\0' == ch ) {
*out = '\0';
break;
}
assert( ch < gRevMap.size() );
*out++ = gRevMap[ch];
tilesLen += sprintf( &tiles[tilesLen], "%d,", ch );
assert( (out - orig) < outSize );
}
tiles[tilesLen] = ']';
tiles[tilesLen+1] = '\0';
strcpy( out, tiles );
return orig;
}
@ -636,7 +708,7 @@ printWords( std::vector<char*>* strings )
{
std::vector<char*>::iterator iter = strings->begin();
while ( iter != strings->end() ) {
char buf[MAX_WORD_LEN+1];
char buf[T2ABUFLEN(MAX_WORD_LEN)];
tileToAscii( buf, sizeof(buf), *iter );
fprintf( stderr, "%s\n", buf );
++iter;
@ -760,6 +832,9 @@ makeTableHash( void )
if ( NULL == TABLEFILE ) {
ERROR_EXIT( "unable to open %s\n", gTableFile );
}
// Fill the 0th space since references are one-based
gRevMap.push_back(0);
for ( ii = 0; ; ++ii ) {
int ch = getc(TABLEFILE);
@ -817,7 +892,6 @@ emitNodes( unsigned int nBytesPerOutfile, const char* outFileBase )
gNBytesPerNode = 3;
} else {
if ( gBlankIndex == 32 ) { // blank
fprintf( stderr, "blank's at 32; 3-byte-nodes still ok\n" );
gNBytesPerNode = 3;
} else {
ERROR_EXIT( "move blank to last position in info.txt "
@ -994,8 +1068,10 @@ usage( const char* name )
{
fprintf( stderr, "usage: %s \n"
"\t[-v] (print version and exit)\n"
"\t[-poolsize] (print size of hardcoded pool and exit)\n"
"\t[-poolsize] (print hardcoded size of pool and exit)\n"
"\t[-b bytesPerFile] (default = 0xFFFFFFFF)\n"
"\t[-min <num in 0..15>]\n"
"\t[-max <num in 0..15>]\n"
"\t-m mapFile\n"
"\t-mn mapFile (unicode)\n"
"\t-ob outFileBase\n"
@ -1048,6 +1124,10 @@ parseARGV( int argc, char** argv, const char** inFileName )
} else if ( 0 == strcmp( arg, "-mn" ) ) {
gTableFile = argv[index++];
gUseUnicode = true;
} else if ( 0 == strcmp( arg, "-min" ) ) {
gLimLow = atoi(argv[index++]);
} else if ( 0 == strcmp( arg, "-max" ) ) {
gLimHigh = atoi(argv[index++]);
} else if ( 0 == strcmp( arg, "-m" ) ) {
gTableFile = argv[index++];
} else if ( 0 == strcmp( arg, "-ob" ) ) {
@ -1079,17 +1159,25 @@ parseARGV( int argc, char** argv, const char** inFileName )
gDebug = true;
#endif
} else {
ERROR_EXIT( "unexpected arg %s", arg );
ERROR_EXIT( "%s: unexpected arg %s", __func__, arg );
}
}
if ( gLimHigh > MAX_WORD_LEN || gLimLow > MAX_WORD_LEN ) {
usage( argv[0] );
exit(1);
}
#ifdef DEBUG
if ( gDebug ) {
fprintf( stderr, "gNBytesPerOutfile=$gNBytesPerOutfile\n" );
fprintf( stderr, "gTableFile=$gTableFile\n" );
fprintf( stderr, "gOutFileBase=$gOutFileBase\n" );
fprintf( stderr, "gStartNodeOut=$gStartNodeOut\n" );
fprintf( stderr, "gNBytesPerOutfile=%d\n", gNBytesPerOutfile );
fprintf( stderr, "gTableFile=%s\n", gTableFile );
fprintf( stderr, "gOutFileBase=%s\n", gOutFileBase );
fprintf( stderr, "gStartNodeOut=%s\n", gStartNodeOut );
fprintf( stderr, "gTermChar=%c(%d)\n", gTermChar, (int)gTermChar );
fprintf( stderr, "gFileSize=%d\n", gFileSize );
fprintf( stderr, "gLimLow=%d\n", gLimLow );
fprintf( stderr, "gLimHigh=%d\n", gLimHigh );
}
#endif
return gTableFile;

View file

@ -117,7 +117,7 @@ sub WriteMapFile($$$) {
} elsif ( $str =~ /(\d+)/ ) {
print $fhr pack( $packStr, $1 );
} else {
die "WriteMapFile: unrecognized face format $str";
die "WriteMapFile: unrecognized face format $str, elem $i";
}
}
} # WriteMapFile