Add support for Russian. So that Russian text can be processed on systems without setting LANG=ru_RU.CP1251, modify dict2dawg to skip duplicates and words outside of specified lengths. Modify all info.txt files for the new scheme (which includes change to byod.cgi not kept on sourceforge.)

This commit is contained in:
ehouse 2007-02-17 17:06:05 +00:00
parent 326ecb00f4
commit 3bb2fb018f
15 changed files with 366 additions and 181 deletions

View file

@ -18,23 +18,21 @@ LANGCODE:da_DK
# deal with DOS files # deal with DOS files
LANGFILTER_PRECLIP: tr -d '\r' | LANGFILTER: tr -d '\r'
# uppercase all # uppercase all
LANGFILTER_POSTCLIP: | tr [a-zĺćř] [A-ZĹĆŘ] LANGFILTER: | tr [a-zåæø] [A-ZÅÆØ]
# no words not containing a vowel # no words not containing a vowel
LANGFILTER_POSTCLIP: | grep '[AEIOUYĹĆŘ]' LANGFILTER: | grep '[AEIOUYÅÆØ]'
# none with illegal chars # none with illegal chars
LANGFILTER_POSTCLIP: | grep '^[A-PR-VX-ZĹĆŘ]\+$' LANGFILTER: | grep '^[A-PR-VX-ZÅÆØ]\+$'
# remove duplicates # remove duplicates
LANGFILTER_POSTCLIP: | sort -u LANGFILTER: | sort -u
LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
# Until I can figure out how to force sort to use a locale's collation # Until I can figure out how to force sort to use a locale's collation
# rules we can't trust sort in the filtering rules above and so must # rules we can't trust sort in the filtering rules above and so must
# leave the sorting work to dict2dawg.pl. # leave the sorting work to dict2dawg.pl.
NEEDSSORT:true D2DARGS: -r -term 10
LANGINFO: <p>Danish uses all English letters except Q and W. There LANGINFO: <p>Danish uses all English letters except Q and W. There
LANGINFO: are three non-English letters: 'Å', 'Æ' and 'Ø'. </p> LANGINFO: are three non-English letters: 'Å', 'Æ' and 'Ø'. </p>

View file

@ -18,22 +18,19 @@ LANGCODE:nl_NL
# deal with DOS files # deal with DOS files
LANGFILTER_PRECLIP: tr -d '\r' | LANGFILTER: tr -d '\r'
# uppercase all # uppercase all
LANGFILTER_POSTCLIP: | tr [a-z] [A-Z] LANGFILTER: | tr [a-z] [A-Z]
# no words not containing a vowel # no words not containing a vowel
LANGFILTER_POSTCLIP: | grep '[AEIOU]' LANGFILTER: | grep '[AEIOU]'
# none with illegal chars # none with illegal chars
LANGFILTER_POSTCLIP: | grep '^[A-Z]\+$' LANGFILTER: | grep '^[A-Z]\+$'
LANGFILTER_POSTCLIP: | sort -u LANGFILTER: | sort -u
LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
# Until I can figure out how to force sort to use a locale's collation # Until I can figure out how to force sort to use a locale's collation
# rules we can't trust sort in the filtering rules above and so must # rules we can't trust sort in the filtering rules above and so must
# leave the sorting work to dict2dawg.pl. # leave the sorting work to dict2dawg.pl.
D2DARGS: -r -term 10
NEEDSSORT:false
LANGINFO: <p>Dutch has the same 26 letters as English, though of LANGINFO: <p>Dutch has the same 26 letters as English, though of
LANGINFO: course the counts and values are different. Filtering rules LANGINFO: course the counts and values are different. Filtering rules

View file

@ -1,4 +1,4 @@
# -*-mode: Makefile -*- # -*-mode: Makefile; compile-command: "make -f Makefile.BasEnglish"; -*-
# Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved. # Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved.
# #
# This program is free software; you can redistribute it and/or # This program is free software; you can redistribute it and/or
@ -17,7 +17,7 @@
XWLANG=BasEnglish XWLANG=BasEnglish
LANGCODE=en_US LANGCODE=en_US
#NEWDAWG=1 DICT2DAWGARGS = -r -nosort
TARGET_TYPE ?= PALM TARGET_TYPE ?= PALM

View file

@ -17,16 +17,16 @@
LANGCODE:en_US LANGCODE:en_US
# deal with DOS files # deal with DOS files
LANGFILTER_PRECLIP: tr -d '\r' | LANGFILTER: tr -d '\r'
LANGFILTER: | tr [a-z] [A-Z]
LANGFILTER_POSTCLIP: | tr [a-z] [A-Z] LANGFILTER: | grep '^[A-Z]*$'
LANGFILTER_POSTCLIP: | grep '^[A-Z]*$' LANGFILTER: | sort -u
LANGFILTER_POSTCLIP: | sort -u
# We can trust sort (above) to do the right thing since there's no # We can trust sort (above) to do the right thing since there's no
# high ascii. dict2dawg.pl is much faster if I can trust that its # high ascii. dict2dawg.pl is much faster if I can trust that its
# input is in sorted order. # input is in sorted order.
NEEDSSORT:false D2DARGS: -nosort -term 10
LANGINFO: <p>English dictionaries can contain words with any of the 26 LANGINFO: <p>English dictionaries can contain words with any of the 26
LANGINFO: letters you think of as making up the alphabet: A-Z. At LANGINFO: letters you think of as making up the alphabet: A-Z. At

View file

@ -17,16 +17,14 @@
LANGCODE:fr_FR LANGCODE:fr_FR
# deal with DOS files # deal with DOS files
LANGFILTER_PRECLIP: tr -d '\r' | LANGFILTER: tr -d '\r'
LANGFILTER_POSTCLIP: | tr [a-z] [A-Z] LANGFILTER: | tr [a-z] [A-Z]
LANGFILTER_POSTCLIP: | grep '^[A-Z]*$' LANGFILTER: | grep '^[A-Z]*$'
LANGFILTER_POSTCLIP: | tr '\n' '\000' LANGFILTER: | tr '\n' '\000'
LANGFILTER_POSTCLIP: | sort -u -z LANGFILTER: | sort -u -z
NEEDSSORT:false
D2DARGS: -r -nosort -term 0
LANGINFO: <p>At this point French is getting treated the same as LANGINFO: <p>At this point French is getting treated the same as
LANGINFO: English. But I think I should be transforming accented LANGINFO: English. But I think I should be transforming accented

View file

@ -16,26 +16,21 @@
LANGCODE:de_DE LANGCODE:de_DE
# deal with DOS files # deal with DOS files
LANGFILTER_PRECLIP: tr -d '\r' | LANGFILTER: tr -d '\r'
# substitute for sharfes-s # substitute for sharfes-s
LANGFILTER_PRECLIP: sed -e 's/ß/SS/g' | LANGFILTER: | sed -e 's/ß/SS/g'
# uppercase all # uppercase all
LANGFILTER_POSTCLIP: | tr [a-zäöü] [A-ZÄÖÜ] LANGFILTER: | tr [a-zäöü] [A-ZÄÖÜ]
# no words not containing a vowel # no words not containing a vowel
LANGFILTER_POSTCLIP: | grep '[AEIOUÄÖÜ]' LANGFILTER: | grep '[AEIOUÄÖÜ]'
# none with illegal chars # none with illegal chars
LANGFILTER_POSTCLIP: | grep '^[A-ZÄÖÜ]\+$' LANGFILTER: | grep '^[A-ZÄÖÜ]\+$'
LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
# Until I can figure out how to force sort to use a locale's collation # Until I can figure out how to force sort to use a locale's collation
# rules we can't trust sort in the filtering rules above and so must # rules we can't trust sort in the filtering rules above and so must
# leave the sorting work to dict2dawg.pl. # leave the sorting work to dict2dawg.pl.
D2DARGS: -r -term 10
NEEDSSORT:true
LANGINFO: <p>German has the 26 English letters plus the three umlaut LANGINFO: <p>German has the 26 English letters plus the three umlaut
LANGINFO: vowels. Scharfes-s is not a legal tile, but if present in LANGINFO: vowels. Scharfes-s is not a legal tile, but if present in

View file

@ -18,16 +18,12 @@
LANGCODE:it_IT LANGCODE:it_IT
# deal with DOS files # deal with DOS files
LANGFILTER_PRECLIP: tr -d '\r' | LANGFILTER: tr -d '\r'
LANGFILTER: | tr [a-z] [A-Z]
LANGFILTER_POSTCLIP: | tr [a-z] [A-Z] LANGFILTER: | grep '^[A-IL-VZ]*$'
LANGFILTER_POSTCLIP: | grep '^[A-IL-VZ]*$' LANGFILTER: | sort -u
LANGFILTER_POSTCLIP: | tr '\n' '\000'
LANGFILTER_POSTCLIP: | sort -u -z
NEEDSSORT:false
D2DARGS: -r -term 10 -nosort
LANGINFO: <p>Italian is treated the same as English but for LANGINFO: <p>Italian is treated the same as English but for
LANGINFO: missing letters J, K, W, X and Y.</p> LANGINFO: missing letters J, K, W, X and Y.</p>

View file

@ -197,7 +197,7 @@ frankspecials.bin: ../frank_mkspecials.pl $(BMPFILES)
# a binary file (one byte) giving the number of tiles in the dict # a binary file (one byte) giving the number of tiles in the dict
charcount.bin: table.bin charcount.bin: table.bin
ifdef NEWDAWG ifdef NEWDAWG
siz=$$(wc $< | awk '{print $$3}'); \ siz=$$(ls -l $< | awk '{print $$5}'); \
perl -e "print pack(\"c\",$$siz/2)" > $@ perl -e "print pack(\"c\",$$siz/2)" > $@
else else
siz=$$(wc -c $< | sed -e 's/$<//'); \ siz=$$(wc -c $< | sed -e 's/$<//'); \
@ -240,11 +240,10 @@ dawg$(XWLANG)%.stamp: $(XWLANG)Main.dict.gz $(DICT2DAWG) table.bin ../Makefile.l
start=$$(echo $@ | sed -e 's/dawg$(XWLANG)\([0-9]*\)to[0-9]*.stamp/\1/'); \ start=$$(echo $@ | sed -e 's/dawg$(XWLANG)\([0-9]*\)to[0-9]*.stamp/\1/'); \
end=$$(echo $@ | sed -e 's/dawg$(XWLANG)[0-9]*to\([0-9]*\).stamp/\1/'); \ end=$$(echo $@ | sed -e 's/dawg$(XWLANG)[0-9]*to\([0-9]*\).stamp/\1/'); \
echo $${start} and $$end; \ echo $${start} and $$end; \
zcat $< | grep "^.\{$${start},$${end}\}$$" | \ zcat $< | $(DICT2DAWG) $(DICT2DAWGARGS) $(TABLE_ARG) table.bin -b 28000 \
sort -u | $(DICT2DAWG) $(TABLE_ARG) table.bin -b 28000 \
-ob dawg$(XWLANG)$* \ -ob dawg$(XWLANG)$* \
-sn $(XWLANG)StartLoc.bin -k -term 10 -wc $(XWLANG)$*_wordcount.bin \ -sn $(XWLANG)StartLoc.bin -min $$start -max $$end \
$(FORCE_4) -ns $(XWLANG)$*_nodesize.bin -wc $(XWLANG)$*_wordcount.bin $(FORCE_4) -ns $(XWLANG)$*_nodesize.bin
touch $@ touch $@
$(XWLANG)%_wordcount.bin: dawg$(XWLANG)%.stamp $(XWLANG)%_wordcount.bin: dawg$(XWLANG)%.stamp

View file

@ -17,22 +17,19 @@
LANGCODE:pt_PT LANGCODE:pt_PT
# deal with DOS files # deal with DOS files
LANGFILTER_PRECLIP: tr -d '\r' | LANGFILTER: tr -d '\r'
# uppercase all # uppercase all
LANGFILTER_POSTCLIP: | tr [a-zç] [A-ZÇ] LANGFILTER: | tr [a-zç] [A-ZÇ]
# no words not containing a vowel # no words not containing a vowel
LANGFILTER_POSTCLIP: | grep '[AEIOU]' LANGFILTER: | grep '[AEIOU]'
# none with illegal chars # none with illegal chars
LANGFILTER_POSTCLIP: | grep '^[A-JL-VXZÇ]\+$' LANGFILTER: | grep '^[A-JL-VXZÇ]\+$'
LANGFILTER_POSTCLIP: | sort -u
LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
# Until I can figure out how to force sort to use a locale's collation # Until I can figure out how to force sort to use a locale's collation
# rules we can't trust sort in the filtering rules above and so must # rules we can't trust sort in the filtering rules above and so must
# leave the sorting work to dict2dawg.pl. # leave the sorting work to dict2dawg.pl.
D2DARGS: -r -term 10
NEEDSSORT:true
LANGINFO: <p>Portugese uses the letter A-Z, excluding K, W and Y, and adds LANGINFO: <p>Portugese uses the letter A-Z, excluding K, W and Y, and adds
LANGINFO: Ç. Words containing any other letters are dropped. </p> LANGINFO: Ç. Words containing any other letters are dropped. </p>

41
dawg/Russian/Makefile Normal file
View file

@ -0,0 +1,41 @@
# -*- mode: makefile -*-
# Copyright 2002-2007 by Eric House (xwords@eehouse.org). All rights reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
XWLANG=Russian
LANGCODE=ru_RU
DICT2DAWGARGS = -r
TARGET_TYPE ?= WINCE
include ../Makefile.2to8
include ../Makefile.langcommon
SOURCEDICT ?= $(XWDICTPATH)/$(XWLANG)/RU5000.txt.gz
$(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile
zcat $< | tr -d '\r' | \
tr [àáâãäåæçèéêëìíîïðñòóôõö÷øùÚûüýþÿ] [ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß] | \
gzip -c > $@
# Everything but creating of the Main.dict file is inherited from the
# "parent" Makefile.langcommon in the parent directory.
clean: clean_common
rm -f $(XWLANG)Main.dict.gz *.bin $(XWLANG)*.pdb $(XWLANG)*.seb

76
dawg/Russian/info.txt Normal file
View file

@ -0,0 +1,76 @@
# Copyright 2002,2007 by Eric House (xwords@eehouse.org). All rights
# reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
LANGCODE:ru_RU
CHARSET:windows-1251
# deal with DOS files
LANGFILTER: tr -d '\r'
# uppercase all
LANGFILTER: | tr [àáâãäåæçèéêëìíîïðñòóôõö÷øùÚûüýþÿ] [ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß]
# LANGFILTER: | tr -s '\n' '\000'
# note: don't turn off sorting! Can't do it with GNU 'sort' without
# setting LANG
D2DARGS: -r -term 10
LANGINFO: <p>Russian wordlists must be in the Windows-1251
LANGINFO: codepage. Lower-case letters are converted to upper case and
LANGINFO: any words that contain letters not listed below are
LANGINFO: removed.</p>
# High bit means "official". Next 7 bits are an enum where
# Russian==0x0F. Low byte is padding.
XLOC_HEADER:0x8F00
<BEGIN_TILES>
8 1 'À'
2 3 'Á'
4 1 'Â'
2 3 'Ã'
2 2 'Ä'
7 1 'Å'
1 4 'Æ'
1 3 'Ç'
7 1 'È'
1 2 'É'
4 2 'Ê'
4 2 'Ë'
2 3 'Ì'
4 1 'Í'
9 1 'Î'
4 2 'Ï'
5 1 'Ð'
5 1 'Ñ'
7 1 'Ò'
4 2 'Ó'
1 5 'Ô'
1 4 'Õ'
1 4 'Ö'
1 3 '×'
1 4 'Ø'
1 5 'Ù'
1 10 'Ú'
2 2 'Û'
4 1 'Ü'
1 8 'Ý'
1 5 'Þ'
2 2 'ß'
2 0 {"_"}
<END_TILES>
# should ignore all after the <END_TILES> above

View file

@ -21,24 +21,25 @@
NEEDSSORT:true NEEDSSORT:true
# MSDos LF chars go bye-bye # MSDos LF chars go bye-bye
LANGFILTER_PRECLIP: tr -d '\r' | LANGFILTER: tr -d '\r'
# convert accented vowels # convert accented vowels
LANGFILTER_POSTCLIP: | tr '\207\216\222\227\234\237\226' 'aeiouu\321' LANGFILTER: | tr '\207\216\222\227\234\237\226' 'aeiouu\321'
# uppercase # uppercase
LANGFILTER_POSTCLIP: | tr [a-zń] [A-ZŃ] LANGFILTER: | tr [a-zñ] [A-ZÑ]
# remove words with illegal letters # remove words with illegal letters
LANGFILTER_POSTCLIP: | grep '^[[A-JL-VX-ZŃ]*$' LANGFILTER: | grep '^[[A-JL-VX-ZÑ]*$'
# substitute pairs (can't figure out how to use octal values) # substitute pairs (can't figure out how to use octal values)
LANGFILTER_POSTCLIP: | sed 's/CH/1/g' LANGFILTER: | sed 's/CH/1/g'
LANGFILTER_POSTCLIP: | sed 's/LL/2/g' LANGFILTER: | sed 's/LL/2/g'
LANGFILTER_POSTCLIP: | sed 's/RR/3/g' LANGFILTER: | sed 's/RR/3/g'
# substitute in the octal control character values # substitute in the octal control character values
LANGFILTER_POSTCLIP: | tr '123' '\001\002\003' LANGFILTER: | tr '123' '\001\002\003'
# now add nulls as terminators # now add nulls as terminators
LANGFILTER_POSTCLIP: | tr -s '\n' '\000' LANGFILTER: | tr -s '\n' '\000'
LANGFILTER_POSTCLIP: | sort -u -z LANGFILTER: | sort -u -z
D2DARGS: -r -term 0
LANGINFO: <p>Spanish words include all letters in the English alphabet LANGINFO: <p>Spanish words include all letters in the English alphabet
LANGINFO: except "K" and "W", and with "Ñ" added. Since there are no LANGINFO: except "K" and "W", and with "Ñ" added. Since there are no

View file

@ -16,12 +16,11 @@
LANGCODE:sv_SE LANGCODE:sv_SE
LANGFILTER: tr -d '\r'
LANGFILTER: | tr [a-zäåæöü] [A-ZÄÅÆÖÜ]
LANGFILTER: | grep '^[A-ZÄÅÆÖÜ]*$'
LANGFILTER_POSTCLIP: | tr [a-zäĺćöü] [A-ZÄĹĆÖÜ] D2DARGS: -r -term 10
LANGFILTER_POSTCLIP: | grep '^[A-ZÄĹĆÖÜ]*$'
LANGFILTER_POSTCLIP: | tr '\n' '\000'
NEEDSSORT:true
LANGINFO: <p>From an English-speaker's perspective, Swedish drops Q LANGINFO: <p>From an English-speaker's perspective, Swedish drops Q
LANGINFO: and W, and adds Ä, Å, Æ, Ö and Ü.</p> LANGINFO: and W, and adds Ä, Å, Æ, Ö and Ü.</p>

View file

@ -1,9 +1,9 @@
/* -*- compile-command: "g++ -O -o dict2dawg dict2dawg.cpp"; -*- */ /* -*- compile-command: "g++ -DDEBUG -O -o dict2dawg dict2dawg.cpp"; -*- */
/************************************************************************* /*************************************************************************
* adapted from perl code that was itself adapted from C++ code * adapted from perl code that was itself adapted from C++ code
* Copyright (C) 2000 Falk Hueffner * Copyright (C) 2000 Falk Hueffner
* This version Copyright (C) 2002,2006 Eric House (xwords@eehouse.org) * This version Copyright (C) 2002,2006-2007 Eric House (xwords@eehouse.org)
* *
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
@ -54,9 +54,11 @@ typedef unsigned int Node;
typedef std::vector<Node> NodeList; typedef std::vector<Node> NodeList;
typedef std::vector<char*> WordList; typedef std::vector<char*> WordList;
#define MAX_WORD_LEN 15
#define VERSION_STR "$Rev$" #define VERSION_STR "$Rev$"
#define MAX_WORD_LEN 15
#define T2ABUFLEN(s) (((s)*4)+3)
int gFirstDiff; int gFirstDiff;
static char gCurrentWordBuf[MAX_WORD_LEN+1] = { '\0' }; static char gCurrentWordBuf[MAX_WORD_LEN+1] = { '\0' };
@ -92,10 +94,12 @@ bool gForceFour = false; // use four bytes regardless of need?
static int gFileSize = 0; static int gFileSize = 0;
int gNBytesPerNode; int gNBytesPerNode;
bool gUseUnicode; bool gUseUnicode;
int gLimLow = 2;
int gLimHigh = MAX_WORD_LEN;
// OWL is 1.7M // OWL is 1.7M
#define MAX_POOL_SIZE (3 * 0x100000) #define MAX_POOL_SIZE (5 * 0x100000)
#define ERROR_EXIT(...) error_exit( __LINE__, __VA_ARGS__ ); #define ERROR_EXIT(...) error_exit( __LINE__, __VA_ARGS__ );
static char* parseARGV( int argc, char** argv, const char** inFileName ); static char* parseARGV( int argc, char** argv, const char** inFileName );
@ -182,7 +186,8 @@ main( int argc, char** argv )
unsigned long be = htonl( gWordCount ); unsigned long be = htonl( gWordCount );
fwrite( &be, sizeof(be), 1, OFILE ); fwrite( &be, sizeof(be), 1, OFILE );
fclose( OFILE ); fclose( OFILE );
fprintf( stderr, "wrote out: got %d words\n", gWordCount ); fprintf( stderr, "Wrote %d (word count) to %s\n", gWordCount,
gCountFile );
} }
if ( gOutFileBase ) { if ( gOutFileBase ) {
@ -393,49 +398,62 @@ readFromSortedArray( void )
#endif #endif
} }
char* word = ""; for ( ; ; ) {
char* word = "";
if ( !gDone ) {
gDone = gNextWordIndex == sInputStrings->size();
if ( !gDone ) { if ( !gDone ) {
word = sInputStrings->at(gNextWordIndex++); gDone = gNextWordIndex == sInputStrings->size();
if ( !gDone ) {
word = sInputStrings->at(gNextWordIndex++);
#ifdef DEBUG #ifdef DEBUG
} else if ( gDebug ) { } else if ( gDebug ) {
fprintf( stderr, "gDone set to true\n" ); fprintf( stderr, "gDone set to true\n" );
#endif
}
#ifdef DEBUG
if ( gDebug ) {
char buf[T2ABUFLEN(MAX_WORD_LEN)];
fprintf( stderr, "%s: got word: %s\n", __func__,
tileToAscii( buf, sizeof(buf), word ) );
}
#endif #endif
} }
#ifdef DEBUG int numCommonLetters = 0;
if ( gDebug ) { int len = strlen( word );
fprintf( stderr, "got word: %s\n", word ); if ( gCurrentWordLen < len ) {
len = gCurrentWordLen;
} }
while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
&& numCommonLetters < len ) {
++numCommonLetters;
}
gFirstDiff = numCommonLetters;
if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
&& !firstBeforeSecond( gCurrentWord, word ) ) {
#ifdef DEBUG
if ( gDebug ) {
char buf1[T2ABUFLEN(MAX_WORD_LEN)];
char buf2[T2ABUFLEN(MAX_WORD_LEN)];
fprintf( stderr,
"%s: words %s and %s are the same or out of order\n",
__func__,
tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
tileToAscii( buf2, sizeof(buf2), word ) );
}
#endif #endif
continue;
}
gCurrentWord = word;
gCurrentWordLen = strlen(word);
break;
} }
int numCommonLetters = 0;
int len = strlen( word );
if ( gCurrentWordLen < len ) {
len = gCurrentWordLen;
}
while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
&& numCommonLetters < len ) {
++numCommonLetters;
}
gFirstDiff = numCommonLetters;
if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
&& !firstBeforeSecond( gCurrentWord, word ) ) {
char buf1[MAX_WORD_LEN+1];
char buf2[MAX_WORD_LEN+1];
ERROR_EXIT( "words %s and %s are out of order\n",
tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
tileToAscii( buf2, sizeof(buf2), word ) );
}
gCurrentWord = word;
gCurrentWordLen = strlen(word);
#ifdef DEBUG #ifdef DEBUG
if ( gDebug ) { if ( gDebug ) {
char buf[MAX_WORD_LEN+1]; char buf[T2ABUFLEN(MAX_WORD_LEN)];
fprintf( stderr, "gCurrentWord now %s\n", fprintf( stderr, "gCurrentWord now %s\n",
tileToAscii( buf, sizeof(buf), gCurrentWord) ); tileToAscii( buf, sizeof(buf), gCurrentWord) );
} }
@ -450,47 +468,77 @@ readOneWord( char* wordBuf, int bufLen, int* lenp, bool* gotEOF )
bool dropWord = false; bool dropWord = false;
bool done = false; bool done = false;
// for each byte // for each byte, append to an internal buffer up to size limit.
// On reaching an end-of-word or EOF, check if the word formed is
// within the length range and contains no unknown chars. If yes,
// return it. If no, start over ONLY IF the terminator was not
// EOF.
for ( ; ; ) { for ( ; ; ) {
int byt = getc( gInFile ); int byt = getc( gInFile );
// EOF is special: we don't try for another word even if // EOF is special: we don't try for another word even if
// dropWord is true; we must leave now. // dropWord is true; we must leave now.
if ( byt == EOF || byt == gTermChar ) { if ( byt == EOF || byt == gTermChar ) {
*gotEOF = byt == EOF; bool isEOF = byt == EOF;
*gotEOF = isEOF;
if ( !dropWord || *gotEOF ) { assert( isEOF || count < bufLen );
if ( count != 0 ) { if ( !dropWord && (count >= gLimLow) && (count <= gLimHigh) ) {
wordBuf[count] = '\0'; assert( count < bufLen );
result = wordBuf; wordBuf[count] = '\0';
*lenp = count; result = wordBuf;
++gWordCount; *lenp = count;
} ++gWordCount;
break; // we've finished a word
} else if ( *gotEOF ) {
break; break;
} else if ( isEOF ) {
assert( !result );
break;
}
#ifdef DEBUG
if ( gDebug ) {
char buf[T2ABUFLEN(count)];
wordBuf[count] = '\0';
fprintf( stderr, "%s: dropping word (len=%d): %s\n", __func__,
count, tileToAscii( buf, sizeof(buf), wordBuf ) );
} }
#endif
count = 0; // we'll start over
dropWord = false;
} else if ( count >= bufLen ) {
// Just drop it...
dropWord = true;
// Don't call into the hashtable twice here!! // Don't call into the hashtable twice here!!
} else if ( gTableHash.find(byt) != gTableHash.end() ) { } else if ( gTableHash.find(byt) != gTableHash.end() ) {
if ( !dropWord ) { assert( count < bufLen );
wordBuf[count++] = (char)gTableHash[byt]; wordBuf[count++] = (char)gTableHash[byt];
if ( count >= bufLen ) { if ( count >= bufLen ) {
char buf[MAX_WORD_LEN+1]; char buf[T2ABUFLEN(count)];
ERROR_EXIT( "no space for word %d (starting \"%s\")", ERROR_EXIT( "no space for word %d (starting \"%s\")",
gWordCount, gWordCount,
tileToAscii( buf, sizeof(buf), wordBuf )); tileToAscii( buf, sizeof(buf), wordBuf ));
} }
} else if ( gKillIfMissing || !dropWord ) {
char buf[T2ABUFLEN(count)];
wordBuf[count] = '\0';
tileToAscii( buf, sizeof(buf), wordBuf );
if ( gKillIfMissing ) {
ERROR_EXIT( "chr %c (%d) not in map file %s\n"
"last word was %s\n",
(char)byt, (int)byt, gTableFile, buf );
} else if ( !dropWord ) {
#ifdef DEBUG
if ( gDebug ) {
fprintf( stderr, "%s: chr %c (%d) not in map file %s\n"
"dropping partial word %s\n", __func__,
(char)byt, (int)byt, gTableFile, buf );
}
#endif
dropWord = true;
} }
} else if ( gKillIfMissing ) {
char buf[MAX_WORD_LEN+1];
ERROR_EXIT( "chr %c (%d) not in map file %s\n"
"last word was %s\n",
byt, (int)byt, gTableFile,
tileToAscii( buf, sizeof(buf), wordBuf ) );
} else {
dropWord = true;
count = 0; // lose anything we already have
} }
} }
@ -511,40 +559,55 @@ readFromFile( void )
int len; int len;
gDone = s_eof; gDone = s_eof;
if ( !gDone ) {
word = readOneWord( wordBuf, sizeof(wordBuf), &len, &s_eof ); // Repeat until we get a new word that's not "out-of-order". When
gDone = NULL == word; // we see this the problem isn't failure to sort, it's duplicates.
} // So dropping is ok. The alternative would be detecting dupes
if ( gDone ) { // during the sort. This seems easier.
word = ""; for ( ; ; ) {
len = 0; if ( !gDone ) {
} word = readOneWord( wordBuf, sizeof(wordBuf), &len, &s_eof );
gDone = NULL == word;
}
if ( gDone ) {
word = "";
len = 0;
}
int numCommonLetters = 0; int numCommonLetters = 0;
if ( gCurrentWordLen < len ) { if ( gCurrentWordLen < len ) {
len = gCurrentWordLen; len = gCurrentWordLen;
} }
while ( gCurrentWord[numCommonLetters] == word[numCommonLetters] while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
&& numCommonLetters < len ) { && numCommonLetters < len ) {
++numCommonLetters; ++numCommonLetters;
} }
gFirstDiff = numCommonLetters; gFirstDiff = numCommonLetters;
if ( (gCurrentWordLen > 0) && (strlen(word) > 0) if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
&& !firstBeforeSecond( gCurrentWord, word ) ) { && !firstBeforeSecond( gCurrentWord, word ) ) {
char buf1[MAX_WORD_LEN+1]; #ifdef DEBUG
char buf2[MAX_WORD_LEN+1]; if ( gDebug ) {
ERROR_EXIT( "words %s and %s are out of order\n", char buf1[T2ABUFLEN(MAX_WORD_LEN)];
tileToAscii( buf1, sizeof(buf1), gCurrentWord ), char buf2[T2ABUFLEN(MAX_WORD_LEN)];
tileToAscii( buf2, sizeof(buf2), word ) ); fprintf( stderr,
"%s: words %s and %s are the smae or out of order\n",
__func__,
tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
tileToAscii( buf2, sizeof(buf2), word ) );
}
#endif
continue;
}
break;
} }
gCurrentWordLen = strlen(word); gCurrentWordLen = strlen(word);
strncpy( gCurrentWordBuf, word, sizeof(gCurrentWordBuf) ); strncpy( gCurrentWordBuf, word, sizeof(gCurrentWordBuf) );
#ifdef DEBUG #ifdef DEBUG
if ( gDebug ) { if ( gDebug ) {
char buf[MAX_WORD_LEN+1]; char buf[T2ABUFLEN(MAX_WORD_LEN)];
fprintf( stderr, "gCurrentWord now %s\n", fprintf( stderr, "gCurrentWord now %s\n",
tileToAscii( buf, sizeof(buf), gCurrentWord) ); tileToAscii( buf, sizeof(buf), gCurrentWord) );
} }
@ -561,17 +624,26 @@ firstBeforeSecond( const char* lhs, const char* rhs )
static char* static char*
tileToAscii( char* out, int outSize, const char* in ) tileToAscii( char* out, int outSize, const char* in )
{ {
char tiles[outSize];
int tilesLen = 1;
tiles[0] = '[';
char* orig = out; char* orig = out;
for ( ; ; ) { for ( ; ; ) {
char ch = *in++; char ch = *in++;
if ( '\0' == ch ) { if ( '\0' == ch ) {
*out = '\0';
break; break;
} }
assert( ch < gRevMap.size() ); assert( ch < gRevMap.size() );
*out++ = gRevMap[ch]; *out++ = gRevMap[ch];
tilesLen += sprintf( &tiles[tilesLen], "%d,", ch );
assert( (out - orig) < outSize ); assert( (out - orig) < outSize );
} }
tiles[tilesLen] = ']';
tiles[tilesLen+1] = '\0';
strcpy( out, tiles );
return orig; return orig;
} }
@ -636,7 +708,7 @@ printWords( std::vector<char*>* strings )
{ {
std::vector<char*>::iterator iter = strings->begin(); std::vector<char*>::iterator iter = strings->begin();
while ( iter != strings->end() ) { while ( iter != strings->end() ) {
char buf[MAX_WORD_LEN+1]; char buf[T2ABUFLEN(MAX_WORD_LEN)];
tileToAscii( buf, sizeof(buf), *iter ); tileToAscii( buf, sizeof(buf), *iter );
fprintf( stderr, "%s\n", buf ); fprintf( stderr, "%s\n", buf );
++iter; ++iter;
@ -760,6 +832,9 @@ makeTableHash( void )
if ( NULL == TABLEFILE ) { if ( NULL == TABLEFILE ) {
ERROR_EXIT( "unable to open %s\n", gTableFile ); ERROR_EXIT( "unable to open %s\n", gTableFile );
} }
// Fill the 0th space since references are one-based
gRevMap.push_back(0);
for ( ii = 0; ; ++ii ) { for ( ii = 0; ; ++ii ) {
int ch = getc(TABLEFILE); int ch = getc(TABLEFILE);
@ -817,7 +892,6 @@ emitNodes( unsigned int nBytesPerOutfile, const char* outFileBase )
gNBytesPerNode = 3; gNBytesPerNode = 3;
} else { } else {
if ( gBlankIndex == 32 ) { // blank if ( gBlankIndex == 32 ) { // blank
fprintf( stderr, "blank's at 32; 3-byte-nodes still ok\n" );
gNBytesPerNode = 3; gNBytesPerNode = 3;
} else { } else {
ERROR_EXIT( "move blank to last position in info.txt " ERROR_EXIT( "move blank to last position in info.txt "
@ -994,8 +1068,10 @@ usage( const char* name )
{ {
fprintf( stderr, "usage: %s \n" fprintf( stderr, "usage: %s \n"
"\t[-v] (print version and exit)\n" "\t[-v] (print version and exit)\n"
"\t[-poolsize] (print size of hardcoded pool and exit)\n" "\t[-poolsize] (print hardcoded size of pool and exit)\n"
"\t[-b bytesPerFile] (default = 0xFFFFFFFF)\n" "\t[-b bytesPerFile] (default = 0xFFFFFFFF)\n"
"\t[-min <num in 0..15>]\n"
"\t[-max <num in 0..15>]\n"
"\t-m mapFile\n" "\t-m mapFile\n"
"\t-mn mapFile (unicode)\n" "\t-mn mapFile (unicode)\n"
"\t-ob outFileBase\n" "\t-ob outFileBase\n"
@ -1048,6 +1124,10 @@ parseARGV( int argc, char** argv, const char** inFileName )
} else if ( 0 == strcmp( arg, "-mn" ) ) { } else if ( 0 == strcmp( arg, "-mn" ) ) {
gTableFile = argv[index++]; gTableFile = argv[index++];
gUseUnicode = true; gUseUnicode = true;
} else if ( 0 == strcmp( arg, "-min" ) ) {
gLimLow = atoi(argv[index++]);
} else if ( 0 == strcmp( arg, "-max" ) ) {
gLimHigh = atoi(argv[index++]);
} else if ( 0 == strcmp( arg, "-m" ) ) { } else if ( 0 == strcmp( arg, "-m" ) ) {
gTableFile = argv[index++]; gTableFile = argv[index++];
} else if ( 0 == strcmp( arg, "-ob" ) ) { } else if ( 0 == strcmp( arg, "-ob" ) ) {
@ -1079,17 +1159,25 @@ parseARGV( int argc, char** argv, const char** inFileName )
gDebug = true; gDebug = true;
#endif #endif
} else { } else {
ERROR_EXIT( "unexpected arg %s", arg ); ERROR_EXIT( "%s: unexpected arg %s", __func__, arg );
} }
} }
if ( gLimHigh > MAX_WORD_LEN || gLimLow > MAX_WORD_LEN ) {
usage( argv[0] );
exit(1);
}
#ifdef DEBUG #ifdef DEBUG
if ( gDebug ) { if ( gDebug ) {
fprintf( stderr, "gNBytesPerOutfile=$gNBytesPerOutfile\n" ); fprintf( stderr, "gNBytesPerOutfile=%d\n", gNBytesPerOutfile );
fprintf( stderr, "gTableFile=$gTableFile\n" ); fprintf( stderr, "gTableFile=%s\n", gTableFile );
fprintf( stderr, "gOutFileBase=$gOutFileBase\n" ); fprintf( stderr, "gOutFileBase=%s\n", gOutFileBase );
fprintf( stderr, "gStartNodeOut=$gStartNodeOut\n" ); fprintf( stderr, "gStartNodeOut=%s\n", gStartNodeOut );
fprintf( stderr, "gTermChar=%c(%d)\n", gTermChar, (int)gTermChar ); fprintf( stderr, "gTermChar=%c(%d)\n", gTermChar, (int)gTermChar );
fprintf( stderr, "gFileSize=%d\n", gFileSize );
fprintf( stderr, "gLimLow=%d\n", gLimLow );
fprintf( stderr, "gLimHigh=%d\n", gLimHigh );
} }
#endif #endif
return gTableFile; return gTableFile;

View file

@ -117,7 +117,7 @@ sub WriteMapFile($$$) {
} elsif ( $str =~ /(\d+)/ ) { } elsif ( $str =~ /(\d+)/ ) {
print $fhr pack( $packStr, $1 ); print $fhr pack( $packStr, $1 );
} else { } else {
die "WriteMapFile: unrecognized face format $str"; die "WriteMapFile: unrecognized face format $str, elem $i";
} }
} }
} # WriteMapFile } # WriteMapFile