mirror of
git://xwords.git.sourceforge.net/gitroot/xwords/xwords
synced 2025-01-28 07:58:08 +01:00
Add support for Russian. So that Russian text can be processed on systems without setting LANG=ru_RU.CP1251, modify dict2dawg to skip duplicates and words outside of specified lengths. Modify all info.txt files for the new scheme (which includes change to byod.cgi not kept on sourceforge.)
This commit is contained in:
parent
326ecb00f4
commit
3bb2fb018f
15 changed files with 366 additions and 181 deletions
|
@ -18,23 +18,21 @@ LANGCODE:da_DK
|
|||
|
||||
|
||||
# deal with DOS files
|
||||
LANGFILTER_PRECLIP: tr -d '\r' |
|
||||
|
||||
LANGFILTER: tr -d '\r'
|
||||
# uppercase all
|
||||
LANGFILTER_POSTCLIP: | tr [a-zĺćř] [A-ZĹĆŘ]
|
||||
LANGFILTER: | tr [a-zåæø] [A-ZÅÆØ]
|
||||
# no words not containing a vowel
|
||||
LANGFILTER_POSTCLIP: | grep '[AEIOUYĹĆŘ]'
|
||||
LANGFILTER: | grep '[AEIOUYÅÆØ]'
|
||||
# none with illegal chars
|
||||
LANGFILTER_POSTCLIP: | grep '^[A-PR-VX-ZĹĆŘ]\+$'
|
||||
LANGFILTER: | grep '^[A-PR-VX-ZÅÆØ]\+$'
|
||||
# remove duplicates
|
||||
LANGFILTER_POSTCLIP: | sort -u
|
||||
LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
|
||||
LANGFILTER: | sort -u
|
||||
|
||||
# Until I can figure out how to force sort to use a locale's collation
|
||||
# rules we can't trust sort in the filtering rules above and so must
|
||||
# leave the sorting work to dict2dawg.pl.
|
||||
|
||||
NEEDSSORT:true
|
||||
D2DARGS: -r -term 10
|
||||
|
||||
LANGINFO: <p>Danish uses all English letters except Q and W. There
|
||||
LANGINFO: are three non-English letters: 'Å', 'Æ' and 'Ø'. </p>
|
||||
|
|
|
@ -18,22 +18,19 @@ LANGCODE:nl_NL
|
|||
|
||||
|
||||
# deal with DOS files
|
||||
LANGFILTER_PRECLIP: tr -d '\r' |
|
||||
|
||||
LANGFILTER: tr -d '\r'
|
||||
# uppercase all
|
||||
LANGFILTER_POSTCLIP: | tr [a-z] [A-Z]
|
||||
LANGFILTER: | tr [a-z] [A-Z]
|
||||
# no words not containing a vowel
|
||||
LANGFILTER_POSTCLIP: | grep '[AEIOU]'
|
||||
LANGFILTER: | grep '[AEIOU]'
|
||||
# none with illegal chars
|
||||
LANGFILTER_POSTCLIP: | grep '^[A-Z]\+$'
|
||||
LANGFILTER_POSTCLIP: | sort -u
|
||||
LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
|
||||
LANGFILTER: | grep '^[A-Z]\+$'
|
||||
LANGFILTER: | sort -u
|
||||
|
||||
# Until I can figure out how to force sort to use a locale's collation
|
||||
# rules we can't trust sort in the filtering rules above and so must
|
||||
# leave the sorting work to dict2dawg.pl.
|
||||
|
||||
NEEDSSORT:false
|
||||
D2DARGS: -r -term 10
|
||||
|
||||
LANGINFO: <p>Dutch has the same 26 letters as English, though of
|
||||
LANGINFO: course the counts and values are different. Filtering rules
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# -*-mode: Makefile -*-
|
||||
# -*-mode: Makefile; compile-command: "make -f Makefile.BasEnglish"; -*-
|
||||
# Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved.
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
|
@ -17,7 +17,7 @@
|
|||
|
||||
XWLANG=BasEnglish
|
||||
LANGCODE=en_US
|
||||
#NEWDAWG=1
|
||||
DICT2DAWGARGS = -r -nosort
|
||||
|
||||
TARGET_TYPE ?= PALM
|
||||
|
||||
|
|
|
@ -17,16 +17,16 @@
|
|||
LANGCODE:en_US
|
||||
|
||||
# deal with DOS files
|
||||
LANGFILTER_PRECLIP: tr -d '\r' |
|
||||
|
||||
LANGFILTER_POSTCLIP: | tr [a-z] [A-Z]
|
||||
LANGFILTER_POSTCLIP: | grep '^[A-Z]*$'
|
||||
LANGFILTER_POSTCLIP: | sort -u
|
||||
LANGFILTER: tr -d '\r'
|
||||
LANGFILTER: | tr [a-z] [A-Z]
|
||||
LANGFILTER: | grep '^[A-Z]*$'
|
||||
LANGFILTER: | sort -u
|
||||
|
||||
# We can trust sort (above) to do the right thing since there's no
|
||||
# high ascii. dict2dawg.pl is much faster if I can trust that its
|
||||
# input is in sorted order.
|
||||
NEEDSSORT:false
|
||||
D2DARGS: -nosort -term 10
|
||||
|
||||
|
||||
LANGINFO: <p>English dictionaries can contain words with any of the 26
|
||||
LANGINFO: letters you think of as making up the alphabet: A-Z. At
|
||||
|
|
|
@ -17,16 +17,14 @@
|
|||
LANGCODE:fr_FR
|
||||
|
||||
# deal with DOS files
|
||||
LANGFILTER_PRECLIP: tr -d '\r' |
|
||||
LANGFILTER: tr -d '\r'
|
||||
|
||||
LANGFILTER_POSTCLIP: | tr [a-z] [A-Z]
|
||||
LANGFILTER_POSTCLIP: | grep '^[A-Z]*$'
|
||||
LANGFILTER_POSTCLIP: | tr '\n' '\000'
|
||||
LANGFILTER_POSTCLIP: | sort -u -z
|
||||
|
||||
|
||||
NEEDSSORT:false
|
||||
LANGFILTER: | tr [a-z] [A-Z]
|
||||
LANGFILTER: | grep '^[A-Z]*$'
|
||||
LANGFILTER: | tr '\n' '\000'
|
||||
LANGFILTER: | sort -u -z
|
||||
|
||||
D2DARGS: -r -nosort -term 0
|
||||
|
||||
LANGINFO: <p>At this point French is getting treated the same as
|
||||
LANGINFO: English. But I think I should be transforming accented
|
||||
|
|
|
@ -16,26 +16,21 @@
|
|||
|
||||
LANGCODE:de_DE
|
||||
|
||||
|
||||
# deal with DOS files
|
||||
LANGFILTER_PRECLIP: tr -d '\r' |
|
||||
|
||||
LANGFILTER: tr -d '\r'
|
||||
# substitute for sharfes-s
|
||||
LANGFILTER_PRECLIP: sed -e 's/ß/SS/g' |
|
||||
|
||||
LANGFILTER: | sed -e 's/ß/SS/g'
|
||||
# uppercase all
|
||||
LANGFILTER_POSTCLIP: | tr [a-zäöü] [A-ZÄÖÜ]
|
||||
LANGFILTER: | tr [a-zäöü] [A-ZÄÖÜ]
|
||||
# no words not containing a vowel
|
||||
LANGFILTER_POSTCLIP: | grep '[AEIOUÄÖÜ]'
|
||||
LANGFILTER: | grep '[AEIOUÄÖÜ]'
|
||||
# none with illegal chars
|
||||
LANGFILTER_POSTCLIP: | grep '^[A-ZÄÖÜ]\+$'
|
||||
LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
|
||||
LANGFILTER: | grep '^[A-ZÄÖÜ]\+$'
|
||||
|
||||
# Until I can figure out how to force sort to use a locale's collation
|
||||
# rules we can't trust sort in the filtering rules above and so must
|
||||
# leave the sorting work to dict2dawg.pl.
|
||||
|
||||
NEEDSSORT:true
|
||||
D2DARGS: -r -term 10
|
||||
|
||||
LANGINFO: <p>German has the 26 English letters plus the three umlaut
|
||||
LANGINFO: vowels. Scharfes-s is not a legal tile, but if present in
|
||||
|
|
|
@ -18,16 +18,12 @@
|
|||
LANGCODE:it_IT
|
||||
|
||||
# deal with DOS files
|
||||
LANGFILTER_PRECLIP: tr -d '\r' |
|
||||
|
||||
LANGFILTER_POSTCLIP: | tr [a-z] [A-Z]
|
||||
LANGFILTER_POSTCLIP: | grep '^[A-IL-VZ]*$'
|
||||
LANGFILTER_POSTCLIP: | tr '\n' '\000'
|
||||
LANGFILTER_POSTCLIP: | sort -u -z
|
||||
|
||||
|
||||
NEEDSSORT:false
|
||||
LANGFILTER: tr -d '\r'
|
||||
LANGFILTER: | tr [a-z] [A-Z]
|
||||
LANGFILTER: | grep '^[A-IL-VZ]*$'
|
||||
LANGFILTER: | sort -u
|
||||
|
||||
D2DARGS: -r -term 10 -nosort
|
||||
|
||||
LANGINFO: <p>Italian is treated the same as English but for
|
||||
LANGINFO: missing letters J, K, W, X and Y.</p>
|
||||
|
|
|
@ -197,7 +197,7 @@ frankspecials.bin: ../frank_mkspecials.pl $(BMPFILES)
|
|||
# a binary file (one byte) giving the number of tiles in the dict
|
||||
charcount.bin: table.bin
|
||||
ifdef NEWDAWG
|
||||
siz=$$(wc $< | awk '{print $$3}'); \
|
||||
siz=$$(ls -l $< | awk '{print $$5}'); \
|
||||
perl -e "print pack(\"c\",$$siz/2)" > $@
|
||||
else
|
||||
siz=$$(wc -c $< | sed -e 's/$<//'); \
|
||||
|
@ -240,11 +240,10 @@ dawg$(XWLANG)%.stamp: $(XWLANG)Main.dict.gz $(DICT2DAWG) table.bin ../Makefile.l
|
|||
start=$$(echo $@ | sed -e 's/dawg$(XWLANG)\([0-9]*\)to[0-9]*.stamp/\1/'); \
|
||||
end=$$(echo $@ | sed -e 's/dawg$(XWLANG)[0-9]*to\([0-9]*\).stamp/\1/'); \
|
||||
echo $${start} and $$end; \
|
||||
zcat $< | grep "^.\{$${start},$${end}\}$$" | \
|
||||
sort -u | $(DICT2DAWG) $(TABLE_ARG) table.bin -b 28000 \
|
||||
zcat $< | $(DICT2DAWG) $(DICT2DAWGARGS) $(TABLE_ARG) table.bin -b 28000 \
|
||||
-ob dawg$(XWLANG)$* \
|
||||
-sn $(XWLANG)StartLoc.bin -k -term 10 -wc $(XWLANG)$*_wordcount.bin \
|
||||
$(FORCE_4) -ns $(XWLANG)$*_nodesize.bin
|
||||
-sn $(XWLANG)StartLoc.bin -min $$start -max $$end \
|
||||
-wc $(XWLANG)$*_wordcount.bin $(FORCE_4) -ns $(XWLANG)$*_nodesize.bin
|
||||
touch $@
|
||||
|
||||
$(XWLANG)%_wordcount.bin: dawg$(XWLANG)%.stamp
|
||||
|
|
|
@ -17,22 +17,19 @@
|
|||
LANGCODE:pt_PT
|
||||
|
||||
# deal with DOS files
|
||||
LANGFILTER_PRECLIP: tr -d '\r' |
|
||||
|
||||
LANGFILTER: tr -d '\r'
|
||||
# uppercase all
|
||||
LANGFILTER_POSTCLIP: | tr [a-zç] [A-ZÇ]
|
||||
LANGFILTER: | tr [a-zç] [A-ZÇ]
|
||||
# no words not containing a vowel
|
||||
LANGFILTER_POSTCLIP: | grep '[AEIOU]'
|
||||
LANGFILTER: | grep '[AEIOU]'
|
||||
# none with illegal chars
|
||||
LANGFILTER_POSTCLIP: | grep '^[A-JL-VXZÇ]\+$'
|
||||
LANGFILTER_POSTCLIP: | sort -u
|
||||
LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
|
||||
LANGFILTER: | grep '^[A-JL-VXZÇ]\+$'
|
||||
|
||||
# Until I can figure out how to force sort to use a locale's collation
|
||||
# rules we can't trust sort in the filtering rules above and so must
|
||||
# leave the sorting work to dict2dawg.pl.
|
||||
D2DARGS: -r -term 10
|
||||
|
||||
NEEDSSORT:true
|
||||
|
||||
LANGINFO: <p>Portugese uses the letter A-Z, excluding K, W and Y, and adds
|
||||
LANGINFO: Ç. Words containing any other letters are dropped. </p>
|
||||
|
|
41
dawg/Russian/Makefile
Normal file
41
dawg/Russian/Makefile
Normal file
|
@ -0,0 +1,41 @@
|
|||
# -*- mode: makefile -*-
|
||||
# Copyright 2002-2007 by Eric House (xwords@eehouse.org). All rights reserved.
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
XWLANG=Russian
|
||||
LANGCODE=ru_RU
|
||||
DICT2DAWGARGS = -r
|
||||
|
||||
TARGET_TYPE ?= WINCE
|
||||
|
||||
include ../Makefile.2to8
|
||||
|
||||
include ../Makefile.langcommon
|
||||
|
||||
SOURCEDICT ?= $(XWDICTPATH)/$(XWLANG)/RU5000.txt.gz
|
||||
|
||||
$(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile
|
||||
zcat $< | tr -d '\r' | \
|
||||
tr [àáâãäåæçèéêëìíîïðñòóôõö÷øùÚûüýþÿ] [ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß] | \
|
||||
gzip -c > $@
|
||||
|
||||
|
||||
# Everything but creating of the Main.dict file is inherited from the
|
||||
# "parent" Makefile.langcommon in the parent directory.
|
||||
|
||||
clean: clean_common
|
||||
rm -f $(XWLANG)Main.dict.gz *.bin $(XWLANG)*.pdb $(XWLANG)*.seb
|
||||
|
76
dawg/Russian/info.txt
Normal file
76
dawg/Russian/info.txt
Normal file
|
@ -0,0 +1,76 @@
|
|||
# Copyright 2002,2007 by Eric House (xwords@eehouse.org). All rights
|
||||
# reserved.
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
LANGCODE:ru_RU
|
||||
CHARSET:windows-1251
|
||||
|
||||
# deal with DOS files
|
||||
LANGFILTER: tr -d '\r'
|
||||
# uppercase all
|
||||
LANGFILTER: | tr [àáâãäåæçèéêëìíîïðñòóôõö÷øùÚûüýþÿ] [ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß]
|
||||
# LANGFILTER: | tr -s '\n' '\000'
|
||||
|
||||
# note: don't turn off sorting! Can't do it with GNU 'sort' without
|
||||
# setting LANG
|
||||
D2DARGS: -r -term 10
|
||||
|
||||
LANGINFO: <p>Russian wordlists must be in the Windows-1251
|
||||
LANGINFO: codepage. Lower-case letters are converted to upper case and
|
||||
LANGINFO: any words that contain letters not listed below are
|
||||
LANGINFO: removed.</p>
|
||||
|
||||
# High bit means "official". Next 7 bits are an enum where
|
||||
# Russian==0x0F. Low byte is padding.
|
||||
XLOC_HEADER:0x8F00
|
||||
|
||||
|
||||
<BEGIN_TILES>
|
||||
8 1 'À'
|
||||
2 3 'Á'
|
||||
4 1 'Â'
|
||||
2 3 'Ã'
|
||||
2 2 'Ä'
|
||||
7 1 'Å'
|
||||
1 4 'Æ'
|
||||
1 3 'Ç'
|
||||
7 1 'È'
|
||||
1 2 'É'
|
||||
4 2 'Ê'
|
||||
4 2 'Ë'
|
||||
2 3 'Ì'
|
||||
4 1 'Í'
|
||||
9 1 'Î'
|
||||
4 2 'Ï'
|
||||
5 1 'Ð'
|
||||
5 1 'Ñ'
|
||||
7 1 'Ò'
|
||||
4 2 'Ó'
|
||||
1 5 'Ô'
|
||||
1 4 'Õ'
|
||||
1 4 'Ö'
|
||||
1 3 '×'
|
||||
1 4 'Ø'
|
||||
1 5 'Ù'
|
||||
1 10 'Ú'
|
||||
2 2 'Û'
|
||||
4 1 'Ü'
|
||||
1 8 'Ý'
|
||||
1 5 'Þ'
|
||||
2 2 'ß'
|
||||
2 0 {"_"}
|
||||
<END_TILES>
|
||||
# should ignore all after the <END_TILES> above
|
|
@ -21,24 +21,25 @@
|
|||
NEEDSSORT:true
|
||||
|
||||
# MSDos LF chars go bye-bye
|
||||
LANGFILTER_PRECLIP: tr -d '\r' |
|
||||
LANGFILTER: tr -d '\r'
|
||||
|
||||
# convert accented vowels
|
||||
LANGFILTER_POSTCLIP: | tr '\207\216\222\227\234\237\226' 'aeiouu\321'
|
||||
LANGFILTER: | tr '\207\216\222\227\234\237\226' 'aeiouu\321'
|
||||
# uppercase
|
||||
LANGFILTER_POSTCLIP: | tr [a-zń] [A-ZŃ]
|
||||
LANGFILTER: | tr [a-zñ] [A-ZÑ]
|
||||
# remove words with illegal letters
|
||||
LANGFILTER_POSTCLIP: | grep '^[[A-JL-VX-ZŃ]*$'
|
||||
LANGFILTER: | grep '^[[A-JL-VX-ZÑ]*$'
|
||||
# substitute pairs (can't figure out how to use octal values)
|
||||
LANGFILTER_POSTCLIP: | sed 's/CH/1/g'
|
||||
LANGFILTER_POSTCLIP: | sed 's/LL/2/g'
|
||||
LANGFILTER_POSTCLIP: | sed 's/RR/3/g'
|
||||
LANGFILTER: | sed 's/CH/1/g'
|
||||
LANGFILTER: | sed 's/LL/2/g'
|
||||
LANGFILTER: | sed 's/RR/3/g'
|
||||
# substitute in the octal control character values
|
||||
LANGFILTER_POSTCLIP: | tr '123' '\001\002\003'
|
||||
LANGFILTER: | tr '123' '\001\002\003'
|
||||
# now add nulls as terminators
|
||||
LANGFILTER_POSTCLIP: | tr -s '\n' '\000'
|
||||
LANGFILTER_POSTCLIP: | sort -u -z
|
||||
LANGFILTER: | tr -s '\n' '\000'
|
||||
LANGFILTER: | sort -u -z
|
||||
|
||||
D2DARGS: -r -term 0
|
||||
|
||||
LANGINFO: <p>Spanish words include all letters in the English alphabet
|
||||
LANGINFO: except "K" and "W", and with "Ñ" added. Since there are no
|
||||
|
|
|
@ -16,12 +16,11 @@
|
|||
|
||||
LANGCODE:sv_SE
|
||||
|
||||
LANGFILTER: tr -d '\r'
|
||||
LANGFILTER: | tr [a-zäåæöü] [A-ZÄÅÆÖÜ]
|
||||
LANGFILTER: | grep '^[A-ZÄÅÆÖÜ]*$'
|
||||
|
||||
LANGFILTER_POSTCLIP: | tr [a-zäĺćöü] [A-ZÄĹĆÖÜ]
|
||||
LANGFILTER_POSTCLIP: | grep '^[A-ZÄĹĆÖÜ]*$'
|
||||
LANGFILTER_POSTCLIP: | tr '\n' '\000'
|
||||
|
||||
NEEDSSORT:true
|
||||
D2DARGS: -r -term 10
|
||||
|
||||
LANGINFO: <p>From an English-speaker's perspective, Swedish drops Q
|
||||
LANGINFO: and W, and adds Ä, Å, Æ, Ö and Ü.</p>
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
/* -*- compile-command: "g++ -O -o dict2dawg dict2dawg.cpp"; -*- */
|
||||
/* -*- compile-command: "g++ -DDEBUG -O -o dict2dawg dict2dawg.cpp"; -*- */
|
||||
/*************************************************************************
|
||||
* adapted from perl code that was itself adapted from C++ code
|
||||
* Copyright (C) 2000 Falk Hueffner
|
||||
|
||||
* This version Copyright (C) 2002,2006 Eric House (xwords@eehouse.org)
|
||||
* This version Copyright (C) 2002,2006-2007 Eric House (xwords@eehouse.org)
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
|
@ -54,9 +54,11 @@ typedef unsigned int Node;
|
|||
typedef std::vector<Node> NodeList;
|
||||
typedef std::vector<char*> WordList;
|
||||
|
||||
#define MAX_WORD_LEN 15
|
||||
#define VERSION_STR "$Rev$"
|
||||
|
||||
#define MAX_WORD_LEN 15
|
||||
#define T2ABUFLEN(s) (((s)*4)+3)
|
||||
|
||||
int gFirstDiff;
|
||||
|
||||
static char gCurrentWordBuf[MAX_WORD_LEN+1] = { '\0' };
|
||||
|
@ -92,10 +94,12 @@ bool gForceFour = false; // use four bytes regardless of need?
|
|||
static int gFileSize = 0;
|
||||
int gNBytesPerNode;
|
||||
bool gUseUnicode;
|
||||
int gLimLow = 2;
|
||||
int gLimHigh = MAX_WORD_LEN;
|
||||
|
||||
|
||||
// OWL is 1.7M
|
||||
#define MAX_POOL_SIZE (3 * 0x100000)
|
||||
#define MAX_POOL_SIZE (5 * 0x100000)
|
||||
#define ERROR_EXIT(...) error_exit( __LINE__, __VA_ARGS__ );
|
||||
|
||||
static char* parseARGV( int argc, char** argv, const char** inFileName );
|
||||
|
@ -182,7 +186,8 @@ main( int argc, char** argv )
|
|||
unsigned long be = htonl( gWordCount );
|
||||
fwrite( &be, sizeof(be), 1, OFILE );
|
||||
fclose( OFILE );
|
||||
fprintf( stderr, "wrote out: got %d words\n", gWordCount );
|
||||
fprintf( stderr, "Wrote %d (word count) to %s\n", gWordCount,
|
||||
gCountFile );
|
||||
}
|
||||
|
||||
if ( gOutFileBase ) {
|
||||
|
@ -393,49 +398,62 @@ readFromSortedArray( void )
|
|||
#endif
|
||||
}
|
||||
|
||||
char* word = "";
|
||||
for ( ; ; ) {
|
||||
char* word = "";
|
||||
|
||||
if ( !gDone ) {
|
||||
gDone = gNextWordIndex == sInputStrings->size();
|
||||
if ( !gDone ) {
|
||||
word = sInputStrings->at(gNextWordIndex++);
|
||||
gDone = gNextWordIndex == sInputStrings->size();
|
||||
if ( !gDone ) {
|
||||
word = sInputStrings->at(gNextWordIndex++);
|
||||
#ifdef DEBUG
|
||||
} else if ( gDebug ) {
|
||||
fprintf( stderr, "gDone set to true\n" );
|
||||
} else if ( gDebug ) {
|
||||
fprintf( stderr, "gDone set to true\n" );
|
||||
#endif
|
||||
}
|
||||
#ifdef DEBUG
|
||||
if ( gDebug ) {
|
||||
char buf[T2ABUFLEN(MAX_WORD_LEN)];
|
||||
fprintf( stderr, "%s: got word: %s\n", __func__,
|
||||
tileToAscii( buf, sizeof(buf), word ) );
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#ifdef DEBUG
|
||||
if ( gDebug ) {
|
||||
fprintf( stderr, "got word: %s\n", word );
|
||||
int numCommonLetters = 0;
|
||||
int len = strlen( word );
|
||||
if ( gCurrentWordLen < len ) {
|
||||
len = gCurrentWordLen;
|
||||
}
|
||||
|
||||
while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
|
||||
&& numCommonLetters < len ) {
|
||||
++numCommonLetters;
|
||||
}
|
||||
|
||||
gFirstDiff = numCommonLetters;
|
||||
if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
|
||||
&& !firstBeforeSecond( gCurrentWord, word ) ) {
|
||||
#ifdef DEBUG
|
||||
if ( gDebug ) {
|
||||
char buf1[T2ABUFLEN(MAX_WORD_LEN)];
|
||||
char buf2[T2ABUFLEN(MAX_WORD_LEN)];
|
||||
fprintf( stderr,
|
||||
"%s: words %s and %s are the same or out of order\n",
|
||||
__func__,
|
||||
tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
|
||||
tileToAscii( buf2, sizeof(buf2), word ) );
|
||||
}
|
||||
#endif
|
||||
continue;
|
||||
}
|
||||
|
||||
gCurrentWord = word;
|
||||
gCurrentWordLen = strlen(word);
|
||||
break;
|
||||
}
|
||||
int numCommonLetters = 0;
|
||||
int len = strlen( word );
|
||||
if ( gCurrentWordLen < len ) {
|
||||
len = gCurrentWordLen;
|
||||
}
|
||||
|
||||
while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
|
||||
&& numCommonLetters < len ) {
|
||||
++numCommonLetters;
|
||||
}
|
||||
|
||||
gFirstDiff = numCommonLetters;
|
||||
if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
|
||||
&& !firstBeforeSecond( gCurrentWord, word ) ) {
|
||||
char buf1[MAX_WORD_LEN+1];
|
||||
char buf2[MAX_WORD_LEN+1];
|
||||
ERROR_EXIT( "words %s and %s are out of order\n",
|
||||
tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
|
||||
tileToAscii( buf2, sizeof(buf2), word ) );
|
||||
}
|
||||
gCurrentWord = word;
|
||||
gCurrentWordLen = strlen(word);
|
||||
|
||||
#ifdef DEBUG
|
||||
if ( gDebug ) {
|
||||
char buf[MAX_WORD_LEN+1];
|
||||
char buf[T2ABUFLEN(MAX_WORD_LEN)];
|
||||
fprintf( stderr, "gCurrentWord now %s\n",
|
||||
tileToAscii( buf, sizeof(buf), gCurrentWord) );
|
||||
}
|
||||
|
@ -450,47 +468,77 @@ readOneWord( char* wordBuf, int bufLen, int* lenp, bool* gotEOF )
|
|||
bool dropWord = false;
|
||||
bool done = false;
|
||||
|
||||
// for each byte
|
||||
// for each byte, append to an internal buffer up to size limit.
|
||||
// On reaching an end-of-word or EOF, check if the word formed is
|
||||
// within the length range and contains no unknown chars. If yes,
|
||||
// return it. If no, start over ONLY IF the terminator was not
|
||||
// EOF.
|
||||
for ( ; ; ) {
|
||||
int byt = getc( gInFile );
|
||||
|
||||
// EOF is special: we don't try for another word even if
|
||||
// dropWord is true; we must leave now.
|
||||
if ( byt == EOF || byt == gTermChar ) {
|
||||
*gotEOF = byt == EOF;
|
||||
bool isEOF = byt == EOF;
|
||||
*gotEOF = isEOF;
|
||||
|
||||
if ( !dropWord || *gotEOF ) {
|
||||
if ( count != 0 ) {
|
||||
wordBuf[count] = '\0';
|
||||
result = wordBuf;
|
||||
*lenp = count;
|
||||
++gWordCount;
|
||||
}
|
||||
break; // we've finished a word
|
||||
} else if ( *gotEOF ) {
|
||||
assert( isEOF || count < bufLen );
|
||||
if ( !dropWord && (count >= gLimLow) && (count <= gLimHigh) ) {
|
||||
assert( count < bufLen );
|
||||
wordBuf[count] = '\0';
|
||||
result = wordBuf;
|
||||
*lenp = count;
|
||||
++gWordCount;
|
||||
break;
|
||||
} else if ( isEOF ) {
|
||||
assert( !result );
|
||||
break;
|
||||
}
|
||||
#ifdef DEBUG
|
||||
if ( gDebug ) {
|
||||
char buf[T2ABUFLEN(count)];
|
||||
wordBuf[count] = '\0';
|
||||
fprintf( stderr, "%s: dropping word (len=%d): %s\n", __func__,
|
||||
count, tileToAscii( buf, sizeof(buf), wordBuf ) );
|
||||
}
|
||||
#endif
|
||||
count = 0; // we'll start over
|
||||
dropWord = false;
|
||||
|
||||
} else if ( count >= bufLen ) {
|
||||
// Just drop it...
|
||||
dropWord = true;
|
||||
|
||||
// Don't call into the hashtable twice here!!
|
||||
} else if ( gTableHash.find(byt) != gTableHash.end() ) {
|
||||
if ( !dropWord ) {
|
||||
wordBuf[count++] = (char)gTableHash[byt];
|
||||
if ( count >= bufLen ) {
|
||||
char buf[MAX_WORD_LEN+1];
|
||||
ERROR_EXIT( "no space for word %d (starting \"%s\")",
|
||||
gWordCount,
|
||||
tileToAscii( buf, sizeof(buf), wordBuf ));
|
||||
}
|
||||
assert( count < bufLen );
|
||||
wordBuf[count++] = (char)gTableHash[byt];
|
||||
if ( count >= bufLen ) {
|
||||
char buf[T2ABUFLEN(count)];
|
||||
ERROR_EXIT( "no space for word %d (starting \"%s\")",
|
||||
gWordCount,
|
||||
tileToAscii( buf, sizeof(buf), wordBuf ));
|
||||
}
|
||||
} else if ( gKillIfMissing || !dropWord ) {
|
||||
char buf[T2ABUFLEN(count)];
|
||||
wordBuf[count] = '\0';
|
||||
|
||||
tileToAscii( buf, sizeof(buf), wordBuf );
|
||||
|
||||
if ( gKillIfMissing ) {
|
||||
ERROR_EXIT( "chr %c (%d) not in map file %s\n"
|
||||
"last word was %s\n",
|
||||
(char)byt, (int)byt, gTableFile, buf );
|
||||
} else if ( !dropWord ) {
|
||||
#ifdef DEBUG
|
||||
if ( gDebug ) {
|
||||
fprintf( stderr, "%s: chr %c (%d) not in map file %s\n"
|
||||
"dropping partial word %s\n", __func__,
|
||||
(char)byt, (int)byt, gTableFile, buf );
|
||||
}
|
||||
#endif
|
||||
dropWord = true;
|
||||
}
|
||||
} else if ( gKillIfMissing ) {
|
||||
char buf[MAX_WORD_LEN+1];
|
||||
ERROR_EXIT( "chr %c (%d) not in map file %s\n"
|
||||
"last word was %s\n",
|
||||
byt, (int)byt, gTableFile,
|
||||
tileToAscii( buf, sizeof(buf), wordBuf ) );
|
||||
} else {
|
||||
dropWord = true;
|
||||
count = 0; // lose anything we already have
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -511,40 +559,55 @@ readFromFile( void )
|
|||
int len;
|
||||
|
||||
gDone = s_eof;
|
||||
if ( !gDone ) {
|
||||
word = readOneWord( wordBuf, sizeof(wordBuf), &len, &s_eof );
|
||||
gDone = NULL == word;
|
||||
}
|
||||
if ( gDone ) {
|
||||
word = "";
|
||||
len = 0;
|
||||
}
|
||||
|
||||
// Repeat until we get a new word that's not "out-of-order". When
|
||||
// we see this the problem isn't failure to sort, it's duplicates.
|
||||
// So dropping is ok. The alternative would be detecting dupes
|
||||
// during the sort. This seems easier.
|
||||
for ( ; ; ) {
|
||||
if ( !gDone ) {
|
||||
word = readOneWord( wordBuf, sizeof(wordBuf), &len, &s_eof );
|
||||
gDone = NULL == word;
|
||||
}
|
||||
if ( gDone ) {
|
||||
word = "";
|
||||
len = 0;
|
||||
}
|
||||
|
||||
int numCommonLetters = 0;
|
||||
if ( gCurrentWordLen < len ) {
|
||||
len = gCurrentWordLen;
|
||||
}
|
||||
int numCommonLetters = 0;
|
||||
if ( gCurrentWordLen < len ) {
|
||||
len = gCurrentWordLen;
|
||||
}
|
||||
|
||||
while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
|
||||
&& numCommonLetters < len ) {
|
||||
++numCommonLetters;
|
||||
}
|
||||
while ( gCurrentWord[numCommonLetters] == word[numCommonLetters]
|
||||
&& numCommonLetters < len ) {
|
||||
++numCommonLetters;
|
||||
}
|
||||
|
||||
gFirstDiff = numCommonLetters;
|
||||
if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
|
||||
&& !firstBeforeSecond( gCurrentWord, word ) ) {
|
||||
char buf1[MAX_WORD_LEN+1];
|
||||
char buf2[MAX_WORD_LEN+1];
|
||||
ERROR_EXIT( "words %s and %s are out of order\n",
|
||||
tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
|
||||
tileToAscii( buf2, sizeof(buf2), word ) );
|
||||
gFirstDiff = numCommonLetters;
|
||||
if ( (gCurrentWordLen > 0) && (strlen(word) > 0)
|
||||
&& !firstBeforeSecond( gCurrentWord, word ) ) {
|
||||
#ifdef DEBUG
|
||||
if ( gDebug ) {
|
||||
char buf1[T2ABUFLEN(MAX_WORD_LEN)];
|
||||
char buf2[T2ABUFLEN(MAX_WORD_LEN)];
|
||||
fprintf( stderr,
|
||||
"%s: words %s and %s are the smae or out of order\n",
|
||||
__func__,
|
||||
tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
|
||||
tileToAscii( buf2, sizeof(buf2), word ) );
|
||||
}
|
||||
#endif
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
gCurrentWordLen = strlen(word);
|
||||
strncpy( gCurrentWordBuf, word, sizeof(gCurrentWordBuf) );
|
||||
|
||||
#ifdef DEBUG
|
||||
if ( gDebug ) {
|
||||
char buf[MAX_WORD_LEN+1];
|
||||
char buf[T2ABUFLEN(MAX_WORD_LEN)];
|
||||
fprintf( stderr, "gCurrentWord now %s\n",
|
||||
tileToAscii( buf, sizeof(buf), gCurrentWord) );
|
||||
}
|
||||
|
@ -561,17 +624,26 @@ firstBeforeSecond( const char* lhs, const char* rhs )
|
|||
static char*
|
||||
tileToAscii( char* out, int outSize, const char* in )
|
||||
{
|
||||
char tiles[outSize];
|
||||
int tilesLen = 1;
|
||||
tiles[0] = '[';
|
||||
|
||||
char* orig = out;
|
||||
for ( ; ; ) {
|
||||
char ch = *in++;
|
||||
if ( '\0' == ch ) {
|
||||
*out = '\0';
|
||||
break;
|
||||
}
|
||||
assert( ch < gRevMap.size() );
|
||||
*out++ = gRevMap[ch];
|
||||
tilesLen += sprintf( &tiles[tilesLen], "%d,", ch );
|
||||
assert( (out - orig) < outSize );
|
||||
}
|
||||
|
||||
tiles[tilesLen] = ']';
|
||||
tiles[tilesLen+1] = '\0';
|
||||
strcpy( out, tiles );
|
||||
|
||||
return orig;
|
||||
}
|
||||
|
||||
|
@ -636,7 +708,7 @@ printWords( std::vector<char*>* strings )
|
|||
{
|
||||
std::vector<char*>::iterator iter = strings->begin();
|
||||
while ( iter != strings->end() ) {
|
||||
char buf[MAX_WORD_LEN+1];
|
||||
char buf[T2ABUFLEN(MAX_WORD_LEN)];
|
||||
tileToAscii( buf, sizeof(buf), *iter );
|
||||
fprintf( stderr, "%s\n", buf );
|
||||
++iter;
|
||||
|
@ -760,6 +832,9 @@ makeTableHash( void )
|
|||
if ( NULL == TABLEFILE ) {
|
||||
ERROR_EXIT( "unable to open %s\n", gTableFile );
|
||||
}
|
||||
|
||||
// Fill the 0th space since references are one-based
|
||||
gRevMap.push_back(0);
|
||||
|
||||
for ( ii = 0; ; ++ii ) {
|
||||
int ch = getc(TABLEFILE);
|
||||
|
@ -817,7 +892,6 @@ emitNodes( unsigned int nBytesPerOutfile, const char* outFileBase )
|
|||
gNBytesPerNode = 3;
|
||||
} else {
|
||||
if ( gBlankIndex == 32 ) { // blank
|
||||
fprintf( stderr, "blank's at 32; 3-byte-nodes still ok\n" );
|
||||
gNBytesPerNode = 3;
|
||||
} else {
|
||||
ERROR_EXIT( "move blank to last position in info.txt "
|
||||
|
@ -994,8 +1068,10 @@ usage( const char* name )
|
|||
{
|
||||
fprintf( stderr, "usage: %s \n"
|
||||
"\t[-v] (print version and exit)\n"
|
||||
"\t[-poolsize] (print size of hardcoded pool and exit)\n"
|
||||
"\t[-poolsize] (print hardcoded size of pool and exit)\n"
|
||||
"\t[-b bytesPerFile] (default = 0xFFFFFFFF)\n"
|
||||
"\t[-min <num in 0..15>]\n"
|
||||
"\t[-max <num in 0..15>]\n"
|
||||
"\t-m mapFile\n"
|
||||
"\t-mn mapFile (unicode)\n"
|
||||
"\t-ob outFileBase\n"
|
||||
|
@ -1048,6 +1124,10 @@ parseARGV( int argc, char** argv, const char** inFileName )
|
|||
} else if ( 0 == strcmp( arg, "-mn" ) ) {
|
||||
gTableFile = argv[index++];
|
||||
gUseUnicode = true;
|
||||
} else if ( 0 == strcmp( arg, "-min" ) ) {
|
||||
gLimLow = atoi(argv[index++]);
|
||||
} else if ( 0 == strcmp( arg, "-max" ) ) {
|
||||
gLimHigh = atoi(argv[index++]);
|
||||
} else if ( 0 == strcmp( arg, "-m" ) ) {
|
||||
gTableFile = argv[index++];
|
||||
} else if ( 0 == strcmp( arg, "-ob" ) ) {
|
||||
|
@ -1079,17 +1159,25 @@ parseARGV( int argc, char** argv, const char** inFileName )
|
|||
gDebug = true;
|
||||
#endif
|
||||
} else {
|
||||
ERROR_EXIT( "unexpected arg %s", arg );
|
||||
ERROR_EXIT( "%s: unexpected arg %s", __func__, arg );
|
||||
}
|
||||
}
|
||||
|
||||
if ( gLimHigh > MAX_WORD_LEN || gLimLow > MAX_WORD_LEN ) {
|
||||
usage( argv[0] );
|
||||
exit(1);
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
if ( gDebug ) {
|
||||
fprintf( stderr, "gNBytesPerOutfile=$gNBytesPerOutfile\n" );
|
||||
fprintf( stderr, "gTableFile=$gTableFile\n" );
|
||||
fprintf( stderr, "gOutFileBase=$gOutFileBase\n" );
|
||||
fprintf( stderr, "gStartNodeOut=$gStartNodeOut\n" );
|
||||
fprintf( stderr, "gNBytesPerOutfile=%d\n", gNBytesPerOutfile );
|
||||
fprintf( stderr, "gTableFile=%s\n", gTableFile );
|
||||
fprintf( stderr, "gOutFileBase=%s\n", gOutFileBase );
|
||||
fprintf( stderr, "gStartNodeOut=%s\n", gStartNodeOut );
|
||||
fprintf( stderr, "gTermChar=%c(%d)\n", gTermChar, (int)gTermChar );
|
||||
fprintf( stderr, "gFileSize=%d\n", gFileSize );
|
||||
fprintf( stderr, "gLimLow=%d\n", gLimLow );
|
||||
fprintf( stderr, "gLimHigh=%d\n", gLimHigh );
|
||||
}
|
||||
#endif
|
||||
return gTableFile;
|
||||
|
|
|
@ -117,7 +117,7 @@ sub WriteMapFile($$$) {
|
|||
} elsif ( $str =~ /(\d+)/ ) {
|
||||
print $fhr pack( $packStr, $1 );
|
||||
} else {
|
||||
die "WriteMapFile: unrecognized face format $str";
|
||||
die "WriteMapFile: unrecognized face format $str, elem $i";
|
||||
}
|
||||
}
|
||||
} # WriteMapFile
|
||||
|
|
Loading…
Add table
Reference in a new issue