From 79990bc7b108dd7e009d89eea9f914bdc717a313 Mon Sep 17 00:00:00 2001
From: Andy2
Date: Tue, 30 Nov 2010 18:35:11 -0800
Subject: [PATCH] first set of changes formed by applyinig diff of
android_branch's dawg/ directory against unicode_branch's. The two branches
seem to have to common ancestor -- probably didn't survive translation from
svn -- so this is the best I can do.
This checkin is all the files that were modified by the patch plus a
couple of simple additions. Next I'll be adding directories that the
patch created. It also reintroduced a bunch of .cvsignore files; I
won't check those in.
---
xwords4/dawg/Catalan/Makefile | 2 +-
xwords4/dawg/Catalan/info.txt | 2 +-
xwords4/dawg/English/Makefile | 2 +-
xwords4/dawg/English/Makefile.CollegeEng | 36 ++++
xwords4/dawg/English/Makefile.OWL2 | 4 +-
xwords4/dawg/English/Makefile.TWL06 | 37 ++++
xwords4/dawg/German/Makefile | 17 +-
xwords4/dawg/German/info.txt | 24 +--
xwords4/dawg/Hex/Makefile | 12 +-
xwords4/dawg/Hex/info.txt | 16 +-
xwords4/dawg/Makefile.langcommon | 29 ++--
xwords4/dawg/Polish/Makefile | 14 +-
xwords4/dawg/Polish/info.txt | 30 ++--
xwords4/dawg/Spanish/Makefile | 11 +-
xwords4/dawg/Spanish/info.txt | 12 +-
xwords4/dawg/Swedish/Makefile | 10 +-
xwords4/dawg/Swedish/info.txt | 24 +--
xwords4/dawg/dawg2dict.pl | 103 +++++++++--
xwords4/dawg/dict2dawg.cpp | 207 +++++++++++++----------
xwords4/dawg/dictstats.pl | 28 +--
xwords4/dawg/xloc.pl | 15 +-
xwords4/dawg/xloc.pm | 14 +-
22 files changed, 423 insertions(+), 226 deletions(-)
create mode 100644 xwords4/dawg/English/Makefile.CollegeEng
create mode 100644 xwords4/dawg/English/Makefile.TWL06
diff --git a/xwords4/dawg/Catalan/Makefile b/xwords4/dawg/Catalan/Makefile
index 46815e993..da54778d7 100644
--- a/xwords4/dawg/Catalan/Makefile
+++ b/xwords4/dawg/Catalan/Makefile
@@ -33,7 +33,7 @@ endif
endif
LANG_SPECIAL_INFO = \
- "L-L" $(PBITMS)/large_ll.pbitm $(PBITMS)/small_ll.pbitm \
+ "L·L" $(PBITMS)/large_ll.pbitm $(PBITMS)/small_ll.pbitm \
"NY" $(PBITMS)/large_ny.pbitm $(PBITMS)/small_ny.pbitm \
"QU" $(PBITMS)/large_qu.pbitm $(PBITMS)/small_qu.pbitm \
diff --git a/xwords4/dawg/Catalan/info.txt b/xwords4/dawg/Catalan/info.txt
index 1cb56442d..9954db826 100644
--- a/xwords4/dawg/Catalan/info.txt
+++ b/xwords4/dawg/Catalan/info.txt
@@ -75,7 +75,7 @@ XLOC_HEADER:0x8C00
8 1 'I'
1 8 'J'
4 1 'L'
-1 10 {"L-L"}
+1 10 {"L·L"}
3 2 'M'
6 1 'N'
1 10 {"NY"}
diff --git a/xwords4/dawg/English/Makefile b/xwords4/dawg/English/Makefile
index 95b975643..21c4b89ac 100644
--- a/xwords4/dawg/English/Makefile
+++ b/xwords4/dawg/English/Makefile
@@ -15,7 +15,7 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-TARGET_TYPE ?= FRANK
+TARGET_TYPE ?= WINCE
include ../Makefile.langcommon
diff --git a/xwords4/dawg/English/Makefile.CollegeEng b/xwords4/dawg/English/Makefile.CollegeEng
new file mode 100644
index 000000000..80d4cbd24
--- /dev/null
+++ b/xwords4/dawg/English/Makefile.CollegeEng
@@ -0,0 +1,36 @@
+# -*- mode: makefile; compile-command: "make -f Makefile.COSD"; -*-
+# Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+XWLANG=CollegeEng
+LANGCODE=en_US
+TARGET_TYPE=WINCE
+
+include ../Makefile.2to8
+
+include ../Makefile.langcommon
+
+SOURCEDICT ?= $(XWDICTPATH)/English/CollegeEng.dict.gz
+
+$(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile
+ zcat $< | tr -d '\r' | tr [a-z] [A-Z] | grep -e "^[A-Z]\{2,15\}$$" | \
+ gzip -c > $@
+
+# Everything but creating of the Main.dict file is inherited from the
+# "parent" Makefile.langcommon in the parent directory.
+
+clean: clean_common
+ rm -f $(XWLANG)Main.dict.gz *.bin $(XWLANG)*.pdb $(XWLANG)*.seb
diff --git a/xwords4/dawg/English/Makefile.OWL2 b/xwords4/dawg/English/Makefile.OWL2
index 41479a843..ce7988a5a 100644
--- a/xwords4/dawg/English/Makefile.OWL2
+++ b/xwords4/dawg/English/Makefile.OWL2
@@ -15,9 +15,9 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-XWLANG=OWL2_
+XWLANG=OWL2
LANGCODE=en_US
-TARGET_TYPE=PALM
+TARGET_TYPE?=PALM
include ../Makefile.2to8
diff --git a/xwords4/dawg/English/Makefile.TWL06 b/xwords4/dawg/English/Makefile.TWL06
new file mode 100644
index 000000000..28dd43ca1
--- /dev/null
+++ b/xwords4/dawg/English/Makefile.TWL06
@@ -0,0 +1,37 @@
+# -*- mode: makefile; compile-command: "make -f Makefile.COSD"; -*-
+# Copyright 2002-2010 by Eric House (xwords@eehouse.org). All rights
+# reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+XWLANG=TWL06
+LANGCODE=en_US
+TARGET_TYPE=WINCE
+
+include ../Makefile.2to8
+
+include ../Makefile.langcommon
+
+# from http://www.3zsoftware.com/en/wordmagic/lists.php
+SOURCEDICT ?= $(XWDICTPATH)/English/twl06.zip
+
+$(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile
+ zcat $< | grep -e "^[A-Z]\{2,15\}$$" | gzip -c > $@
+
+# Everything but creating of the Main.dict file is inherited from the
+# "parent" Makefile.langcommon in the parent directory.
+
+clean: clean_common
+ rm -f $(XWLANG)Main.dict.gz *.bin $(XWLANG)*.pdb $(XWLANG)*.seb
diff --git a/xwords4/dawg/German/Makefile b/xwords4/dawg/German/Makefile
index 62ec0be1e..b5e453d5a 100644
--- a/xwords4/dawg/German/Makefile
+++ b/xwords4/dawg/German/Makefile
@@ -1,5 +1,6 @@
-# -*- mode: makefile; coding: iso-8859-1; -*-
-# Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved.
+# -*- mode: makefile; coding: utf-8; -*-
+# Copyright 2002 - 2010 by Eric House (xwords@eehouse.org). All
+# rights reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
@@ -17,7 +18,7 @@
XWLANG = German
LANGCODE = de_DE
-ENC = ISO-8859-1
+ENC = UTF-8
TARGET_TYPE ?= WINCE
@@ -28,11 +29,11 @@ include ../Makefile.langcommon
SOURCEDICT ?= $(XWDICTPATH)/German/HansGerman.dict.gz
$(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile
- zcat $< | tr [a-z] [A-Z] | \
- LANG=$(LANGCODE):$(ENC) sed -e 's//SS/g' | \
- LANG=$(LANGCODE):$(ENC) grep '[AEIOU]' | \
- LANG=$(LANGCODE):$(ENC) grep '^[A-Z]\+$$' | \
- gzip -c > $@
+ zcat $< \
+ | tr [a-zäöü] [A-ZÄÖÜ] \
+ | sed -e 's/ß/SS/g' \
+ | grep '^[A-ZÄÖÜ]*$$' \
+ | gzip -c > $@
# Everything but creating of the Main.dict file is inherited from the
# "parent" Makefile.langcommon in the parent directory.
diff --git a/xwords4/dawg/German/info.txt b/xwords4/dawg/German/info.txt
index f6321981d..7c3ee588f 100644
--- a/xwords4/dawg/German/info.txt
+++ b/xwords4/dawg/German/info.txt
@@ -1,4 +1,6 @@
-# Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved.
+# -*- mode: conf; coding: utf-8; -*-
+# Copyright 2002 - 2010 by Eric House (xwords@eehouse.org). All
+# rights reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
@@ -15,17 +17,18 @@
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
LANGCODE:de_DE
+CHARSET: utf-8
# deal with DOS files
LANGFILTER: tr -d '\r'
# substitute for sharfes-s
-LANGFILTER: | sed -e 's//SS/g'
+LANGFILTER: | sed -e 's/ß/SS/g'
# uppercase all
-LANGFILTER: | tr [a-z] [A-Z]
+LANGFILTER: | tr [a-zäöü] [A-ZÄÖÜ]
# no words not containing a vowel
-LANGFILTER: | grep '[AEIOU]'
+LANGFILTER: | grep '[AEIOUÄÖÜ]'
# none with illegal chars
-LANGFILTER: | grep '^[A-Z]\+$'
+LANGFILTER: | grep '^[A-ZÄÖÜ]\+$'
# Until I can figure out how to force sort to use a locale's collation
# rules we can't trust sort in the filtering rules above and so must
@@ -46,9 +49,8 @@ XLOC_HEADER:0x8300
2 0 {"_"}
-5 1 'A'
-# A mit umlaut
-1 6 196
+5 1 'A'
+1 6 'Ä'
2 3 'B'
2 4 'C'
4 1 'D'
@@ -63,16 +65,14 @@ XLOC_HEADER:0x8300
4 3 'M'
9 1 'N'
3 2 'O'
-# O mit umlaut
-1 8 214
+1 8 'Ö'
1 4 'P'
1 10 'Q'
6 1 'R'
7 1 'S'
6 1 'T'
6 1 'U'
-# U mit umlaut
-1 6 220
+1 6 'Ü'
1 6 'V'
1 3 'W'
1 8 'X'
diff --git a/xwords4/dawg/Hex/Makefile b/xwords4/dawg/Hex/Makefile
index cf710fa59..96d54ff1b 100644
--- a/xwords4/dawg/Hex/Makefile
+++ b/xwords4/dawg/Hex/Makefile
@@ -1,4 +1,6 @@
-# Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved.
+# -*- mode: makefile; -*-
+# Copyright 2002-2009 by Eric House (xwords@eehouse.org). All rights
+# reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
@@ -16,6 +18,7 @@
XWLANG = Hex
LANGCODE = hex
+ENC = UTF-8
TARGET_TYPE = WINCE
@@ -24,14 +27,15 @@ include ../Makefile.2to8
include ../Makefile.langcommon
# Pass in your own dict here by setting DICT
-DICT ?= $(XWDICTPATH)/English/SOWPODS_official.txt.gz
+DICT ?= $(XWDICTPATH)/English/CSW.dict.gz
-# Feel free to base this on whatever dictionary you have at hand. I'm
-# using CollegeEng for no particular reason.
+# tr 'AE' 'ÄË' doesn't work, so use sed.
$(XWLANG)Main.dict.gz: $(DICT)
@echo "building $@ from $<"
zcat $< | tr [a-f] [A-F] | grep -e '^[A-F]\{2,8\}$$' | \
echo CAFEBABE DEADBEEF $$(cat -) | \
+ sed 's/A/Ä/g' | \
+ sed 's/E/Ë/g' | \
tr ' ' '\n' | sort | gzip > $@
# Everything but creating of the Main.dict file is inherited from the
diff --git a/xwords4/dawg/Hex/info.txt b/xwords4/dawg/Hex/info.txt
index fcd4f6baf..526041cf1 100755
--- a/xwords4/dawg/Hex/info.txt
+++ b/xwords4/dawg/Hex/info.txt
@@ -1,4 +1,6 @@
-# Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved.
+# -*- mode: conf; -*-
+# Copyright 2002-2009 by Eric House (xwords@eehouse.org). All rights
+# reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
@@ -16,14 +18,14 @@
LANGCODE:HEX
-
-
# uppercase all
LANGFILTER: tr [a-f] [A-F]
LANGFILTER: | grep '^[A-F]*$'
+LANGFILTER: | sed 's/A/Ä/'
+LANGFILTER: | sed 's/E/Ë/'
LANGFILTER: | sort -u
-D2DARGS: -nosort -term 10
+D2DARGS: -term 10
LANGINFO: The hex "language" is something of a programmers' joke.
LANGINFO: Hex is short for hexadecimal, a 16-base number system whose
@@ -41,8 +43,6 @@ LANGINFO: tiles and games play quickly. That's also why the Hex
LANGINFO: tile set has four blanks; that's the largest number
LANGINFO: Crosswords supports and I needed to test at the limit.
-
-
# High bit means "official". Next 7 bits are an enum where Hex==127
# (I just made that up; not sure what it was originally.) Low byte is
# padding
@@ -51,11 +51,11 @@ XLOC_HEADER:0xFF00
4 0 {"_"}
-9 1 'A'
+9 1 'Ä'
2 3 'B'
2 3 'C'
4 2 'D'
-12 1 'E'
+12 1 'Ë'
2 4 'F'
# should ignore all after the above
diff --git a/xwords4/dawg/Makefile.langcommon b/xwords4/dawg/Makefile.langcommon
index a0206cc72..e5b5b64d8 100644
--- a/xwords4/dawg/Makefile.langcommon
+++ b/xwords4/dawg/Makefile.langcommon
@@ -204,16 +204,6 @@ endif
frankspecials.bin: ../frank_mkspecials.pl $(BMPFILES)
$< $(BLANK_INFO) $(LANG_SPECIAL_INFO) > $@
-# a binary file (one byte) giving the number of tiles in the dict
-charcount.bin: table.bin
-ifdef NEWDAWG
- siz=$$(ls -l $< | awk '{print $$5}'); \
- perl -e "print pack(\"c\",$$siz/2)" > $@
-else
- siz=$$(wc -c $< | sed -e 's/$/'); \
- perl -e "print pack(\"c\",$$siz)" > $@
-endif
-
$(XWLANG)%.$(FRANK_EXT): dawg$(XWLANG)%.stamp $(XWLANG)%_flags.bin charcount.bin table.bin values.bin frankspecials.bin
cat $(XWLANG)$*_flags.bin charcount.bin table.bin values.bin \
frankspecials.bin $(XWLANG)StartLoc.bin \
@@ -233,9 +223,9 @@ $(XWLANG)%.$(FRANK_EXT): dawg$(XWLANG)%.stamp $(XWLANG)%_flags.bin charcount.bin
$(XWLANG)%_flags.bin: dawg$(XWLANG)%.stamp
ifdef NEWDAWG
if [ 3 = $$(cat $(XWLANG)$*_nodesize.bin) ] ; \
- then perl -e "print pack(\"n\",0x0002)" > $@; echo "flags=2"; \
+ then perl -e "print pack(\"n\",0x0004)" > $@; echo "flags=4"; \
elif [ 4 = $$(cat $(XWLANG)$*_nodesize.bin) ] ; \
- then perl -e "print pack(\"n\",0x0003)" > $@; echo "flags=3"; \
+ then perl -e "print pack(\"n\",0x0005)" > $@; echo "flags=5"; \
elif true; \
then echo "Unexpected node size"; exit 1; \
fi
@@ -272,8 +262,19 @@ else
perl -I../ ../xloc.pl -t -out $@
endif
-values.bin: ../xloc.pl
- perl -I../ ../xloc.pl -v -out $@ $(ENCP)
+values.bin: ../xloc.pl
+ perl -I../ ../xloc.pl -v -out $@
+
+# a binary file, two bytes, one giving the size of tiles data and the
+# other the number of tiles in the dict. Tiles data is utf-8 and so
+# number is not derivable from size.
+charcount.bin: table.bin ../xloc.pl
+ SIZ=$$(ls -l $< | awk '{print $$5}'); \
+ perl -e "print pack(\"c\",$$SIZ)" > $@
+ TMP=/tmp/tmp$$$$; \
+ perl -I../ ../xloc.pl -s -out $$TMP; \
+ cat $$TMP >> $@; \
+ rm -f $$TMP
%.dict: %.dict.gz
zcat $< > $@
diff --git a/xwords4/dawg/Polish/Makefile b/xwords4/dawg/Polish/Makefile
index bee788e10..3254bfefb 100644
--- a/xwords4/dawg/Polish/Makefile
+++ b/xwords4/dawg/Polish/Makefile
@@ -1,4 +1,4 @@
-# -*- coding: iso-8859-2; mode: Makefile; -*-
+# -*- mode: Makefile; -*-
# Copyright 2002 - 2009 by Eric House (xwords@eehouse.org). All
# rights reserved.
#
@@ -16,9 +16,9 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-XWLANG=Polish
-LANGCODE=pl_PL
-ENC = ISO-8859-2
+XWLANG = Polish
+LANGCODE = pl_PL
+ENC = UTF-8
# DICT2DAWGARGS = -lang $(LANGCODE)
# DICT2DAWGARGS = -debug
@@ -29,12 +29,12 @@ include ../Makefile.2to8
include ../Makefile.langcommon
-SOURCEDICT ?= $(XWDICTPATH)/Polish/iso-8859-2/slowa.txt.gz
+SOURCEDICT ?= $(XWDICTPATH)/Polish/slowa.txt.gz
$(XWLANG)Main.dict.gz: $(SOURCEDICT)
zcat $< | tr -d '\r' \
- | LANG=$(LANGCODE):$(ENC) tr [abcdefghijklmnoprstuwyz] [ABCDEFGHIJKLMNOPRSTUWYZ] \
- | LANG=$(LANGCODE):$(ENC) grep '^[ABCDEFGHIJKLMNOPRSTUWYZ]*$$' \
+ | tr [aąbcćdeęfghijklłmnńoóprsśtuwyzźż] [AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ] \
+ | grep '^[AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ]*$$' \
| gzip > $@
# Everything but creating of the Main.dict file is inherited from the
diff --git a/xwords4/dawg/Polish/info.txt b/xwords4/dawg/Polish/info.txt
index 657006233..35eed9ac0 100644
--- a/xwords4/dawg/Polish/info.txt
+++ b/xwords4/dawg/Polish/info.txt
@@ -1,4 +1,4 @@
-# -*- coding: iso-8859-2; mode: conf; -*-
+# -*- mode: conf; -*-
# Copyright 2002-2009 by Eric House (xwords@eehouse.org). All rights
# reserved.
#
@@ -17,12 +17,12 @@
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
LANGCODE:pl_PL
-CHARSET:iso-8859-2
+CHARSET:utf-8
# deal with DOS files
LANGFILTER: tr -d '\r'
-LANGFILTER: | tr [a-pr-uwyz] [A-PR-UWYZʣӦ]
-LANGFILTER: | grep '^[A-PR-UWYZʣӦ]*$'
+LANGFILTER: | tr [a-pr-uwyząćęłńóśźż] [A-PR-UWYZĄĆĘŁŃÓŚŹŻ]
+LANGFILTER: | grep '^[A-PR-UWYZĄĆĘŁŃÓŚŹŻ]*$'
LANGFILTER: | tr '\n' '\000'
D2DARGS: -r -term 0
@@ -42,8 +42,8 @@ LANGINFO: this working.
LANGINFO: Note that the blank is the last tile here, while with all
LANGINFO: other languages it's the first.
-LANGINFO: Also, please note that we currently require the files you
-LANGINFO: upload to use the iso-8859-2 character encoding.
+# LANGINFO: Also, please note that we currently require the files you
+# LANGINFO: upload to use the iso-8859-2 character encoding.
# High bit means "official". Next 7 bits are an enum where
# Polish==8. Low byte is padding
@@ -51,13 +51,13 @@ XLOC_HEADER:0x8800
9 1 'A'
-1 5 161 # ''
+1 5 'Ą'
2 3 'B'
3 2 'C'
-1 6 198 # ''
+1 6 'Ć'
3 2 'D'
7 1 'E'
-1 5 202 # ''
+1 5 'Ę'
1 5 'F'
2 3 'G'
2 3 'H'
@@ -65,23 +65,23 @@ XLOC_HEADER:0x8800
2 3 'J'
3 3 'K'
3 2 'L'
-2 3 163 # ''
+2 3 'Ł'
3 2 'M'
5 1 'N'
-1 7 209 # ''
+1 7 'Ń'
6 1 'O'
-1 5 211 # ''
+1 5 'Ó'
3 2 'P'
4 1 'R'
4 1 'S'
-1 5 166 # ''
+1 5 'Ś'
3 2 'T'
2 3 'U'
4 1 'W'
4 2 'Y'
5 1 'Z'
-1 9 172 # ''
-1 5 175 # ''
+1 9 'Ź'
+1 5 'Ż'
# the blank *must* be last here!!!
2 0 {"_"}
diff --git a/xwords4/dawg/Spanish/Makefile b/xwords4/dawg/Spanish/Makefile
index 591a150c1..455a8c825 100644
--- a/xwords4/dawg/Spanish/Makefile
+++ b/xwords4/dawg/Spanish/Makefile
@@ -1,4 +1,4 @@
-# -*-mode: Makefile; compile-command: "make all"; coding: iso-8859-1; -*-
+# -*-mode: Makefile; compile-command: "make all"; coding: utf-8; -*-
# Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved.
#
# This program is free software; you can redistribute it and/or
@@ -18,7 +18,7 @@
XWLANG = SpanishFAA41
LANGCODE = es_ES
TARGET_TYPE ?= WINCE
-ENC = ISO-8859-1
+ENC = UTF-8
ifeq ($(TARGET_TYPE),PALM)
PBITMS = ./bmps/palm
@@ -44,14 +44,13 @@ include ../Makefile.langcommon
#$(LANG)Main.dict.gz: SpanishMain.dict.gz
# ln -s $< $@
-SOURCEDICT ?= $(XWDICTPATH)/Spanish/FAA_4.1.txt.gz
+SOURCEDICT ?= $(XWDICTPATH)/Spanish/FAA_4.1.utf8.gz
$(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile
zcat $< \
| tr -d '\r' \
- | tr '\207\216\222\227\234\237\226' 'aeiouu\321' \
- | tr [a-z] [A-Z] \
- | LANG=$(LANGCODE):$(ENC) grep '^[A-JL-VX-Z]*$$' \
+ | tr [a-zñ] [A-ZÑ] \
+ | LANG=$(LANGCODE):$(ENC) grep '^[A-JL-VX-ZÑ]*$$' \
| sed 's/CH/1/g' \
| sed 's/LL/2/g' \
| sed 's/RR/3/g' \
diff --git a/xwords4/dawg/Spanish/info.txt b/xwords4/dawg/Spanish/info.txt
index 7a086e2b4..03a4ec1ff 100644
--- a/xwords4/dawg/Spanish/info.txt
+++ b/xwords4/dawg/Spanish/info.txt
@@ -1,4 +1,4 @@
-# -*- mode: conf; coding: iso-8859-1; -*-
+# -*- mode: conf; coding: utf-8; -*-
# Copyright 2002-2006 by Eric House (xwords@eehouse.org). All rights
# reserved.
#
@@ -20,6 +20,7 @@
# below
NEEDSSORT:true
+CHARSET: utf-8
# MSDos LF chars go bye-bye
LANGFILTER: tr -d '\r'
@@ -27,9 +28,9 @@ LANGFILTER: tr -d '\r'
# convert accented vowels
LANGFILTER: | tr '\207\216\222\227\234\237\226' 'aeiouu\321'
# uppercase
-LANGFILTER: | tr [a-z] [A-Z]
+LANGFILTER: | tr [a-zñ] [A-ZÃ]
# remove words with illegal letters
-LANGFILTER: | grep '^[[A-JL-VX-Z]*$'
+LANGFILTER: | grep '^[[A-JL-VX-ZÃ]*$'
# substitute pairs (can't figure out how to use octal values)
LANGFILTER: | sed 's/CH/1/g'
LANGFILTER: | sed 's/LL/2/g'
@@ -43,7 +44,7 @@ LANGFILTER: | sort -u -z
D2DARGS: -r -term 0
LANGINFO: Spanish words include all letters in the English alphabet
-LANGINFO: except "K" and "W", and with "" added. Since there are no
+LANGINFO: except "K" and "W", and with "Ã" added. Since there are no
LANGINFO: tiles for accented vowels, these are replaced by the
LANGINFO: unaccented forms.
@@ -92,8 +93,7 @@ XLOC_HEADER:0x8600
1 8 {"LL", true, true}
2 3 'M'
5 1 'N'
-# /*'N~'*/
-1 8 209
+1 8 'Ñ'
9 1 'O'
2 3 'P'
1 5 'Q'
diff --git a/xwords4/dawg/Swedish/Makefile b/xwords4/dawg/Swedish/Makefile
index d830ba744..56f363240 100644
--- a/xwords4/dawg/Swedish/Makefile
+++ b/xwords4/dawg/Swedish/Makefile
@@ -1,4 +1,4 @@
-# -*-mode: Makefile; coding: iso-8859-1; -*-
+# -*-mode: Makefile; coding: utf-8; -*-
# Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved.
#
# This program is free software; you can redistribute it and/or
@@ -17,7 +17,7 @@
XWLANG=Swedish
LANGCODE=sv_SE
-ENC = ISO-8859-1
+ENC = UTF-8
# Swedish has too many chars for the old format.
NEWDAWG=whatever
@@ -28,14 +28,14 @@ include ../Makefile.2to8
include ../Makefile.langcommon
-SOURCEDICT ?= $(XWDICTPATH)/Swedish/swedish15.dict.gz
+SOURCEDICT ?= $(XWDICTPATH)/Swedish/swedish15.utf8.gz
# Q and W are not available as tiles, but I'm told there's a custom in
# Swedish play of allowing blanks to stand for those letters as well.
# So we don't exclude words with those letters from the dictionary.
$(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile
- zcat $< | tr [a-z] [A-Z] | \
- LANG=$(LANGCODE):$(ENC) grep '^[A-Z]\{2,15\}$$' | \
+ zcat $< | tr [a-zäåæöü] [A-ZÄÅÆÖÜ] | \
+ LANG=$(LANGCODE):$(ENC) grep '^[A-ZÄÅÆÖÜ]\{2,15\}$$' | \
gzip -c > $@
# Everything but creating of the Main.dict file is inherited from the
diff --git a/xwords4/dawg/Swedish/info.txt b/xwords4/dawg/Swedish/info.txt
index 40dde73ef..7f95ee3c6 100644
--- a/xwords4/dawg/Swedish/info.txt
+++ b/xwords4/dawg/Swedish/info.txt
@@ -1,4 +1,4 @@
-# -*- mode: conf; coding: iso-8859-1; -*-
+# -*- mode: conf; coding: utf-8; -*-
# Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved.
#
# This program is free software; you can redistribute it and/or
@@ -15,16 +15,17 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+CHARSET: utf-8
LANGCODE:sv_SE
LANGFILTER: tr -d '\r'
-LANGFILTER: | tr [a-z] [A-Z]
-LANGFILTER: | grep '^[A-Z]*$'
+LANGFILTER: | tr [a-zäåæöü] [A-ZÄÅÆÖÜ]
+LANGFILTER: | grep '^[A-ZÄÅÆÖÜ]*$'
D2DARGS: -r -term 10
LANGINFO: From an English-speaker's perspective, Swedish drops Q
-LANGINFO: and W, and adds , , , and .
+LANGINFO: and W, and adds Ä, Å, Æ, Ö and Ü.
# High bit means "official". Next 7 bits are an enum where
# Swedish==7. Low byte is padding
@@ -36,11 +37,11 @@ XLOC_HEADER:0x8700
2 0 {"_"}
8 1 'A'
# A with two dots
-2 3 ''
+2 3 'Ä'
# A with circle
-2 4 ''
-# tile only available for blanks
-0 1 ''
+2 4 'Å'
+# Æ tile only available for blanks
+0 1 'Æ'
2 4 'B'
1 8 'C'
5 1 'D'
@@ -56,7 +57,7 @@ XLOC_HEADER:0x8700
6 1 'N'
5 2 'O'
# O with two dots
-2 4 ''
+2 4 'Ö'
2 4 'P'
# Q tile only available for blanks
0 1 'Q'
@@ -64,13 +65,12 @@ XLOC_HEADER:0x8700
8 1 'S'
8 1 'T'
3 4 'U'
-# tile only available for blanks
-0 1 ''
+# Ü tile only available for blanks
+0 1 'Ü'
2 3 'V'
# W tile only available for blanks
0 1 'W'
1 8 'X'
1 7 'Y'
1 10 'Z'
-
diff --git a/xwords4/dawg/dawg2dict.pl b/xwords4/dawg/dawg2dict.pl
index b4565cd34..70d93343a 100755
--- a/xwords4/dawg/dawg2dict.pl
+++ b/xwords4/dawg/dawg2dict.pl
@@ -1,6 +1,6 @@
-#!/usr/bin/perl
+#!/usr/bin/perl -CS
#
-# Copyright 2004 by Eric House (xwords@eehouse.org)
+# Copyright 2004 - 2009 by Eric House (xwords@eehouse.org)
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
@@ -22,9 +22,12 @@
use strict;
use Fcntl;
+use Encode 'from_to';
+use Encode;
my $gInFile;
my $gDoRaw = 0;
+my $gDoJSON = 0;
my $gFileType;
my $gNodeSize;
@@ -33,7 +36,7 @@ sub systell { sysseek($_[0], 0, SEEK_CUR) }
sub usage() {
print STDERR "USAGE: $0 "
- . "[-raw] "
+ . "[-raw | -json] "
. "-dict "
. "\n"
. "\t(Takes a .pdb or .xwd and prints its words to stdout)\n";
@@ -45,6 +48,8 @@ sub parseARGV() {
while ( my $parm = shift(@ARGV) ) {
if ( $parm eq "-raw" ) {
$gDoRaw = 1;
+ } elsif ( $parm eq "-json" ) {
+ $gDoJSON = 1;
} elsif ( $parm eq "-dict" ) {
$gInFile = shift(@ARGV);
} else {
@@ -72,18 +77,32 @@ sub countSpecials($) {
sub readXWDFaces($$$) {
my ( $fh, $facRef, $nSpecials ) = @_;
- my $buf;
- my $nRead = sysread( $fh, $buf, 1 );
- my $nChars = unpack( 'c', $buf );
+ my ( $buf, $nRead, $nChars, $nBytes );
+ $nRead = sysread( $fh, $buf, 1 );
+ $nBytes = unpack( 'c', $buf );
+ printf STDERR "nBytes of faces: %d\n", $nBytes;
+ $nRead = sysread( $fh, $buf, 1 );
+ $nChars = unpack( 'c', $buf );
+ printf STDERR "nChars of faces: %d\n", $nChars;
+ binmode( $fh, ":encoding(utf8)" ) or die "binmode(:utf-8) failed\n";
+ sysread( $fh, $buf, $nChars );
+ length($buf) == $nChars or die "didn't read expected number of bytes\n";
+ binmode( $fh ) or die "binmode failed\n";
+
+ print STDERR "string now: $buf\n";
my @faces;
- for ( my $i = 0; $i < $nChars; ++$i ) {
- my $nRead = sysread( $fh, $buf, 2 );
- push( @faces, chr(unpack( "n", $buf ) ) );
+ for ( my $ii = 0; $ii < $nChars; ++$ii ) {
+ my $chr = substr( $buf, $ii, 1 );
+ print STDERR "pushing $chr \n";
+ push( @faces, $chr );
}
+ printf STDERR "at 0x%x after reading faces\n", systell($fh);
+
${$nSpecials} = countSpecials( \@faces );
@{$facRef} = @faces;
+ printf STDERR "readXWDFaces=>%d\n", $nChars;
return $nChars;
} # readXWDFaces
@@ -99,6 +118,7 @@ sub skipBitmap($) {
sysread( $fh, $buf, $nBytes );
}
+ printf STDERR "skipBitmap\n";
} # skipBitmap
sub getSpecials($$$) {
@@ -138,9 +158,9 @@ sub readNodesToEnd($) {
sub nodeSizeFromFlags($) {
my ( $flags ) = @_;
- if ( $flags == 2 ) {
+ if ( $flags == 4 ) {
return 3;
- } elsif ( $flags == 3 ) {
+ } elsif ( $flags == 5 ) {
return 4;
} else {
die "invalid dict flags $flags";
@@ -161,6 +181,7 @@ sub mergeSpecials($$) {
sub prepXWD($$$$) {
my ( $fh, $facRef, $nodesRef, $startRef ) = @_;
+ printf STDERR "at 0x%x at start\n", systell($fh);
my $buf;
my $nRead = sysread( $fh, $buf, 2 );
my $flags = unpack( "n", $buf );
@@ -170,24 +191,30 @@ sub prepXWD($$$$) {
my $nSpecials;
my $faceCount = readXWDFaces( $fh, $facRef, \$nSpecials );
+ printf STDERR "at 0x%x before header read\n", systell($fh);
# skip xloc header
$nRead = sysread( $fh, $buf, 2 );
# skip values info.
+ printf STDERR "at 0x%x before reading %d values\n", systell($fh), $faceCount;
sysread( $fh, $buf, $faceCount * 2 );
+ printf STDERR "at 0x%x after values read\n", systell($fh);
+ printf STDERR "at 0x%x before specials read\n", systell($fh);
my @specials;
getSpecials( $fh, $nSpecials, \@specials );
mergeSpecials( $facRef, \@specials );
+ printf STDERR "at 0x%x after specials read\n", systell($fh);
-# printf STDERR "at 0x%x before offset read\n", systell($fh);
+ printf STDERR "at 0x%x before offset read\n", systell($fh);
sysread( $fh, $buf, 4 );
$$startRef = unpack( 'N', $buf );
-# print STDERR "startRef=$$startRef\n";
+ print STDERR "startRef=$$startRef\n";
my @nodes = readNodesToEnd( $fh );
@$nodesRef = @nodes;
+ print STDERR "prepXWD done\n";
} # prepXWD
sub readPDBSpecials($$$$$) {
@@ -342,10 +369,52 @@ sub printNodes($$) {
}
}
+sub printStartJson($) {
+ my ( $startIndex ) = @_;
+ printf( " start: 0x%.8x,\n", $startIndex );
+}
+
+sub printCharsJson($) {
+ my ( $fr ) = @_;
+ print " chars: [ ";
+ foreach my $char (@$fr) {
+ print "\"$char\", "
+ }
+ print "],\n"
+}
+
+sub printNodesJson($) {
+ my ( $nr ) = @_;
+ print " dawg: [\n";
+
+ my $len = @$nr;
+ my $newLine = 1;
+ for ( my $ii = 0; $ii < $len; ++$ii ) {
+ my $node = $$nr[$ii];
+
+ if ( $newLine ) {
+ printf( " /*%.6x*/ ", $ii );
+ $newLine = 0;
+ }
+
+ printf "0x%.8x, ", $node;
+
+ my ( $chrIndex, $nextEdge, $accepting, $lastEdge );
+ parseNode( $node, \$chrIndex, \$nextEdge, \$accepting, \$lastEdge );
+ if ( $lastEdge ) {
+ print "\n";
+ $newLine = 1;
+ }
+ }
+
+ print "\n ],\n"
+}
+
#################################################################
# main
#################################################################
+binmode( STDERR, ":encoding(utf8)" ) or die "binmode(:utf-8) failed\n";
parseARGV();
@@ -364,9 +433,17 @@ if ( $gFileType eq "xwd" ){
close INFILE;
die "no nodes!!!" if 0 == @nodes;
+
if ( $gDoRaw ) {
printNodes( \@nodes, \@faces );
+} elsif ( $gDoJSON ) {
+ print "dict = {\n";
+ printStartJson( $startIndex );
+ printCharsJson( \@faces );
+ printNodesJson( \@nodes );
+ print "}\n";
} else {
+ binmode( STDOUT, ":encoding(utf8)" ) or die "binmode(:utf-8) failed\n";
printDAWG( [], \@nodes, $startIndex, \@faces );
}
diff --git a/xwords4/dawg/dict2dawg.cpp b/xwords4/dawg/dict2dawg.cpp
index 873593397..c48f5655e 100644
--- a/xwords4/dawg/dict2dawg.cpp
+++ b/xwords4/dawg/dict2dawg.cpp
@@ -78,7 +78,7 @@ static void (*gReadWordProc)(void) = NULL;
static NodeList gNodes; // final array of nodes
static unsigned int gNBytesPerOutfile = 0xFFFFFFFF;
static char* gTableFile = NULL;
-static bool gIsMultibyte = false;
+static bool gIsMultibyte = true; // always true
static const char* gEncoding = NULL;
static char* gOutFileBase = NULL;
static char* gStartNodeOut = NULL;
@@ -91,9 +91,9 @@ static const char* gLang = NULL;
static char* gBytesPerNodeFile = NULL; // where to write whether node
// size 3 or 4
int gWordCount = 0;
-std::map gTableHash;
+std::map gTableHash;
int gBlankIndex;
-std::vector gRevMap;
+std::vector gRevMap;
#ifdef DEBUG
bool gDebug = false;
#endif
@@ -107,17 +107,19 @@ int gLimHigh = MAX_WORD_LEN;
// OWL is 1.7M
-#define MAX_POOL_SIZE (10 * 0x100000)
+#define MAX_POOL_SIZE (10 * 0x100000 * sizeof(wchar_t))
#define ERROR_EXIT(...) error_exit( __LINE__, __VA_ARGS__ );
+#define VSIZE(a) (sizeof(a)/sizeof(a[0]))
static char* parseARGV( int argc, char** argv, const char** inFileName );
static void usage( const char* name );
static void error_exit( int line, const char* fmt, ... );
static void makeTableHash( void );
+static void printTableHash( void );
static WordList* parseAndSort( void );
static void printWords( WordList* strings );
static bool firstBeforeSecond( const Letter* lhs, const Letter* rhs );
-static char* tileToAscii( char* out, int outSize, const Letter* in );
+static wchar_t* tilesToText( wchar_t* out, int outLen, const Letter* in );
static int buildNode( int depth );
static void TrieNodeSetIsLastSibling( Node* nodeR, bool isLastSibling );
static int addNodes( NodeList& newedgesR );
@@ -178,6 +180,7 @@ main( int argc, char** argv )
}
makeTableHash();
+ printTableHash();
// Do I need this stupid thing? Better to move the first row to
// the front of the array and patch everything else. Or fix the
@@ -451,9 +454,9 @@ readFromSortedArray( void )
}
#ifdef DEBUG
if ( gDebug ) {
- char buf[T2ABUFLEN(MAX_WORD_LEN)];
- fprintf( stderr, "%s: got word: %s\n", __func__,
- tileToAscii( buf, sizeof(buf), word ) );
+ wchar_t buf[T2ABUFLEN(MAX_WORD_LEN)];
+ fprintf( stderr, "%s: got word: %ls\n", __func__,
+ tilesToText( buf, VSIZE(buf), word ) );
}
#endif
}
@@ -473,13 +476,13 @@ readFromSortedArray( void )
&& !firstBeforeSecond( gCurrentWord, word ) ) {
#ifdef DEBUG
if ( gDebug ) {
- char buf1[T2ABUFLEN(MAX_WORD_LEN)];
- char buf2[T2ABUFLEN(MAX_WORD_LEN)];
+ wchar_t buf1[T2ABUFLEN(MAX_WORD_LEN)];
+ wchar_t buf2[T2ABUFLEN(MAX_WORD_LEN)];
fprintf( stderr,
- "%s: words %s and %s are the same or out of order\n",
+ "%s: words %ls and %ls are the same or out of order\n",
__func__,
- tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
- tileToAscii( buf2, sizeof(buf2), word ) );
+ tilesToText( buf1, VSIZE(buf1), gCurrentWord ),
+ tilesToText( buf2, VSIZE(buf2), word ) );
}
#endif
continue;
@@ -492,9 +495,9 @@ readFromSortedArray( void )
#ifdef DEBUG
if ( gDebug ) {
- char buf[T2ABUFLEN(MAX_WORD_LEN)];
- fprintf( stderr, "gCurrentWord now %s\n",
- tileToAscii( buf, sizeof(buf), gCurrentWord) );
+ wchar_t buf[T2ABUFLEN(MAX_WORD_LEN)];
+ fprintf( stderr, "gCurrentWord now %ls\n",
+ tilesToText( buf, VSIZE(buf), gCurrentWord) );
}
#endif
} // readFromSortedArray
@@ -516,6 +519,9 @@ getWideChar( FILE* file )
assert( 0 == ii );
dest = byt;
break;
+ } else if ( byt < ' ' && 0 == ii ) {
+ dest = byt;
+ break;
}
assert( ii < 4 );
@@ -533,7 +539,7 @@ getWideChar( FILE* file )
} // getWideChar
static Letter*
-readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
+readOneWord( Letter* wordBuf, const int bufLen, int* lenp, bool* gotEOF )
{
Letter* result = NULL;
int count = 0;
@@ -545,7 +551,7 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
// return it. If no, start over ONLY IF the terminator was not
// EOF.
for ( ; ; ) {
- wchar_t byt = gIsMultibyte? getWideChar( gInFile ) : getc( gInFile );
+ wchar_t byt = getWideChar( gInFile );
// EOF is special: we don't try for another word even if
// dropWord is true; we must leave now.
@@ -557,6 +563,13 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
if ( !dropWord && (count >= gLimLow) && (count <= gLimHigh) ) {
assert( count < bufLen );
wordBuf[count] = '\0';
+#ifdef DEBUG
+ if ( gDebug ) {
+ wchar_t buf[T2ABUFLEN(count)];
+ fprintf( stderr, "%s: adding word: %ls\n",
+ __func__, tilesToText( buf, VSIZE(buf), wordBuf ) );
+ }
+#endif
result = wordBuf;
*lenp = count;
++gWordCount;
@@ -567,11 +580,12 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
}
#ifdef DEBUG
if ( gDebug ) {
- char buf[T2ABUFLEN(count)];
+ wchar_t buf[T2ABUFLEN(count)];
wordBuf[count] = '\0';
- fprintf( stderr, "%s: dropping word (len %d>=%d): %s\n",
- __func__, count, gLimHigh,
- tileToAscii( buf, sizeof(buf), wordBuf ) );
+ fprintf( stderr, "%s: dropping word (len %d >%d or <%d or "
+ "dropWord:%d): %ls\n", __func__, count, gLimHigh,
+ gLimLow, (int)dropWord,
+ tilesToText( buf, VSIZE(buf), wordBuf ) );
}
#endif
count = 0; // we'll start over
@@ -579,43 +593,43 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF )
} else if ( count >= bufLen ) {
// Just drop it...
+ assert(0); // Fix this -- but need to warn when out of
+ // memory!!!
dropWord = true;
// Don't call into the hashtable twice here!!
- } else if ( gTableHash.find(byt) != gTableHash.end() ) {
- assert( count < bufLen );
- wordBuf[count++] = gTableHash[byt];
- if ( count >= bufLen ) {
- dropWord = true;
- }
- } else if ( gKillIfMissing || !dropWord ) {
- char buf[T2ABUFLEN(count)];
- wordBuf[count] = '\0';
-
- tileToAscii( buf, sizeof(buf), wordBuf );
-
- if ( gKillIfMissing ) {
- ERROR_EXIT( "chr %lc (%d/0x%x) not in map file %s\n"
- "last word was %s\n",
- byt, (int)byt, (int)byt, gTableFile, buf );
- } else if ( !dropWord ) {
-#ifdef DEBUG
- if ( gDebug ) {
- fprintf( stderr, "%s: chr %c (%d) not in map file %s\n"
- "dropping partial word %s\n", __func__,
- (char)byt, (int)byt, gTableFile, buf );
+ } else {
+ std::map::iterator iter = gTableHash.find(byt);
+ if ( iter != gTableHash.end() ) {
+ assert( count < bufLen );
+ wordBuf[count++] = iter->second;
+ if ( count >= bufLen ) {
+ dropWord = true;
}
+ } else if ( gKillIfMissing || !dropWord ) {
+ wchar_t buf[T2ABUFLEN(count)];
+ wordBuf[count] = '\0';
+
+ tilesToText( buf, VSIZE(buf), wordBuf );
+
+ if ( gKillIfMissing ) {
+ ERROR_EXIT( "chr %lc (%d/0x%x) not in map file %s\n"
+ "last word was %ls\n",
+ byt, (int)byt, (int)byt, gTableFile, buf );
+ } else if ( !dropWord ) {
+#ifdef DEBUG
+ if ( gDebug ) {
+ fprintf( stderr, "%s: chr %lc (%d) not in map file %s\n"
+ "dropping partial word %ls\n", __func__,
+ byt, (int)byt, gTableFile, buf );
+ }
#endif
- dropWord = true;
+ dropWord = true;
+ }
}
}
- }
+ } // for
-// if ( NULL != result ) {
-// char buf[T2ABUFLEN(MAX_WORD_LEN)];
-// fprintf( stderr, "%s returning %s\n", __func__,
-// tileToAscii( buf, sizeof(buf), result ) );
-// }
return result;
} // readOneWord
@@ -635,7 +649,7 @@ readFromFile( void )
// during the sort. This seems easier.
for ( ; ; ) {
if ( !gDone ) {
- word = readOneWord( wordBuf, sizeof(wordBuf), &len, &s_eof );
+ word = readOneWord( wordBuf, VSIZE(wordBuf), &len, &s_eof );
gDone = NULL == word;
}
if ( gDone ) {
@@ -658,13 +672,13 @@ readFromFile( void )
&& !firstBeforeSecond( gCurrentWord, word ) ) {
#ifdef DEBUG
if ( gDebug ) {
- char buf1[T2ABUFLEN(MAX_WORD_LEN)];
- char buf2[T2ABUFLEN(MAX_WORD_LEN)];
+ wchar_t buf1[T2ABUFLEN(MAX_WORD_LEN)];
+ wchar_t buf2[T2ABUFLEN(MAX_WORD_LEN)];
fprintf( stderr,
- "%s: words %s and %s are the smae or out of order\n",
+ "%s: words %ls and %ls are the smae or out of order\n",
__func__,
- tileToAscii( buf1, sizeof(buf1), gCurrentWord ),
- tileToAscii( buf2, sizeof(buf2), word ) );
+ tilesToText( buf1, VSIZE(buf1), gCurrentWord ),
+ tilesToText( buf2, VSIZE(buf2), word ) );
}
#endif
continue;
@@ -676,9 +690,9 @@ readFromFile( void )
#ifdef DEBUG
if ( gDebug ) {
- char buf[T2ABUFLEN(MAX_WORD_LEN)];
- fprintf( stderr, "gCurrentWord now %s\n",
- tileToAscii( buf, sizeof(buf), gCurrentWord) );
+ wchar_t buf[T2ABUFLEN(MAX_WORD_LEN)];
+ fprintf( stderr, "gCurrentWord now %ls\n",
+ tilesToText( buf, VSIZE(buf), gCurrentWord) );
}
#endif
} // readFromFile
@@ -690,14 +704,15 @@ firstBeforeSecond( const Letter* lhs, const Letter* rhs )
return gt;
}
-static char*
-tileToAscii( char* out, int outSize, const Letter* in )
+static wchar_t*
+tilesToText( wchar_t* out, int outSize, const Letter* in )
{
- char tiles[outSize];
- int tilesLen = 1;
- tiles[0] = '[';
+ wchar_t tiles[outSize];
+ wchar_t* orig = out;
+ int tilesLen = 0;
+
+ tiles[tilesLen++] = L'[';
- char* orig = out;
for ( ; ; ) {
Letter ch = *in++;
if ( '\0' == ch ) {
@@ -705,14 +720,15 @@ tileToAscii( char* out, int outSize, const Letter* in )
}
assert( ch < gRevMap.size() );
*out++ = gRevMap[ch];
- tilesLen += sprintf( &tiles[tilesLen], "%d,", ch );
+
+ tilesLen += swprintf( &tiles[tilesLen], outSize-tilesLen, L"%d,", ch );
assert( (out - orig) < outSize );
}
assert( tilesLen+1 < outSize );
- tiles[tilesLen] = ']';
- tiles[tilesLen+1] = '\0';
- strcpy( out, tiles );
+ tiles[tilesLen] = L']';
+ tiles[tilesLen+1] = L'\0';
+ wcscpy( out, tiles );
return orig;
}
@@ -777,9 +793,9 @@ printWords( WordList* strings )
{
std::vector::iterator iter = strings->begin();
while ( iter != strings->end() ) {
- char buf[T2ABUFLEN(MAX_WORD_LEN)];
- tileToAscii( buf, sizeof(buf), *iter );
- fprintf( stderr, "%s\n", buf );
+ wchar_t buf[T2ABUFLEN(MAX_WORD_LEN)];
+ tilesToText( buf, VSIZE(buf), *iter );
+ fprintf( stderr, "%ls\n", buf );
++iter;
}
}
@@ -906,18 +922,12 @@ makeTableHash( void )
gRevMap.push_back(0);
for ( ii = 0; ; ++ii ) {
- int ch = getc(TABLEFILE);
- if ( ch == EOF ) {
- break;
- }
-
- if ( gUseUnicode ) { // skip the first byte each time: tmp HACK!!!
- ch = getc(TABLEFILE);
- }
- if ( ch == EOF ) {
+ wchar_t ch = getWideChar( TABLEFILE );
+ if ( EOF == ch ) {
break;
}
+ fprintf( stderr, "adding %lc/%x\n", ch, ch );
gRevMap.push_back(ch);
if ( ch == 0 ) { // blank
@@ -940,6 +950,26 @@ makeTableHash( void )
fclose( TABLEFILE );
} // makeTableHash
+static void
+printTableHash( void )
+{
+ if ( gDebug ) {
+ std::vector::iterator iter = gRevMap.begin();
+ int count = 0; // 0th entry is 0
+ while ( iter != gRevMap.end() ) {
+ wchar_t ch = *iter;
+ if ( 0 != ch ) {
+ fprintf( stderr, "%s: gRevMap[%d]: %lc\n", __func__, count, ch );
+ fprintf( stderr, "%s: gTableHash[%lc]: %d\n", __func__, ch,
+ gTableHash[ch] );
+ assert( gTableHash[ch] == count );
+ }
+ ++iter;
+ ++count;
+ }
+ }
+}
+
// emitNodes. "input" is $gNodes. From it we write up to
// $nBytesPerOutfile to files named $outFileBase0..n, mapping the
// letter field down to 5 bits with a hash built from $tableFile. If
@@ -1065,6 +1095,9 @@ outputNode( Node node, int nBytes, FILE* outfile )
unsigned int fco = TrieNodeGetFirstChildOffset(node);
unsigned int fourthByte = 0;
+ assert( ((3 == nBytes) && (fco < (1<<17)))
+ || ((4 == nBytes) && (fco < (1<<24))) );
+
if ( nBytes == 4 ) {
fourthByte = fco >> 16;
if ( fourthByte > 0xFF ) {
@@ -1085,7 +1118,7 @@ outputNode( Node node, int nBytes, FILE* outfile )
// | | |
// accepting bit ---+ | |
// last edge bit ------+ |
- // ---- last bit (17th on next node addr)---------+
+ // ---- last bit (17th of next node addr)---------+
// The four-byte format adds a byte at the right end for
// addressing, but removes the extra bit (5) in order to let the
@@ -1247,13 +1280,13 @@ parseARGV( int argc, char** argv, const char** inFileName )
if ( !!enc ) {
if ( !strcasecmp( enc, "UTF-8" ) ) {
- gIsMultibyte = true;
+// gIsMultibyte = true;
} else if ( !strcasecmp( enc, "iso-8859-1" ) ) {
- gIsMultibyte = false;
+// gIsMultibyte = false;
} else if ( !strcasecmp( enc, "iso-latin-1" ) ) {
- gIsMultibyte = false;
+// gIsMultibyte = false;
} else if ( !strcasecmp( enc, "ISO-8859-2" ) ) {
- gIsMultibyte = false;
+// gIsMultibyte = false;
} else {
ERROR_EXIT( "%s: unknown encoding %s", __func__, enc );
}
diff --git a/xwords4/dawg/dictstats.pl b/xwords4/dawg/dictstats.pl
index 567a77f16..8465ca8df 100755
--- a/xwords4/dawg/dictstats.pl
+++ b/xwords4/dawg/dictstats.pl
@@ -15,9 +15,15 @@
use strict;
my @wordSizeCounts;
-my @letterCounts;
+my %letterCounts;
my $wordCount;
my $letterCount;
+my $enc = "utf8"; # this could be a cmdline arg....
+
+if ( $enc ) {
+ binmode( STDOUT, ":encoding($enc)" ) ;
+ binmode( STDIN, ":encoding($enc)" ) ;
+}
while (<>) {
@@ -27,10 +33,10 @@ while (<>) {
++$wordCount;
foreach my $letter (split( / */ ) ) {
- my $i = ord($letter);
+ my $ii = ord($letter);
# special-case the bogus chars we add for "specials"
- die "$0: this is a letter?: $i" if $i <= 32 && $i >= 4 && $i != 0;
- ++$letterCounts[$i];
+ die "$0: this is a letter?: $ii" if $ii <= 32 && $ii >= 4 && $ii != 0;
+ ++$letterCounts{$letter};
++$letterCount;
}
}
@@ -54,14 +60,12 @@ for ( my $i = 1 ; $i <= 99; ++$i ) {
print "\n\n**** Letter counts ****\n";
print " ASCII ORD HEX PCT (of $letterCount)\n";
my $lineNo = 1;
-for ( my $i = 0; $i < 255; ++$i ) {
- my $count = $letterCounts[$i];
- if ( $count > 0 ) {
- my $pct = (100.00 * $count) / $letterCount;
- printf( "%2d: %3s %3d %x %5.2f (%d)\n",
- $lineNo, chr($i), $i, $i, $pct, $count );
- ++$lineNo;
- }
+foreach my $key (sort keys %letterCounts) {
+ my $count = $letterCounts{$key};
+ my $pct = (100.00 * $count) / $letterCount;
+ printf( "%2d: %3s %3d %x %5.2f (%d)\n",
+ $lineNo, $key, ord($key), ord($key), $pct, $count );
+ ++$lineNo;
}
print "\n";
diff --git a/xwords4/dawg/xloc.pl b/xwords4/dawg/xloc.pl
index 20b72fcc9..23ef0ca43 100755
--- a/xwords4/dawg/xloc.pl
+++ b/xwords4/dawg/xloc.pl
@@ -23,6 +23,7 @@ use xloc;
my $unicode = -1;
my $doval = 0;
+my $dosize = 0;
my $enc;
my $outfile;
@@ -37,6 +38,8 @@ while ( $arg = $ARGV[0] ) {
$unicode = 0;
} elsif ( $arg eq "-v" ) {
$doval = 1;
+ } elsif ( $arg eq "-s" ) {
+ $dosize = 1;
} elsif ( $arg eq '-out' ) {
$outfile = $ARGV[1];
shift @ARGV;
@@ -52,12 +55,20 @@ die "info file $infoFile not found\n" if ! -s $infoFile;
my $xlocToken = xloc::ParseTileInfo($infoFile, $enc);
-open OUTFILE, "> $outfile";
+if ( $enc ) {
+ open OUTFILE, ">:encoding($enc)", "$outfile"
+ or die "couldn't open $outfile";
+} else {
+ open OUTFILE, ">$outfile" or die "couldn't open $outfile";
+}
# For f*cking windoze linefeeds
-binmode( OUTFILE );
+# binmode( OUTFILE );
if ( $unicode ne -1 ) {
xloc::WriteMapFile( $xlocToken, $unicode, \*OUTFILE );
+} elsif ( $dosize ) {
+ my $count = xloc::GetNTiles( $xlocToken );
+ print OUTFILE pack("c", $count );
} elsif ( $doval ) {
xloc::WriteValuesFile( $xlocToken, \*OUTFILE );
}
diff --git a/xwords4/dawg/xloc.pm b/xwords4/dawg/xloc.pm
index 741968e76..6e25fa138 100644
--- a/xwords4/dawg/xloc.pm
+++ b/xwords4/dawg/xloc.pm
@@ -103,13 +103,6 @@ sub GetValue($$) {
sub WriteMapFile($$$) {
my ( $hashR, $unicode, $fhr ) = @_;
- my $packStr;
- if ( $unicode ) {
- $packStr = "n";
- } else {
- $packStr = "C";
- }
-
my $count = GetNTiles($hashR);
my $specialCount = 0;
for ( my $i = 0; $i < $count; ++$i ) {
@@ -117,11 +110,12 @@ sub WriteMapFile($$$) {
my $str = ${$tileR}[2];
if ( $str =~ /\'(.)\'/ ) {
- print $fhr pack($packStr, ord($1) );
+ print $fhr pack( "U", ord($1) );
+# printf STDERR "ord: %x ($1)\n", ord($1);
} elsif ( $str =~ /\"(.+)\"/ ) {
- print $fhr pack($packStr, $specialCount++ );
+ print $fhr pack( "c", $specialCount++ );
} elsif ( $str =~ /(\d+)/ ) {
- print $fhr pack( $packStr, $1 );
+ print $fhr pack( "n", $1 );
} else {
die "WriteMapFile: unrecognized face format $str, elem $i";
}