From 79990bc7b108dd7e009d89eea9f914bdc717a313 Mon Sep 17 00:00:00 2001 From: Andy2 Date: Tue, 30 Nov 2010 18:35:11 -0800 Subject: [PATCH] first set of changes formed by applyinig diff of android_branch's dawg/ directory against unicode_branch's. The two branches seem to have to common ancestor -- probably didn't survive translation from svn -- so this is the best I can do. This checkin is all the files that were modified by the patch plus a couple of simple additions. Next I'll be adding directories that the patch created. It also reintroduced a bunch of .cvsignore files; I won't check those in. --- xwords4/dawg/Catalan/Makefile | 2 +- xwords4/dawg/Catalan/info.txt | 2 +- xwords4/dawg/English/Makefile | 2 +- xwords4/dawg/English/Makefile.CollegeEng | 36 ++++ xwords4/dawg/English/Makefile.OWL2 | 4 +- xwords4/dawg/English/Makefile.TWL06 | 37 ++++ xwords4/dawg/German/Makefile | 17 +- xwords4/dawg/German/info.txt | 24 +-- xwords4/dawg/Hex/Makefile | 12 +- xwords4/dawg/Hex/info.txt | 16 +- xwords4/dawg/Makefile.langcommon | 29 ++-- xwords4/dawg/Polish/Makefile | 14 +- xwords4/dawg/Polish/info.txt | 30 ++-- xwords4/dawg/Spanish/Makefile | 11 +- xwords4/dawg/Spanish/info.txt | 12 +- xwords4/dawg/Swedish/Makefile | 10 +- xwords4/dawg/Swedish/info.txt | 24 +-- xwords4/dawg/dawg2dict.pl | 103 +++++++++-- xwords4/dawg/dict2dawg.cpp | 207 +++++++++++++---------- xwords4/dawg/dictstats.pl | 28 +-- xwords4/dawg/xloc.pl | 15 +- xwords4/dawg/xloc.pm | 14 +- 22 files changed, 423 insertions(+), 226 deletions(-) create mode 100644 xwords4/dawg/English/Makefile.CollegeEng create mode 100644 xwords4/dawg/English/Makefile.TWL06 diff --git a/xwords4/dawg/Catalan/Makefile b/xwords4/dawg/Catalan/Makefile index 46815e993..da54778d7 100644 --- a/xwords4/dawg/Catalan/Makefile +++ b/xwords4/dawg/Catalan/Makefile @@ -33,7 +33,7 @@ endif endif LANG_SPECIAL_INFO = \ - "L-L" $(PBITMS)/large_ll.pbitm $(PBITMS)/small_ll.pbitm \ + "L·L" $(PBITMS)/large_ll.pbitm $(PBITMS)/small_ll.pbitm \ "NY" $(PBITMS)/large_ny.pbitm $(PBITMS)/small_ny.pbitm \ "QU" $(PBITMS)/large_qu.pbitm $(PBITMS)/small_qu.pbitm \ diff --git a/xwords4/dawg/Catalan/info.txt b/xwords4/dawg/Catalan/info.txt index 1cb56442d..9954db826 100644 --- a/xwords4/dawg/Catalan/info.txt +++ b/xwords4/dawg/Catalan/info.txt @@ -75,7 +75,7 @@ XLOC_HEADER:0x8C00 8 1 'I' 1 8 'J' 4 1 'L' -1 10 {"L-L"} +1 10 {"L·L"} 3 2 'M' 6 1 'N' 1 10 {"NY"} diff --git a/xwords4/dawg/English/Makefile b/xwords4/dawg/English/Makefile index 95b975643..21c4b89ac 100644 --- a/xwords4/dawg/English/Makefile +++ b/xwords4/dawg/English/Makefile @@ -15,7 +15,7 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -TARGET_TYPE ?= FRANK +TARGET_TYPE ?= WINCE include ../Makefile.langcommon diff --git a/xwords4/dawg/English/Makefile.CollegeEng b/xwords4/dawg/English/Makefile.CollegeEng new file mode 100644 index 000000000..80d4cbd24 --- /dev/null +++ b/xwords4/dawg/English/Makefile.CollegeEng @@ -0,0 +1,36 @@ +# -*- mode: makefile; compile-command: "make -f Makefile.COSD"; -*- +# Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +XWLANG=CollegeEng +LANGCODE=en_US +TARGET_TYPE=WINCE + +include ../Makefile.2to8 + +include ../Makefile.langcommon + +SOURCEDICT ?= $(XWDICTPATH)/English/CollegeEng.dict.gz + +$(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile + zcat $< | tr -d '\r' | tr [a-z] [A-Z] | grep -e "^[A-Z]\{2,15\}$$" | \ + gzip -c > $@ + +# Everything but creating of the Main.dict file is inherited from the +# "parent" Makefile.langcommon in the parent directory. + +clean: clean_common + rm -f $(XWLANG)Main.dict.gz *.bin $(XWLANG)*.pdb $(XWLANG)*.seb diff --git a/xwords4/dawg/English/Makefile.OWL2 b/xwords4/dawg/English/Makefile.OWL2 index 41479a843..ce7988a5a 100644 --- a/xwords4/dawg/English/Makefile.OWL2 +++ b/xwords4/dawg/English/Makefile.OWL2 @@ -15,9 +15,9 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -XWLANG=OWL2_ +XWLANG=OWL2 LANGCODE=en_US -TARGET_TYPE=PALM +TARGET_TYPE?=PALM include ../Makefile.2to8 diff --git a/xwords4/dawg/English/Makefile.TWL06 b/xwords4/dawg/English/Makefile.TWL06 new file mode 100644 index 000000000..28dd43ca1 --- /dev/null +++ b/xwords4/dawg/English/Makefile.TWL06 @@ -0,0 +1,37 @@ +# -*- mode: makefile; compile-command: "make -f Makefile.COSD"; -*- +# Copyright 2002-2010 by Eric House (xwords@eehouse.org). All rights +# reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +XWLANG=TWL06 +LANGCODE=en_US +TARGET_TYPE=WINCE + +include ../Makefile.2to8 + +include ../Makefile.langcommon + +# from http://www.3zsoftware.com/en/wordmagic/lists.php +SOURCEDICT ?= $(XWDICTPATH)/English/twl06.zip + +$(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile + zcat $< | grep -e "^[A-Z]\{2,15\}$$" | gzip -c > $@ + +# Everything but creating of the Main.dict file is inherited from the +# "parent" Makefile.langcommon in the parent directory. + +clean: clean_common + rm -f $(XWLANG)Main.dict.gz *.bin $(XWLANG)*.pdb $(XWLANG)*.seb diff --git a/xwords4/dawg/German/Makefile b/xwords4/dawg/German/Makefile index 62ec0be1e..b5e453d5a 100644 --- a/xwords4/dawg/German/Makefile +++ b/xwords4/dawg/German/Makefile @@ -1,5 +1,6 @@ -# -*- mode: makefile; coding: iso-8859-1; -*- -# Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved. +# -*- mode: makefile; coding: utf-8; -*- +# Copyright 2002 - 2010 by Eric House (xwords@eehouse.org). All +# rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -17,7 +18,7 @@ XWLANG = German LANGCODE = de_DE -ENC = ISO-8859-1 +ENC = UTF-8 TARGET_TYPE ?= WINCE @@ -28,11 +29,11 @@ include ../Makefile.langcommon SOURCEDICT ?= $(XWDICTPATH)/German/HansGerman.dict.gz $(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile - zcat $< | tr [a-z] [A-Z] | \ - LANG=$(LANGCODE):$(ENC) sed -e 's//SS/g' | \ - LANG=$(LANGCODE):$(ENC) grep '[AEIOU]' | \ - LANG=$(LANGCODE):$(ENC) grep '^[A-Z]\+$$' | \ - gzip -c > $@ + zcat $< \ + | tr [a-zäöü] [A-ZÄÖÜ] \ + | sed -e 's/ß/SS/g' \ + | grep '^[A-ZÄÖÜ]*$$' \ + | gzip -c > $@ # Everything but creating of the Main.dict file is inherited from the # "parent" Makefile.langcommon in the parent directory. diff --git a/xwords4/dawg/German/info.txt b/xwords4/dawg/German/info.txt index f6321981d..7c3ee588f 100644 --- a/xwords4/dawg/German/info.txt +++ b/xwords4/dawg/German/info.txt @@ -1,4 +1,6 @@ -# Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved. +# -*- mode: conf; coding: utf-8; -*- +# Copyright 2002 - 2010 by Eric House (xwords@eehouse.org). All +# rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -15,17 +17,18 @@ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. LANGCODE:de_DE +CHARSET: utf-8 # deal with DOS files LANGFILTER: tr -d '\r' # substitute for sharfes-s -LANGFILTER: | sed -e 's//SS/g' +LANGFILTER: | sed -e 's/ß/SS/g' # uppercase all -LANGFILTER: | tr [a-z] [A-Z] +LANGFILTER: | tr [a-zäöü] [A-ZÄÖÜ] # no words not containing a vowel -LANGFILTER: | grep '[AEIOU]' +LANGFILTER: | grep '[AEIOUÄÖÜ]' # none with illegal chars -LANGFILTER: | grep '^[A-Z]\+$' +LANGFILTER: | grep '^[A-ZÄÖÜ]\+$' # Until I can figure out how to force sort to use a locale's collation # rules we can't trust sort in the filtering rules above and so must @@ -46,9 +49,8 @@ XLOC_HEADER:0x8300 2 0 {"_"} -5 1 'A' -# A mit umlaut -1 6 196 +5 1 'A' +1 6 'Ä' 2 3 'B' 2 4 'C' 4 1 'D' @@ -63,16 +65,14 @@ XLOC_HEADER:0x8300 4 3 'M' 9 1 'N' 3 2 'O' -# O mit umlaut -1 8 214 +1 8 'Ö' 1 4 'P' 1 10 'Q' 6 1 'R' 7 1 'S' 6 1 'T' 6 1 'U' -# U mit umlaut -1 6 220 +1 6 'Ü' 1 6 'V' 1 3 'W' 1 8 'X' diff --git a/xwords4/dawg/Hex/Makefile b/xwords4/dawg/Hex/Makefile index cf710fa59..96d54ff1b 100644 --- a/xwords4/dawg/Hex/Makefile +++ b/xwords4/dawg/Hex/Makefile @@ -1,4 +1,6 @@ -# Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved. +# -*- mode: makefile; -*- +# Copyright 2002-2009 by Eric House (xwords@eehouse.org). All rights +# reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -16,6 +18,7 @@ XWLANG = Hex LANGCODE = hex +ENC = UTF-8 TARGET_TYPE = WINCE @@ -24,14 +27,15 @@ include ../Makefile.2to8 include ../Makefile.langcommon # Pass in your own dict here by setting DICT -DICT ?= $(XWDICTPATH)/English/SOWPODS_official.txt.gz +DICT ?= $(XWDICTPATH)/English/CSW.dict.gz -# Feel free to base this on whatever dictionary you have at hand. I'm -# using CollegeEng for no particular reason. +# tr 'AE' 'ÄË' doesn't work, so use sed. $(XWLANG)Main.dict.gz: $(DICT) @echo "building $@ from $<" zcat $< | tr [a-f] [A-F] | grep -e '^[A-F]\{2,8\}$$' | \ echo CAFEBABE DEADBEEF $$(cat -) | \ + sed 's/A/Ä/g' | \ + sed 's/E/Ë/g' | \ tr ' ' '\n' | sort | gzip > $@ # Everything but creating of the Main.dict file is inherited from the diff --git a/xwords4/dawg/Hex/info.txt b/xwords4/dawg/Hex/info.txt index fcd4f6baf..526041cf1 100755 --- a/xwords4/dawg/Hex/info.txt +++ b/xwords4/dawg/Hex/info.txt @@ -1,4 +1,6 @@ -# Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved. +# -*- mode: conf; -*- +# Copyright 2002-2009 by Eric House (xwords@eehouse.org). All rights +# reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -16,14 +18,14 @@ LANGCODE:HEX - - # uppercase all LANGFILTER: tr [a-f] [A-F] LANGFILTER: | grep '^[A-F]*$' +LANGFILTER: | sed 's/A/Ä/' +LANGFILTER: | sed 's/E/Ë/' LANGFILTER: | sort -u -D2DARGS: -nosort -term 10 +D2DARGS: -term 10 LANGINFO:

The hex "language" is something of a programmers' joke. LANGINFO: Hex is short for hexadecimal, a 16-base number system whose @@ -41,8 +43,6 @@ LANGINFO: tiles and games play quickly. That's also why the Hex LANGINFO: tile set has four blanks; that's the largest number LANGINFO: Crosswords supports and I needed to test at the limit.

- - # High bit means "official". Next 7 bits are an enum where Hex==127 # (I just made that up; not sure what it was originally.) Low byte is # padding @@ -51,11 +51,11 @@ XLOC_HEADER:0xFF00 4 0 {"_"} -9 1 'A' +9 1 'Ä' 2 3 'B' 2 3 'C' 4 2 'D' -12 1 'E' +12 1 'Ë' 2 4 'F' # should ignore all after the above diff --git a/xwords4/dawg/Makefile.langcommon b/xwords4/dawg/Makefile.langcommon index a0206cc72..e5b5b64d8 100644 --- a/xwords4/dawg/Makefile.langcommon +++ b/xwords4/dawg/Makefile.langcommon @@ -204,16 +204,6 @@ endif frankspecials.bin: ../frank_mkspecials.pl $(BMPFILES) $< $(BLANK_INFO) $(LANG_SPECIAL_INFO) > $@ -# a binary file (one byte) giving the number of tiles in the dict -charcount.bin: table.bin -ifdef NEWDAWG - siz=$$(ls -l $< | awk '{print $$5}'); \ - perl -e "print pack(\"c\",$$siz/2)" > $@ -else - siz=$$(wc -c $< | sed -e 's/$ $@ -endif - $(XWLANG)%.$(FRANK_EXT): dawg$(XWLANG)%.stamp $(XWLANG)%_flags.bin charcount.bin table.bin values.bin frankspecials.bin cat $(XWLANG)$*_flags.bin charcount.bin table.bin values.bin \ frankspecials.bin $(XWLANG)StartLoc.bin \ @@ -233,9 +223,9 @@ $(XWLANG)%.$(FRANK_EXT): dawg$(XWLANG)%.stamp $(XWLANG)%_flags.bin charcount.bin $(XWLANG)%_flags.bin: dawg$(XWLANG)%.stamp ifdef NEWDAWG if [ 3 = $$(cat $(XWLANG)$*_nodesize.bin) ] ; \ - then perl -e "print pack(\"n\",0x0002)" > $@; echo "flags=2"; \ + then perl -e "print pack(\"n\",0x0004)" > $@; echo "flags=4"; \ elif [ 4 = $$(cat $(XWLANG)$*_nodesize.bin) ] ; \ - then perl -e "print pack(\"n\",0x0003)" > $@; echo "flags=3"; \ + then perl -e "print pack(\"n\",0x0005)" > $@; echo "flags=5"; \ elif true; \ then echo "Unexpected node size"; exit 1; \ fi @@ -272,8 +262,19 @@ else perl -I../ ../xloc.pl -t -out $@ endif -values.bin: ../xloc.pl - perl -I../ ../xloc.pl -v -out $@ $(ENCP) +values.bin: ../xloc.pl + perl -I../ ../xloc.pl -v -out $@ + +# a binary file, two bytes, one giving the size of tiles data and the +# other the number of tiles in the dict. Tiles data is utf-8 and so +# number is not derivable from size. +charcount.bin: table.bin ../xloc.pl + SIZ=$$(ls -l $< | awk '{print $$5}'); \ + perl -e "print pack(\"c\",$$SIZ)" > $@ + TMP=/tmp/tmp$$$$; \ + perl -I../ ../xloc.pl -s -out $$TMP; \ + cat $$TMP >> $@; \ + rm -f $$TMP %.dict: %.dict.gz zcat $< > $@ diff --git a/xwords4/dawg/Polish/Makefile b/xwords4/dawg/Polish/Makefile index bee788e10..3254bfefb 100644 --- a/xwords4/dawg/Polish/Makefile +++ b/xwords4/dawg/Polish/Makefile @@ -1,4 +1,4 @@ -# -*- coding: iso-8859-2; mode: Makefile; -*- +# -*- mode: Makefile; -*- # Copyright 2002 - 2009 by Eric House (xwords@eehouse.org). All # rights reserved. # @@ -16,9 +16,9 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -XWLANG=Polish -LANGCODE=pl_PL -ENC = ISO-8859-2 +XWLANG = Polish +LANGCODE = pl_PL +ENC = UTF-8 # DICT2DAWGARGS = -lang $(LANGCODE) # DICT2DAWGARGS = -debug @@ -29,12 +29,12 @@ include ../Makefile.2to8 include ../Makefile.langcommon -SOURCEDICT ?= $(XWDICTPATH)/Polish/iso-8859-2/slowa.txt.gz +SOURCEDICT ?= $(XWDICTPATH)/Polish/slowa.txt.gz $(XWLANG)Main.dict.gz: $(SOURCEDICT) zcat $< | tr -d '\r' \ - | LANG=$(LANGCODE):$(ENC) tr [abcdefghijklmnoprstuwyz] [ABCDEFGHIJKLMNOPRSTUWYZ] \ - | LANG=$(LANGCODE):$(ENC) grep '^[ABCDEFGHIJKLMNOPRSTUWYZ]*$$' \ + | tr [aąbcćdeęfghijklłmnńoóprsśtuwyzźż] [AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ] \ + | grep '^[AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ]*$$' \ | gzip > $@ # Everything but creating of the Main.dict file is inherited from the diff --git a/xwords4/dawg/Polish/info.txt b/xwords4/dawg/Polish/info.txt index 657006233..35eed9ac0 100644 --- a/xwords4/dawg/Polish/info.txt +++ b/xwords4/dawg/Polish/info.txt @@ -1,4 +1,4 @@ -# -*- coding: iso-8859-2; mode: conf; -*- +# -*- mode: conf; -*- # Copyright 2002-2009 by Eric House (xwords@eehouse.org). All rights # reserved. # @@ -17,12 +17,12 @@ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. LANGCODE:pl_PL -CHARSET:iso-8859-2 +CHARSET:utf-8 # deal with DOS files LANGFILTER: tr -d '\r' -LANGFILTER: | tr [a-pr-uwyz󶼿] [A-PR-UWYZʣӦ] -LANGFILTER: | grep '^[A-PR-UWYZʣӦ]*$' +LANGFILTER: | tr [a-pr-uwyząćęłńóśźż] [A-PR-UWYZĄĆĘŁŃÓŚŹŻ] +LANGFILTER: | grep '^[A-PR-UWYZĄĆĘŁŃÓŚŹŻ]*$' LANGFILTER: | tr '\n' '\000' D2DARGS: -r -term 0 @@ -42,8 +42,8 @@ LANGINFO: this working.

LANGINFO:

Note that the blank is the last tile here, while with all LANGINFO: other languages it's the first.

-LANGINFO:

Also, please note that we currently require the files you -LANGINFO: upload to use the iso-8859-2 character encoding.

+# LANGINFO:

Also, please note that we currently require the files you +# LANGINFO: upload to use the iso-8859-2 character encoding.

# High bit means "official". Next 7 bits are an enum where # Polish==8. Low byte is padding @@ -51,13 +51,13 @@ XLOC_HEADER:0x8800 9 1 'A' -1 5 161 # '' +1 5 'Ą' 2 3 'B' 3 2 'C' -1 6 198 # '' +1 6 'Ć' 3 2 'D' 7 1 'E' -1 5 202 # '' +1 5 'Ę' 1 5 'F' 2 3 'G' 2 3 'H' @@ -65,23 +65,23 @@ XLOC_HEADER:0x8800 2 3 'J' 3 3 'K' 3 2 'L' -2 3 163 # '' +2 3 'Ł' 3 2 'M' 5 1 'N' -1 7 209 # '' +1 7 'Ń' 6 1 'O' -1 5 211 # '' +1 5 'Ó' 3 2 'P' 4 1 'R' 4 1 'S' -1 5 166 # '' +1 5 'Ś' 3 2 'T' 2 3 'U' 4 1 'W' 4 2 'Y' 5 1 'Z' -1 9 172 # '' -1 5 175 # '' +1 9 'Ź' +1 5 'Ż' # the blank *must* be last here!!! 2 0 {"_"} diff --git a/xwords4/dawg/Spanish/Makefile b/xwords4/dawg/Spanish/Makefile index 591a150c1..455a8c825 100644 --- a/xwords4/dawg/Spanish/Makefile +++ b/xwords4/dawg/Spanish/Makefile @@ -1,4 +1,4 @@ -# -*-mode: Makefile; compile-command: "make all"; coding: iso-8859-1; -*- +# -*-mode: Makefile; compile-command: "make all"; coding: utf-8; -*- # Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved. # # This program is free software; you can redistribute it and/or @@ -18,7 +18,7 @@ XWLANG = SpanishFAA41 LANGCODE = es_ES TARGET_TYPE ?= WINCE -ENC = ISO-8859-1 +ENC = UTF-8 ifeq ($(TARGET_TYPE),PALM) PBITMS = ./bmps/palm @@ -44,14 +44,13 @@ include ../Makefile.langcommon #$(LANG)Main.dict.gz: SpanishMain.dict.gz # ln -s $< $@ -SOURCEDICT ?= $(XWDICTPATH)/Spanish/FAA_4.1.txt.gz +SOURCEDICT ?= $(XWDICTPATH)/Spanish/FAA_4.1.utf8.gz $(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile zcat $< \ | tr -d '\r' \ - | tr '\207\216\222\227\234\237\226' 'aeiouu\321' \ - | tr [a-z] [A-Z] \ - | LANG=$(LANGCODE):$(ENC) grep '^[A-JL-VX-Z]*$$' \ + | tr [a-zñ] [A-ZÑ] \ + | LANG=$(LANGCODE):$(ENC) grep '^[A-JL-VX-ZÑ]*$$' \ | sed 's/CH/1/g' \ | sed 's/LL/2/g' \ | sed 's/RR/3/g' \ diff --git a/xwords4/dawg/Spanish/info.txt b/xwords4/dawg/Spanish/info.txt index 7a086e2b4..03a4ec1ff 100644 --- a/xwords4/dawg/Spanish/info.txt +++ b/xwords4/dawg/Spanish/info.txt @@ -1,4 +1,4 @@ -# -*- mode: conf; coding: iso-8859-1; -*- +# -*- mode: conf; coding: utf-8; -*- # Copyright 2002-2006 by Eric House (xwords@eehouse.org). All rights # reserved. # @@ -20,6 +20,7 @@ # below NEEDSSORT:true +CHARSET: utf-8 # MSDos LF chars go bye-bye LANGFILTER: tr -d '\r' @@ -27,9 +28,9 @@ LANGFILTER: tr -d '\r' # convert accented vowels LANGFILTER: | tr '\207\216\222\227\234\237\226' 'aeiouu\321' # uppercase -LANGFILTER: | tr [a-z] [A-Z] +LANGFILTER: | tr [a-zñ] [A-ZÑ] # remove words with illegal letters -LANGFILTER: | grep '^[[A-JL-VX-Z]*$' +LANGFILTER: | grep '^[[A-JL-VX-ZÑ]*$' # substitute pairs (can't figure out how to use octal values) LANGFILTER: | sed 's/CH/1/g' LANGFILTER: | sed 's/LL/2/g' @@ -43,7 +44,7 @@ LANGFILTER: | sort -u -z D2DARGS: -r -term 0 LANGINFO:

Spanish words include all letters in the English alphabet -LANGINFO: except "K" and "W", and with "" added. Since there are no +LANGINFO: except "K" and "W", and with "Ñ" added. Since there are no LANGINFO: tiles for accented vowels, these are replaced by the LANGINFO: unaccented forms.

@@ -92,8 +93,7 @@ XLOC_HEADER:0x8600 1 8 {"LL", true, true} 2 3 'M' 5 1 'N' -# /*'N~'*/ -1 8 209 +1 8 'Ñ' 9 1 'O' 2 3 'P' 1 5 'Q' diff --git a/xwords4/dawg/Swedish/Makefile b/xwords4/dawg/Swedish/Makefile index d830ba744..56f363240 100644 --- a/xwords4/dawg/Swedish/Makefile +++ b/xwords4/dawg/Swedish/Makefile @@ -1,4 +1,4 @@ -# -*-mode: Makefile; coding: iso-8859-1; -*- +# -*-mode: Makefile; coding: utf-8; -*- # Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved. # # This program is free software; you can redistribute it and/or @@ -17,7 +17,7 @@ XWLANG=Swedish LANGCODE=sv_SE -ENC = ISO-8859-1 +ENC = UTF-8 # Swedish has too many chars for the old format. NEWDAWG=whatever @@ -28,14 +28,14 @@ include ../Makefile.2to8 include ../Makefile.langcommon -SOURCEDICT ?= $(XWDICTPATH)/Swedish/swedish15.dict.gz +SOURCEDICT ?= $(XWDICTPATH)/Swedish/swedish15.utf8.gz # Q and W are not available as tiles, but I'm told there's a custom in # Swedish play of allowing blanks to stand for those letters as well. # So we don't exclude words with those letters from the dictionary. $(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile - zcat $< | tr [a-z] [A-Z] | \ - LANG=$(LANGCODE):$(ENC) grep '^[A-Z]\{2,15\}$$' | \ + zcat $< | tr [a-zäåæöü] [A-ZÄÅÆÖÜ] | \ + LANG=$(LANGCODE):$(ENC) grep '^[A-ZÄÅÆÖÜ]\{2,15\}$$' | \ gzip -c > $@ # Everything but creating of the Main.dict file is inherited from the diff --git a/xwords4/dawg/Swedish/info.txt b/xwords4/dawg/Swedish/info.txt index 40dde73ef..7f95ee3c6 100644 --- a/xwords4/dawg/Swedish/info.txt +++ b/xwords4/dawg/Swedish/info.txt @@ -1,4 +1,4 @@ -# -*- mode: conf; coding: iso-8859-1; -*- +# -*- mode: conf; coding: utf-8; -*- # Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved. # # This program is free software; you can redistribute it and/or @@ -15,16 +15,17 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +CHARSET: utf-8 LANGCODE:sv_SE LANGFILTER: tr -d '\r' -LANGFILTER: | tr [a-z] [A-Z] -LANGFILTER: | grep '^[A-Z]*$' +LANGFILTER: | tr [a-zäåæöü] [A-ZÄÅÆÖÜ] +LANGFILTER: | grep '^[A-ZÄÅÆÖÜ]*$' D2DARGS: -r -term 10 LANGINFO:

From an English-speaker's perspective, Swedish drops Q -LANGINFO: and W, and adds , , , and .

+LANGINFO: and W, and adds Ä, Å, Æ, Ö and Ü.

# High bit means "official". Next 7 bits are an enum where # Swedish==7. Low byte is padding @@ -36,11 +37,11 @@ XLOC_HEADER:0x8700 2 0 {"_"} 8 1 'A' # A with two dots -2 3 '' +2 3 'Ä' # A with circle -2 4 '' -# tile only available for blanks -0 1 '' +2 4 'Å' +# Æ tile only available for blanks +0 1 'Æ' 2 4 'B' 1 8 'C' 5 1 'D' @@ -56,7 +57,7 @@ XLOC_HEADER:0x8700 6 1 'N' 5 2 'O' # O with two dots -2 4 '' +2 4 'Ö' 2 4 'P' # Q tile only available for blanks 0 1 'Q' @@ -64,13 +65,12 @@ XLOC_HEADER:0x8700 8 1 'S' 8 1 'T' 3 4 'U' -# tile only available for blanks -0 1 '' +# Ü tile only available for blanks +0 1 'Ü' 2 3 'V' # W tile only available for blanks 0 1 'W' 1 8 'X' 1 7 'Y' 1 10 'Z' - diff --git a/xwords4/dawg/dawg2dict.pl b/xwords4/dawg/dawg2dict.pl index b4565cd34..70d93343a 100755 --- a/xwords4/dawg/dawg2dict.pl +++ b/xwords4/dawg/dawg2dict.pl @@ -1,6 +1,6 @@ -#!/usr/bin/perl +#!/usr/bin/perl -CS # -# Copyright 2004 by Eric House (xwords@eehouse.org) +# Copyright 2004 - 2009 by Eric House (xwords@eehouse.org) # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -22,9 +22,12 @@ use strict; use Fcntl; +use Encode 'from_to'; +use Encode; my $gInFile; my $gDoRaw = 0; +my $gDoJSON = 0; my $gFileType; my $gNodeSize; @@ -33,7 +36,7 @@ sub systell { sysseek($_[0], 0, SEEK_CUR) } sub usage() { print STDERR "USAGE: $0 " - . "[-raw] " + . "[-raw | -json] " . "-dict " . "\n" . "\t(Takes a .pdb or .xwd and prints its words to stdout)\n"; @@ -45,6 +48,8 @@ sub parseARGV() { while ( my $parm = shift(@ARGV) ) { if ( $parm eq "-raw" ) { $gDoRaw = 1; + } elsif ( $parm eq "-json" ) { + $gDoJSON = 1; } elsif ( $parm eq "-dict" ) { $gInFile = shift(@ARGV); } else { @@ -72,18 +77,32 @@ sub countSpecials($) { sub readXWDFaces($$$) { my ( $fh, $facRef, $nSpecials ) = @_; - my $buf; - my $nRead = sysread( $fh, $buf, 1 ); - my $nChars = unpack( 'c', $buf ); + my ( $buf, $nRead, $nChars, $nBytes ); + $nRead = sysread( $fh, $buf, 1 ); + $nBytes = unpack( 'c', $buf ); + printf STDERR "nBytes of faces: %d\n", $nBytes; + $nRead = sysread( $fh, $buf, 1 ); + $nChars = unpack( 'c', $buf ); + printf STDERR "nChars of faces: %d\n", $nChars; + binmode( $fh, ":encoding(utf8)" ) or die "binmode(:utf-8) failed\n"; + sysread( $fh, $buf, $nChars ); + length($buf) == $nChars or die "didn't read expected number of bytes\n"; + binmode( $fh ) or die "binmode failed\n"; + + print STDERR "string now: $buf\n"; my @faces; - for ( my $i = 0; $i < $nChars; ++$i ) { - my $nRead = sysread( $fh, $buf, 2 ); - push( @faces, chr(unpack( "n", $buf ) ) ); + for ( my $ii = 0; $ii < $nChars; ++$ii ) { + my $chr = substr( $buf, $ii, 1 ); + print STDERR "pushing $chr \n"; + push( @faces, $chr ); } + printf STDERR "at 0x%x after reading faces\n", systell($fh); + ${$nSpecials} = countSpecials( \@faces ); @{$facRef} = @faces; + printf STDERR "readXWDFaces=>%d\n", $nChars; return $nChars; } # readXWDFaces @@ -99,6 +118,7 @@ sub skipBitmap($) { sysread( $fh, $buf, $nBytes ); } + printf STDERR "skipBitmap\n"; } # skipBitmap sub getSpecials($$$) { @@ -138,9 +158,9 @@ sub readNodesToEnd($) { sub nodeSizeFromFlags($) { my ( $flags ) = @_; - if ( $flags == 2 ) { + if ( $flags == 4 ) { return 3; - } elsif ( $flags == 3 ) { + } elsif ( $flags == 5 ) { return 4; } else { die "invalid dict flags $flags"; @@ -161,6 +181,7 @@ sub mergeSpecials($$) { sub prepXWD($$$$) { my ( $fh, $facRef, $nodesRef, $startRef ) = @_; + printf STDERR "at 0x%x at start\n", systell($fh); my $buf; my $nRead = sysread( $fh, $buf, 2 ); my $flags = unpack( "n", $buf ); @@ -170,24 +191,30 @@ sub prepXWD($$$$) { my $nSpecials; my $faceCount = readXWDFaces( $fh, $facRef, \$nSpecials ); + printf STDERR "at 0x%x before header read\n", systell($fh); # skip xloc header $nRead = sysread( $fh, $buf, 2 ); # skip values info. + printf STDERR "at 0x%x before reading %d values\n", systell($fh), $faceCount; sysread( $fh, $buf, $faceCount * 2 ); + printf STDERR "at 0x%x after values read\n", systell($fh); + printf STDERR "at 0x%x before specials read\n", systell($fh); my @specials; getSpecials( $fh, $nSpecials, \@specials ); mergeSpecials( $facRef, \@specials ); + printf STDERR "at 0x%x after specials read\n", systell($fh); -# printf STDERR "at 0x%x before offset read\n", systell($fh); + printf STDERR "at 0x%x before offset read\n", systell($fh); sysread( $fh, $buf, 4 ); $$startRef = unpack( 'N', $buf ); -# print STDERR "startRef=$$startRef\n"; + print STDERR "startRef=$$startRef\n"; my @nodes = readNodesToEnd( $fh ); @$nodesRef = @nodes; + print STDERR "prepXWD done\n"; } # prepXWD sub readPDBSpecials($$$$$) { @@ -342,10 +369,52 @@ sub printNodes($$) { } } +sub printStartJson($) { + my ( $startIndex ) = @_; + printf( " start: 0x%.8x,\n", $startIndex ); +} + +sub printCharsJson($) { + my ( $fr ) = @_; + print " chars: [ "; + foreach my $char (@$fr) { + print "\"$char\", " + } + print "],\n" +} + +sub printNodesJson($) { + my ( $nr ) = @_; + print " dawg: [\n"; + + my $len = @$nr; + my $newLine = 1; + for ( my $ii = 0; $ii < $len; ++$ii ) { + my $node = $$nr[$ii]; + + if ( $newLine ) { + printf( " /*%.6x*/ ", $ii ); + $newLine = 0; + } + + printf "0x%.8x, ", $node; + + my ( $chrIndex, $nextEdge, $accepting, $lastEdge ); + parseNode( $node, \$chrIndex, \$nextEdge, \$accepting, \$lastEdge ); + if ( $lastEdge ) { + print "\n"; + $newLine = 1; + } + } + + print "\n ],\n" +} + ################################################################# # main ################################################################# +binmode( STDERR, ":encoding(utf8)" ) or die "binmode(:utf-8) failed\n"; parseARGV(); @@ -364,9 +433,17 @@ if ( $gFileType eq "xwd" ){ close INFILE; die "no nodes!!!" if 0 == @nodes; + if ( $gDoRaw ) { printNodes( \@nodes, \@faces ); +} elsif ( $gDoJSON ) { + print "dict = {\n"; + printStartJson( $startIndex ); + printCharsJson( \@faces ); + printNodesJson( \@nodes ); + print "}\n"; } else { + binmode( STDOUT, ":encoding(utf8)" ) or die "binmode(:utf-8) failed\n"; printDAWG( [], \@nodes, $startIndex, \@faces ); } diff --git a/xwords4/dawg/dict2dawg.cpp b/xwords4/dawg/dict2dawg.cpp index 873593397..c48f5655e 100644 --- a/xwords4/dawg/dict2dawg.cpp +++ b/xwords4/dawg/dict2dawg.cpp @@ -78,7 +78,7 @@ static void (*gReadWordProc)(void) = NULL; static NodeList gNodes; // final array of nodes static unsigned int gNBytesPerOutfile = 0xFFFFFFFF; static char* gTableFile = NULL; -static bool gIsMultibyte = false; +static bool gIsMultibyte = true; // always true static const char* gEncoding = NULL; static char* gOutFileBase = NULL; static char* gStartNodeOut = NULL; @@ -91,9 +91,9 @@ static const char* gLang = NULL; static char* gBytesPerNodeFile = NULL; // where to write whether node // size 3 or 4 int gWordCount = 0; -std::map gTableHash; +std::map gTableHash; int gBlankIndex; -std::vector gRevMap; +std::vector gRevMap; #ifdef DEBUG bool gDebug = false; #endif @@ -107,17 +107,19 @@ int gLimHigh = MAX_WORD_LEN; // OWL is 1.7M -#define MAX_POOL_SIZE (10 * 0x100000) +#define MAX_POOL_SIZE (10 * 0x100000 * sizeof(wchar_t)) #define ERROR_EXIT(...) error_exit( __LINE__, __VA_ARGS__ ); +#define VSIZE(a) (sizeof(a)/sizeof(a[0])) static char* parseARGV( int argc, char** argv, const char** inFileName ); static void usage( const char* name ); static void error_exit( int line, const char* fmt, ... ); static void makeTableHash( void ); +static void printTableHash( void ); static WordList* parseAndSort( void ); static void printWords( WordList* strings ); static bool firstBeforeSecond( const Letter* lhs, const Letter* rhs ); -static char* tileToAscii( char* out, int outSize, const Letter* in ); +static wchar_t* tilesToText( wchar_t* out, int outLen, const Letter* in ); static int buildNode( int depth ); static void TrieNodeSetIsLastSibling( Node* nodeR, bool isLastSibling ); static int addNodes( NodeList& newedgesR ); @@ -178,6 +180,7 @@ main( int argc, char** argv ) } makeTableHash(); + printTableHash(); // Do I need this stupid thing? Better to move the first row to // the front of the array and patch everything else. Or fix the @@ -451,9 +454,9 @@ readFromSortedArray( void ) } #ifdef DEBUG if ( gDebug ) { - char buf[T2ABUFLEN(MAX_WORD_LEN)]; - fprintf( stderr, "%s: got word: %s\n", __func__, - tileToAscii( buf, sizeof(buf), word ) ); + wchar_t buf[T2ABUFLEN(MAX_WORD_LEN)]; + fprintf( stderr, "%s: got word: %ls\n", __func__, + tilesToText( buf, VSIZE(buf), word ) ); } #endif } @@ -473,13 +476,13 @@ readFromSortedArray( void ) && !firstBeforeSecond( gCurrentWord, word ) ) { #ifdef DEBUG if ( gDebug ) { - char buf1[T2ABUFLEN(MAX_WORD_LEN)]; - char buf2[T2ABUFLEN(MAX_WORD_LEN)]; + wchar_t buf1[T2ABUFLEN(MAX_WORD_LEN)]; + wchar_t buf2[T2ABUFLEN(MAX_WORD_LEN)]; fprintf( stderr, - "%s: words %s and %s are the same or out of order\n", + "%s: words %ls and %ls are the same or out of order\n", __func__, - tileToAscii( buf1, sizeof(buf1), gCurrentWord ), - tileToAscii( buf2, sizeof(buf2), word ) ); + tilesToText( buf1, VSIZE(buf1), gCurrentWord ), + tilesToText( buf2, VSIZE(buf2), word ) ); } #endif continue; @@ -492,9 +495,9 @@ readFromSortedArray( void ) #ifdef DEBUG if ( gDebug ) { - char buf[T2ABUFLEN(MAX_WORD_LEN)]; - fprintf( stderr, "gCurrentWord now %s\n", - tileToAscii( buf, sizeof(buf), gCurrentWord) ); + wchar_t buf[T2ABUFLEN(MAX_WORD_LEN)]; + fprintf( stderr, "gCurrentWord now %ls\n", + tilesToText( buf, VSIZE(buf), gCurrentWord) ); } #endif } // readFromSortedArray @@ -516,6 +519,9 @@ getWideChar( FILE* file ) assert( 0 == ii ); dest = byt; break; + } else if ( byt < ' ' && 0 == ii ) { + dest = byt; + break; } assert( ii < 4 ); @@ -533,7 +539,7 @@ getWideChar( FILE* file ) } // getWideChar static Letter* -readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF ) +readOneWord( Letter* wordBuf, const int bufLen, int* lenp, bool* gotEOF ) { Letter* result = NULL; int count = 0; @@ -545,7 +551,7 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF ) // return it. If no, start over ONLY IF the terminator was not // EOF. for ( ; ; ) { - wchar_t byt = gIsMultibyte? getWideChar( gInFile ) : getc( gInFile ); + wchar_t byt = getWideChar( gInFile ); // EOF is special: we don't try for another word even if // dropWord is true; we must leave now. @@ -557,6 +563,13 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF ) if ( !dropWord && (count >= gLimLow) && (count <= gLimHigh) ) { assert( count < bufLen ); wordBuf[count] = '\0'; +#ifdef DEBUG + if ( gDebug ) { + wchar_t buf[T2ABUFLEN(count)]; + fprintf( stderr, "%s: adding word: %ls\n", + __func__, tilesToText( buf, VSIZE(buf), wordBuf ) ); + } +#endif result = wordBuf; *lenp = count; ++gWordCount; @@ -567,11 +580,12 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF ) } #ifdef DEBUG if ( gDebug ) { - char buf[T2ABUFLEN(count)]; + wchar_t buf[T2ABUFLEN(count)]; wordBuf[count] = '\0'; - fprintf( stderr, "%s: dropping word (len %d>=%d): %s\n", - __func__, count, gLimHigh, - tileToAscii( buf, sizeof(buf), wordBuf ) ); + fprintf( stderr, "%s: dropping word (len %d >%d or <%d or " + "dropWord:%d): %ls\n", __func__, count, gLimHigh, + gLimLow, (int)dropWord, + tilesToText( buf, VSIZE(buf), wordBuf ) ); } #endif count = 0; // we'll start over @@ -579,43 +593,43 @@ readOneWord( Letter* wordBuf, int bufLen, int* lenp, bool* gotEOF ) } else if ( count >= bufLen ) { // Just drop it... + assert(0); // Fix this -- but need to warn when out of + // memory!!! dropWord = true; // Don't call into the hashtable twice here!! - } else if ( gTableHash.find(byt) != gTableHash.end() ) { - assert( count < bufLen ); - wordBuf[count++] = gTableHash[byt]; - if ( count >= bufLen ) { - dropWord = true; - } - } else if ( gKillIfMissing || !dropWord ) { - char buf[T2ABUFLEN(count)]; - wordBuf[count] = '\0'; - - tileToAscii( buf, sizeof(buf), wordBuf ); - - if ( gKillIfMissing ) { - ERROR_EXIT( "chr %lc (%d/0x%x) not in map file %s\n" - "last word was %s\n", - byt, (int)byt, (int)byt, gTableFile, buf ); - } else if ( !dropWord ) { -#ifdef DEBUG - if ( gDebug ) { - fprintf( stderr, "%s: chr %c (%d) not in map file %s\n" - "dropping partial word %s\n", __func__, - (char)byt, (int)byt, gTableFile, buf ); + } else { + std::map::iterator iter = gTableHash.find(byt); + if ( iter != gTableHash.end() ) { + assert( count < bufLen ); + wordBuf[count++] = iter->second; + if ( count >= bufLen ) { + dropWord = true; } + } else if ( gKillIfMissing || !dropWord ) { + wchar_t buf[T2ABUFLEN(count)]; + wordBuf[count] = '\0'; + + tilesToText( buf, VSIZE(buf), wordBuf ); + + if ( gKillIfMissing ) { + ERROR_EXIT( "chr %lc (%d/0x%x) not in map file %s\n" + "last word was %ls\n", + byt, (int)byt, (int)byt, gTableFile, buf ); + } else if ( !dropWord ) { +#ifdef DEBUG + if ( gDebug ) { + fprintf( stderr, "%s: chr %lc (%d) not in map file %s\n" + "dropping partial word %ls\n", __func__, + byt, (int)byt, gTableFile, buf ); + } #endif - dropWord = true; + dropWord = true; + } } } - } + } // for -// if ( NULL != result ) { -// char buf[T2ABUFLEN(MAX_WORD_LEN)]; -// fprintf( stderr, "%s returning %s\n", __func__, -// tileToAscii( buf, sizeof(buf), result ) ); -// } return result; } // readOneWord @@ -635,7 +649,7 @@ readFromFile( void ) // during the sort. This seems easier. for ( ; ; ) { if ( !gDone ) { - word = readOneWord( wordBuf, sizeof(wordBuf), &len, &s_eof ); + word = readOneWord( wordBuf, VSIZE(wordBuf), &len, &s_eof ); gDone = NULL == word; } if ( gDone ) { @@ -658,13 +672,13 @@ readFromFile( void ) && !firstBeforeSecond( gCurrentWord, word ) ) { #ifdef DEBUG if ( gDebug ) { - char buf1[T2ABUFLEN(MAX_WORD_LEN)]; - char buf2[T2ABUFLEN(MAX_WORD_LEN)]; + wchar_t buf1[T2ABUFLEN(MAX_WORD_LEN)]; + wchar_t buf2[T2ABUFLEN(MAX_WORD_LEN)]; fprintf( stderr, - "%s: words %s and %s are the smae or out of order\n", + "%s: words %ls and %ls are the smae or out of order\n", __func__, - tileToAscii( buf1, sizeof(buf1), gCurrentWord ), - tileToAscii( buf2, sizeof(buf2), word ) ); + tilesToText( buf1, VSIZE(buf1), gCurrentWord ), + tilesToText( buf2, VSIZE(buf2), word ) ); } #endif continue; @@ -676,9 +690,9 @@ readFromFile( void ) #ifdef DEBUG if ( gDebug ) { - char buf[T2ABUFLEN(MAX_WORD_LEN)]; - fprintf( stderr, "gCurrentWord now %s\n", - tileToAscii( buf, sizeof(buf), gCurrentWord) ); + wchar_t buf[T2ABUFLEN(MAX_WORD_LEN)]; + fprintf( stderr, "gCurrentWord now %ls\n", + tilesToText( buf, VSIZE(buf), gCurrentWord) ); } #endif } // readFromFile @@ -690,14 +704,15 @@ firstBeforeSecond( const Letter* lhs, const Letter* rhs ) return gt; } -static char* -tileToAscii( char* out, int outSize, const Letter* in ) +static wchar_t* +tilesToText( wchar_t* out, int outSize, const Letter* in ) { - char tiles[outSize]; - int tilesLen = 1; - tiles[0] = '['; + wchar_t tiles[outSize]; + wchar_t* orig = out; + int tilesLen = 0; + + tiles[tilesLen++] = L'['; - char* orig = out; for ( ; ; ) { Letter ch = *in++; if ( '\0' == ch ) { @@ -705,14 +720,15 @@ tileToAscii( char* out, int outSize, const Letter* in ) } assert( ch < gRevMap.size() ); *out++ = gRevMap[ch]; - tilesLen += sprintf( &tiles[tilesLen], "%d,", ch ); + + tilesLen += swprintf( &tiles[tilesLen], outSize-tilesLen, L"%d,", ch ); assert( (out - orig) < outSize ); } assert( tilesLen+1 < outSize ); - tiles[tilesLen] = ']'; - tiles[tilesLen+1] = '\0'; - strcpy( out, tiles ); + tiles[tilesLen] = L']'; + tiles[tilesLen+1] = L'\0'; + wcscpy( out, tiles ); return orig; } @@ -777,9 +793,9 @@ printWords( WordList* strings ) { std::vector::iterator iter = strings->begin(); while ( iter != strings->end() ) { - char buf[T2ABUFLEN(MAX_WORD_LEN)]; - tileToAscii( buf, sizeof(buf), *iter ); - fprintf( stderr, "%s\n", buf ); + wchar_t buf[T2ABUFLEN(MAX_WORD_LEN)]; + tilesToText( buf, VSIZE(buf), *iter ); + fprintf( stderr, "%ls\n", buf ); ++iter; } } @@ -906,18 +922,12 @@ makeTableHash( void ) gRevMap.push_back(0); for ( ii = 0; ; ++ii ) { - int ch = getc(TABLEFILE); - if ( ch == EOF ) { - break; - } - - if ( gUseUnicode ) { // skip the first byte each time: tmp HACK!!! - ch = getc(TABLEFILE); - } - if ( ch == EOF ) { + wchar_t ch = getWideChar( TABLEFILE ); + if ( EOF == ch ) { break; } + fprintf( stderr, "adding %lc/%x\n", ch, ch ); gRevMap.push_back(ch); if ( ch == 0 ) { // blank @@ -940,6 +950,26 @@ makeTableHash( void ) fclose( TABLEFILE ); } // makeTableHash +static void +printTableHash( void ) +{ + if ( gDebug ) { + std::vector::iterator iter = gRevMap.begin(); + int count = 0; // 0th entry is 0 + while ( iter != gRevMap.end() ) { + wchar_t ch = *iter; + if ( 0 != ch ) { + fprintf( stderr, "%s: gRevMap[%d]: %lc\n", __func__, count, ch ); + fprintf( stderr, "%s: gTableHash[%lc]: %d\n", __func__, ch, + gTableHash[ch] ); + assert( gTableHash[ch] == count ); + } + ++iter; + ++count; + } + } +} + // emitNodes. "input" is $gNodes. From it we write up to // $nBytesPerOutfile to files named $outFileBase0..n, mapping the // letter field down to 5 bits with a hash built from $tableFile. If @@ -1065,6 +1095,9 @@ outputNode( Node node, int nBytes, FILE* outfile ) unsigned int fco = TrieNodeGetFirstChildOffset(node); unsigned int fourthByte = 0; + assert( ((3 == nBytes) && (fco < (1<<17))) + || ((4 == nBytes) && (fco < (1<<24))) ); + if ( nBytes == 4 ) { fourthByte = fco >> 16; if ( fourthByte > 0xFF ) { @@ -1085,7 +1118,7 @@ outputNode( Node node, int nBytes, FILE* outfile ) // | | | // accepting bit ---+ | | // last edge bit ------+ | - // ---- last bit (17th on next node addr)---------+ + // ---- last bit (17th of next node addr)---------+ // The four-byte format adds a byte at the right end for // addressing, but removes the extra bit (5) in order to let the @@ -1247,13 +1280,13 @@ parseARGV( int argc, char** argv, const char** inFileName ) if ( !!enc ) { if ( !strcasecmp( enc, "UTF-8" ) ) { - gIsMultibyte = true; +// gIsMultibyte = true; } else if ( !strcasecmp( enc, "iso-8859-1" ) ) { - gIsMultibyte = false; +// gIsMultibyte = false; } else if ( !strcasecmp( enc, "iso-latin-1" ) ) { - gIsMultibyte = false; +// gIsMultibyte = false; } else if ( !strcasecmp( enc, "ISO-8859-2" ) ) { - gIsMultibyte = false; +// gIsMultibyte = false; } else { ERROR_EXIT( "%s: unknown encoding %s", __func__, enc ); } diff --git a/xwords4/dawg/dictstats.pl b/xwords4/dawg/dictstats.pl index 567a77f16..8465ca8df 100755 --- a/xwords4/dawg/dictstats.pl +++ b/xwords4/dawg/dictstats.pl @@ -15,9 +15,15 @@ use strict; my @wordSizeCounts; -my @letterCounts; +my %letterCounts; my $wordCount; my $letterCount; +my $enc = "utf8"; # this could be a cmdline arg.... + +if ( $enc ) { + binmode( STDOUT, ":encoding($enc)" ) ; + binmode( STDIN, ":encoding($enc)" ) ; +} while (<>) { @@ -27,10 +33,10 @@ while (<>) { ++$wordCount; foreach my $letter (split( / */ ) ) { - my $i = ord($letter); + my $ii = ord($letter); # special-case the bogus chars we add for "specials" - die "$0: this is a letter?: $i" if $i <= 32 && $i >= 4 && $i != 0; - ++$letterCounts[$i]; + die "$0: this is a letter?: $ii" if $ii <= 32 && $ii >= 4 && $ii != 0; + ++$letterCounts{$letter}; ++$letterCount; } } @@ -54,14 +60,12 @@ for ( my $i = 1 ; $i <= 99; ++$i ) { print "\n\n**** Letter counts ****\n"; print " ASCII ORD HEX PCT (of $letterCount)\n"; my $lineNo = 1; -for ( my $i = 0; $i < 255; ++$i ) { - my $count = $letterCounts[$i]; - if ( $count > 0 ) { - my $pct = (100.00 * $count) / $letterCount; - printf( "%2d: %3s %3d %x %5.2f (%d)\n", - $lineNo, chr($i), $i, $i, $pct, $count ); - ++$lineNo; - } +foreach my $key (sort keys %letterCounts) { + my $count = $letterCounts{$key}; + my $pct = (100.00 * $count) / $letterCount; + printf( "%2d: %3s %3d %x %5.2f (%d)\n", + $lineNo, $key, ord($key), ord($key), $pct, $count ); + ++$lineNo; } print "\n"; diff --git a/xwords4/dawg/xloc.pl b/xwords4/dawg/xloc.pl index 20b72fcc9..23ef0ca43 100755 --- a/xwords4/dawg/xloc.pl +++ b/xwords4/dawg/xloc.pl @@ -23,6 +23,7 @@ use xloc; my $unicode = -1; my $doval = 0; +my $dosize = 0; my $enc; my $outfile; @@ -37,6 +38,8 @@ while ( $arg = $ARGV[0] ) { $unicode = 0; } elsif ( $arg eq "-v" ) { $doval = 1; + } elsif ( $arg eq "-s" ) { + $dosize = 1; } elsif ( $arg eq '-out' ) { $outfile = $ARGV[1]; shift @ARGV; @@ -52,12 +55,20 @@ die "info file $infoFile not found\n" if ! -s $infoFile; my $xlocToken = xloc::ParseTileInfo($infoFile, $enc); -open OUTFILE, "> $outfile"; +if ( $enc ) { + open OUTFILE, ">:encoding($enc)", "$outfile" + or die "couldn't open $outfile"; +} else { + open OUTFILE, ">$outfile" or die "couldn't open $outfile"; +} # For f*cking windoze linefeeds -binmode( OUTFILE ); +# binmode( OUTFILE ); if ( $unicode ne -1 ) { xloc::WriteMapFile( $xlocToken, $unicode, \*OUTFILE ); +} elsif ( $dosize ) { + my $count = xloc::GetNTiles( $xlocToken ); + print OUTFILE pack("c", $count ); } elsif ( $doval ) { xloc::WriteValuesFile( $xlocToken, \*OUTFILE ); } diff --git a/xwords4/dawg/xloc.pm b/xwords4/dawg/xloc.pm index 741968e76..6e25fa138 100644 --- a/xwords4/dawg/xloc.pm +++ b/xwords4/dawg/xloc.pm @@ -103,13 +103,6 @@ sub GetValue($$) { sub WriteMapFile($$$) { my ( $hashR, $unicode, $fhr ) = @_; - my $packStr; - if ( $unicode ) { - $packStr = "n"; - } else { - $packStr = "C"; - } - my $count = GetNTiles($hashR); my $specialCount = 0; for ( my $i = 0; $i < $count; ++$i ) { @@ -117,11 +110,12 @@ sub WriteMapFile($$$) { my $str = ${$tileR}[2]; if ( $str =~ /\'(.)\'/ ) { - print $fhr pack($packStr, ord($1) ); + print $fhr pack( "U", ord($1) ); +# printf STDERR "ord: %x ($1)\n", ord($1); } elsif ( $str =~ /\"(.+)\"/ ) { - print $fhr pack($packStr, $specialCount++ ); + print $fhr pack( "c", $specialCount++ ); } elsif ( $str =~ /(\d+)/ ) { - print $fhr pack( $packStr, $1 ); + print $fhr pack( "n", $1 ); } else { die "WriteMapFile: unrecognized face format $str, elem $i"; }