switch to utf-8, adding an iconv call to translate the wordlists.

2024-12-30 10:26:58 +01:00 · 2010-12-17 17:37:57 -08:00 · 2010-12-17 17:37:57 -08:00 · 18f8b0d4e4
commit 18f8b0d4e4
parent 32fccca995
3 changed files with 18 additions and 13 deletions
--- a/xwords4/dawg/Portuguese/Makefile.BrOffice
+++ b/xwords4/dawg/Portuguese/Makefile.BrOffice
@ -1,4 +1,4 @@
-# -*- mode: makefile; coding: iso-8859-1 -*-
+# -*- mode: Makefile; coding: utf-8; -*-
 # Copyright 2002, 2006 by Eric House (xwords@eehouse.org).  All rights
 # reserved.
 #
@ -18,7 +18,7 @@
 XWLANG = PortugueseBR
 LANGCODE = pt_PT
-ENC = ISO-8859-1
+ENC = UTF-8
 TARGET_TYPE ?= WINCE
@ -29,8 +29,10 @@ include ../Makefile.langcommon
 SOURCEDICT ?= $(XWDICTPATH)/Portuguese/portugueseBR.txt.gz
 $(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile.BrOffice
-	zcat $< | tr [a-zç] [A-ZÇ] | \
+	zcat $< | \
-		LANG=$(LANGCODE):$(ENC) grep '^[A-JL-VXZÇ]\+$$' | \
+		iconv -f iso88591 -t utf8 | \
 		sed 's,.,\U\0,g' | \
 		grep '^[ABCÇDEFGHIJLMNOPQRSTUVXZ]\+$$' | \
 		gzip -c > $@
 # Everything but creating of the Main.dict file is inherited from the
--- a/xwords4/dawg/Portuguese/Makefile.Minho
+++ b/xwords4/dawg/Portuguese/Makefile.Minho
@ -1,4 +1,4 @@
-# -*- mode: makefile; coding: iso-8859-1 -*-
+# -*- mode: makefile; coding: utf-8 -*-
 #
 # Copyright 2002, 2006 by Eric House (xwords@eehouse.org).  All rights
 # reserved.
@ -19,7 +19,7 @@
 XWLANG=PortuguesePT
 LANGCODE = pt_PT
-ENC = ISO-8859-1
+ENC = UTF-8
 TARGET_TYPE ?= WINCE
@ -30,8 +30,10 @@ include ../Makefile.langcommon
 SOURCEDICT ?= $(XWDICTPATH)/Portuguese/portuguese_pt.bz2
 $(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile.Minho
-	bzcat $< | tr [a-zç] [A-ZÇ] | \
+	bzcat $< | \
-		LANG=$(LANGCODE):$(ENC) grep '^[A-JL-VXZÇ]\+$$' | \
+		iconv -f iso88591 -t utf8 | \
 		sed 's,.,\U\0,g' | \
 		grep '^[ABCÇDEFGHIJLMNOPQRSTUVXZ]\+$$' | \
 		gzip -c > $@
 # Everything but creating of the Main.dict file is inherited from the
--- a/xwords4/dawg/Portuguese/info.txt
+++ b/xwords4/dawg/Portuguese/info.txt
@ -15,15 +15,16 @@
 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 LANGCODE:pt_PT
 CHARSET: utf-8
 # deal with DOS files
 LANGFILTER: tr -d '\r'
 # uppercase all
-LANGFILTER: | tr [a-zç] [A-ZÇ]
+LANGFILTER: | tr [a-zç] [A-ZÇ]
 # no words not containing a vowel
 LANGFILTER: | grep '[AEIOU]' 
 # none with illegal chars
-LANGFILTER: | grep '^[A-JL-VXZÇ]\+$'
+LANGFILTER: | grep '^[A-JL-VXZÇ]\+$'
 # Until I can figure out how to force sort to use a locale's collation
 # rules we can't trust sort in the filtering rules above and so must
@ -31,8 +32,8 @@ LANGFILTER: | grep '^[A-JL-VXZ
 D2DARGS: -r -term 10
-LANGINFO: <p>Portugese uses the letter A-Z, excluding K, W and Y, and adds
+LANGINFO: <p>Portuguese uses the letter A-Z, excluding K, W and Y, and adds
-LANGINFO: Ç.  Words containing any other letters are dropped. </p>
+LANGINFO: Ç.  Words containing any other letters are dropped. </p>
 # High bit means "official".  Next 7 bits are an enum where
 # Portuguese==D.  Low byte is padding
@ -44,7 +45,7 @@ XLOC_HEADER:0x8D00
 14          1       'A'
 3           3       'B'
 4           2       'C'
-2           3       'Ç'
+2           3       'Ç'
 5           2       'D'
 11          1       'E'
 2           4       'F'