switch to utf-8, adding an iconv call to translate the wordlists.

This commit is contained in:
Andy2 2010-12-17 17:37:57 -08:00
parent 32fccca995
commit 18f8b0d4e4
3 changed files with 18 additions and 13 deletions

View file

@ -1,4 +1,4 @@
# -*- mode: makefile; coding: iso-8859-1 -*-
# -*- mode: Makefile; coding: utf-8; -*-
# Copyright 2002, 2006 by Eric House (xwords@eehouse.org). All rights
# reserved.
#
@ -18,7 +18,7 @@
XWLANG = PortugueseBR
LANGCODE = pt_PT
ENC = ISO-8859-1
ENC = UTF-8
TARGET_TYPE ?= WINCE
@ -29,8 +29,10 @@ include ../Makefile.langcommon
SOURCEDICT ?= $(XWDICTPATH)/Portuguese/portugueseBR.txt.gz
$(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile.BrOffice
zcat $< | tr [a-zç] [A-ZÇ] | \
LANG=$(LANGCODE):$(ENC) grep '^[A-JL-VXZÇ]\+$$' | \
zcat $< | \
iconv -f iso88591 -t utf8 | \
sed 's,.,\U\0,g' | \
grep '^[ABCÇDEFGHIJLMNOPQRSTUVXZ]\+$$' | \
gzip -c > $@
# Everything but creating of the Main.dict file is inherited from the

View file

@ -1,4 +1,4 @@
# -*- mode: makefile; coding: iso-8859-1 -*-
# -*- mode: makefile; coding: utf-8 -*-
#
# Copyright 2002, 2006 by Eric House (xwords@eehouse.org). All rights
# reserved.
@ -19,7 +19,7 @@
XWLANG=PortuguesePT
LANGCODE = pt_PT
ENC = ISO-8859-1
ENC = UTF-8
TARGET_TYPE ?= WINCE
@ -30,8 +30,10 @@ include ../Makefile.langcommon
SOURCEDICT ?= $(XWDICTPATH)/Portuguese/portuguese_pt.bz2
$(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile.Minho
bzcat $< | tr [a-zç] [A-ZÇ] | \
LANG=$(LANGCODE):$(ENC) grep '^[A-JL-VXZÇ]\+$$' | \
bzcat $< | \
iconv -f iso88591 -t utf8 | \
sed 's,.,\U\0,g' | \
grep '^[ABCÇDEFGHIJLMNOPQRSTUVXZ]\+$$' | \
gzip -c > $@
# Everything but creating of the Main.dict file is inherited from the

View file

@ -15,15 +15,16 @@
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
LANGCODE:pt_PT
CHARSET: utf-8
# deal with DOS files
LANGFILTER: tr -d '\r'
# uppercase all
LANGFILTER: | tr [a-zç] [A-ZÇ]
LANGFILTER: | tr [a-zç] [A-ZÇ]
# no words not containing a vowel
LANGFILTER: | grep '[AEIOU]'
# none with illegal chars
LANGFILTER: | grep '^[A-JL-VXZÇ]\+$'
LANGFILTER: | grep '^[A-JL-VXZÇ]\+$'
# Until I can figure out how to force sort to use a locale's collation
# rules we can't trust sort in the filtering rules above and so must
@ -31,8 +32,8 @@ LANGFILTER: | grep '^[A-JL-VXZ
D2DARGS: -r -term 10
LANGINFO: <p>Portugese uses the letter A-Z, excluding K, W and Y, and adds
LANGINFO: Ç. Words containing any other letters are dropped. </p>
LANGINFO: <p>Portuguese uses the letter A-Z, excluding K, W and Y, and adds
LANGINFO: Ç. Words containing any other letters are dropped. </p>
# High bit means "official". Next 7 bits are an enum where
# Portuguese==D. Low byte is padding
@ -44,7 +45,7 @@ XLOC_HEADER:0x8D00
14 1 'A'
3 3 'B'
4 2 'C'
2 3 'Ç'
2 3 'Ç'
5 2 'D'
11 1 'E'
2 4 'F'