switch to utf-8, adding an iconv call to translate the wordlists.

This commit is contained in:
Andy2 2010-12-17 17:37:57 -08:00
parent 32fccca995
commit 18f8b0d4e4
3 changed files with 18 additions and 13 deletions

View file

@ -1,4 +1,4 @@
# -*- mode: makefile; coding: iso-8859-1 -*- # -*- mode: Makefile; coding: utf-8; -*-
# Copyright 2002, 2006 by Eric House (xwords@eehouse.org). All rights # Copyright 2002, 2006 by Eric House (xwords@eehouse.org). All rights
# reserved. # reserved.
# #
@ -18,7 +18,7 @@
XWLANG = PortugueseBR XWLANG = PortugueseBR
LANGCODE = pt_PT LANGCODE = pt_PT
ENC = ISO-8859-1 ENC = UTF-8
TARGET_TYPE ?= WINCE TARGET_TYPE ?= WINCE
@ -29,8 +29,10 @@ include ../Makefile.langcommon
SOURCEDICT ?= $(XWDICTPATH)/Portuguese/portugueseBR.txt.gz SOURCEDICT ?= $(XWDICTPATH)/Portuguese/portugueseBR.txt.gz
$(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile.BrOffice $(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile.BrOffice
zcat $< | tr [a-zç] [A-ZÇ] | \ zcat $< | \
LANG=$(LANGCODE):$(ENC) grep '^[A-JL-VXZÇ]\+$$' | \ iconv -f iso88591 -t utf8 | \
sed 's,.,\U\0,g' | \
grep '^[ABCÇDEFGHIJLMNOPQRSTUVXZ]\+$$' | \
gzip -c > $@ gzip -c > $@
# Everything but creating of the Main.dict file is inherited from the # Everything but creating of the Main.dict file is inherited from the

View file

@ -1,4 +1,4 @@
# -*- mode: makefile; coding: iso-8859-1 -*- # -*- mode: makefile; coding: utf-8 -*-
# #
# Copyright 2002, 2006 by Eric House (xwords@eehouse.org). All rights # Copyright 2002, 2006 by Eric House (xwords@eehouse.org). All rights
# reserved. # reserved.
@ -19,7 +19,7 @@
XWLANG=PortuguesePT XWLANG=PortuguesePT
LANGCODE = pt_PT LANGCODE = pt_PT
ENC = ISO-8859-1 ENC = UTF-8
TARGET_TYPE ?= WINCE TARGET_TYPE ?= WINCE
@ -30,8 +30,10 @@ include ../Makefile.langcommon
SOURCEDICT ?= $(XWDICTPATH)/Portuguese/portuguese_pt.bz2 SOURCEDICT ?= $(XWDICTPATH)/Portuguese/portuguese_pt.bz2
$(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile.Minho $(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile.Minho
bzcat $< | tr [a-zç] [A-ZÇ] | \ bzcat $< | \
LANG=$(LANGCODE):$(ENC) grep '^[A-JL-VXZÇ]\+$$' | \ iconv -f iso88591 -t utf8 | \
sed 's,.,\U\0,g' | \
grep '^[ABCÇDEFGHIJLMNOPQRSTUVXZ]\+$$' | \
gzip -c > $@ gzip -c > $@
# Everything but creating of the Main.dict file is inherited from the # Everything but creating of the Main.dict file is inherited from the

View file

@ -15,15 +15,16 @@
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
LANGCODE:pt_PT LANGCODE:pt_PT
CHARSET: utf-8
# deal with DOS files # deal with DOS files
LANGFILTER: tr -d '\r' LANGFILTER: tr -d '\r'
# uppercase all # uppercase all
LANGFILTER: | tr [a-zç] [A-ZÇ] LANGFILTER: | tr [a-zç] [A-ZÇ]
# no words not containing a vowel # no words not containing a vowel
LANGFILTER: | grep '[AEIOU]' LANGFILTER: | grep '[AEIOU]'
# none with illegal chars # none with illegal chars
LANGFILTER: | grep '^[A-JL-VXZÇ]\+$' LANGFILTER: | grep '^[A-JL-VXZÇ]\+$'
# Until I can figure out how to force sort to use a locale's collation # Until I can figure out how to force sort to use a locale's collation
# rules we can't trust sort in the filtering rules above and so must # rules we can't trust sort in the filtering rules above and so must
@ -31,8 +32,8 @@ LANGFILTER: | grep '^[A-JL-VXZ
D2DARGS: -r -term 10 D2DARGS: -r -term 10
LANGINFO: <p>Portugese uses the letter A-Z, excluding K, W and Y, and adds LANGINFO: <p>Portuguese uses the letter A-Z, excluding K, W and Y, and adds
LANGINFO: Ç. Words containing any other letters are dropped. </p> LANGINFO: Ç. Words containing any other letters are dropped. </p>
# High bit means "official". Next 7 bits are an enum where # High bit means "official". Next 7 bits are an enum where
# Portuguese==D. Low byte is padding # Portuguese==D. Low byte is padding
@ -44,7 +45,7 @@ XLOC_HEADER:0x8D00
14 1 'A' 14 1 'A'
3 3 'B' 3 3 'B'
4 2 'C' 4 2 'C'
2 3 'Ç' 2 3 'Ç'
5 2 'D' 5 2 'D'
11 1 'E' 11 1 'E'
2 4 'F' 2 4 'F'