iso-8859-1 -> utf8 for files and the dicts they build

This commit is contained in:
Eric House 2010-10-14 05:58:49 -07:00
parent 73bd9be80a
commit 6f9ba42e21
4 changed files with 28 additions and 29 deletions

View file

@ -1,4 +1,4 @@
# -*-mode: Makefile; compile-command: "make all"; coding: iso-8859-1; -*-
# -*-mode: Makefile; compile-command: "make all"; coding: utf-8; -*-
# Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved.
#
# This program is free software; you can redistribute it and/or
@ -18,7 +18,7 @@
XWLANG = SpanishFAA41
LANGCODE = es_ES
TARGET_TYPE ?= WINCE
ENC = ISO-8859-1
ENC = UTF-8
ifeq ($(TARGET_TYPE),PALM)
PBITMS = ./bmps/palm
@ -44,14 +44,13 @@ include ../Makefile.langcommon
#$(LANG)Main.dict.gz: SpanishMain.dict.gz
# ln -s $< $@
SOURCEDICT ?= $(XWDICTPATH)/Spanish/FAA_4.1.txt.gz
SOURCEDICT ?= $(XWDICTPATH)/Spanish/FAA_4.1.utf8.gz
$(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile
zcat $< \
| tr -d '\r' \
| tr '\207\216\222\227\234\237\226' 'aeiouu\321' \
| tr [a-zñ] [A-ZÑ] \
| LANG=$(LANGCODE):$(ENC) grep '^[A-JL-VX-ZÑ]*$$' \
| tr [a-zñ] [A-ZÑ] \
| LANG=$(LANGCODE):$(ENC) grep '^[A-JL-VX-ZÑ]*$$' \
| sed 's/CH/1/g' \
| sed 's/LL/2/g' \
| sed 's/RR/3/g' \

View file

@ -1,4 +1,4 @@
# -*- mode: conf; coding: iso-8859-1; -*-
# -*- mode: conf; coding: utf-8; -*-
# Copyright 2002-2006 by Eric House (xwords@eehouse.org). All rights
# reserved.
#
@ -20,6 +20,7 @@
# below
NEEDSSORT:true
CHARSET: utf-8
# MSDos LF chars go bye-bye
LANGFILTER: tr -d '\r'
@ -27,9 +28,9 @@ LANGFILTER: tr -d '\r'
# convert accented vowels
LANGFILTER: | tr '\207\216\222\227\234\237\226' 'aeiouu\321'
# uppercase
LANGFILTER: | tr [a-zñ] [A-ZÑ]
LANGFILTER: | tr [a-zñ] [A-ZÑ]
# remove words with illegal letters
LANGFILTER: | grep '^[[A-JL-VX-ZÑ]*$'
LANGFILTER: | grep '^[[A-JL-VX-ZÑ]*$'
# substitute pairs (can't figure out how to use octal values)
LANGFILTER: | sed 's/CH/1/g'
LANGFILTER: | sed 's/LL/2/g'
@ -43,7 +44,7 @@ LANGFILTER: | sort -u -z
D2DARGS: -r -term 0
LANGINFO: <p>Spanish words include all letters in the English alphabet
LANGINFO: except "K" and "W", and with "Ñ" added. Since there are no
LANGINFO: except "K" and "W", and with "Ñ" added. Since there are no
LANGINFO: tiles for accented vowels, these are replaced by the
LANGINFO: unaccented forms.</p>
@ -92,8 +93,7 @@ XLOC_HEADER:0x8600
1 8 {"LL", true, true}
2 3 'M'
5 1 'N'
# /*'N~'*/
1 8 209
1 8 'Ñ'
9 1 'O'
2 3 'P'
1 5 'Q'

View file

@ -1,4 +1,4 @@
# -*-mode: Makefile; coding: iso-8859-1; -*-
# -*-mode: Makefile; coding: utf-8; -*-
# Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved.
#
# This program is free software; you can redistribute it and/or
@ -17,7 +17,7 @@
XWLANG=Swedish
LANGCODE=sv_SE
ENC = ISO-8859-1
ENC = UTF-8
# Swedish has too many chars for the old format.
NEWDAWG=whatever
@ -28,14 +28,14 @@ include ../Makefile.2to8
include ../Makefile.langcommon
SOURCEDICT ?= $(XWDICTPATH)/Swedish/swedish15.dict.gz
SOURCEDICT ?= $(XWDICTPATH)/Swedish/swedish15.utf8.gz
# Q and W are not available as tiles, but I'm told there's a custom in
# Swedish play of allowing blanks to stand for those letters as well.
# So we don't exclude words with those letters from the dictionary.
$(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile
zcat $< | tr [a-zäĺćöü] [A-ZÄĹĆÖÜ] | \
LANG=$(LANGCODE):$(ENC) grep '^[A-ZÄĹĆÖÜ]\{2,15\}$$' | \
zcat $< | tr [a-zäåæöü] [A-ZÄÅÆÖÜ] | \
LANG=$(LANGCODE):$(ENC) grep '^[A-ZÄÅÆÖÜ]\{2,15\}$$' | \
gzip -c > $@
# Everything but creating of the Main.dict file is inherited from the

View file

@ -1,4 +1,4 @@
# -*- mode: conf; coding: iso-8859-1; -*-
# -*- mode: conf; coding: utf-8; -*-
# Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved.
#
# This program is free software; you can redistribute it and/or
@ -15,16 +15,17 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
CHARSET: utf-8
LANGCODE:sv_SE
LANGFILTER: tr -d '\r'
LANGFILTER: | tr [a-zäåæöü] [A-ZÄÅÆÖÜ]
LANGFILTER: | grep '^[A-ZÄÅÆÖÜ]*$'
LANGFILTER: | tr [a-zäåæöü] [A-ZÄÅÆÖÜ]
LANGFILTER: | grep '^[A-ZÄÅÆÖÜ]*$'
D2DARGS: -r -term 10
LANGINFO: <p>From an English-speaker&apos;s perspective, Swedish drops Q
LANGINFO: and W, and adds Ä, Å, Æ, Ö and Ü.</p>
LANGINFO: and W, and adds Ä, Å, Æ, Ö and Ü.</p>
# High bit means "official". Next 7 bits are an enum where
# Swedish==7. Low byte is padding
@ -36,11 +37,11 @@ XLOC_HEADER:0x8700
2 0 {"_"}
8 1 'A'
# A with two dots
2 3 'Ä'
2 3 'Ä'
# A with circle
2 4 'Å'
# Æ tile only available for blanks
0 1 'Æ'
2 4 'Å'
# Æ tile only available for blanks
0 1 'Æ'
2 4 'B'
1 8 'C'
5 1 'D'
@ -56,7 +57,7 @@ XLOC_HEADER:0x8700
6 1 'N'
5 2 'O'
# O with two dots
2 4 'Ö'
2 4 'Ö'
2 4 'P'
# Q tile only available for blanks
0 1 'Q'
@ -64,13 +65,12 @@ XLOC_HEADER:0x8700
8 1 'S'
8 1 'T'
3 4 'U'
# Ü tile only available for blanks
0 1 'Ü'
# Ü tile only available for blanks
0 1 'Ü'
2 3 'V'
# W tile only available for blanks
0 1 'W'
1 8 'X'
1 7 'Y'
1 10 'Z'
<END_TILES>