iso-8859-1 -> utf8 for files and the dicts they build

2025-01-30 08:34:16 +01:00 · 2010-10-14 05:58:49 -07:00 · 2010-10-14 05:58:49 -07:00 · 6f9ba42e21
commit 6f9ba42e21
parent 73bd9be80a
4 changed files with 28 additions and 29 deletions
--- a/dawg/Spanish/Makefile
+++ b/dawg/Spanish/Makefile
@ -1,4 +1,4 @@
-# -*-mode: Makefile; compile-command: "make all"; coding: iso-8859-1; -*-
+# -*-mode: Makefile; compile-command: "make all"; coding: utf-8; -*-
 # Copyright 2002 by Eric House (xwords@eehouse.org).  All rights reserved.
 #
 # This program is free software; you can redistribute it and/or
@ -18,7 +18,7 @@
 XWLANG = SpanishFAA41
 LANGCODE = es_ES
 TARGET_TYPE ?= WINCE
-ENC = ISO-8859-1
+ENC = UTF-8

 ifeq ($(TARGET_TYPE),PALM)
 PBITMS = ./bmps/palm
@ -44,14 +44,13 @@ include ../Makefile.langcommon
 #$(LANG)Main.dict.gz: SpanishMain.dict.gz
 #	ln -s $< $@

-SOURCEDICT ?= $(XWDICTPATH)/Spanish/FAA_4.1.txt.gz
+SOURCEDICT ?= $(XWDICTPATH)/Spanish/FAA_4.1.utf8.gz

 $(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile
 	zcat $< \
 	| tr -d '\r' \
-	| tr '\207\216\222\227\234\237\226' 'aeiouu\321' \
-	| tr [a-zñ] [A-ZÑ] \
-	| LANG=$(LANGCODE):$(ENC) grep '^[A-JL-VX-ZÑ]*$$' \
+	| tr [a-zñ] [A-ZÑ] \
+	| LANG=$(LANGCODE):$(ENC) grep '^[A-JL-VX-ZÑ]*$$' \
 	| sed 's/CH/1/g' \
 	| sed 's/LL/2/g' \
 	| sed 's/RR/3/g' \
--- a/dawg/Spanish/info.txt
+++ b/dawg/Spanish/info.txt
@ -1,4 +1,4 @@
-# -*- mode: conf; coding: iso-8859-1; -*-
+# -*- mode: conf; coding: utf-8; -*-
 # Copyright 2002-2006 by Eric House (xwords@eehouse.org).  All rights
 # reserved.
 #
@ -20,6 +20,7 @@
 # below

 NEEDSSORT:true
+CHARSET: utf-8

 # MSDos LF chars go bye-bye
 LANGFILTER: tr -d '\r'
@ -27,9 +28,9 @@ LANGFILTER: tr -d '\r'
 # convert accented vowels
 LANGFILTER: | tr '\207\216\222\227\234\237\226' 'aeiouu\321'
 # uppercase
-LANGFILTER: | tr [a-zñ] [A-ZÑ]
+LANGFILTER: | tr [a-zÃ±] [A-ZÃ]
 # remove words with illegal letters
-LANGFILTER: | grep '^[[A-JL-VX-ZÑ]*$'
+LANGFILTER: | grep '^[[A-JL-VX-ZÃ]*$'
 # substitute pairs (can't figure out how to use octal values)
 LANGFILTER: | sed 's/CH/1/g'
 LANGFILTER: | sed 's/LL/2/g'
@ -43,7 +44,7 @@ LANGFILTER: | sort -u -z
 D2DARGS: -r -term 0

 LANGINFO: <p>Spanish words include all letters in the English alphabet
-LANGINFO: except "K" and "W", and with "Ñ" added. Since there are no
+LANGINFO: except "K" and "W", and with "Ã" added. Since there are no
 LANGINFO: tiles for accented vowels, these are replaced by the
 LANGINFO: unaccented forms.</p>

@ -92,8 +93,7 @@ XLOC_HEADER:0x8600
 1			8	{"LL", true, true}
 2			3	'M'
 5			1	'N'
-#	/*'N~'*/
-1			8	 209
+1			8	'Ñ'
 9			1	'O'
 2			3	'P'
 1			5	'Q'
--- a/dawg/Swedish/Makefile
+++ b/dawg/Swedish/Makefile
@ -1,4 +1,4 @@
-# -*-mode: Makefile; coding: iso-8859-1; -*-
+# -*-mode: Makefile; coding: utf-8; -*-
 # Copyright 2002 by Eric House (xwords@eehouse.org).  All rights reserved.
 #
 # This program is free software; you can redistribute it and/or
@ -17,7 +17,7 @@

 XWLANG=Swedish
 LANGCODE=sv_SE
-ENC = ISO-8859-1
+ENC = UTF-8

 # Swedish has too many chars for the old format.
 NEWDAWG=whatever
@ -28,14 +28,14 @@ include ../Makefile.2to8

 include ../Makefile.langcommon

-SOURCEDICT ?= $(XWDICTPATH)/Swedish/swedish15.dict.gz
+SOURCEDICT ?= $(XWDICTPATH)/Swedish/swedish15.utf8.gz

 # Q and W are not available as tiles, but I'm told there's a custom in
 # Swedish play of allowing blanks to stand for those letters as well.
 # So we don't exclude words with those letters from the dictionary.
 $(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile
-	zcat $< | tr [a-zäĺćöü] [A-ZÄĹĆÖÜ] | \
-		LANG=$(LANGCODE):$(ENC) grep '^[A-ZÄĹĆÖÜ]\{2,15\}$$' | \
+	zcat $< | tr [a-zäåæöü] [A-ZÄÅÆÖÜ] | \
+		LANG=$(LANGCODE):$(ENC) grep '^[A-ZÄÅÆÖÜ]\{2,15\}$$' | \
 		gzip -c > $@

 # Everything but creating of the Main.dict file is inherited from the
--- a/dawg/Swedish/info.txt
+++ b/dawg/Swedish/info.txt
@ -1,4 +1,4 @@
-# -*- mode: conf; coding: iso-8859-1; -*-
+# -*- mode: conf; coding: utf-8; -*-
 # Copyright 2002 by Eric House (xwords@eehouse.org).  All rights reserved.
 #
 # This program is free software; you can redistribute it and/or
@ -15,16 +15,17 @@
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

+CHARSET: utf-8
 LANGCODE:sv_SE

 LANGFILTER: tr -d '\r'
-LANGFILTER: | tr [a-zäåæöü] [A-ZÄÅÆÖÜ]
-LANGFILTER: | grep '^[A-ZÄÅÆÖÜ]*$'
+LANGFILTER: | tr [a-zäåæöü] [A-ZÄÅÆÖÜ]
+LANGFILTER: | grep '^[A-ZÄÅÆÖÜ]*$'

 D2DARGS: -r -term 10

 LANGINFO: <p>From an English-speaker&apos;s perspective, Swedish drops Q
-LANGINFO: and W, and adds Ä, Å, Æ, Ö and Ü.</p>
+LANGINFO: and W, and adds Ä, Å, Æ, Ö and Ü.</p>

 # High bit means "official".  Next 7 bits are an enum where
 # Swedish==7.  Low byte is padding
@ -36,11 +37,11 @@ XLOC_HEADER:0x8700
 2			0		{"_"}
 8			1		'A'
 # A with two dots
-2			3		'Ä'
+2			3		'Ä'
 # A with circle
-2			4		'Å'
-# Æ tile only available for blanks
-0			1		'Æ'
+2			4		'Å'
+# Æ tile only available for blanks
+0			1		'Æ'
 2			4		'B'
 1			8		'C'
 5			1		'D'
@ -56,7 +57,7 @@ XLOC_HEADER:0x8700
 6			1		'N'
 5			2		'O'
 # O with two dots
-2			4		'Ö'
+2			4		'Ö'
 2			4		'P'
 # Q tile only available for blanks
 0			1		'Q'
@ -64,13 +65,12 @@ XLOC_HEADER:0x8700
 8			1		'S'
 8			1		'T'
 3			4		'U'
-# Ü tile only available for blanks
-0			1		'Ü'
+# Ü tile only available for blanks
+0			1		'Ü'
 2			3		'V'
 # W tile only available for blanks
 0			1		'W'
 1			8		'X'
 1			7		'Y'
 1			10		'Z'
-
 <END_TILES>