From 90f8a276e1d5f9f859f367b8aef001a67abef3c0 Mon Sep 17 00:00:00 2001 From: ehouse Date: Sun, 25 Jan 2009 18:57:05 +0000 Subject: [PATCH] Cleanup to run on a machine that's utf8: specify iso-8859-1 when needed. --- xwords4/dawg/Catalan/info.txt | 40 +++++++++++++---------- xwords4/dawg/Danish/Makefile | 17 ++++++---- xwords4/dawg/Dutch/Makefile | 19 +++++++---- xwords4/dawg/Dutch/info.txt | 2 +- xwords4/dawg/English/info.txt | 1 + xwords4/dawg/French/Makefile.ODS4 | 2 +- xwords4/dawg/German/Makefile | 16 +++++---- xwords4/dawg/Italian/Makefile | 2 +- xwords4/dawg/Polish/info.txt | 1 + xwords4/dawg/Portuguese/Makefile.BrOffice | 13 ++++---- xwords4/dawg/Portuguese/Makefile.Minho | 10 +++--- xwords4/dawg/Spanish/Makefile | 13 ++++---- xwords4/dawg/Spanish/info.txt | 2 +- xwords4/dawg/Swedish/Makefile | 9 ++--- xwords4/dawg/Swedish/info.txt | 3 +- 15 files changed, 88 insertions(+), 62 deletions(-) diff --git a/xwords4/dawg/Catalan/info.txt b/xwords4/dawg/Catalan/info.txt index 16e3ed5ae..1cb56442d 100644 --- a/xwords4/dawg/Catalan/info.txt +++ b/xwords4/dawg/Catalan/info.txt @@ -17,38 +17,44 @@ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. LANGCODE:ca_ES +CHARSET: utf-8 NEEDSSORT:true -LANGINFO:

Catalan includes several special tiles, "L.L", "NY" and +LANGINFO:

Catalan includes several special tiles, "L·L", "NY" and LANGINFO: "QU" in addition to Ç. There are no "Y" or "Q" tiles, LANGINFO: and all words containing either of these letters not in LANGINFO: combination with a "N" or "U" will be excluded from the LANGINFO: dictionary.

LANGINFO:

"L" is legal by itself, as are words in which two "L"s -LANGINFO: appear side-by-side. If you want your dictionary to include -LANGINFO: the "L.L" tile you will need to make sure that the exact -LANGINFO: string "L.L" (or "l.l") appears in the wordlist you -LANGINFO: upload.

+LANGINFO: appear side-by-side. The "L·L" tile is used whenever any of +LANGINFO: these three strings appears in the wordlist you upload: +LANGINFO: "L-L", "L.L" or "L·L". (And of course "l-l", "l.l" or +LANGINFO: "l·l".)

+ +LANGINFO:

In addition to the special multi-letter tiles discussed +LANGINFO: above, the following letters are allowed: A-J, L-V, X, Z and +LANGINFO: Ç. Lowercase letters will be converted to uppercase, then +LANGINFO: words containing letters not listed here will be excluded.

+ +LANGINFO:

The file you upload should be encoded in UTF-8.

-LANGFILTER_PRECLIP: tr 'ça-z' 'ÇA-Z' | -LANGFILTER_PRECLIP: grep -v 'Q[^U]' | -LANGFILTER_PRECLIP: grep -v '[^N]Y' | -LANGFILTER_PRECLIP: grep -v '^Y' | -LANGFILTER_PRECLIP: grep '^[ÇA-JL-VXYZ\.]*$' | -LANGFILTER_PRECLIP: sed -e 's/L\.L/1/g' -e 's/NY/2/g' -e 's/QU/3/g' | +# MSDos LF chars go bye-bye +LANGFILTER: tr -d '\r' -LANGFILTER_POSTCLIP: | tr -d '\r' -LANGFILTER_POSTCLIP: | sort -u -LANGFILTER_POSTCLIP: | tr -s '\n' '\000' +LANGFILTER: | tr 'a-zç' 'A-ZÇ' +LANGFILTER: | sed -e 's/L·L/1/g' -e 's/L\.L/1/g' -e 's/L-L/1/g' +LANGFILTER: | sed -e 's/NY/2/g' -e 's/QU/3/g' +LANGFILTER: | grep '^[Ç1-3A-JL-VXZ\.]*$' -#LANGFILTER_PRECLIP: sed 's/NY/2/g' | -#LANGFILTER_PRECLIP: sed 's/QU/3/g' | +# substitute in the octal control character values +LANGFILTER: | tr '123' '\001\002\003' +LANGFILTER: | tr -s '\n' '\000' +D2DARGS: -r -term 0 -enc UTF-8 -LANGFILTER_POSTCLIP: | tr '123' '\001\002\003' # High bit means "official". Next 7 bits are an enum where # Catalan==c. Low byte is padding diff --git a/xwords4/dawg/Danish/Makefile b/xwords4/dawg/Danish/Makefile index 0e289f130..283690239 100644 --- a/xwords4/dawg/Danish/Makefile +++ b/xwords4/dawg/Danish/Makefile @@ -1,4 +1,4 @@ -# -*- mode: makefile -*- +# -*- mode: Makefile; coding: iso-8859-1; -*- # Copyright 2002-2005 by Eric House (xwords@eehouse.org). All rights reserved. # # This program is free software; you can redistribute it and/or @@ -15,8 +15,9 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -XWLANG=Danish -LANGCODE=da_DK +XWLANG = Danish +LANGCODE = da_DK +ENC = ISO-8859-1 TARGET_TYPE ?= PALM @@ -24,12 +25,14 @@ include ../Makefile.2to8 include ../Makefile.langcommon -SOURCEDICT ?= $(XWDICTPATH)/$(XWLANG)/LarsDanish.dict.gz +SOURCEDICT ?= $(XWDICTPATH)/Danish/LarsDanish.dict.gz $(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile - zcat $< | tr -d '\r' | tr [a-zåæø] [A-ZÅÆØ] | \ - grep '[AEIOUÅÆØ]' | \ - grep '^[A-PR-VX-ZÅÆØ]\+$$' | sort -u | \ + zcat $< | tr -d '\r' | \ + LANG=$(LANGCODE):$(ENC) tr [a-zåæø] [A-ZÅÆØ] | \ + LANG=$(LANGCODE):$(ENC) grep '[AEIOUÅÆØ]' | \ + LANG=$(LANGCODE):$(ENC) grep '^[A-PR-VX-ZÅÆØ]\+$$' | \ + LANG=$(LANGCODE):$(ENC) sort -u | \ gzip -c > $@ # Everything but creating of the Main.dict file is inherited from the diff --git a/xwords4/dawg/Dutch/Makefile b/xwords4/dawg/Dutch/Makefile index 0e87c082e..85d7eb1b1 100644 --- a/xwords4/dawg/Dutch/Makefile +++ b/xwords4/dawg/Dutch/Makefile @@ -1,5 +1,5 @@ -# -*- mode: makefile -*- -# Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved. +# -*- mode: makefile; coding: iso-8859-1 -*- +# Copyright 2002-2009 by Eric House (xwords@eehouse.org). All rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -17,18 +17,25 @@ XWLANG=Dutch LANGCODE=nl_NL +ENC = ISO-8859-1 -TARGET_TYPE ?= PALM +TARGET_TYPE ?= WINCE include ../Makefile.2to8 include ../Makefile.langcommon -SOURCEDICT ?= $(XWDICTPATH)/$(XWLANG)/Dutch__unofficial_alphabetical.dict.gz +SOURCEDICT ?= $(XWDICTPATH)/Dutch/Dutch__unofficial_alphabetical.dict.gz + +# This is weird. We're keeping umlaut letters even though they're not +# on tiles. Do they get translated to non-umlaut equivalents or are +# they allowed to drop this way. Need to confirm the tile set and conversion. $(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile - zcat $< | tr -d '\r' | tr [a-zäöü] [A-ZÄÖÜ] | \ - grep '^[A-Z]\+$$' | sort -u | \ + zcat $< | tr -d '\r' | \ + tr [a-zäöü] [A-ZÄÖÜ] | \ + LANG=$(LANGCODE):$(ENC) grep '^[A-Z]\+$$' | \ + LANG=$(LANGCODE):$(ENC) sort -u | \ gzip -c > $@ # Everything but creating of the Main.dict file is inherited from the diff --git a/xwords4/dawg/Dutch/info.txt b/xwords4/dawg/Dutch/info.txt index 58413d59d..20ce776f1 100644 --- a/xwords4/dawg/Dutch/info.txt +++ b/xwords4/dawg/Dutch/info.txt @@ -1,4 +1,4 @@ -# -*- mode:conf; -*- +# -*- mode:conf; coding: iso-8859-1; -*- # Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved. # # This program is free software; you can redistribute it and/or diff --git a/xwords4/dawg/English/info.txt b/xwords4/dawg/English/info.txt index 70f8d8a49..a6de296ca 100644 --- a/xwords4/dawg/English/info.txt +++ b/xwords4/dawg/English/info.txt @@ -1,3 +1,4 @@ +# -*- mode: conf; -*- # Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved. # # This program is free software; you can redistribute it and/or diff --git a/xwords4/dawg/French/Makefile.ODS4 b/xwords4/dawg/French/Makefile.ODS4 index 2851350bf..1ba915040 100644 --- a/xwords4/dawg/French/Makefile.ODS4 +++ b/xwords4/dawg/French/Makefile.ODS4 @@ -24,7 +24,7 @@ include ../Makefile.2to8 include ../Makefile.langcommon -$(XWLANG)Main.dict.gz: $(XWDICTPATH)/$(XWLANG)/ods4c.txt.gz +$(XWLANG)Main.dict.gz: $(XWDICTPATH)/French/ods4c.txt.gz zcat $< | tr a-z A-Z | tr -d '\r' | gzip >$@ # Everything but creating of the Main.dict file is inherited from the diff --git a/xwords4/dawg/German/Makefile b/xwords4/dawg/German/Makefile index 6452bcfbd..62ec0be1e 100644 --- a/xwords4/dawg/German/Makefile +++ b/xwords4/dawg/German/Makefile @@ -1,4 +1,4 @@ -# -*- mode: makefile -*- +# -*- mode: makefile; coding: iso-8859-1; -*- # Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved. # # This program is free software; you can redistribute it and/or @@ -15,21 +15,23 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -XWLANG=German -LANGCODE=de_DE +XWLANG = German +LANGCODE = de_DE +ENC = ISO-8859-1 -TARGET_TYPE ?= PALM +TARGET_TYPE ?= WINCE include ../Makefile.2to8 include ../Makefile.langcommon -SOURCEDICT ?= $(XWDICTPATH)/$(XWLANG)/HansGerman.dict.gz +SOURCEDICT ?= $(XWDICTPATH)/German/HansGerman.dict.gz $(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile zcat $< | tr [a-zäöü] [A-ZÄÖÜ] | \ - sed -e 's/ß/SS/g' | \ - grep '[AEIOUÄÖÜ]' | grep '^[A-ZÄÖÜ]\+$$' | \ + LANG=$(LANGCODE):$(ENC) sed -e 's/ß/SS/g' | \ + LANG=$(LANGCODE):$(ENC) grep '[AEIOUÄÖÜ]' | \ + LANG=$(LANGCODE):$(ENC) grep '^[A-ZÄÖÜ]\+$$' | \ gzip -c > $@ # Everything but creating of the Main.dict file is inherited from the diff --git a/xwords4/dawg/Italian/Makefile b/xwords4/dawg/Italian/Makefile index d1ff35cb2..a0bbd1ad6 100644 --- a/xwords4/dawg/Italian/Makefile +++ b/xwords4/dawg/Italian/Makefile @@ -24,7 +24,7 @@ include ../Makefile.2to8 include ../Makefile.langcommon -$(XWLANG)Main.dict.gz: $(XWDICTPATH)/$(XWLANG)/ITALIANO.txt.gz +$(XWLANG)Main.dict.gz: $(XWDICTPATH)/Italian/ITALIANO.txt.gz zcat $< | tr a-z A-Z | grep '^[A-IL-VZ]*$$' | gzip >$@ # Everything but creating of the Main.dict file is inherited from the diff --git a/xwords4/dawg/Polish/info.txt b/xwords4/dawg/Polish/info.txt index ceb88c569..4a92d128c 100644 --- a/xwords4/dawg/Polish/info.txt +++ b/xwords4/dawg/Polish/info.txt @@ -1,3 +1,4 @@ +# -*- mode: conf; -*- # Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved. # # This program is free software; you can redistribute it and/or diff --git a/xwords4/dawg/Portuguese/Makefile.BrOffice b/xwords4/dawg/Portuguese/Makefile.BrOffice index b68361de4..13b3a8316 100644 --- a/xwords4/dawg/Portuguese/Makefile.BrOffice +++ b/xwords4/dawg/Portuguese/Makefile.BrOffice @@ -1,4 +1,4 @@ -# -*- mode: makefile -*- +# -*- mode: makefile; coding: iso-8859-1 -*- # Copyright 2002, 2006 by Eric House (xwords@eehouse.org). All rights # reserved. # @@ -16,10 +16,11 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -LANG=PortugueseBR -LANGCODE=pt_PT +XWLANG = PortugueseBR +LANGCODE = pt_PT +ENC = ISO-8859-1 -TARGET_TYPE ?= PALM +TARGET_TYPE ?= WINCE include ../Makefile.2to8 @@ -27,9 +28,9 @@ include ../Makefile.langcommon SOURCEDICT ?= $(XWDICTPATH)/Portuguese/portugueseBR.txt.gz -$(LANG)Main.dict.gz: $(SOURCEDICT) Makefile.BrOffice +$(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile.BrOffice zcat $< | tr [a-zç] [A-ZÇ] | \ - grep '^[A-JL-VXZÇ]\+$$' | \ + LANG=$(LANGCODE):$(ENC) grep '^[A-JL-VXZÇ]\+$$' | \ gzip -c > $@ # Everything but creating of the Main.dict file is inherited from the diff --git a/xwords4/dawg/Portuguese/Makefile.Minho b/xwords4/dawg/Portuguese/Makefile.Minho index f4aef74f8..76542e4ad 100644 --- a/xwords4/dawg/Portuguese/Makefile.Minho +++ b/xwords4/dawg/Portuguese/Makefile.Minho @@ -1,4 +1,5 @@ -# -*- mode: makefile -*- +# -*- mode: makefile; coding: iso-8859-1 -*- +# # Copyright 2002, 2006 by Eric House (xwords@eehouse.org). All rights # reserved. # @@ -17,9 +18,10 @@ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. XWLANG=PortuguesePT -LANGCODE=pt_PT +LANGCODE = pt_PT +ENC = ISO-8859-1 -TARGET_TYPE ?= PALM +TARGET_TYPE ?= WINCE include ../Makefile.2to8 @@ -29,7 +31,7 @@ SOURCEDICT ?= $(XWDICTPATH)/Portuguese/portuguese_pt.bz2 $(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile.Minho bzcat $< | tr [a-zç] [A-ZÇ] | \ - grep '^[A-JL-VXZÇ]\+$$' | \ + LANG=$(LANGCODE):$(ENC) grep '^[A-JL-VXZÇ]\+$$' | \ gzip -c > $@ # Everything but creating of the Main.dict file is inherited from the diff --git a/xwords4/dawg/Spanish/Makefile b/xwords4/dawg/Spanish/Makefile index f3f170fad..591a150c1 100644 --- a/xwords4/dawg/Spanish/Makefile +++ b/xwords4/dawg/Spanish/Makefile @@ -1,4 +1,4 @@ -# -*-mode: Makefile; compile-command: "make all"; -*- +# -*-mode: Makefile; compile-command: "make all"; coding: iso-8859-1; -*- # Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved. # # This program is free software; you can redistribute it and/or @@ -15,9 +15,10 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -XWLANG=SpanishFAA41 -LANGCODE=es_ES -TARGET_TYPE ?= PALM +XWLANG = SpanishFAA41 +LANGCODE = es_ES +TARGET_TYPE ?= WINCE +ENC = ISO-8859-1 ifeq ($(TARGET_TYPE),PALM) PBITMS = ./bmps/palm @@ -49,8 +50,8 @@ $(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile zcat $< \ | tr -d '\r' \ | tr '\207\216\222\227\234\237\226' 'aeiouu\321' \ - | tr [a-zñ] [A-ZÑ] \ - | grep '^[[A-JL-VX-ZÑ]*$$' \ + | tr [a-zñ] [A-ZÑ] \ + | LANG=$(LANGCODE):$(ENC) grep '^[A-JL-VX-ZÑ]*$$' \ | sed 's/CH/1/g' \ | sed 's/LL/2/g' \ | sed 's/RR/3/g' \ diff --git a/xwords4/dawg/Spanish/info.txt b/xwords4/dawg/Spanish/info.txt index f11f81318..7a086e2b4 100644 --- a/xwords4/dawg/Spanish/info.txt +++ b/xwords4/dawg/Spanish/info.txt @@ -1,4 +1,4 @@ -# -*- mode: conf; -*- +# -*- mode: conf; coding: iso-8859-1; -*- # Copyright 2002-2006 by Eric House (xwords@eehouse.org). All rights # reserved. # diff --git a/xwords4/dawg/Swedish/Makefile b/xwords4/dawg/Swedish/Makefile index fadbfdc68..d830ba744 100644 --- a/xwords4/dawg/Swedish/Makefile +++ b/xwords4/dawg/Swedish/Makefile @@ -1,4 +1,4 @@ -# -*-mode: Makefile -*- +# -*-mode: Makefile; coding: iso-8859-1; -*- # Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved. # # This program is free software; you can redistribute it and/or @@ -17,24 +17,25 @@ XWLANG=Swedish LANGCODE=sv_SE +ENC = ISO-8859-1 # Swedish has too many chars for the old format. NEWDAWG=whatever -TARGET_TYPE ?= FRANK +TARGET_TYPE ?= WINCE include ../Makefile.2to8 include ../Makefile.langcommon -SOURCEDICT ?= $(XWDICTPATH)/$(XWLANG)/swedish15.dict.gz +SOURCEDICT ?= $(XWDICTPATH)/Swedish/swedish15.dict.gz # Q and W are not available as tiles, but I'm told there's a custom in # Swedish play of allowing blanks to stand for those letters as well. # So we don't exclude words with those letters from the dictionary. $(XWLANG)Main.dict.gz: $(SOURCEDICT) Makefile zcat $< | tr [a-zäåæöü] [A-ZÄÅÆÖÜ] | \ - grep '^[A-ZÄÅÆÖÜ]\{2,15\}$$' | \ + LANG=$(LANGCODE):$(ENC) grep '^[A-ZÄÅÆÖÜ]\{2,15\}$$' | \ gzip -c > $@ # Everything but creating of the Main.dict file is inherited from the diff --git a/xwords4/dawg/Swedish/info.txt b/xwords4/dawg/Swedish/info.txt index 161cbc060..40dde73ef 100644 --- a/xwords4/dawg/Swedish/info.txt +++ b/xwords4/dawg/Swedish/info.txt @@ -1,3 +1,4 @@ +# -*- mode: conf; coding: iso-8859-1; -*- # Copyright 2002 by Eric House (xwords@eehouse.org). All rights reserved. # # This program is free software; you can redistribute it and/or @@ -22,7 +23,7 @@ LANGFILTER: | grep '^[A-Z D2DARGS: -r -term 10 -LANGINFO:

From an English-speaker's perspective, Swedish drops Q +LANGINFO:

From an English-speaker's perspective, Swedish drops Q LANGINFO: and W, and adds Ä, Å, Æ, Ö and Ü.

# High bit means "official". Next 7 bits are an enum where