From cc4776d29da5bf69dc662d1f1eb5696b3a5c40e8 Mon Sep 17 00:00:00 2001 From: Eric House Date: Fri, 24 Apr 2020 13:44:55 -0700 Subject: [PATCH] Populate an actual wordlist for Hungarian Add Makefile filters to create a wordlist with about 42K words derived from a github project (thanks to pointers from an informant. :-) Per him, and contrary to how Catalan does it, double-letter-tile words also appear in single-letter variants if the tiles allow. --- xwords4/dawg/Hungarian/Makefile | 22 +++++++++++++++++++--- xwords4/dawg/Hungarian/mklist.sh | 11 +++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) create mode 100755 xwords4/dawg/Hungarian/mklist.sh diff --git a/xwords4/dawg/Hungarian/Makefile b/xwords4/dawg/Hungarian/Makefile index 03547fbb4..e07355735 100644 --- a/xwords4/dawg/Hungarian/Makefile +++ b/xwords4/dawg/Hungarian/Makefile @@ -21,7 +21,7 @@ LANGCODE=hu_HU ENC = UTF-8 TARGET_TYPE ?= WINCE -DICTNOTE = "This wordlist contains the tile information for Hungarian but no words." +DICTNOTE = "Derived from szotar/alap/ in https://github.com/laszlonemeth/magyarispell.git" LANG_SPECIAL_INFO = \ "CS Cs cS cs" /dev/null /dev/null \ @@ -34,9 +34,25 @@ LANG_SPECIAL_INFO = \ include ../Makefile.langcommon -# Empty dict +# Filtering wordlist: When a word can be spelled with either a +# double-letter tile or two single-letter tiles, it's in the list with +# both spellings. That's what the longer sed expressions are doing +# (emitting two words) $(XWLANG)Main.dict.gz: - echo -n "" | gzip -c > $@ + cat hungarian_wordlist.txt \ + | tr -d '\r' \ + | tr [aábcdeéfghiíjklmnnyoóöőprtuúüűvzs] [AÁBCDEÉFGHIÍJKLMNNYOÓÖŐPRTUÚÜŰVZS] \ + | grep -v '1\|2\|3\|4\|5\|6\|7' \ + | sed -e 's,^\(.*\)CS\(.*\)$$,\11\2\n\1CS\2,g' \ + | sed -e 's,GY,2,g' \ + | sed -e 's,LY,3,g' \ + | sed -e 's,NY,4,g' \ + | sed -e 's,^\(.*\)SZ\(.*\)$$,\15\2\n\1SZ\2,g' \ + | sed -e 's,TY,6,g' \ + | sed -e 's,^\(.*\)ZS\(.*\)$$,\17\2\n\1ZS\2,g' \ + | grep '^[1-7AÁBCDEÉFGHIÍJKLMNOÓÖŐPRSTUÚÜŰVZ]*$$' \ + | tr '1234567' '\001\002\003\004\005\006\007' \ + | gzip -c > $@ # Everything but creating of the Main.dict file is inherited from the # "parent" Makefile.langcommon in the parent directory. diff --git a/xwords4/dawg/Hungarian/mklist.sh b/xwords4/dawg/Hungarian/mklist.sh new file mode 100755 index 000000000..7b207a3de --- /dev/null +++ b/xwords4/dawg/Hungarian/mklist.sh @@ -0,0 +1,11 @@ +#!/bin/sh + +set -e -u + +# from: https://github.com/laszlonemeth/magyarispell.git +DIR=/home/eehouse/dev/git/magyarispell/szotar/alap + +cat ${DIR}/fonev.1 ${DIR}/melleknev.1 ${DIR}/ige_alanyi.1 ${DIR}/ige_targy.1 ${DIR}/ragozatlan.2 |\ + sed -e 's/#.*$//' -e 's/\[.*$//' -e 's/ .*$//' |\ + grep -v '^$' |\ + sort -u > hungarian_wordlist.txt