Populate an actual wordlist for Hungarian

Add Makefile filters to create a wordlist with about 42K words derived
from a github project (thanks to pointers from an informant. :-) Per
him, and contrary to how Catalan does it, double-letter-tile words
also appear in single-letter variants if the tiles allow.
This commit is contained in:
Eric House 2020-04-24 13:44:55 -07:00
parent ebc6c4629d
commit cc4776d29d
2 changed files with 30 additions and 3 deletions

View file

@ -21,7 +21,7 @@ LANGCODE=hu_HU
ENC = UTF-8
TARGET_TYPE ?= WINCE
DICTNOTE = "This wordlist contains the tile information for Hungarian but no words."
DICTNOTE = "Derived from szotar/alap/ in https://github.com/laszlonemeth/magyarispell.git"
LANG_SPECIAL_INFO = \
"CS Cs cS cs" /dev/null /dev/null \
@ -34,9 +34,25 @@ LANG_SPECIAL_INFO = \
include ../Makefile.langcommon
# Empty dict
# Filtering wordlist: When a word can be spelled with either a
# double-letter tile or two single-letter tiles, it's in the list with
# both spellings. That's what the longer sed expressions are doing
# (emitting two words)
$(XWLANG)Main.dict.gz:
echo -n "" | gzip -c > $@
cat hungarian_wordlist.txt \
| tr -d '\r' \
| tr [aábcdeéfghiíjklmnnyoóöőprtuúüűvzs] [AÁBCDEÉFGHIÍJKLMNNYOÓÖŐPRTUÚÜŰVZS] \
| grep -v '1\|2\|3\|4\|5\|6\|7' \
| sed -e 's,^\(.*\)CS\(.*\)$$,\11\2\n\1CS\2,g' \
| sed -e 's,GY,2,g' \
| sed -e 's,LY,3,g' \
| sed -e 's,NY,4,g' \
| sed -e 's,^\(.*\)SZ\(.*\)$$,\15\2\n\1SZ\2,g' \
| sed -e 's,TY,6,g' \
| sed -e 's,^\(.*\)ZS\(.*\)$$,\17\2\n\1ZS\2,g' \
| grep '^[1-7AÁBCDEÉFGHIÍJKLMNOÓÖŐPRSTUÚÜŰVZ]*$$' \
| tr '1234567' '\001\002\003\004\005\006\007' \
| gzip -c > $@
# Everything but creating of the Main.dict file is inherited from the
# "parent" Makefile.langcommon in the parent directory.

View file

@ -0,0 +1,11 @@
#!/bin/sh
set -e -u
# from: https://github.com/laszlonemeth/magyarispell.git
DIR=/home/eehouse/dev/git/magyarispell/szotar/alap
cat ${DIR}/fonev.1 ${DIR}/melleknev.1 ${DIR}/ige_alanyi.1 ${DIR}/ige_targy.1 ${DIR}/ragozatlan.2 |\
sed -e 's/#.*$//' -e 's/\[.*$//' -e 's/ .*$//' |\
grep -v '^$' |\
sort -u > hungarian_wordlist.txt