tmp fix for Hungarian: remove duplicate words

Find-prefix feature in current code crashes on Hungarian because it
allows duplicates (words that occur spelled with the same letters but
different tile combinations.) Modify Makefile to exclude those (as it
does for all other multi-letter-tile languages). And to pull the git
source of the wordlist on demand.
This commit is contained in:
Eric House 2020-04-29 12:29:26 -07:00
parent 7efd084d35
commit fb2fcf15cc
3 changed files with 34 additions and 17 deletions

1
xwords4/dawg/Hungarian/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
magyarispell

View file

@ -34,22 +34,49 @@ LANG_SPECIAL_INFO = \
include ../Makefile.langcommon include ../Makefile.langcommon
GIT_DIR = ./magyarispell
SRC_COMMIT = 39ee7f3f8631b953d44ed6f12cfe8ae7193fbf13
SRC = \
${GIT_DIR}/szotar/alap/fonev.1 \
${GIT_DIR}/szotar/alap/melleknev.1 \
${GIT_DIR}/szotar/alap/ige_alanyi.1 \
${GIT_DIR}/szotar/alap/ige_targy.1 \
${GIT_DIR}/szotar/alap/ragozatlan.2
PHONY: GIT_TREE
$(SRC) : GIT_TREE
GIT_TREE:
if [ ! -d $(GIT_DIR) ]; then \
mkdir -p $(GIT_DIR); \
git clone https://github.com/laszlonemeth/magyarispell.git $(GIT_DIR); \
fi
(cd $(GIT_DIR) && git checkout $(SRC_COMMIT))
hungarian_wordlist.txt: $(SRC)
cat $^ | \
sed -e 's/#.*$$//' -e 's/\[.*$$//' -e 's/ .*$$//' |\
grep -v '^$$' |\
sort -u > $@
# Filtering wordlist: When a word can be spelled with either a # Filtering wordlist: When a word can be spelled with either a
# double-letter tile or two single-letter tiles, it's in the list with # double-letter tile or two single-letter tiles, it's in the list with
# both spellings. That's what the longer sed expressions are doing # both spellings. That's what the longer sed expressions are doing
# (emitting two words) # (emitting two words)
$(XWLANG)Main.dict.gz: $(XWLANG)Main.dict.gz: hungarian_wordlist.txt
cat hungarian_wordlist.txt \ cat $< \
| tr -d '\r' \ | tr -d '\r' \
| tr [aábcdeéfghiíjklmnnyoóöőprtuúüűvzs] [AÁBCDEÉFGHIÍJKLMNNYOÓÖŐPRTUÚÜŰVZS] \ | tr [aábcdeéfghiíjklmnnyoóöőprtuúüűvzs] [AÁBCDEÉFGHIÍJKLMNNYOÓÖŐPRTUÚÜŰVZS] \
| grep -v '1\|2\|3\|4\|5\|6\|7' \ | grep -v '1\|2\|3\|4\|5\|6\|7' \
| sed -e 's,^\(.*\)CS\(.*\)$$,\11\2\n\1CS\2,g' \ | sed -e 's,CS,1,g' \
| sed -e 's,GY,2,g' \ | sed -e 's,GY,2,g' \
| sed -e 's,LY,3,g' \ | sed -e 's,LY,3,g' \
| sed -e 's,NY,4,g' \ | sed -e 's,NY,4,g' \
| sed -e 's,^\(.*\)SZ\(.*\)$$,\15\2\n\1SZ\2,g' \ | sed -e 's,SZ,5,g' \
| sed -e 's,TY,6,g' \ | sed -e 's,TY,6,g' \
| sed -e 's,^\(.*\)ZS\(.*\)$$,\17\2\n\1ZS\2,g' \ | sed -e 's,ZS,7,g' \
| grep '^[1-7AÁBCDEÉFGHIÍJKLMNOÓÖŐPRSTUÚÜŰVZ]*$$' \ | grep '^[1-7AÁBCDEÉFGHIÍJKLMNOÓÖŐPRSTUÚÜŰVZ]*$$' \
| tr '1234567' '\001\002\003\004\005\006\007' \ | tr '1234567' '\001\002\003\004\005\006\007' \
| gzip -c > $@ | gzip -c > $@
@ -58,4 +85,4 @@ $(XWLANG)Main.dict.gz:
# "parent" Makefile.langcommon in the parent directory. # "parent" Makefile.langcommon in the parent directory.
clean: clean_common clean: clean_common
rm -f $(XWLANG)Main.dict.gz *.bin $(XWLANG)*.pdb $(XWLANG)*.seb rm -f $(XWLANG)Main.dict.gz *.bin $(XWLANG)*.pdb $(XWLANG)*.seb hungarian_wordlist.txt

View file

@ -1,11 +0,0 @@
#!/bin/sh
set -e -u
# from: https://github.com/laszlonemeth/magyarispell.git
DIR=/home/eehouse/dev/git/magyarispell/szotar/alap
cat ${DIR}/fonev.1 ${DIR}/melleknev.1 ${DIR}/ige_alanyi.1 ${DIR}/ige_targy.1 ${DIR}/ragozatlan.2 |\
sed -e 's/#.*$//' -e 's/\[.*$//' -e 's/ .*$//' |\
grep -v '^$' |\
sort -u > hungarian_wordlist.txt