tmp fix for Hungarian: remove duplicate words

Find-prefix feature in current code crashes on Hungarian because it allows duplicates (words that occur spelled with the same letters but different tile combinations.) Modify Makefile to exclude those (as it does for all other multi-letter-tile languages). And to pull the git source of the wordlist on demand.
2025-01-08 05:24:39 +01:00 · 2020-04-29 12:29:26 -07:00 · 2020-04-29 12:29:26 -07:00 · fb2fcf15cc
commit fb2fcf15cc
parent 7efd084d35
3 changed files with 34 additions and 17 deletions
--- a/xwords4/dawg/Hungarian/.gitignore
+++ b/xwords4/dawg/Hungarian/.gitignore
@ -0,0 +1 @@
 magyarispell
--- a/xwords4/dawg/Hungarian/Makefile
+++ b/xwords4/dawg/Hungarian/Makefile
@ -34,22 +34,49 @@ LANG_SPECIAL_INFO = \
 include ../Makefile.langcommon
 GIT_DIR = ./magyarispell
 SRC_COMMIT = 39ee7f3f8631b953d44ed6f12cfe8ae7193fbf13
 SRC = \
 	${GIT_DIR}/szotar/alap/fonev.1 \
 	${GIT_DIR}/szotar/alap/melleknev.1 \
 	${GIT_DIR}/szotar/alap/ige_alanyi.1 \
 	${GIT_DIR}/szotar/alap/ige_targy.1 \
 	${GIT_DIR}/szotar/alap/ragozatlan.2
 PHONY: GIT_TREE
 $(SRC) : GIT_TREE
 GIT_TREE:
 	if [ ! -d $(GIT_DIR) ]; then \
 		mkdir -p $(GIT_DIR); \
 		git clone https://github.com/laszlonemeth/magyarispell.git $(GIT_DIR); \
 	fi
 	(cd $(GIT_DIR) && git checkout $(SRC_COMMIT))
 hungarian_wordlist.txt: $(SRC)
 	cat $^ | \
 		sed -e 's/#.*$$//' -e 's/\[.*$$//' -e 's/ .*$$//' |\
 		grep -v '^$$' |\
 		sort -u > $@
 # Filtering wordlist: When a word can be spelled with either a
 # double-letter tile or two single-letter tiles, it's in the list with
 # both spellings. That's what the longer sed expressions are doing
 # (emitting two words)
-$(XWLANG)Main.dict.gz:
+$(XWLANG)Main.dict.gz: hungarian_wordlist.txt
-	cat hungarian_wordlist.txt \
+	cat $< \
 	| tr -d '\r' \
 	| tr [aábcdeéfghiíjklmnnyoóöőprtuúüűvzs] [AÁBCDEÉFGHIÍJKLMNNYOÓÖŐPRTUÚÜŰVZS] \
 	| grep -v '1\|2\|3\|4\|5\|6\|7' \
-	| sed -e 's,^\(.*\)CS\(.*\)$$,\11\2\n\1CS\2,g' \
+	| sed -e 's,CS,1,g' \
 	| sed -e 's,GY,2,g' \
 	| sed -e 's,LY,3,g' \
 	| sed -e 's,NY,4,g' \
-	| sed -e 's,^\(.*\)SZ\(.*\)$$,\15\2\n\1SZ\2,g' \
+	| sed -e 's,SZ,5,g' \
 	| sed -e 's,TY,6,g' \
-	| sed -e 's,^\(.*\)ZS\(.*\)$$,\17\2\n\1ZS\2,g' \
+	| sed -e 's,ZS,7,g' \
 	| grep '^[1-7AÁBCDEÉFGHIÍJKLMNOÓÖŐPRSTUÚÜŰVZ]*$$' \
 	| tr '1234567' '\001\002\003\004\005\006\007' \
 	| gzip -c > $@
@ -58,4 +85,4 @@ $(XWLANG)Main.dict.gz:
 # "parent" Makefile.langcommon in the parent directory.
 clean: clean_common
-	rm -f $(XWLANG)Main.dict.gz *.bin $(XWLANG)*.pdb $(XWLANG)*.seb
+	rm -f $(XWLANG)Main.dict.gz *.bin $(XWLANG)*.pdb $(XWLANG)*.seb hungarian_wordlist.txt
--- a/xwords4/dawg/Hungarian/mklist.sh
+++ b/xwords4/dawg/Hungarian/mklist.sh
@ -1,11 +0,0 @@
 #!/bin/sh
 set -e -u
 # from: https://github.com/laszlonemeth/magyarispell.git
 DIR=/home/eehouse/dev/git/magyarispell/szotar/alap
 cat ${DIR}/fonev.1 ${DIR}/melleknev.1 ${DIR}/ige_alanyi.1 ${DIR}/ige_targy.1 ${DIR}/ragozatlan.2 |\
 	sed -e 's/#.*$//' -e 's/\[.*$//' -e 's/ .*$//' |\
 	grep -v '^$' |\
 	sort -u > hungarian_wordlist.txt