tmp fix for Hungarian: remove duplicate words

Find-prefix feature in current code crashes on Hungarian because it allows duplicates (words that occur spelled with the same letters but different tile combinations.) Modify Makefile to exclude those (as it does for all other multi-letter-tile languages). And to pull the git source of the wordlist on demand.
2025-02-04 20:46:28 +01:00 · 2020-04-29 12:29:26 -07:00 · 2020-04-29 12:29:26 -07:00 · fb2fcf15cc
commit fb2fcf15cc
parent 7efd084d35
3 changed files with 34 additions and 17 deletions
--- a/xwords4/dawg/Hungarian/.gitignore
+++ b/xwords4/dawg/Hungarian/.gitignore
@ -0,0 +1 @@
+magyarispell
--- a/xwords4/dawg/Hungarian/Makefile
+++ b/xwords4/dawg/Hungarian/Makefile
@ -34,22 +34,49 @@ LANG_SPECIAL_INFO = \

 include ../Makefile.langcommon

+GIT_DIR = ./magyarispell
+SRC_COMMIT = 39ee7f3f8631b953d44ed6f12cfe8ae7193fbf13
+
+SRC = \
+	${GIT_DIR}/szotar/alap/fonev.1 \
+	${GIT_DIR}/szotar/alap/melleknev.1 \
+	${GIT_DIR}/szotar/alap/ige_alanyi.1 \
+	${GIT_DIR}/szotar/alap/ige_targy.1 \
+	${GIT_DIR}/szotar/alap/ragozatlan.2
+
+PHONY: GIT_TREE
+
+$(SRC) : GIT_TREE
+
+GIT_TREE:
+	if [ ! -d $(GIT_DIR) ]; then \
+		mkdir -p $(GIT_DIR); \
+		git clone https://github.com/laszlonemeth/magyarispell.git $(GIT_DIR); \
+	fi
+	(cd $(GIT_DIR) && git checkout $(SRC_COMMIT))
+
+hungarian_wordlist.txt: $(SRC)
+	cat $^ | \
+		sed -e 's/#.*$$//' -e 's/\[.*$$//' -e 's/ .*$$//' |\
+		grep -v '^$$' |\
+		sort -u > $@
+
 # Filtering wordlist: When a word can be spelled with either a
 # double-letter tile or two single-letter tiles, it's in the list with
 # both spellings. That's what the longer sed expressions are doing
 # (emitting two words)
-$(XWLANG)Main.dict.gz:
-	cat hungarian_wordlist.txt \
+$(XWLANG)Main.dict.gz: hungarian_wordlist.txt
+	cat $< \
 	| tr -d '\r' \
 	| tr [aábcdeéfghiíjklmnnyoóöőprtuúüűvzs] [AÁBCDEÉFGHIÍJKLMNNYOÓÖŐPRTUÚÜŰVZS] \
 	| grep -v '1\|2\|3\|4\|5\|6\|7' \
-	| sed -e 's,^\(.*\)CS\(.*\)$$,\11\2\n\1CS\2,g' \
+	| sed -e 's,CS,1,g' \
 	| sed -e 's,GY,2,g' \
 	| sed -e 's,LY,3,g' \
 	| sed -e 's,NY,4,g' \
-	| sed -e 's,^\(.*\)SZ\(.*\)$$,\15\2\n\1SZ\2,g' \
+	| sed -e 's,SZ,5,g' \
 	| sed -e 's,TY,6,g' \
-	| sed -e 's,^\(.*\)ZS\(.*\)$$,\17\2\n\1ZS\2,g' \
+	| sed -e 's,ZS,7,g' \
 	| grep '^[1-7AÁBCDEÉFGHIÍJKLMNOÓÖŐPRSTUÚÜŰVZ]*$$' \
 	| tr '1234567' '\001\002\003\004\005\006\007' \
 	| gzip -c > $@
@ -58,4 +85,4 @@ $(XWLANG)Main.dict.gz:
 # "parent" Makefile.langcommon in the parent directory.

 clean: clean_common
-	rm -f $(XWLANG)Main.dict.gz *.bin $(XWLANG)*.pdb $(XWLANG)*.seb
+	rm -f $(XWLANG)Main.dict.gz *.bin $(XWLANG)*.pdb $(XWLANG)*.seb hungarian_wordlist.txt
--- a/xwords4/dawg/Hungarian/mklist.sh
+++ b/xwords4/dawg/Hungarian/mklist.sh
@ -1,11 +0,0 @@
-#!/bin/sh
-
-set -e -u
-
-# from: https://github.com/laszlonemeth/magyarispell.git
-DIR=/home/eehouse/dev/git/magyarispell/szotar/alap
-
-cat ${DIR}/fonev.1 ${DIR}/melleknev.1 ${DIR}/ige_alanyi.1 ${DIR}/ige_targy.1 ${DIR}/ragozatlan.2 |\
-	sed -e 's/#.*$//' -e 's/\[.*$//' -e 's/ .*$//' |\
-	grep -v '^$' |\
-	sort -u > hungarian_wordlist.txt