From fb2fcf15ccdf83ab9031ab929324b479651acc75 Mon Sep 17 00:00:00 2001 From: Eric House Date: Wed, 29 Apr 2020 12:29:26 -0700 Subject: [PATCH] tmp fix for Hungarian: remove duplicate words Find-prefix feature in current code crashes on Hungarian because it allows duplicates (words that occur spelled with the same letters but different tile combinations.) Modify Makefile to exclude those (as it does for all other multi-letter-tile languages). And to pull the git source of the wordlist on demand. --- xwords4/dawg/Hungarian/.gitignore | 1 + xwords4/dawg/Hungarian/Makefile | 39 ++++++++++++++++++++++++++----- xwords4/dawg/Hungarian/mklist.sh | 11 --------- 3 files changed, 34 insertions(+), 17 deletions(-) create mode 100644 xwords4/dawg/Hungarian/.gitignore delete mode 100755 xwords4/dawg/Hungarian/mklist.sh diff --git a/xwords4/dawg/Hungarian/.gitignore b/xwords4/dawg/Hungarian/.gitignore new file mode 100644 index 000000000..d0b2fc328 --- /dev/null +++ b/xwords4/dawg/Hungarian/.gitignore @@ -0,0 +1 @@ +magyarispell diff --git a/xwords4/dawg/Hungarian/Makefile b/xwords4/dawg/Hungarian/Makefile index e07355735..69d5f8744 100644 --- a/xwords4/dawg/Hungarian/Makefile +++ b/xwords4/dawg/Hungarian/Makefile @@ -34,22 +34,49 @@ LANG_SPECIAL_INFO = \ include ../Makefile.langcommon +GIT_DIR = ./magyarispell +SRC_COMMIT = 39ee7f3f8631b953d44ed6f12cfe8ae7193fbf13 + +SRC = \ + ${GIT_DIR}/szotar/alap/fonev.1 \ + ${GIT_DIR}/szotar/alap/melleknev.1 \ + ${GIT_DIR}/szotar/alap/ige_alanyi.1 \ + ${GIT_DIR}/szotar/alap/ige_targy.1 \ + ${GIT_DIR}/szotar/alap/ragozatlan.2 + +PHONY: GIT_TREE + +$(SRC) : GIT_TREE + +GIT_TREE: + if [ ! -d $(GIT_DIR) ]; then \ + mkdir -p $(GIT_DIR); \ + git clone https://github.com/laszlonemeth/magyarispell.git $(GIT_DIR); \ + fi + (cd $(GIT_DIR) && git checkout $(SRC_COMMIT)) + +hungarian_wordlist.txt: $(SRC) + cat $^ | \ + sed -e 's/#.*$$//' -e 's/\[.*$$//' -e 's/ .*$$//' |\ + grep -v '^$$' |\ + sort -u > $@ + # Filtering wordlist: When a word can be spelled with either a # double-letter tile or two single-letter tiles, it's in the list with # both spellings. That's what the longer sed expressions are doing # (emitting two words) -$(XWLANG)Main.dict.gz: - cat hungarian_wordlist.txt \ +$(XWLANG)Main.dict.gz: hungarian_wordlist.txt + cat $< \ | tr -d '\r' \ | tr [aábcdeéfghiíjklmnnyoóöőprtuúüűvzs] [AÁBCDEÉFGHIÍJKLMNNYOÓÖŐPRTUÚÜŰVZS] \ | grep -v '1\|2\|3\|4\|5\|6\|7' \ - | sed -e 's,^\(.*\)CS\(.*\)$$,\11\2\n\1CS\2,g' \ + | sed -e 's,CS,1,g' \ | sed -e 's,GY,2,g' \ | sed -e 's,LY,3,g' \ | sed -e 's,NY,4,g' \ - | sed -e 's,^\(.*\)SZ\(.*\)$$,\15\2\n\1SZ\2,g' \ + | sed -e 's,SZ,5,g' \ | sed -e 's,TY,6,g' \ - | sed -e 's,^\(.*\)ZS\(.*\)$$,\17\2\n\1ZS\2,g' \ + | sed -e 's,ZS,7,g' \ | grep '^[1-7AÁBCDEÉFGHIÍJKLMNOÓÖŐPRSTUÚÜŰVZ]*$$' \ | tr '1234567' '\001\002\003\004\005\006\007' \ | gzip -c > $@ @@ -58,4 +85,4 @@ $(XWLANG)Main.dict.gz: # "parent" Makefile.langcommon in the parent directory. clean: clean_common - rm -f $(XWLANG)Main.dict.gz *.bin $(XWLANG)*.pdb $(XWLANG)*.seb + rm -f $(XWLANG)Main.dict.gz *.bin $(XWLANG)*.pdb $(XWLANG)*.seb hungarian_wordlist.txt diff --git a/xwords4/dawg/Hungarian/mklist.sh b/xwords4/dawg/Hungarian/mklist.sh deleted file mode 100755 index 7b207a3de..000000000 --- a/xwords4/dawg/Hungarian/mklist.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/sh - -set -e -u - -# from: https://github.com/laszlonemeth/magyarispell.git -DIR=/home/eehouse/dev/git/magyarispell/szotar/alap - -cat ${DIR}/fonev.1 ${DIR}/melleknev.1 ${DIR}/ige_alanyi.1 ${DIR}/ige_targy.1 ${DIR}/ragozatlan.2 |\ - sed -e 's/#.*$//' -e 's/\[.*$//' -e 's/ .*$//' |\ - grep -v '^$' |\ - sort -u > hungarian_wordlist.txt