From 3912a60ee970cf646fab230fcc2e8c9f2fdabdab Mon Sep 17 00:00:00 2001 From: Eric House Date: Sun, 23 Jan 2022 17:46:52 -0800 Subject: [PATCH] limit word lengths to 2-15 dict2dawg crashes when given a 1-letter word. Easier to fix in the filtering that has to be there anyway. --- xwords4/dawg/Catalan/info.txt | 2 +- xwords4/dawg/Danish/info.txt | 2 +- xwords4/dawg/Dutch/info.txt | 2 +- xwords4/dawg/English/info.txt | 2 +- xwords4/dawg/French/info.txt | 2 +- xwords4/dawg/German/info.txt | 2 +- xwords4/dawg/Greek/info.txt | 2 +- xwords4/dawg/Hex/info.txt | 2 +- xwords4/dawg/Hungarian/info.txt | 2 +- xwords4/dawg/Hëx/info.txt | 2 +- xwords4/dawg/Italian/info.txt | 2 +- xwords4/dawg/Polish/info.txt | 2 +- xwords4/dawg/Portuguese/info.txt | 2 +- xwords4/dawg/Romanian/info.txt | 2 +- xwords4/dawg/Spanish/info.txt | 2 +- xwords4/dawg/Swedish/info.txt | 2 +- 16 files changed, 16 insertions(+), 16 deletions(-) diff --git a/xwords4/dawg/Catalan/info.txt b/xwords4/dawg/Catalan/info.txt index d08052e72..92757ecd9 100644 --- a/xwords4/dawg/Catalan/info.txt +++ b/xwords4/dawg/Catalan/info.txt @@ -47,7 +47,7 @@ LANGFILTER: tr -d '\r' LANGFILTER: | tr 'a-zç' 'A-ZÇ' LANGFILTER: | sed -e 's/L·L/1/g' -e 's/L\.L/1/g' -e 's/L-L/1/g' LANGFILTER: | sed -e 's/NY/2/g' -e 's/QU/3/g' -LANGFILTER: | grep '^[Ç1-3A-JL-VXZ\.]*$' +LANGFILTER: | grep -x '[Ç1-3A-JL-VXZ\.]\{2,15\}' # substitute in the octal control character values LANGFILTER: | tr '123' '\001\002\003' diff --git a/xwords4/dawg/Danish/info.txt b/xwords4/dawg/Danish/info.txt index 4d08dba23..e79c94ced 100644 --- a/xwords4/dawg/Danish/info.txt +++ b/xwords4/dawg/Danish/info.txt @@ -24,7 +24,7 @@ LANGFILTER: | tr [a-zåæø] [A-ZÅÆØ] # no words not containing a vowel LANGFILTER: | grep '[AEIOUYÅÆØ]' # none with illegal chars -LANGFILTER: | grep '^[A-PR-VX-ZÅÆØ]\+$' +LANGFILTER: | grep -x '[A-PR-VX-ZÅÆØ]\{2,15\}' # remove duplicates LANGFILTER: | sort -u diff --git a/xwords4/dawg/Dutch/info.txt b/xwords4/dawg/Dutch/info.txt index e0add8a5b..d1dc42262 100644 --- a/xwords4/dawg/Dutch/info.txt +++ b/xwords4/dawg/Dutch/info.txt @@ -23,7 +23,7 @@ LANGFILTER: tr -d '\r' # uppercase all LANGFILTER: | tr [a-z] [A-Z] # none with illegal chars -LANGFILTER: | grep '^[A-Z]\+$' +LANGFILTER: | grep -x '[A-Z]\{2,15\}' LANGFILTER: | sort -u # Until I can figure out how to force sort to use a locale's collation diff --git a/xwords4/dawg/English/info.txt b/xwords4/dawg/English/info.txt index e3aabfeb0..e4c1abf6c 100644 --- a/xwords4/dawg/English/info.txt +++ b/xwords4/dawg/English/info.txt @@ -20,7 +20,7 @@ LANGCODE:en_US # deal with DOS files LANGFILTER: tr -d '\r' LANGFILTER: | tr [a-z] [A-Z] -LANGFILTER: | grep '^[A-Z]*$' +LANGFILTER: | grep -x '[A-Z]\{2,15\}' LANGFILTER: | sort -u # We can trust sort (above) to do the right thing since there's no diff --git a/xwords4/dawg/French/info.txt b/xwords4/dawg/French/info.txt index b80f61211..b48611b91 100755 --- a/xwords4/dawg/French/info.txt +++ b/xwords4/dawg/French/info.txt @@ -20,7 +20,7 @@ LANGCODE:fr_FR LANGFILTER: tr -d '\r' LANGFILTER: | tr [a-z] [A-Z] -LANGFILTER: | grep '^[A-Z]*$' +LANGFILTER: | grep -x '[A-Z]\{2,15\}' LANGFILTER: | tr '\n' '\000' LANGFILTER: | sort -u -z diff --git a/xwords4/dawg/German/info.txt b/xwords4/dawg/German/info.txt index 603fb9392..e69e3eda7 100644 --- a/xwords4/dawg/German/info.txt +++ b/xwords4/dawg/German/info.txt @@ -28,7 +28,7 @@ LANGFILTER: | tr [a-zäöü] [A-ZÄÖÜ] # no words not containing a vowel # LANGFILTER: | grep '[AEIOUÄÖÜ]' # none with illegal chars -LANGFILTER: | grep '^[A-ZÄÖÜ]\+$' +LANGFILTER: | grep -x '[A-ZÄÖÜ]\{2,15\}' # Until I can figure out how to force sort to use a locale's collation # rules we can't trust sort in the filtering rules above and so must diff --git a/xwords4/dawg/Greek/info.txt b/xwords4/dawg/Greek/info.txt index a3c231075..38eedaeb0 100644 --- a/xwords4/dawg/Greek/info.txt +++ b/xwords4/dawg/Greek/info.txt @@ -21,7 +21,7 @@ CHARSET: utf-8 LANGFILTER: tr -d '\r' LANGFILTER: | tr 'αβγδεζηθικλμνξοπρστυφχψω' 'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ' -LANGFILTER: | grep '^[ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ]*$' +LANGFILTER: | grep -x '[ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ]\{2,15\}' LANGINFO:

Greek. Uploaded wordlist must be in utf-8 format. LANGINFO:

diff --git a/xwords4/dawg/Hex/info.txt b/xwords4/dawg/Hex/info.txt index 7f357cd10..98b39716d 100755 --- a/xwords4/dawg/Hex/info.txt +++ b/xwords4/dawg/Hex/info.txt @@ -20,7 +20,7 @@ LANGCODE:HEX # uppercase all LANGFILTER: tr [a-f] [A-F] -LANGFILTER: | grep '^[A-F]*$' +LANGFILTER: | grep -x '[A-F]\{2,15\}' LANGFILTER: | sed 's/A/Ä/' LANGFILTER: | sed 's/E/Ë/' LANGFILTER: | sort -u diff --git a/xwords4/dawg/Hungarian/info.txt b/xwords4/dawg/Hungarian/info.txt index eab5f0148..846c998c9 100644 --- a/xwords4/dawg/Hungarian/info.txt +++ b/xwords4/dawg/Hungarian/info.txt @@ -14,7 +14,7 @@ LANGFILTER: | sed -e 's,NY,4,g' LANGFILTER: | sed -e 's,^\(.*\)SZ\(.*\)$,\15\2\n\1SZ\2,g' LANGFILTER: | sed -e 's,TY,6,g' LANGFILTER: | sed -e 's,^\(.*\)ZS\(.*\)$,\17\2\n\1ZS\2,g' -LANGFILTER: | grep '^[1-7AÁBCDEÉFGHIÍJKLMNOÓÖŐPRSTUÚÜŰVZ]\{2,15\}$' +LANGFILTER: | grep -x '[1-7AÁBCDEÉFGHIÍJKLMNOÓÖŐPRSTUÚÜŰVZ]\{2,15\}' LANGFILTER: | tr '1234567' '\001\002\003\004\005\006\007' # High bit means "official". Next 7 bits are an enum where diff --git a/xwords4/dawg/Hëx/info.txt b/xwords4/dawg/Hëx/info.txt index 526041cf1..e6bc1c2a2 100644 --- a/xwords4/dawg/Hëx/info.txt +++ b/xwords4/dawg/Hëx/info.txt @@ -20,7 +20,7 @@ LANGCODE:HEX # uppercase all LANGFILTER: tr [a-f] [A-F] -LANGFILTER: | grep '^[A-F]*$' +LANGFILTER: | grep -x '[A-F]\{2,15\}' LANGFILTER: | sed 's/A/Ä/' LANGFILTER: | sed 's/E/Ë/' LANGFILTER: | sort -u diff --git a/xwords4/dawg/Italian/info.txt b/xwords4/dawg/Italian/info.txt index 813a41323..434fc814f 100755 --- a/xwords4/dawg/Italian/info.txt +++ b/xwords4/dawg/Italian/info.txt @@ -20,7 +20,7 @@ LANGCODE:it_IT # deal with DOS files LANGFILTER: tr -d '\r' LANGFILTER: | tr [a-z] [A-Z] -LANGFILTER: | grep '^[A-IL-VZ]*$' +LANGFILTER: | grep -x '[A-IL-VZ]\{2,15\}' LANGFILTER: | sort -u D2DARGS: -r -term 10 -nosort diff --git a/xwords4/dawg/Polish/info.txt b/xwords4/dawg/Polish/info.txt index 9b4f3d7be..6efb4f8a4 100644 --- a/xwords4/dawg/Polish/info.txt +++ b/xwords4/dawg/Polish/info.txt @@ -22,7 +22,7 @@ CHARSET:utf-8 # deal with DOS files LANGFILTER: tr -d '\r' LANGFILTER: | tr [a-pr-uwyząćęłńóśźż] [A-PR-UWYZĄĆĘŁŃÓŚŹŻ] -LANGFILTER: | grep '^[A-PR-UWYZĄĆĘŁŃÓŚŹŻ]*$' +LANGFILTER: | grep -x '[A-PR-UWYZĄĆĘŁŃÓŚŹŻ]\{2,15\}' LANGFILTER: | tr '\n' '\000' D2DARGS: -r -term 0 diff --git a/xwords4/dawg/Portuguese/info.txt b/xwords4/dawg/Portuguese/info.txt index 85d05df14..075c1f20b 100644 --- a/xwords4/dawg/Portuguese/info.txt +++ b/xwords4/dawg/Portuguese/info.txt @@ -24,7 +24,7 @@ LANGFILTER: | tr [a-zç] [A-ZÇ] # no words not containing a vowel LANGFILTER: | grep '[AEIOU]' # none with illegal chars -LANGFILTER: | grep '^[A-JL-VXZÇ]\+$' +LANGFILTER: | grep -x '[A-JL-VXZÇ]\{2,15\}' # Until I can figure out how to force sort to use a locale's collation # rules we can't trust sort in the filtering rules above and so must diff --git a/xwords4/dawg/Romanian/info.txt b/xwords4/dawg/Romanian/info.txt index a08e2ab5c..238501084 100644 --- a/xwords4/dawg/Romanian/info.txt +++ b/xwords4/dawg/Romanian/info.txt @@ -23,7 +23,7 @@ LANGCODE:ro LANGFILTER: tr -d '\r' LANGFILTER: | tr [:lower:] [:upper:] LANGFILTER: | tr 'ĂÂÎȘȚ' 'AAIST' -LANGFILTER: | grep '^[A-JL-PR-VXZ]*$' +LANGFILTER: | grep -x '[A-JL-PR-VXZ]\{2,15\}' LANGFILTER: | sort -u # We can trust sort (above) to do the right thing since there's no diff --git a/xwords4/dawg/Spanish/info.txt b/xwords4/dawg/Spanish/info.txt index cf4252d82..8dba7a719 100644 --- a/xwords4/dawg/Spanish/info.txt +++ b/xwords4/dawg/Spanish/info.txt @@ -30,7 +30,7 @@ LANGFILTER: | tr '\207\216\222\227\234\237\226' 'aeiouu\321' # uppercase LANGFILTER: | tr [a-zñ] [A-ZÑ] # remove words with illegal letters -LANGFILTER: | grep '^[[A-JL-VX-ZÑ]*$' +LANGFILTER: | grep -x '[[A-JL-VX-ZÑ]\{2,15\}' # substitute pairs (can't figure out how to use octal values) LANGFILTER: | sed 's/CH/1/g' LANGFILTER: | sed 's/LL/2/g' diff --git a/xwords4/dawg/Swedish/info.txt b/xwords4/dawg/Swedish/info.txt index 2692a0812..d21521c43 100644 --- a/xwords4/dawg/Swedish/info.txt +++ b/xwords4/dawg/Swedish/info.txt @@ -20,7 +20,7 @@ LANGCODE:sv_SE LANGFILTER: tr -d '\r' LANGFILTER: | tr [a-zäåæöü] [A-ZÄÅÆÖÜ] -LANGFILTER: | grep '^[A-ZÄÅÆÖÜ]*$' +LANGFILTER: | grep -x '[A-ZÄÅÆÖÜ]\{2,15\}' D2DARGS: -r -term 10