limit word lengths to 2-15

dict2dawg crashes when given a 1-letter word. Easier to fix in the
filtering that has to be there anyway.
This commit is contained in:
Eric House 2022-01-23 17:46:52 -08:00
parent 2d6ce6cfda
commit 3912a60ee9
16 changed files with 16 additions and 16 deletions

View file

@ -47,7 +47,7 @@ LANGFILTER: tr -d '\r'
LANGFILTER: | tr 'a-zç' 'A-ZÇ' LANGFILTER: | tr 'a-zç' 'A-ZÇ'
LANGFILTER: | sed -e 's/L·L/1/g' -e 's/L\.L/1/g' -e 's/L-L/1/g' LANGFILTER: | sed -e 's/L·L/1/g' -e 's/L\.L/1/g' -e 's/L-L/1/g'
LANGFILTER: | sed -e 's/NY/2/g' -e 's/QU/3/g' LANGFILTER: | sed -e 's/NY/2/g' -e 's/QU/3/g'
LANGFILTER: | grep '^[Ç1-3A-JL-VXZ\.]*$' LANGFILTER: | grep -x '[Ç1-3A-JL-VXZ\.]\{2,15\}'
# substitute in the octal control character values # substitute in the octal control character values
LANGFILTER: | tr '123' '\001\002\003' LANGFILTER: | tr '123' '\001\002\003'

View file

@ -24,7 +24,7 @@ LANGFILTER: | tr [a-zåæø] [A-ZÅÆØ]
# no words not containing a vowel # no words not containing a vowel
LANGFILTER: | grep '[AEIOUYÅÆØ]' LANGFILTER: | grep '[AEIOUYÅÆØ]'
# none with illegal chars # none with illegal chars
LANGFILTER: | grep '^[A-PR-VX-ZÅÆØ]\+$' LANGFILTER: | grep -x '[A-PR-VX-ZÅÆØ]\{2,15\}'
# remove duplicates # remove duplicates
LANGFILTER: | sort -u LANGFILTER: | sort -u

View file

@ -23,7 +23,7 @@ LANGFILTER: tr -d '\r'
# uppercase all # uppercase all
LANGFILTER: | tr [a-z] [A-Z] LANGFILTER: | tr [a-z] [A-Z]
# none with illegal chars # none with illegal chars
LANGFILTER: | grep '^[A-Z]\+$' LANGFILTER: | grep -x '[A-Z]\{2,15\}'
LANGFILTER: | sort -u LANGFILTER: | sort -u
# Until I can figure out how to force sort to use a locale's collation # Until I can figure out how to force sort to use a locale's collation

View file

@ -20,7 +20,7 @@ LANGCODE:en_US
# deal with DOS files # deal with DOS files
LANGFILTER: tr -d '\r' LANGFILTER: tr -d '\r'
LANGFILTER: | tr [a-z] [A-Z] LANGFILTER: | tr [a-z] [A-Z]
LANGFILTER: | grep '^[A-Z]*$' LANGFILTER: | grep -x '[A-Z]\{2,15\}'
LANGFILTER: | sort -u LANGFILTER: | sort -u
# We can trust sort (above) to do the right thing since there's no # We can trust sort (above) to do the right thing since there's no

View file

@ -20,7 +20,7 @@ LANGCODE:fr_FR
LANGFILTER: tr -d '\r' LANGFILTER: tr -d '\r'
LANGFILTER: | tr [a-z] [A-Z] LANGFILTER: | tr [a-z] [A-Z]
LANGFILTER: | grep '^[A-Z]*$' LANGFILTER: | grep -x '[A-Z]\{2,15\}'
LANGFILTER: | tr '\n' '\000' LANGFILTER: | tr '\n' '\000'
LANGFILTER: | sort -u -z LANGFILTER: | sort -u -z

View file

@ -28,7 +28,7 @@ LANGFILTER: | tr [a-zäöü] [A-ZÄÖÜ]
# no words not containing a vowel # no words not containing a vowel
# LANGFILTER: | grep '[AEIOUÄÖÜ]' # LANGFILTER: | grep '[AEIOUÄÖÜ]'
# none with illegal chars # none with illegal chars
LANGFILTER: | grep '^[A-ZÄÖÜ]\+$' LANGFILTER: | grep -x '[A-ZÄÖÜ]\{2,15\}'
# Until I can figure out how to force sort to use a locale's collation # Until I can figure out how to force sort to use a locale's collation
# rules we can't trust sort in the filtering rules above and so must # rules we can't trust sort in the filtering rules above and so must

View file

@ -21,7 +21,7 @@ CHARSET: utf-8
LANGFILTER: tr -d '\r' LANGFILTER: tr -d '\r'
LANGFILTER: | tr 'αβγδεζηθικλμνξοπρστυφχψω' 'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ' LANGFILTER: | tr 'αβγδεζηθικλμνξοπρστυφχψω' 'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ'
LANGFILTER: | grep '^[ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ]*$' LANGFILTER: | grep -x '[ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ]\{2,15\}'
LANGINFO: <p>Greek. Uploaded wordlist must be in utf-8 format. LANGINFO: <p>Greek. Uploaded wordlist must be in utf-8 format.
LANGINFO: </p> LANGINFO: </p>

View file

@ -20,7 +20,7 @@ LANGCODE:HEX
# uppercase all # uppercase all
LANGFILTER: tr [a-f] [A-F] LANGFILTER: tr [a-f] [A-F]
LANGFILTER: | grep '^[A-F]*$' LANGFILTER: | grep -x '[A-F]\{2,15\}'
LANGFILTER: | sed 's/A/Ä/' LANGFILTER: | sed 's/A/Ä/'
LANGFILTER: | sed 's/E/Ë/' LANGFILTER: | sed 's/E/Ë/'
LANGFILTER: | sort -u LANGFILTER: | sort -u

View file

@ -14,7 +14,7 @@ LANGFILTER: | sed -e 's,NY,4,g'
LANGFILTER: | sed -e 's,^\(.*\)SZ\(.*\)$,\15\2\n\1SZ\2,g' LANGFILTER: | sed -e 's,^\(.*\)SZ\(.*\)$,\15\2\n\1SZ\2,g'
LANGFILTER: | sed -e 's,TY,6,g' LANGFILTER: | sed -e 's,TY,6,g'
LANGFILTER: | sed -e 's,^\(.*\)ZS\(.*\)$,\17\2\n\1ZS\2,g' LANGFILTER: | sed -e 's,^\(.*\)ZS\(.*\)$,\17\2\n\1ZS\2,g'
LANGFILTER: | grep '^[1-7AÁBCDEÉFGHIÍJKLMNOÓÖŐPRSTUÚÜŰVZ]\{2,15\}$' LANGFILTER: | grep -x '[1-7AÁBCDEÉFGHIÍJKLMNOÓÖŐPRSTUÚÜŰVZ]\{2,15\}'
LANGFILTER: | tr '1234567' '\001\002\003\004\005\006\007' LANGFILTER: | tr '1234567' '\001\002\003\004\005\006\007'
# High bit means "official". Next 7 bits are an enum where # High bit means "official". Next 7 bits are an enum where

View file

@ -20,7 +20,7 @@ LANGCODE:HEX
# uppercase all # uppercase all
LANGFILTER: tr [a-f] [A-F] LANGFILTER: tr [a-f] [A-F]
LANGFILTER: | grep '^[A-F]*$' LANGFILTER: | grep -x '[A-F]\{2,15\}'
LANGFILTER: | sed 's/A/Ä/' LANGFILTER: | sed 's/A/Ä/'
LANGFILTER: | sed 's/E/Ë/' LANGFILTER: | sed 's/E/Ë/'
LANGFILTER: | sort -u LANGFILTER: | sort -u

View file

@ -20,7 +20,7 @@ LANGCODE:it_IT
# deal with DOS files # deal with DOS files
LANGFILTER: tr -d '\r' LANGFILTER: tr -d '\r'
LANGFILTER: | tr [a-z] [A-Z] LANGFILTER: | tr [a-z] [A-Z]
LANGFILTER: | grep '^[A-IL-VZ]*$' LANGFILTER: | grep -x '[A-IL-VZ]\{2,15\}'
LANGFILTER: | sort -u LANGFILTER: | sort -u
D2DARGS: -r -term 10 -nosort D2DARGS: -r -term 10 -nosort

View file

@ -22,7 +22,7 @@ CHARSET:utf-8
# deal with DOS files # deal with DOS files
LANGFILTER: tr -d '\r' LANGFILTER: tr -d '\r'
LANGFILTER: | tr [a-pr-uwyząćęłńóśźż] [A-PR-UWYZĄĆĘŁŃÓŚŹŻ] LANGFILTER: | tr [a-pr-uwyząćęłńóśźż] [A-PR-UWYZĄĆĘŁŃÓŚŹŻ]
LANGFILTER: | grep '^[A-PR-UWYZĄĆĘŁŃÓŚŹŻ]*$' LANGFILTER: | grep -x '[A-PR-UWYZĄĆĘŁŃÓŚŹŻ]\{2,15\}'
LANGFILTER: | tr '\n' '\000' LANGFILTER: | tr '\n' '\000'
D2DARGS: -r -term 0 D2DARGS: -r -term 0

View file

@ -24,7 +24,7 @@ LANGFILTER: | tr [a-zç] [A-ZÇ]
# no words not containing a vowel # no words not containing a vowel
LANGFILTER: | grep '[AEIOU]' LANGFILTER: | grep '[AEIOU]'
# none with illegal chars # none with illegal chars
LANGFILTER: | grep '^[A-JL-VXZÇ]\+$' LANGFILTER: | grep -x '[A-JL-VXZÇ]\{2,15\}'
# Until I can figure out how to force sort to use a locale's collation # Until I can figure out how to force sort to use a locale's collation
# rules we can't trust sort in the filtering rules above and so must # rules we can't trust sort in the filtering rules above and so must

View file

@ -23,7 +23,7 @@ LANGCODE:ro
LANGFILTER: tr -d '\r' LANGFILTER: tr -d '\r'
LANGFILTER: | tr [:lower:] [:upper:] LANGFILTER: | tr [:lower:] [:upper:]
LANGFILTER: | tr 'ĂÂÎȘȚ' 'AAIST' LANGFILTER: | tr 'ĂÂÎȘȚ' 'AAIST'
LANGFILTER: | grep '^[A-JL-PR-VXZ]*$' LANGFILTER: | grep -x '[A-JL-PR-VXZ]\{2,15\}'
LANGFILTER: | sort -u LANGFILTER: | sort -u
# We can trust sort (above) to do the right thing since there's no # We can trust sort (above) to do the right thing since there's no

View file

@ -30,7 +30,7 @@ LANGFILTER: | tr '\207\216\222\227\234\237\226' 'aeiouu\321'
# uppercase # uppercase
LANGFILTER: | tr [a-zñ] [A-ZÑ] LANGFILTER: | tr [a-zñ] [A-ZÑ]
# remove words with illegal letters # remove words with illegal letters
LANGFILTER: | grep '^[[A-JL-VX-ZÑ]*$' LANGFILTER: | grep -x '[[A-JL-VX-ZÑ]\{2,15\}'
# substitute pairs (can't figure out how to use octal values) # substitute pairs (can't figure out how to use octal values)
LANGFILTER: | sed 's/CH/1/g' LANGFILTER: | sed 's/CH/1/g'
LANGFILTER: | sed 's/LL/2/g' LANGFILTER: | sed 's/LL/2/g'

View file

@ -20,7 +20,7 @@ LANGCODE:sv_SE
LANGFILTER: tr -d '\r' LANGFILTER: tr -d '\r'
LANGFILTER: | tr [a-zäåæöü] [A-ZÄÅÆÖÜ] LANGFILTER: | tr [a-zäåæöü] [A-ZÄÅÆÖÜ]
LANGFILTER: | grep '^[A-ZÄÅÆÖÜ]*$' LANGFILTER: | grep -x '[A-ZÄÅÆÖÜ]\{2,15\}'
D2DARGS: -r -term 10 D2DARGS: -r -term 10