mirror of
git://xwords.git.sourceforge.net/gitroot/xwords/xwords
synced 2025-01-04 23:02:02 +01:00
add ability to filter out "dirty" words
If a Makefile defines a dirty word list then a new python script is invoked to filter for and remove those words as the dict is being built. So far I have for English only, which makes sense because only English wordlists are built-in on Android and Google's rating system cares only about what's built in.
This commit is contained in:
parent
ed781b36b5
commit
8752432de3
6 changed files with 292 additions and 4 deletions
20
xwords4/dawg/English/Makefile.BasEnglishBowd
Normal file
20
xwords4/dawg/English/Makefile.BasEnglishBowd
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
# -*- mode: makefile; compile-command: "make -f Makefile.top5000Bowd"; -*-
|
||||||
|
# Copyright 2017 by Eric House (xwords@eehouse.org). All rights
|
||||||
|
# reserved.
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or
|
||||||
|
# modify it under the terms of the GNU General Public License
|
||||||
|
# as published by the Free Software Foundation; either version 2
|
||||||
|
# of the License, or (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
|
|
||||||
|
DIRTY_LIST=dirtywords.txt
|
||||||
|
include Makefile.BasEnglish
|
19
xwords4/dawg/English/Makefile.CollegeEngBowd
Normal file
19
xwords4/dawg/English/Makefile.CollegeEngBowd
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
# -*- mode: makefile; compile-command: "make -f Makefile.CollegeEngBowd"; -*-
|
||||||
|
# Copyright 2017 by Eric House (xwords@eehouse.org). All rights reserved.
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or
|
||||||
|
# modify it under the terms of the GNU General Public License
|
||||||
|
# as published by the Free Software Foundation; either version 2
|
||||||
|
# of the License, or (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
|
|
||||||
|
DIRTY_LIST=dirtywords.txt
|
||||||
|
include Makefile.CollegeEng
|
20
xwords4/dawg/English/Makefile.top5000Bowd
Normal file
20
xwords4/dawg/English/Makefile.top5000Bowd
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
# -*- mode: makefile; compile-command: "make -f Makefile.top5000Bowd"; -*-
|
||||||
|
# Copyright 2017 by Eric House (xwords@eehouse.org). All rights
|
||||||
|
# reserved.
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or
|
||||||
|
# modify it under the terms of the GNU General Public License
|
||||||
|
# as published by the Free Software Foundation; either version 2
|
||||||
|
# of the License, or (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
|
|
||||||
|
DIRTY_LIST=dirtywords.txt
|
||||||
|
include Makefile.top5000
|
206
xwords4/dawg/English/dirtywords.txt
Normal file
206
xwords4/dawg/English/dirtywords.txt
Normal file
|
@ -0,0 +1,206 @@
|
||||||
|
ABO
|
||||||
|
ABOS
|
||||||
|
ARSE
|
||||||
|
ASSHOLE
|
||||||
|
ASSHOLES
|
||||||
|
BADASS
|
||||||
|
BADASSED
|
||||||
|
BADASSES
|
||||||
|
BALLSIER
|
||||||
|
BALLSIEST
|
||||||
|
BALLSY
|
||||||
|
BAZOOMS
|
||||||
|
BLOWJOB
|
||||||
|
BLOWJOBS
|
||||||
|
BOCHE
|
||||||
|
BOCHES
|
||||||
|
BOOBIE
|
||||||
|
BUBBIES
|
||||||
|
BUBBY
|
||||||
|
BUCKRA
|
||||||
|
BUCKRAS
|
||||||
|
BULLSHIT
|
||||||
|
BULLSHITS
|
||||||
|
BULLSHITTED
|
||||||
|
BULLSHITTING
|
||||||
|
CLIT
|
||||||
|
CLITORAL
|
||||||
|
CLITORIS
|
||||||
|
COLOREDS
|
||||||
|
COMSYMP
|
||||||
|
COMSYMPS
|
||||||
|
CRAPPER
|
||||||
|
CRAPPERS
|
||||||
|
CUNT
|
||||||
|
CUNTS
|
||||||
|
DAGO
|
||||||
|
DAGOES
|
||||||
|
DAGOS
|
||||||
|
DARKEY
|
||||||
|
DARKEYS
|
||||||
|
DARKIE
|
||||||
|
DARKIES
|
||||||
|
DARKY
|
||||||
|
DICKED
|
||||||
|
DICKING
|
||||||
|
DIKEY
|
||||||
|
DYKEY
|
||||||
|
FAGGOTRIES
|
||||||
|
FAGGOTRY
|
||||||
|
FAGGOTY
|
||||||
|
FAGGY
|
||||||
|
FART
|
||||||
|
FARTED
|
||||||
|
FARTING
|
||||||
|
FARTS
|
||||||
|
FATSO
|
||||||
|
FATSOES
|
||||||
|
FATSOS
|
||||||
|
FRIG
|
||||||
|
FRIGGED
|
||||||
|
FRIGGING
|
||||||
|
FRIGS
|
||||||
|
FUCK
|
||||||
|
FUCKED
|
||||||
|
FUCKER
|
||||||
|
FUCKERS
|
||||||
|
FUCKING
|
||||||
|
FUCKS
|
||||||
|
FUCKUP
|
||||||
|
FUCKUPS
|
||||||
|
GANGBANG
|
||||||
|
GANGBANGS
|
||||||
|
GOY
|
||||||
|
GOYIM
|
||||||
|
GOYISH
|
||||||
|
GOYS
|
||||||
|
GRINGO
|
||||||
|
GRINGOS
|
||||||
|
HAOLE
|
||||||
|
HAOLES
|
||||||
|
HEBE
|
||||||
|
HEBES
|
||||||
|
HONKEY
|
||||||
|
HONKEYS
|
||||||
|
HONKIE
|
||||||
|
HONKIES
|
||||||
|
HONKY
|
||||||
|
HUNKIES
|
||||||
|
JESUIT
|
||||||
|
JESUITIC
|
||||||
|
JESUITRIES
|
||||||
|
JESUITRY
|
||||||
|
JESUITS
|
||||||
|
JEW
|
||||||
|
JEWED
|
||||||
|
JEWING
|
||||||
|
JEWS
|
||||||
|
JIGABOO
|
||||||
|
JIGABOOS
|
||||||
|
JISM
|
||||||
|
JISMS
|
||||||
|
KIKE
|
||||||
|
KIKES
|
||||||
|
LEZ
|
||||||
|
LEZES
|
||||||
|
LEZZIE
|
||||||
|
LEZZIES
|
||||||
|
LEZZY
|
||||||
|
LIBBER
|
||||||
|
LIBBERS
|
||||||
|
MERDE
|
||||||
|
MERDES
|
||||||
|
MICK
|
||||||
|
MICKS
|
||||||
|
NANCE
|
||||||
|
NANCES
|
||||||
|
NANCIES
|
||||||
|
NANCY
|
||||||
|
NIGGER
|
||||||
|
NIGGERS
|
||||||
|
NITCHIE
|
||||||
|
NITCHIES
|
||||||
|
NOOKIES
|
||||||
|
NOOKY
|
||||||
|
OFAY
|
||||||
|
OFAYS
|
||||||
|
PAPIST
|
||||||
|
PAPISTIC
|
||||||
|
PAPISTRIES
|
||||||
|
PAPISTRY
|
||||||
|
PAPISTS
|
||||||
|
PEED
|
||||||
|
PEEING
|
||||||
|
PISS
|
||||||
|
PISSED
|
||||||
|
PISSER
|
||||||
|
PISSERS
|
||||||
|
PISSES
|
||||||
|
PISSING
|
||||||
|
POM
|
||||||
|
POMMIE
|
||||||
|
POMMIES
|
||||||
|
POMMY
|
||||||
|
POMS
|
||||||
|
POOFS
|
||||||
|
POOFTAH
|
||||||
|
POOFTAHS
|
||||||
|
POOFTER
|
||||||
|
POOFTERS
|
||||||
|
POOFY
|
||||||
|
POOVE
|
||||||
|
POOVES
|
||||||
|
POPERIES
|
||||||
|
POPERY
|
||||||
|
POPISH
|
||||||
|
POPISHLY
|
||||||
|
REDNECK
|
||||||
|
REDNECKS
|
||||||
|
REDSKIN
|
||||||
|
REDSKINS
|
||||||
|
SHAT
|
||||||
|
SHEENEY
|
||||||
|
SHEENEYS
|
||||||
|
SHEENIE
|
||||||
|
SHEENIES
|
||||||
|
SHEGETZ
|
||||||
|
SHICKSA
|
||||||
|
SHICKSAS
|
||||||
|
SHIKSA
|
||||||
|
SHIKSAS
|
||||||
|
SHIKSE
|
||||||
|
SHIKSES
|
||||||
|
SHIT
|
||||||
|
SHITHEAD
|
||||||
|
SHITHEADS
|
||||||
|
SHITS
|
||||||
|
SHITTED
|
||||||
|
SHITTIER
|
||||||
|
SHITTIEST
|
||||||
|
SHITTING
|
||||||
|
SHITTY
|
||||||
|
SHKOTZIM
|
||||||
|
SKIMO
|
||||||
|
SKIMOS
|
||||||
|
SPIC
|
||||||
|
SPICK
|
||||||
|
SPICKS
|
||||||
|
SPICS
|
||||||
|
SPIK
|
||||||
|
SPIKS
|
||||||
|
TOMMED
|
||||||
|
TOMMING
|
||||||
|
TURD
|
||||||
|
TURDS
|
||||||
|
TWAT
|
||||||
|
TWATS
|
||||||
|
WETBACK
|
||||||
|
WETBACKS
|
||||||
|
WHITEYS
|
||||||
|
WHITIES
|
||||||
|
WOG
|
||||||
|
WOGS
|
||||||
|
WOP
|
||||||
|
WOPS
|
||||||
|
YID
|
||||||
|
YIDS
|
|
@ -16,6 +16,14 @@
|
||||||
# along with this program; if not, write to the Free Software
|
# along with this program; if not, write to the Free Software
|
||||||
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
|
|
||||||
|
ifneq ($(DIRTY_LIST),)
|
||||||
|
BOWDLERIZER = ../remove-dirty.py $(DIRTY_LIST)
|
||||||
|
XWLANG := $(XWLANG)_BOWD
|
||||||
|
DICTNOTE := "$(DICTNOTE) (Bowdlerized)"
|
||||||
|
else
|
||||||
|
BOWDLERIZER = cat
|
||||||
|
endif
|
||||||
|
|
||||||
XWLANG := $(XWLANG)_
|
XWLANG := $(XWLANG)_
|
||||||
|
|
||||||
FRANK_EXT = xwd
|
FRANK_EXT = xwd
|
||||||
|
@ -242,7 +250,7 @@ dawg$(XWLANG)%.stamp: $(XWLANG)Main.dict.gz $(DICT2DAWG) table.bin ../Makefile.l
|
||||||
start=$$(echo $@ | sed -e 's/dawg$(XWLANG)\([0-9]*\)to[0-9]*.stamp/\1/'); \
|
start=$$(echo $@ | sed -e 's/dawg$(XWLANG)\([0-9]*\)to[0-9]*.stamp/\1/'); \
|
||||||
end=$$(echo $@ | sed -e 's/dawg$(XWLANG)[0-9]*to\([0-9]*\).stamp/\1/'); \
|
end=$$(echo $@ | sed -e 's/dawg$(XWLANG)[0-9]*to\([0-9]*\).stamp/\1/'); \
|
||||||
echo $${start} and $${end}; \
|
echo $${start} and $${end}; \
|
||||||
zcat $< | $(DICT2DAWG) $(DICT2DAWGARGS) $(TABLE_ARG) table.bin -b 28000 \
|
zcat $< | $(BOWDLERIZER) | $(DICT2DAWG) $(DICT2DAWGARGS) $(TABLE_ARG) table.bin -b 28000 \
|
||||||
-ob dawg$(XWLANG)$* $(ENCP) \
|
-ob dawg$(XWLANG)$* $(ENCP) \
|
||||||
-sn $(XWLANG)StartLoc.bin -min $${start} -max $${end} \
|
-sn $(XWLANG)StartLoc.bin -min $${start} -max $${end} \
|
||||||
-wc $(XWLANG)$*_wordcount.bin $(FORCE_4) -ns $(XWLANG)$*_nodesize.bin
|
-wc $(XWLANG)$*_wordcount.bin $(FORCE_4) -ns $(XWLANG)$*_nodesize.bin
|
||||||
|
@ -295,9 +303,6 @@ $(XWLANG)%_newheader.bin: $(XWLANG)%_wordcount.bin $(XWLANG)%_note.bin $(XWLANG)
|
||||||
perl -e "print pack(\"n\",$$SIZ)" > $@
|
perl -e "print pack(\"n\",$$SIZ)" > $@
|
||||||
cat $+ >> $@
|
cat $+ >> $@
|
||||||
|
|
||||||
%.dict: %.dict.gz
|
|
||||||
zcat $< > $@
|
|
||||||
|
|
||||||
# clean this up....
|
# clean this up....
|
||||||
../dict2dawg: ../dict2dawg.cpp
|
../dict2dawg: ../dict2dawg.cpp
|
||||||
g++ -DDEBUG -O0 -g -Wall -o $@ $<
|
g++ -DDEBUG -O0 -g -Wall -o $@ $<
|
||||||
|
|
18
xwords4/dawg/remove-dirty.py
Executable file
18
xwords4/dawg/remove-dirty.py
Executable file
|
@ -0,0 +1,18 @@
|
||||||
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
# Invoked with path to bad words list as single parameter, and with a
|
||||||
|
# stream of words via stdin, loads the bad words into a map and for
|
||||||
|
# every word in stdin echos it to stdout IFF it's not in the map.
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
dirtyMap = {}
|
||||||
|
dirtyList = sys.argv[1]
|
||||||
|
for f in open(dirtyList):
|
||||||
|
dirtyMap[f] = True
|
||||||
|
|
||||||
|
for word in sys.stdin:
|
||||||
|
if word in dirtyMap:
|
||||||
|
sys.stderr.write( sys.argv[0] + ": dropping: " + word )
|
||||||
|
else:
|
||||||
|
sys.stdout.write( word )
|
Loading…
Reference in a new issue