add ability to filter out "dirty" words

If a Makefile defines a dirty word list then a new python script is
invoked to filter for and remove those words as the dict is being
built. So far I have for English only, which makes sense because only
English wordlists are built-in on Android and Google's rating system
cares only about what's built in.
This commit is contained in:
Eric House 2017-05-04 22:45:27 -07:00
parent ed781b36b5
commit 8752432de3
6 changed files with 292 additions and 4 deletions

View file

@ -0,0 +1,20 @@
# -*- mode: makefile; compile-command: "make -f Makefile.top5000Bowd"; -*-
# Copyright 2017 by Eric House (xwords@eehouse.org). All rights
# reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
DIRTY_LIST=dirtywords.txt
include Makefile.BasEnglish

View file

@ -0,0 +1,19 @@
# -*- mode: makefile; compile-command: "make -f Makefile.CollegeEngBowd"; -*-
# Copyright 2017 by Eric House (xwords@eehouse.org). All rights reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
DIRTY_LIST=dirtywords.txt
include Makefile.CollegeEng

View file

@ -0,0 +1,20 @@
# -*- mode: makefile; compile-command: "make -f Makefile.top5000Bowd"; -*-
# Copyright 2017 by Eric House (xwords@eehouse.org). All rights
# reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
DIRTY_LIST=dirtywords.txt
include Makefile.top5000

View file

@ -0,0 +1,206 @@
ABO
ABOS
ARSE
ASSHOLE
ASSHOLES
BADASS
BADASSED
BADASSES
BALLSIER
BALLSIEST
BALLSY
BAZOOMS
BLOWJOB
BLOWJOBS
BOCHE
BOCHES
BOOBIE
BUBBIES
BUBBY
BUCKRA
BUCKRAS
BULLSHIT
BULLSHITS
BULLSHITTED
BULLSHITTING
CLIT
CLITORAL
CLITORIS
COLOREDS
COMSYMP
COMSYMPS
CRAPPER
CRAPPERS
CUNT
CUNTS
DAGO
DAGOES
DAGOS
DARKEY
DARKEYS
DARKIE
DARKIES
DARKY
DICKED
DICKING
DIKEY
DYKEY
FAGGOTRIES
FAGGOTRY
FAGGOTY
FAGGY
FART
FARTED
FARTING
FARTS
FATSO
FATSOES
FATSOS
FRIG
FRIGGED
FRIGGING
FRIGS
FUCK
FUCKED
FUCKER
FUCKERS
FUCKING
FUCKS
FUCKUP
FUCKUPS
GANGBANG
GANGBANGS
GOY
GOYIM
GOYISH
GOYS
GRINGO
GRINGOS
HAOLE
HAOLES
HEBE
HEBES
HONKEY
HONKEYS
HONKIE
HONKIES
HONKY
HUNKIES
JESUIT
JESUITIC
JESUITRIES
JESUITRY
JESUITS
JEW
JEWED
JEWING
JEWS
JIGABOO
JIGABOOS
JISM
JISMS
KIKE
KIKES
LEZ
LEZES
LEZZIE
LEZZIES
LEZZY
LIBBER
LIBBERS
MERDE
MERDES
MICK
MICKS
NANCE
NANCES
NANCIES
NANCY
NIGGER
NIGGERS
NITCHIE
NITCHIES
NOOKIES
NOOKY
OFAY
OFAYS
PAPIST
PAPISTIC
PAPISTRIES
PAPISTRY
PAPISTS
PEED
PEEING
PISS
PISSED
PISSER
PISSERS
PISSES
PISSING
POM
POMMIE
POMMIES
POMMY
POMS
POOFS
POOFTAH
POOFTAHS
POOFTER
POOFTERS
POOFY
POOVE
POOVES
POPERIES
POPERY
POPISH
POPISHLY
REDNECK
REDNECKS
REDSKIN
REDSKINS
SHAT
SHEENEY
SHEENEYS
SHEENIE
SHEENIES
SHEGETZ
SHICKSA
SHICKSAS
SHIKSA
SHIKSAS
SHIKSE
SHIKSES
SHIT
SHITHEAD
SHITHEADS
SHITS
SHITTED
SHITTIER
SHITTIEST
SHITTING
SHITTY
SHKOTZIM
SKIMO
SKIMOS
SPIC
SPICK
SPICKS
SPICS
SPIK
SPIKS
TOMMED
TOMMING
TURD
TURDS
TWAT
TWATS
WETBACK
WETBACKS
WHITEYS
WHITIES
WOG
WOGS
WOP
WOPS
YID
YIDS

View file

@ -16,6 +16,14 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
ifneq ($(DIRTY_LIST),)
BOWDLERIZER = ../remove-dirty.py $(DIRTY_LIST)
XWLANG := $(XWLANG)_BOWD
DICTNOTE := "$(DICTNOTE) (Bowdlerized)"
else
BOWDLERIZER = cat
endif
XWLANG := $(XWLANG)_
FRANK_EXT = xwd
@ -242,7 +250,7 @@ dawg$(XWLANG)%.stamp: $(XWLANG)Main.dict.gz $(DICT2DAWG) table.bin ../Makefile.l
start=$$(echo $@ | sed -e 's/dawg$(XWLANG)\([0-9]*\)to[0-9]*.stamp/\1/'); \
end=$$(echo $@ | sed -e 's/dawg$(XWLANG)[0-9]*to\([0-9]*\).stamp/\1/'); \
echo $${start} and $${end}; \
zcat $< | $(DICT2DAWG) $(DICT2DAWGARGS) $(TABLE_ARG) table.bin -b 28000 \
zcat $< | $(BOWDLERIZER) | $(DICT2DAWG) $(DICT2DAWGARGS) $(TABLE_ARG) table.bin -b 28000 \
-ob dawg$(XWLANG)$* $(ENCP) \
-sn $(XWLANG)StartLoc.bin -min $${start} -max $${end} \
-wc $(XWLANG)$*_wordcount.bin $(FORCE_4) -ns $(XWLANG)$*_nodesize.bin
@ -295,9 +303,6 @@ $(XWLANG)%_newheader.bin: $(XWLANG)%_wordcount.bin $(XWLANG)%_note.bin $(XWLANG)
perl -e "print pack(\"n\",$$SIZ)" > $@
cat $+ >> $@
%.dict: %.dict.gz
zcat $< > $@
# clean this up....
../dict2dawg: ../dict2dawg.cpp
g++ -DDEBUG -O0 -g -Wall -o $@ $<

18
xwords4/dawg/remove-dirty.py Executable file
View file

@ -0,0 +1,18 @@
#!/usr/bin/python
# Invoked with path to bad words list as single parameter, and with a
# stream of words via stdin, loads the bad words into a map and for
# every word in stdin echos it to stdout IFF it's not in the map.
import sys
dirtyMap = {}
dirtyList = sys.argv[1]
for f in open(dirtyList):
dirtyMap[f] = True
for word in sys.stdin:
if word in dirtyMap:
sys.stderr.write( sys.argv[0] + ": dropping: " + word )
else:
sys.stdout.write( word )