From 8752432de3a7d2d5aeef1a16b570d08e3b12d70a Mon Sep 17 00:00:00 2001 From: Eric House Date: Thu, 4 May 2017 22:45:27 -0700 Subject: [PATCH] add ability to filter out "dirty" words If a Makefile defines a dirty word list then a new python script is invoked to filter for and remove those words as the dict is being built. So far I have for English only, which makes sense because only English wordlists are built-in on Android and Google's rating system cares only about what's built in. --- xwords4/dawg/English/Makefile.BasEnglishBowd | 20 ++ xwords4/dawg/English/Makefile.CollegeEngBowd | 19 ++ xwords4/dawg/English/Makefile.top5000Bowd | 20 ++ xwords4/dawg/English/dirtywords.txt | 206 +++++++++++++++++++ xwords4/dawg/Makefile.langcommon | 13 +- xwords4/dawg/remove-dirty.py | 18 ++ 6 files changed, 292 insertions(+), 4 deletions(-) create mode 100644 xwords4/dawg/English/Makefile.BasEnglishBowd create mode 100644 xwords4/dawg/English/Makefile.CollegeEngBowd create mode 100644 xwords4/dawg/English/Makefile.top5000Bowd create mode 100644 xwords4/dawg/English/dirtywords.txt create mode 100755 xwords4/dawg/remove-dirty.py diff --git a/xwords4/dawg/English/Makefile.BasEnglishBowd b/xwords4/dawg/English/Makefile.BasEnglishBowd new file mode 100644 index 000000000..9f169ab8a --- /dev/null +++ b/xwords4/dawg/English/Makefile.BasEnglishBowd @@ -0,0 +1,20 @@ +# -*- mode: makefile; compile-command: "make -f Makefile.top5000Bowd"; -*- +# Copyright 2017 by Eric House (xwords@eehouse.org). All rights +# reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +DIRTY_LIST=dirtywords.txt +include Makefile.BasEnglish diff --git a/xwords4/dawg/English/Makefile.CollegeEngBowd b/xwords4/dawg/English/Makefile.CollegeEngBowd new file mode 100644 index 000000000..ef8f17065 --- /dev/null +++ b/xwords4/dawg/English/Makefile.CollegeEngBowd @@ -0,0 +1,19 @@ +# -*- mode: makefile; compile-command: "make -f Makefile.CollegeEngBowd"; -*- +# Copyright 2017 by Eric House (xwords@eehouse.org). All rights reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +DIRTY_LIST=dirtywords.txt +include Makefile.CollegeEng diff --git a/xwords4/dawg/English/Makefile.top5000Bowd b/xwords4/dawg/English/Makefile.top5000Bowd new file mode 100644 index 000000000..a9a4101a5 --- /dev/null +++ b/xwords4/dawg/English/Makefile.top5000Bowd @@ -0,0 +1,20 @@ +# -*- mode: makefile; compile-command: "make -f Makefile.top5000Bowd"; -*- +# Copyright 2017 by Eric House (xwords@eehouse.org). All rights +# reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +DIRTY_LIST=dirtywords.txt +include Makefile.top5000 diff --git a/xwords4/dawg/English/dirtywords.txt b/xwords4/dawg/English/dirtywords.txt new file mode 100644 index 000000000..217a40246 --- /dev/null +++ b/xwords4/dawg/English/dirtywords.txt @@ -0,0 +1,206 @@ +ABO +ABOS +ARSE +ASSHOLE +ASSHOLES +BADASS +BADASSED +BADASSES +BALLSIER +BALLSIEST +BALLSY +BAZOOMS +BLOWJOB +BLOWJOBS +BOCHE +BOCHES +BOOBIE +BUBBIES +BUBBY +BUCKRA +BUCKRAS +BULLSHIT +BULLSHITS +BULLSHITTED +BULLSHITTING +CLIT +CLITORAL +CLITORIS +COLOREDS +COMSYMP +COMSYMPS +CRAPPER +CRAPPERS +CUNT +CUNTS +DAGO +DAGOES +DAGOS +DARKEY +DARKEYS +DARKIE +DARKIES +DARKY +DICKED +DICKING +DIKEY +DYKEY +FAGGOTRIES +FAGGOTRY +FAGGOTY +FAGGY +FART +FARTED +FARTING +FARTS +FATSO +FATSOES +FATSOS +FRIG +FRIGGED +FRIGGING +FRIGS +FUCK +FUCKED +FUCKER +FUCKERS +FUCKING +FUCKS +FUCKUP +FUCKUPS +GANGBANG +GANGBANGS +GOY +GOYIM +GOYISH +GOYS +GRINGO +GRINGOS +HAOLE +HAOLES +HEBE +HEBES +HONKEY +HONKEYS +HONKIE +HONKIES +HONKY +HUNKIES +JESUIT +JESUITIC +JESUITRIES +JESUITRY +JESUITS +JEW +JEWED +JEWING +JEWS +JIGABOO +JIGABOOS +JISM +JISMS +KIKE +KIKES +LEZ +LEZES +LEZZIE +LEZZIES +LEZZY +LIBBER +LIBBERS +MERDE +MERDES +MICK +MICKS +NANCE +NANCES +NANCIES +NANCY +NIGGER +NIGGERS +NITCHIE +NITCHIES +NOOKIES +NOOKY +OFAY +OFAYS +PAPIST +PAPISTIC +PAPISTRIES +PAPISTRY +PAPISTS +PEED +PEEING +PISS +PISSED +PISSER +PISSERS +PISSES +PISSING +POM +POMMIE +POMMIES +POMMY +POMS +POOFS +POOFTAH +POOFTAHS +POOFTER +POOFTERS +POOFY +POOVE +POOVES +POPERIES +POPERY +POPISH +POPISHLY +REDNECK +REDNECKS +REDSKIN +REDSKINS +SHAT +SHEENEY +SHEENEYS +SHEENIE +SHEENIES +SHEGETZ +SHICKSA +SHICKSAS +SHIKSA +SHIKSAS +SHIKSE +SHIKSES +SHIT +SHITHEAD +SHITHEADS +SHITS +SHITTED +SHITTIER +SHITTIEST +SHITTING +SHITTY +SHKOTZIM +SKIMO +SKIMOS +SPIC +SPICK +SPICKS +SPICS +SPIK +SPIKS +TOMMED +TOMMING +TURD +TURDS +TWAT +TWATS +WETBACK +WETBACKS +WHITEYS +WHITIES +WOG +WOGS +WOP +WOPS +YID +YIDS diff --git a/xwords4/dawg/Makefile.langcommon b/xwords4/dawg/Makefile.langcommon index 500789a91..670481051 100644 --- a/xwords4/dawg/Makefile.langcommon +++ b/xwords4/dawg/Makefile.langcommon @@ -16,6 +16,14 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +ifneq ($(DIRTY_LIST),) + BOWDLERIZER = ../remove-dirty.py $(DIRTY_LIST) + XWLANG := $(XWLANG)_BOWD + DICTNOTE := "$(DICTNOTE) (Bowdlerized)" +else + BOWDLERIZER = cat +endif + XWLANG := $(XWLANG)_ FRANK_EXT = xwd @@ -242,7 +250,7 @@ dawg$(XWLANG)%.stamp: $(XWLANG)Main.dict.gz $(DICT2DAWG) table.bin ../Makefile.l start=$$(echo $@ | sed -e 's/dawg$(XWLANG)\([0-9]*\)to[0-9]*.stamp/\1/'); \ end=$$(echo $@ | sed -e 's/dawg$(XWLANG)[0-9]*to\([0-9]*\).stamp/\1/'); \ echo $${start} and $${end}; \ - zcat $< | $(DICT2DAWG) $(DICT2DAWGARGS) $(TABLE_ARG) table.bin -b 28000 \ + zcat $< | $(BOWDLERIZER) | $(DICT2DAWG) $(DICT2DAWGARGS) $(TABLE_ARG) table.bin -b 28000 \ -ob dawg$(XWLANG)$* $(ENCP) \ -sn $(XWLANG)StartLoc.bin -min $${start} -max $${end} \ -wc $(XWLANG)$*_wordcount.bin $(FORCE_4) -ns $(XWLANG)$*_nodesize.bin @@ -295,9 +303,6 @@ $(XWLANG)%_newheader.bin: $(XWLANG)%_wordcount.bin $(XWLANG)%_note.bin $(XWLANG) perl -e "print pack(\"n\",$$SIZ)" > $@ cat $+ >> $@ -%.dict: %.dict.gz - zcat $< > $@ - # clean this up.... ../dict2dawg: ../dict2dawg.cpp g++ -DDEBUG -O0 -g -Wall -o $@ $< diff --git a/xwords4/dawg/remove-dirty.py b/xwords4/dawg/remove-dirty.py new file mode 100755 index 000000000..2a1d4ced7 --- /dev/null +++ b/xwords4/dawg/remove-dirty.py @@ -0,0 +1,18 @@ +#!/usr/bin/python + +# Invoked with path to bad words list as single parameter, and with a +# stream of words via stdin, loads the bad words into a map and for +# every word in stdin echos it to stdout IFF it's not in the map. + +import sys + +dirtyMap = {} +dirtyList = sys.argv[1] +for f in open(dirtyList): + dirtyMap[f] = True + +for word in sys.stdin: + if word in dirtyMap: + sys.stderr.write( sys.argv[0] + ": dropping: " + word ) + else: + sys.stdout.write( word )