In [1]:
# Get bad words from 
# https://github.com/turalus/encycloDB/tree/b1d16cca3957ae1ce72daa0c0c0f4982b0005dfd/Dirty%20Words

In [2]:
# Set up paths/ os
import os
import sys

this_path=os.getcwd()
os.chdir("../data")
sys.path.insert(0, this_path)

In [3]:
import pandas as pd

# Read list of dirty words from:
# https://github.com/turalus/encycloDB/tree/b1d16cca3957ae1ce72daa0c0c0f4982b0005dfd/Dirty%20Words
#df=pd.read_csv('')
#DirtyWords=df['word'].values
#del df

f =  open('bad-words-banned-by-google.txt',"r",encoding='utf-8', errors='ignore')
lines=f.readlines() 
DirtyWords = []
for line in lines:
    x=str(line).strip().replace('\n','')
    DirtyWords.append(x)

f.close()

In [4]:
infile="articles-n-forums-posts.csv"

df=pd.read_csv(infile,index_col=0)
print(len(df))
df.head(1)


3776
Out[4]:
category href source text title user id tokens text_short
post id
0 ['category-applied-behavior-analysis-aba'] https://www.autismparentingmagazine.com/autism... https://www.autismparentingmagazine.com/ For children with autism spectrum disorder (AS... Autism, Head Banging and other Self Harming Be... NaN ['autism', 'head', 'banging', 'and', 'other', ... For children with autism spectrum disorder (AS...

In [5]:
import nltk
tokenizer = nltk.RegexpTokenizer(r'\w+')

df['tokens'] = df['text'].map(lambda x: tokenizer.tokenize(x.lower()))
df['tokens'].head(5)


Out[5]:
post id
0    [for, children, with, autism, spectrum, disord...
1    [dr, stephen, shore, once, said, if, you, ve, ...
2    [help, i, am, going, to, be, starting, applied...
3    [how, do, you, handle, high, anxiety, of, a, c...
4    [a, grandfather, from, singapore, asks, my, el...
Name: tokens, dtype: object

In [6]:
# Find users using offending words:

list_of_bad_posts=[]
tokens_list=df['tokens'].values
post_ids=df.index
for post_id,tokens in zip(post_ids,tokens_list):
    if  any( x in DirtyWords for x in tokens) :
        list_of_bad_posts.append(post_id)

print(len(list_of_bad_posts))
# No dirty words!!!


219

In [7]:
# Create empty dataframe with columns
df2=pd.DataFrame(columns=['tokens','use dirty words'])
df2.index.name='post id'
df2['tokens'] = df['tokens']

for ii in df2.index:
    if ii in list_of_bad_posts:
        df2.loc[ii,'use dirty words'] = 1
    else:
        df2.loc[ii,'use dirty words'] = 0
#df2.head(1)

In [8]:
tokens = df2.loc[df2['use dirty words'] == 1 ].values
tokens[0]


Out[8]:
array([ ['my', 'nephew', 'who', 'is', 'four', 'years', 'has', 'very', 'bad', 'mood', 'swings', 'and', 'stops', 'what', 'he', 'is', 'doing', 'and', 'starts', 'opening', 'and', 'closing', 'of', 'his', 'hands', 'he', 'tells', 'his', 'mother', 'that', 'he', 'wants', 'to', 'break', 'her', 'bones', 'and', 'break', 'the', 'house', 'he', 'goes', 'to', 'the', 'bathroom', 'on', 'the', 'porch', 'and', 'steps', 'in', 'ti', 'and', 'thinks', 'it', 's', 'funny', 'he', 'is', 'in', 'time', 'out', 'at', 'school', '4', 'out', 'of', '5', 'days', 'and', 'the', 'daycare', 'provider', 'said', 'he', 'needs', 'to', 'be', 'seen', 'by', 'a', 'doctor', 'my', 'sister', 'is', 'so', 'frustrated', 'she', 'doesn', 't', 'know', 'what', 'to', 'do', 'other', 'things', 'he', 'does', 'is', 'spit', 'at', 'people', 'hit', 'others', 'punches', 'his', 'mom', 'yells', 'a', 'lot', 'calls', 'eveyone', 'stupid', 'says', 'he', 'wants', 'to', 'hurt', 'animals', 'and', 'people', 'his', 'speech', 'is', 'hard', 'to', 'understand', 'he', 'starts', 'a', 'conversation', 'and', 'be', 'fore', 'he', 'can', 'finish', 'he', 'll', 'stop', 'and', 'talk', 'about', 'something', 'completly', 'different', 'my', 'sister', 'doesn', 't', 'know', 'what', 'to', 'do', 'she', 'thinks', 'possible', 'autism', 'what', 'do', 'you', 'think'],
       1], dtype=object)

In [9]:
for tt in tokens:
    for x in tt[0]:
        if x in DirtyWords:
            print(x)


stupid
stupid
stupid
asses
cocks
bloody
omg
hell
nazi
hell
stupid
bloody
stupid
stupid
omg
penis
penis
omg
xxx
sex
sex
sex
sex
sex
pissed
stupid
stupid
stupid
stupid
stupid
stupid
hell
bloody
hell
hell
stupid
sex
ass
hell
stupid
stupid
stupid
hell
bum
butt
sex
stupid
ejaculation
orgasm
stupid
stupid
stupid
omg
jerk
bloody
omg
penis
breasts
stupid
hell
hell
lmao
sex
sex
stupid
stupid
xxx
chink
stupid
stupid
hell
hell
stupid
stupid
stupid
stupid
stupid
stupid
stupid
vagina
hell
butt
hell
jerk
asshole
asshole
bitches
fucking
fuck
ass
bloody
shitty
damn
fucking
hell
ass
sex
ass
screwing
bloody
pawn
shit
damn
fucking
fucked
bitch
asshole
butt
stupid
porn
jerk
sex
hell
pissed
pissing
fucking
fucking
fucking
breasts
hell
bitch
ass
ass
hell
wtf
sex
ass
boob
fucking
fucking
asshole
fucking
fucking
hell
fucking
shit
shitty
rectum
fuck
shit
fucking
fucking
shit
ass
butt
whore
shits
hell
stupid
fuck
hell
butt
hell
fuck
sex
sex
fucking
shitty
shitty
fucking
fucks
jerk
stupid
stupid
fuck
bitch
wtf
fag
shitty
shit
damn
fucking
hell
nazi
stupid
hell
vagina
ass
penis
penis
fuck
bitch
fuck
hell
sex
shitty
hell
sex
bitch
ass
sex
sex
hell
stupid
homo
homo
butt
asshole
hell
pissed
fuck
bloody
shit
fucking
dick
wtf
pissed
stupid
hell
fuck
stupid
damn
stupid
fuck
sex
fuck
fucked
pissing
stupid
stupid
stupid
stupid
stupid
stupid
sex
sex
damn
fuck
bum
damn
damn
fuck
fucks
shit
fucking
asshole
shit
fucking
stupid
dick
fucking
shit
pissed
stupid
stupid
hell
pissed
fuck
stupid
stupid
hell
hell
hell
homo
asshole
fuck
stupid
fucked
stupid
stupid
pissed
damn
asses
shitty
fucking
fucking
fucked
asshole
shitty
omg
fuck
feck
hell
shitty
goddamned
fucked
sex
sex
hell
butt
stupid
shitty
stupid
shit
pissed
stupid
stupid
fucking
bitch
stupid
pornography
damn
sex
fucked
shit
stupid
stupid
shit
shit
stupid
hell
wtf
faggot
knob
sex
shit
assholes
asshole
assholes
damn
hell
hell
erotic
fuck

In [10]:
df2.to_csv("db-bad-words.csv",header=True)

In [ ]: