In [14]:
import gensim
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
import cPickle as pickle
from scipy.spatial.distance import cosine
from nltk.tokenize import PunktSentenceTokenizer
In [2]:
def seperatePunct(incomingString):
newstring = incomingString
newstring = newstring.replace("!"," ! ")
newstring = newstring.replace("@"," @ ")
newstring = newstring.replace("#"," # ")
newstring = newstring.replace("$"," $ ")
newstring = newstring.replace("%"," % ")
newstring = newstring.replace("^"," ^ ")
newstring = newstring.replace("&"," & ")
newstring = newstring.replace("*"," * ")
newstring = newstring.replace("("," ( ")
newstring = newstring.replace(")"," ) ")
newstring = newstring.replace("+"," + ")
newstring = newstring.replace("="," = ")
newstring = newstring.replace("?"," ? ")
newstring = newstring.replace("\'"," \' ")
newstring = newstring.replace("\""," \" ")
newstring = newstring.replace("{"," { ")
newstring = newstring.replace("}"," } ")
newstring = newstring.replace("["," [ ")
newstring = newstring.replace("]"," ] ")
newstring = newstring.replace("<"," < ")
newstring = newstring.replace(">"," > ")
newstring = newstring.replace("~"," ~ ")
newstring = newstring.replace("`"," ` ")
newstring = newstring.replace(":"," : ")
newstring = newstring.replace(";"," ; ")
newstring = newstring.replace("|"," | ")
newstring = newstring.replace("\\"," \\ ")
newstring = newstring.replace("/"," / ")
return newstring
def hasNumbers(inputString):
return any(char.isdigit() for char in inputString)
def text_cleaner(wordList):
'''
INPUT: List of words to be tokenized
OUTPUT: List of tokenized words
'''
tokenziedList = []
for word in wordList:
#remove these substrings from the word
word = word.replace('[deleted]','')
word = word.replace('>','')
#if link, replace with linktag
if 'http://' in word:
tokenziedList.append('LINK_TAG')
continue
#if reference to subreddit, replace with reddittag
if '/r/' in word:
tokenziedList.append('SUBREDDIT_TAG')
continue
#if reference to reddit user, replace with usertag
if '/u/' in word:
tokenziedList.append('USER_TAG')
continue
#if number, replace with numtag
#m8 is a word, 5'10" and 54-59, 56:48 are numbers
if hasNumbers(word) and not any(char.isalpha() for char in word):
tokenziedList.append('NUM_TAG')
continue
#seperate puncuations and add to tokenizedList
newwords = seperatePunct(word).split(" ")
tokenziedList.extend(newwords)
return tokenziedList
def mytokenizer(comment):
'''
Input: takes in a reddit comment as a str or unicode and tokenizes it
Output: a tokenized list
'''
tokenizer = PunktSentenceTokenizer()
sentenceList = tokenizer.tokenize(comment)
wordList = []
for sentence in sentenceList:
wordList.extend(sentence.split(" "))
return text_cleaner(wordList)
In [3]:
path1 = '../../data/labeledRedditComments.p'
path2 = '../../data/twitter-hate-speech-classifier.csv'
path3 = '../../data/RedditMay2015Comments.sqlite'
In [4]:
df = pickle.load(open(path1, 'rb'))
In [39]:
# List of not hateful subreddits
final_nothate_srs = ['politics', 'worldnews', 'history', 'blackladies', 'lgbt',
'TransSpace', 'women', 'TwoXChromosomes', 'DebateReligion',
'religion', 'islam', 'Judaism', 'BodyAcceptance', 'fatlogic'
'gaybros','AskMen','AskWomen']
# List of hateful subreddits
final_hateful_srs = ['CoonTown', 'WhiteRights', 'Trans_fags', 'SlutJustice',
'TheRedPill', 'KotakuInAction', 'IslamUnveiled', 'GasTheKikes',
'AntiPOZi', 'fatpeoplehate', 'TalesofFatHate','hamplanethatred'
'shitniggerssay','neofag','altright']
In [40]:
final_hateful_srs
Out[40]:
In [42]:
df['subreddit'].value_counts()
Out[42]:
In [43]:
df['label'].value_counts()
Out[43]:
In [5]:
dfhate = pd.read_csv(path2)
In [26]:
dfhate.head()
Out[26]:
In [27]:
dfhate['does_this_tweet_contain_hate_speech_gold'].value_counts()
Out[27]:
In [29]:
dfhate['orig_does_this_tweet_contain_hate_speech'].value_counts()
Out[29]:
In [30]:
def myfunc(x):
if x in ['The tweet is not offensive']:
return 0
else:
return 1
In [31]:
dfhate['label'] = dfhate['does_this_tweet_contain_hate_speech'].map(lambda x: myfunc(x))
In [32]:
dfhate['label'].value_counts()
Out[32]:
In [37]:
pre = np.array([1,1,1,0,0,0])
tru = np.array([1,0,1,0,1,1])
print sum(pre+tru == 2)
print sum(pre+tru == 0)
print sum(pre-tru == 1)
print sum(pre-tru == -1)
In [33]:
dfhate.to_csv(path2)
In [25]:
dfhate['does_this_tweet_contain_hate_speech'].value_counts()
Out[25]:
In [6]:
model = gensim.models.Doc2Vec.load('base_model_original_tokenizer.doc2vec')
In [7]:
docvecs = model.docvecs
In [71]:
docvecs[0]
Out[71]:
In [20]:
doctags = docvecs.doctags
In [22]:
doctags.items()
Out[22]:
In [23]:
doctags['GasTheKikes']
Out[23]:
In [33]:
len(doctags['GasTheKikes'])
Out[33]:
In [34]:
doctags['GasTheKikes'][0]
Out[34]:
In [25]:
docvecs.count
Out[25]:
In [26]:
type(docvecs)
Out[26]:
In [28]:
type(docvecs[0])
Out[28]:
In [31]:
len(docvecs[0])
Out[31]:
In [29]:
len(docvecs)
Out[29]:
In [30]:
docvecs.index_to_doctag(0)
Out[30]:
In [38]:
docvecs.most_similar(14)
Out[38]:
In [40]:
for i in xrange(len(docvecs)):
print docvecs.index_to_doctag(i)
print docvecs.most_similar(i)[0]
print ""
In [44]:
vocab = model.vocab
In [48]:
len(vocab.keys())
Out[48]:
In [49]:
vocab.keys()[:10]
Out[49]:
In [50]:
vocab['sowell']
Out[50]:
In [51]:
word = vocab['sowell']
In [55]:
word.index
Out[55]:
In [59]:
comment = 'hello world'.split(" ")
myvect = model.infer_vector(comment)
print myvect
In [62]:
comment = ':)'.split(" ")
myvect = model.infer_vector(comment)
print myvect
In [8]:
def mostSimilarDoc(model,comment):
'''
Input: doc2vec model, comment is a str
Output: the label of the doc most similar to the comment
'''
docvecs = model.docvecs
wordTokens = mytokenizer(comment)
# wordTokens = comment.split(" ")
commentVec = model.infer_vector(wordTokens)
mostSimVec = None
bestSimVal = None
for vec_ind in xrange(len(docvecs)):
simVal = 1 - cosine(commentVec,docvecs[vec_ind])
if simVal>bestSimVal:
mostSimVec = vec_ind
bestSimVal = simVal
return docvecs.index_to_doctag(mostSimVec), bestSimVal
In [20]:
random.sample(xrange(10), 11)
In [23]:
numsamps = 1000
randrows = random.sample(xrange(len(df.index)), numsamps)
comments = df.ix[randrows,'body'].values
subreddits = df.ix[randrows,'subreddit'].values
count = 0
for row,comment in enumerate(comments):
predictedSub, simVal = mostSimilarDoc(model,comment)
if predictedSub == subreddits[row]:
count+=1
print count/float(len(comments))
In [23]:
predictedSub, simVal = mostSimilarDoc(model,'')
print predictedSub
print simVal
In [10]:
wordTokens = "hi"
commentVec1 = model.infer_vector(wordTokens)
commentVec2 = model.infer_vector(wordTokens)
np.array_equal(commentVec1, commentVec2)
Out[10]:
In [ ]: