In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
import cPickle as pickle
from string import punctuation
from nltk import word_tokenize
from nltk.stem import snowball
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import gensim
from nltk.tokenize import PunktSentenceTokenizer
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D
In [2]:
###########################################################################
# tokenization code
def seperatePunct(incomingString):
'''
Input:str,
Output: str with all puncuations seperated by spaces
'''
outstr = ''
characters = set(['!','@','#','$',"%","^","&","*",":","\\",
"(",")","+","=","?","\'","\"",";","/",
"{","}","[","]","<",">","~","`","|"])
for char in incomingString:
if char in characters:
outstr = outstr + ' ' + char + ' '
else:
outstr = outstr + char
return outstr
def hasNumbers(inputString):
'''
Input: str
Output: returns a 1 if the string contains a number
'''
return any(char.isdigit() for char in inputString)
def text_cleaner(wordList):
'''
INPUT: List of words to be tokenized
OUTPUT: List of tokenized words
'''
tokenziedList = []
for word in wordList:
#remove these substrings from the word
word = word.replace('[deleted]','')
word = word.replace('>','')
#if link, replace with linktag
if 'http' in word:
tokenziedList.append('LINK_TAG')
continue
#if reference to subreddit, replace with reddittag
if '/r/' in word:
tokenziedList.append('SUBREDDIT_TAG')
continue
#if reference to reddit user, replace with usertag
if '/u/' in word:
tokenziedList.append('USER_TAG')
continue
#if reference to twitter user, replace with usertag
if '@' in word:
tokenziedList.append('USER_TAG')
continue
#if number, replace with numtag
#m8 is a word, 5'10" and 54-59, 56:48 are numbers
if hasNumbers(word) and not any(char.isalpha() for char in word):
tokenziedList.append('NUM_TAG')
continue
#seperate puncuations and add to tokenizedList
newwords = seperatePunct(word).split(" ")
tokenziedList.extend(newwords)
return tokenziedList
def mytokenizer(comment):
'''
Input: takes in a reddit comment as a str or unicode and tokenizes it
Output: a tokenized list
'''
tokenizer = PunktSentenceTokenizer()
sentenceList = tokenizer.tokenize(comment)
wordList = []
for sentence in sentenceList:
wordList.extend(sentence.split(" "))
return text_cleaner(wordList)
In [3]:
def mostSimilarDoc(model,comment,k,threshold):
'''
Input: doc2vec model, comment is a str, k = number of similar doc vecs
Output: an int indicating hate (1) or not hate (0),most similar subreddit
'''
docvecs = model.docvecs
numdocvec = len(docvecs)
simVals = np.zeros((numdocvec, ))
#tokenize comment
wordTokens = mytokenizer(comment)
#create vector of tokenized comment
#avg over 100 vectors
finalVec = np.zeros((300, ))
for i in xrange(100):
finalVec = finalVec + model.infer_vector(wordTokens)
commentVec = finalVec/100.0
#compute similarity of comment to each subreddit
for vec_ind in xrange(len(docvecs)):
simVals[vec_ind] = 1 - cosine(commentVec,docvecs[vec_ind])
mostSimVecInd = np.argsort(simVals)[-k:]
hatecount = 0
#count how many hates there are
for index in mostSimVecInd:
hatecount += ishateful(docvecs.index_to_doctag(index))
#majority vote to determine hateful/nothateful
if hatecount>=threshold*len(mostSimVecInd):
prediction = 1
else:
prediction = 0
#find most similar subreddit
mostSimSubreddit = docvecs.index_to_doctag(mostSimVecInd[0])
return prediction,mostSimSubreddit
##############################################################################
#hate/NotHate code
def ishateful(subreddit):
'''
Input: str subreddit
Output: int 1 if hateful subreddit, 0 otherwise
'''
# List of not hateful subreddits
final_nothate_srs = ['politics', 'worldnews', 'history', 'blackladies', 'lgbt',
'TransSpace', 'women', 'TwoXChromosomes', 'DebateReligion',
'religion', 'islam', 'Judaism', 'BodyAcceptance', 'fatlogic'
'gaybros','AskMen','AskWomen']
# List of hateful subreddits
final_hateful_srs = ['CoonTown', 'WhiteRights', 'Trans_fags', 'SlutJustice',
'TheRedPill', 'KotakuInAction', 'IslamUnveiled', 'GasTheKikes',
'AntiPOZi', 'fatpeoplehate', 'TalesofFatHate','hamplanethatred',
'shitniggerssay','neofag','altright']
if subreddit in final_hateful_srs:
return 1
else:
return 0
In [4]:
modelPath = '../../doc2vec_models/basemodel2/basemodel2.doc2vec'
model = gensim.models.Doc2Vec.load(modelPath)
In [11]:
df = pd.read_csv('../../data/twitter_cross_val.csv')
In [10]:
docvecs = model.docvecs
In [13]:
len(docvecs)
Out[13]:
In [19]:
docvecs.index_to_doctag(4)
Out[19]:
In [22]:
subredditvecs = np.zeros((26,300))
colors = []
In [23]:
np.shape(docvecs[0])
Out[23]:
In [24]:
for i in xrange(26):
subredditvecs[i,:] = docvecs[i]
In [59]:
ts = TSNE(2)
reduced_vecs = ts.fit_transform(subredditvecs)
In [60]:
for i in xrange(26):
if ishateful(docvecs.index_to_doctag(i)):
color = 'r'
else:
color = 'b'
plt.plot(reduced_vecs[i,0], reduced_vecs[i,1],marker='o', color=color, markersize=8)
In [61]:
ts3 = TSNE(3)
reduced_vecs3 = ts3.fit_transform(subredditvecs)
In [62]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
for i in xrange(26):
if ishateful(docvecs.index_to_doctag(i)):
color = 'r'
else:
color = 'b'
ax.scatter(reduced_vecs3[i,0], reduced_vecs3[i,1],reduced_vecs3[i,2], color=color)
In [53]:
reduced_vecs3.shape
Out[53]:
In [65]:
print str("You're a ginger, a Jew, and from Jersey! Three strikes, Kyle! YOU´RE OUT!")
In [66]:
unicode("You're a ginger, a Jew, and from Jersey! Three strikes, Kyle! YOU´RE OUT!".)
In [70]:
np.linalg.norm(docvecs,axis=1)
Out[70]:
In [ ]: