notebook.community

Edit and run



In [45]:

    
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
import cPickle as pickle
from string import punctuation
from nltk import word_tokenize
from nltk.stem import snowball
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import gensim
from nltk.tokenize import PunktSentenceTokenizer
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D



In [2]:

    
###########################################################################
# tokenization code

def seperatePunct(incomingString):
    '''
    Input:str,
    Output: str with all puncuations seperated by spaces
    '''
    outstr = ''
    characters = set(['!','@','#','$',"%","^","&","*",":","\\",
                  "(",")","+","=","?","\'","\"",";","/",
                  "{","}","[","]","<",">","~","`","|"])

    for char in incomingString:
        if char in characters:
            outstr = outstr + ' ' + char + ' '
        else:
            outstr = outstr + char

    return outstr

def hasNumbers(inputString):
    '''
    Input: str
    Output: returns a 1 if the string contains a number
    '''
    return any(char.isdigit() for char in inputString)

def text_cleaner(wordList):
    '''
    INPUT: List of words to be tokenized
    OUTPUT: List of tokenized words
    '''

    tokenziedList = []

    for word in wordList:

        #remove these substrings from the word
        word = word.replace('[deleted]','')
        word = word.replace('&gt','')

        #if link, replace with linktag
        if 'http' in word:
            tokenziedList.append('LINK_TAG')
            continue

        #if reference to subreddit, replace with reddittag
        if '/r/' in word:
            tokenziedList.append('SUBREDDIT_TAG')
            continue

        #if reference to reddit user, replace with usertag
        if '/u/' in word:
            tokenziedList.append('USER_TAG')
            continue

        #if reference to twitter user, replace with usertag
        if '@' in word:
            tokenziedList.append('USER_TAG')
            continue

        #if number, replace with numtag
        #m8 is a word, 5'10" and 54-59, 56:48 are numbers
        if hasNumbers(word) and not any(char.isalpha() for char in word):
            tokenziedList.append('NUM_TAG')
            continue

        #seperate puncuations and add to tokenizedList
        newwords = seperatePunct(word).split(" ")
        tokenziedList.extend(newwords)

    return tokenziedList

def mytokenizer(comment):
    '''
    Input: takes in a reddit comment as a str or unicode and tokenizes it
    Output: a tokenized list
    '''
    tokenizer = PunktSentenceTokenizer()
    sentenceList = tokenizer.tokenize(comment)
    wordList = []
    for sentence in sentenceList:
        wordList.extend(sentence.split(" "))

    return text_cleaner(wordList)



In [3]:

    
def mostSimilarDoc(model,comment,k,threshold):
    '''
    Input: doc2vec model, comment is a str, k = number of similar doc vecs
    Output: an int indicating hate (1) or not hate (0),most similar subreddit
    '''

    docvecs = model.docvecs
    numdocvec = len(docvecs)
    simVals = np.zeros((numdocvec, ))

    #tokenize comment
    wordTokens = mytokenizer(comment)

    #create vector of tokenized comment
    #avg over 100 vectors
    finalVec = np.zeros((300, ))
    for i in xrange(100):
        finalVec = finalVec + model.infer_vector(wordTokens)
    commentVec = finalVec/100.0

    #compute similarity of comment to each subreddit
    for vec_ind in xrange(len(docvecs)):
        simVals[vec_ind] = 1 - cosine(commentVec,docvecs[vec_ind])

    mostSimVecInd = np.argsort(simVals)[-k:]
    hatecount = 0

    #count how many hates there are
    for index in mostSimVecInd:
        hatecount += ishateful(docvecs.index_to_doctag(index))

    #majority vote to determine hateful/nothateful
    if hatecount>=threshold*len(mostSimVecInd):
        prediction = 1
    else:
        prediction = 0

    #find most similar subreddit
    mostSimSubreddit = docvecs.index_to_doctag(mostSimVecInd[0])

    return prediction,mostSimSubreddit

##############################################################################
#hate/NotHate code

def ishateful(subreddit):
    '''
    Input: str subreddit
    Output: int 1 if hateful subreddit, 0 otherwise
    '''

    # List of not hateful subreddits
    final_nothate_srs = ['politics', 'worldnews', 'history', 'blackladies', 'lgbt',
                         'TransSpace', 'women', 'TwoXChromosomes', 'DebateReligion',
                         'religion', 'islam', 'Judaism', 'BodyAcceptance', 'fatlogic'
                         'gaybros','AskMen','AskWomen']
    # List of hateful subreddits
    final_hateful_srs = ['CoonTown', 'WhiteRights', 'Trans_fags', 'SlutJustice',
                         'TheRedPill', 'KotakuInAction', 'IslamUnveiled', 'GasTheKikes',
                         'AntiPOZi', 'fatpeoplehate', 'TalesofFatHate','hamplanethatred',
                         'shitniggerssay','neofag','altright']

    if subreddit in final_hateful_srs:
        return 1
    else:
        return 0



In [4]:

    
modelPath = '../../doc2vec_models/basemodel2/basemodel2.doc2vec'
model = gensim.models.Doc2Vec.load(modelPath)



In [11]:

    
df = pd.read_csv('../../data/twitter_cross_val.csv')



In [10]:

    
docvecs = model.docvecs



In [13]:

    
len(docvecs)









    Out[13]:





26



In [19]:

    
docvecs.index_to_doctag(4)









    Out[19]:





'TheRedPill'



In [22]:

    
subredditvecs = np.zeros((26,300))
colors = []



In [23]:

    
np.shape(docvecs[0])









    Out[23]:





(300,)



In [24]:

    
for i in xrange(26):
    subredditvecs[i,:] = docvecs[i]



In [59]:

    
ts = TSNE(2)
reduced_vecs = ts.fit_transform(subredditvecs)



In [60]:

    
for i in xrange(26):
    if ishateful(docvecs.index_to_doctag(i)):
        color = 'r'
    else:
        color = 'b'
        
    plt.plot(reduced_vecs[i,0], reduced_vecs[i,1],marker='o', color=color, markersize=8)



In [61]:

    
ts3 = TSNE(3)
reduced_vecs3 = ts3.fit_transform(subredditvecs)



In [62]:

    
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

for i in xrange(26):
    if ishateful(docvecs.index_to_doctag(i)):
        color = 'r'
    else:
        color = 'b'
        
    ax.scatter(reduced_vecs3[i,0], reduced_vecs3[i,1],reduced_vecs3[i,2], color=color)



In [53]:

    
reduced_vecs3.shape









    Out[53]:





(26, 3)



In [65]:

    
print str("You're a ginger, a Jew, and from Jersey! Three strikes, Kyle! YOU´RE OUT!")









    



You're a ginger, a Jew, and from Jersey! Three strikes, Kyle! YOU´RE OUT!



In [66]:

    
unicode("You're a ginger, a Jew, and from Jersey! Three strikes, Kyle! YOU´RE OUT!".)









    



  File "<ipython-input-66-3732fecbcdcd>", line 1
    unicode("You're a ginger, a Jew, and from Jersey! Three strikes, Kyle! YOU´RE OUT!".)
                                                                                         ^
SyntaxError: invalid syntax



In [70]:

    
np.linalg.norm(docvecs,axis=1)









    Out[70]:





array([ 7.22713137,  7.48275948,  8.61092567,  7.18817329,  6.31885099,
        6.70559311,  7.42265844,  8.504076  ,  7.6732688 ,  6.33953571,
        7.42020178,  7.12737846,  7.34007359,  7.50015068,  7.96021843,
        7.20101309,  6.50609732,  7.85659409,  7.10718393,  7.29736567,
        8.05480385,  7.74172592,  7.55083609,  7.79008579,  6.73802614,
        6.97799635], dtype=float32)



In [ ]: