notebook.community

Edit and run



In [14]:

    
import gensim
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
import cPickle as pickle
from scipy.spatial.distance import cosine
from nltk.tokenize import PunktSentenceTokenizer



In [2]:

    
def seperatePunct(incomingString):
    newstring = incomingString
    newstring = newstring.replace("!"," ! ")
    newstring = newstring.replace("@"," @ ")
    newstring = newstring.replace("#"," # ")
    newstring = newstring.replace("$"," $ ")
    newstring = newstring.replace("%"," % ")
    newstring = newstring.replace("^"," ^ ")
    newstring = newstring.replace("&"," & ")
    newstring = newstring.replace("*"," * ")
    newstring = newstring.replace("("," ( ")
    newstring = newstring.replace(")"," ) ")
    newstring = newstring.replace("+"," + ")
    newstring = newstring.replace("="," = ")
    newstring = newstring.replace("?"," ? ")
    newstring = newstring.replace("\'"," \' ")
    newstring = newstring.replace("\""," \" ")
    newstring = newstring.replace("{"," { ")
    newstring = newstring.replace("}"," } ")
    newstring = newstring.replace("["," [ ")
    newstring = newstring.replace("]"," ] ")
    newstring = newstring.replace("<"," < ")
    newstring = newstring.replace(">"," > ")
    newstring = newstring.replace("~"," ~ ")
    newstring = newstring.replace("`"," ` ")
    newstring = newstring.replace(":"," : ")
    newstring = newstring.replace(";"," ; ")
    newstring = newstring.replace("|"," | ")
    newstring = newstring.replace("\\"," \\ ")
    newstring = newstring.replace("/"," / ")
    return newstring

def hasNumbers(inputString):
     return any(char.isdigit() for char in inputString)

def text_cleaner(wordList):
    '''
    INPUT: List of words to be tokenized
    OUTPUT: List of tokenized words
    '''

    tokenziedList = []

    for word in wordList:

        #remove these substrings from the word
        word = word.replace('[deleted]','')
        word = word.replace('&gt','')

        #if link, replace with linktag
        if 'http://' in word:
            tokenziedList.append('LINK_TAG')
            continue

        #if reference to subreddit, replace with reddittag
        if '/r/' in word:
            tokenziedList.append('SUBREDDIT_TAG')
            continue

        #if reference to reddit user, replace with usertag
        if '/u/' in word:
            tokenziedList.append('USER_TAG')
            continue

        #if number, replace with numtag
        #m8 is a word, 5'10" and 54-59, 56:48 are numbers
        if hasNumbers(word) and not any(char.isalpha() for char in word):
            tokenziedList.append('NUM_TAG')
            continue

        #seperate puncuations and add to tokenizedList
        newwords = seperatePunct(word).split(" ")
        tokenziedList.extend(newwords)

    return tokenziedList

def mytokenizer(comment):
    '''
    Input: takes in a reddit comment as a str or unicode and tokenizes it
    Output: a tokenized list
    '''
    tokenizer = PunktSentenceTokenizer()
    sentenceList = tokenizer.tokenize(comment)
    wordList = []
    for sentence in sentenceList:
        wordList.extend(sentence.split(" "))

    return text_cleaner(wordList)



In [3]:

    
path1 = '../../data/labeledRedditComments.p'
path2 = '../../data/twitter-hate-speech-classifier.csv'
path3 = '../../data/RedditMay2015Comments.sqlite'



In [4]:

    
df = pickle.load(open(path1, 'rb'))



In [39]:

    
# List of not hateful subreddits
final_nothate_srs = ['politics', 'worldnews', 'history', 'blackladies', 'lgbt',
                     'TransSpace', 'women', 'TwoXChromosomes', 'DebateReligion',
                     'religion', 'islam', 'Judaism', 'BodyAcceptance', 'fatlogic'
                     'gaybros','AskMen','AskWomen']
    # List of hateful subreddits
final_hateful_srs = ['CoonTown', 'WhiteRights', 'Trans_fags', 'SlutJustice',
                     'TheRedPill', 'KotakuInAction', 'IslamUnveiled', 'GasTheKikes',
                     'AntiPOZi', 'fatpeoplehate', 'TalesofFatHate','hamplanethatred'
                     'shitniggerssay','neofag','altright']



In [40]:

    
final_hateful_srs









    Out[40]:





['CoonTown',
 'WhiteRights',
 'Trans_fags',
 'SlutJustice',
 'TheRedPill',
 'KotakuInAction',
 'IslamUnveiled',
 'GasTheKikes',
 'AntiPOZi',
 'fatpeoplehate',
 'TalesofFatHate',
 'hamplanethatredshitniggerssay',
 'neofag',
 'altright']



In [42]:

    
df['subreddit'].value_counts()









    Out[42]:





worldnews          490354
fatpeoplehate      311183
politics           244927
AskMen             138839
AskWomen           137889
KotakuInAction     128156
TwoXChromosomes    105130
TheRedPill          59145
CoonTown            51979
DebateReligion      41015
islam               25443
history             25242
Judaism              9103
lgbt                 8253
TalesofFatHate       5239
AntiPOZi             4740
blackladies          4396
religion             2623
Trans_fags           2362
WhiteRights          1352
GasTheKikes           919
BodyAcceptance        579
women                 529
TransSpace            472
SlutJustice           309
IslamUnveiled         110
Name: subreddit, dtype: int64



In [43]:

    
df['label'].value_counts()









    Out[43]:





NotHate    1234794
Hate        565494
Name: label, dtype: int64



In [5]:

    
dfhate = pd.read_csv(path2)



In [26]:

    
dfhate.head()









    Out[26]:






  
    
      
      _unit_id
      _golden
      _unit_state
      _trusted_judgments
      _last_judgment_at
      does_this_tweet_contain_hate_speech
      does_this_tweet_contain_hate_speech:confidence
      _created_at
      orig__golden
      orig__last_judgment_at
      orig__trusted_judgments
      orig__unit_id
      orig__unit_state
      _updated_at
      orig_does_this_tweet_contain_hate_speech
      does_this_tweet_contain_hate_speech_gold
      does_this_tweet_contain_hate_speech_gold_reason
      does_this_tweet_contain_hate_speechconfidence
      tweet_id
      tweet_text
    
  
  
    
      0
      853718217
      True
      golden
      86
      NaN
      The tweet uses offensive language but not hate...
      0.6013
      NaN
      True
      NaN
      0.0
      615561535.0
      golden
      NaN
      The tweet contains hate speech
      The tweet contains hate speech\nThe tweet uses...
      NaN
      1.0
      1.666196e+09
      Warning: penny boards will make you a faggot
    
    
      1
      853718218
      True
      golden
      92
      NaN
      The tweet contains hate speech
      0.7227
      NaN
      True
      NaN
      0.0
      615561723.0
      golden
      NaN
      The tweet contains hate speech
      The tweet contains hate speech\nThe tweet uses...
      NaN
      1.0
      4.295121e+08
      Fuck dykes
    
    
      2
      853718219
      True
      golden
      86
      NaN
      The tweet contains hate speech
      0.5229
      NaN
      True
      NaN
      0.0
      615562039.0
      golden
      NaN
      The tweet contains hate speech
      The tweet contains hate speech\nThe tweet uses...
      NaN
      1.0
      3.956238e+08
      @sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon...
    
    
      3
      853718220
      True
      golden
      98
      NaN
      The tweet contains hate speech
      0.5184
      NaN
      True
      NaN
      0.0
      615562068.0
      golden
      NaN
      The tweet contains hate speech
      The tweet contains hate speech\nThe tweet uses...
      NaN
      1.0
      4.975147e+08
      "@jayswaggkillah: "@JacklynAnnn: @jayswaggkill...
    
    
      4
      853718221
      True
      golden
      88
      NaN
      The tweet uses offensive language but not hate...
      0.5185
      NaN
      True
      NaN
      0.0
      615562488.0
      golden
      NaN
      The tweet contains hate speech
      The tweet contains hate speech\nThe tweet uses...
      NaN
      1.0
      5.889236e+08
      @Zhugstubble You heard me bitch but any way I'...



In [27]:

    
dfhate['does_this_tweet_contain_hate_speech_gold'].value_counts()









    Out[27]:





The tweet contains hate speech\nThe tweet uses offensive language but not hate speech    25
The tweet is not offensive                                                               16
The tweet uses offensive language but not hate speech                                    13
The tweet uses offensive language but not hate speech\nThe tweet is not offensive         9
The tweet contains hate speech                                                            4
Name: does_this_tweet_contain_hate_speech_gold, dtype: int64



In [29]:

    
dfhate['orig_does_this_tweet_contain_hate_speech'].value_counts()









    Out[29]:





The tweet uses offensive language but not hate speech    24
The tweet contains hate speech                           22
The tweet is not offensive                               21
Name: orig_does_this_tweet_contain_hate_speech, dtype: int64



In [30]:

    
def myfunc(x):
    if x in ['The tweet is not offensive']:
        return 0
    else:
        return 1



In [31]:

    
dfhate['label'] = dfhate['does_this_tweet_contain_hate_speech'].map(lambda x: myfunc(x))



In [32]:

    
dfhate['label'].value_counts()









    Out[32]:





0    7274
1    7235
Name: label, dtype: int64



In [37]:

    
pre = np.array([1,1,1,0,0,0])
tru = np.array([1,0,1,0,1,1])

print sum(pre+tru == 2)
print sum(pre+tru == 0)
print sum(pre-tru == 1)
print sum(pre-tru == -1)



In [33]:

    
dfhate.to_csv(path2)



In [25]:

    
dfhate['does_this_tweet_contain_hate_speech'].value_counts()









    Out[25]:





The tweet is not offensive                               7274
The tweet uses offensive language but not hate speech    4836
The tweet contains hate speech                           2399
Name: does_this_tweet_contain_hate_speech, dtype: int64



In [6]:

    
model = gensim.models.Doc2Vec.load('base_model_original_tokenizer.doc2vec')



In [7]:

    
docvecs = model.docvecs



In [71]:

    
docvecs[0]









    Out[71]:





array([-0.12589121, -0.72398579, -0.13750482,  0.01074985, -0.07752338,
        0.13886024, -0.28057936, -0.11339198, -0.02804809,  0.08569955,
       -0.82156771, -0.2160503 ,  0.43471047,  0.19683866, -0.42664996,
        0.04931739, -0.33742675, -0.73014867, -0.12762018, -0.106265  ,
        0.39234036, -0.39817062, -0.14888453, -0.69462979, -0.30694923,
       -0.07339818, -0.18055308, -0.64332151,  0.57735366,  1.02309513,
       -0.34679177, -0.11108843, -0.05537489,  0.10253237,  0.31550717,
        0.52577186,  0.99049211,  0.05952468, -0.47226873, -0.05540276,
       -0.26681474, -0.08775025, -0.92837352, -0.13519953, -0.70518291,
       -0.20343725, -0.11048806,  0.15157135, -0.44448465,  0.43652859,
       -0.14788841, -0.7766152 , -0.3568432 , -0.55862421, -0.61907762,
       -0.49959502, -0.00634548, -0.02462127, -0.35062504, -0.07442575,
       -0.06620318, -0.2105757 ,  0.03848599,  0.07465713, -0.01480301,
        0.0909562 , -0.48590082, -0.48677263, -0.01644286,  0.53316718,
        0.19098762, -0.00537689,  0.60375124,  0.61171699,  0.5195058 ,
       -0.81575453, -0.17606765, -0.05278037,  0.50832444,  0.81706733,
        0.1719389 , -0.13501805, -0.32697934, -0.19407187,  0.50223619,
        0.35213116, -0.22106965, -0.04424353,  0.02781259, -0.06127879,
       -0.3261008 ,  0.57645082,  0.05099476, -0.12902723, -0.24004243,
       -0.17637867, -0.5424065 , -0.7066685 , -0.51928079, -0.39453816,
        0.46386182, -0.30387247, -0.29021129, -0.15439314,  0.04698645,
        0.54548693, -0.08751345,  0.15773326, -0.4897477 ,  0.00445471,
       -0.22462903,  0.27953592,  0.3809576 ,  0.25621074,  0.02975994,
       -0.45700687, -0.08916998, -0.32495815, -0.31760514, -0.38044152,
       -0.15788171,  0.2868689 , -0.67996204, -0.04327663,  0.44221863,
       -0.20513661, -0.15236355,  0.34446013,  0.26793769,  0.15069135,
       -0.22732359, -0.56328028,  0.01038246, -0.01748611, -0.14974129,
       -0.26700798,  0.13132939, -0.11334003,  0.52558941,  0.20994405,
       -0.11825897, -0.38272485,  0.1874278 , -0.13379002,  0.1304699 ,
        0.01476961, -0.61739039, -0.25798291, -0.16011603, -0.39652464,
       -0.59220809, -0.30506358,  0.14669114, -0.2763429 , -0.51747042,
       -0.16309269,  0.03950921,  0.204364  ,  0.55781382, -0.38286638,
       -0.54690766, -0.54346794, -0.02820516,  0.0100578 , -0.28775266,
        0.13238353,  0.93503314,  0.37181729,  0.05437614,  0.23740833,
        0.39891708,  0.94723952, -0.10406715, -0.18395999, -0.00317672,
        0.45014599, -0.29661059, -0.33467513,  0.01850334, -0.11797646,
        0.08393001,  0.30656683, -0.39040384, -0.50537956, -1.003093  ,
       -0.35080022, -0.04855656, -0.75325412,  0.09916819, -0.03518015,
        0.11805237,  0.25506198,  0.10420178,  0.11222638, -0.43056545,
       -0.30389291,  0.16090877,  0.51357472,  0.32703143, -0.05134602,
        0.53110862,  0.61642408,  0.3364757 , -0.04929738,  0.30545914,
        0.45064425, -0.02958472,  0.01003539,  0.25594985, -0.12772371,
        0.05378566,  0.15690044, -0.19670148, -0.17284095, -0.02730625,
       -0.18926087,  0.19283143, -0.31659678,  0.18789057, -0.14049083,
        0.16017769, -0.1022848 ,  0.20286475,  0.53479397, -0.02167618,
       -0.11878969, -0.07998863,  0.12697545,  0.90858006,  0.31139112,
        0.50222713, -0.24131031,  0.3370797 ,  0.39017102, -0.66540205,
       -0.47660181, -0.3390246 ,  0.76906717, -0.02450531, -0.22576049,
       -0.34447014, -0.5409779 ,  0.24217558, -0.05197965,  0.0607123 ,
        0.25163266, -0.48003322,  0.79306233,  0.51902694,  0.42135218,
       -0.21480867, -0.23312834, -0.11083728,  0.24889012,  0.40282148,
        0.47559056, -0.02321037, -0.49362049, -0.11209472, -0.33152193,
        0.19776765, -0.02687917,  0.10523993, -0.41892961, -0.14945576,
        0.43507275, -0.33522612, -0.67242497,  0.16731207,  0.24476747,
        0.47927377,  0.17116192,  0.45978621,  0.69689423,  0.49695787,
       -0.21593295, -0.44263107,  0.16870342,  0.44734085,  0.0122469 ,
        0.21210839, -0.64636171,  0.57238775, -0.72735292, -0.00281904,
       -0.25434625, -0.15268794,  0.13881858, -0.60550427, -0.10906816,
        0.46427441, -0.43058404,  0.1811271 ,  0.22259298,  0.38453677,
        0.53926337,  0.1520562 ,  0.0299741 , -0.19964291, -0.73852772], dtype=float32)



In [20]:

    
doctags = docvecs.doctags



In [22]:

    
doctags.items()









    Out[22]:





[('SlutJustice', Doctag(offset=3, word_count=27446, doc_count=309)),
 ('AskWomen', Doctag(offset=25, word_count=6480218, doc_count=137889)),
 ('KotakuInAction', Doctag(offset=5, word_count=6416371, doc_count=128156)),
 ('blackladies', Doctag(offset=14, word_count=217549, doc_count=4396)),
 ('TwoXChromosomes', Doctag(offset=18, word_count=5929809, doc_count=105130)),
 ('politics', Doctag(offset=11, word_count=12084911, doc_count=244927)),
 ('AntiPOZi', Doctag(offset=8, word_count=221689, doc_count=4740)),
 ('islam', Doctag(offset=21, word_count=1520313, doc_count=25443)),
 ('GasTheKikes', Doctag(offset=7, word_count=47158, doc_count=919)),
 ('BodyAcceptance', Doctag(offset=23, word_count=39993, doc_count=579)),
 ('DebateReligion', Doctag(offset=19, word_count=3667567, doc_count=41015)),
 ('religion', Doctag(offset=20, word_count=197943, doc_count=2623)),
 ('worldnews', Doctag(offset=12, word_count=19291877, doc_count=490354)),
 ('fatpeoplehate', Doctag(offset=9, word_count=8193219, doc_count=311183)),
 ('Trans_fags', Doctag(offset=2, word_count=102110, doc_count=2362)),
 ('Judaism', Doctag(offset=22, word_count=507969, doc_count=9103)),
 ('IslamUnveiled', Doctag(offset=6, word_count=7542, doc_count=110)),
 ('lgbt', Doctag(offset=15, word_count=464559, doc_count=8253)),
 ('WhiteRights', Doctag(offset=1, word_count=59447, doc_count=1352)),
 ('AskMen', Doctag(offset=24, word_count=6168171, doc_count=138839)),
 ('women', Doctag(offset=17, word_count=33116, doc_count=529)),
 ('CoonTown', Doctag(offset=0, word_count=2033220, doc_count=51979)),
 ('TransSpace', Doctag(offset=16, word_count=18327, doc_count=472)),
 ('TheRedPill', Doctag(offset=4, word_count=3728901, doc_count=59145)),
 ('TalesofFatHate', Doctag(offset=10, word_count=178528, doc_count=5239)),
 ('history', Doctag(offset=13, word_count=1097160, doc_count=25242))]



In [23]:

    
doctags['GasTheKikes']









    Out[23]:





Doctag(offset=7, word_count=47158, doc_count=919)



In [33]:

    
len(doctags['GasTheKikes'])









    Out[33]:





3



In [34]:

    
doctags['GasTheKikes'][0]









    Out[34]:





7



In [25]:

    
docvecs.count









    Out[25]:





26



In [26]:

    
type(docvecs)









    Out[26]:





gensim.models.doc2vec.DocvecsArray



In [28]:

    
type(docvecs[0])









    Out[28]:





numpy.ndarray



In [31]:

    
len(docvecs[0])









    Out[31]:





300



In [29]:

    
len(docvecs)









    Out[29]:





26



In [30]:

    
docvecs.index_to_doctag(0)









    Out[30]:





'CoonTown'



In [38]:

    
docvecs.most_similar(14)









    Out[38]:





[('BodyAcceptance', 0.21258679032325745),
 ('WhiteRights', 0.21146908402442932),
 ('TransSpace', 0.18464726209640503),
 ('AskMen', 0.16595458984375),
 ('islam', 0.1570722907781601),
 ('Judaism', 0.15705253183841705),
 ('TheRedPill', 0.15415698289871216),
 ('politics', 0.14485692977905273),
 ('KotakuInAction', 0.14440138638019562),
 ('religion', 0.14105652272701263)]



In [40]:

    
for i in xrange(len(docvecs)):
    print docvecs.index_to_doctag(i)
    print docvecs.most_similar(i)[0]
    print ""









    



CoonTown
('IslamUnveiled', 0.2738789916038513)

WhiteRights
('GasTheKikes', 0.38529878854751587)

Trans_fags
('SlutJustice', 0.41643446683883667)

SlutJustice
('Trans_fags', 0.41643446683883667)

TheRedPill
('WhiteRights', 0.22930307686328888)

KotakuInAction
('SlutJustice', 0.19015194475650787)

IslamUnveiled
('religion', 0.32894909381866455)

GasTheKikes
('AntiPOZi', 0.39256757497787476)

AntiPOZi
('GasTheKikes', 0.39256757497787476)

fatpeoplehate
('BodyAcceptance', 0.22982779145240784)

TalesofFatHate
('SlutJustice', 0.3041166663169861)

politics
('TransSpace', 0.26541823148727417)

worldnews
('IslamUnveiled', 0.22024142742156982)

history
('IslamUnveiled', 0.25044021010398865)

blackladies
('BodyAcceptance', 0.21258679032325745)

lgbt
('TransSpace', 0.33727318048477173)

TransSpace
('Trans_fags', 0.3732454180717468)

women
('IslamUnveiled', 0.3218139111995697)

TwoXChromosomes
('AskWomen', 0.368299663066864)

DebateReligion
('politics', 0.2580014169216156)

religion
('IslamUnveiled', 0.32894909381866455)

islam
('IslamUnveiled', 0.29662537574768066)

Judaism
('TransSpace', 0.2351522445678711)

BodyAcceptance
('lgbt', 0.3149873614311218)

AskMen
('AskWomen', 0.29158464074134827)

AskWomen
('TwoXChromosomes', 0.3682996928691864)



In [44]:

    
vocab = model.vocab



In [48]:

    
len(vocab.keys())









    Out[48]:





215221



In [49]:

    
vocab.keys()[:10]









    Out[49]:





[u"cake'",
 u'Craziness',
 u'nunnery',
 u'EXPLAIN',
 u'transend',
 u'sowell',
 u'Panzergroup',
 u'fleeces',
 u'woods',
 u'clotted']



In [50]:

    
vocab['sowell']









    Out[50]:





<gensim.models.word2vec.Vocab at 0x7f35109b4090>



In [51]:

    
word = vocab['sowell']



In [55]:

    
word.index









    Out[55]:





107432



In [59]:

    
comment = 'hello world'.split(" ")
myvect = model.infer_vector(comment)
print myvect









    



[-0.04111374 -0.02500165  0.00240249 -0.05194312  0.01736004  0.03923237
 -0.02321895 -0.00029489  0.01171858 -0.00763664 -0.00423318 -0.01935696
  0.02693051  0.03966122 -0.0059855   0.01473564 -0.03234283 -0.01823245
  0.00217189 -0.00654919 -0.0153838  -0.01858403 -0.01150477  0.02978476
  0.0007985  -0.00109669  0.02316413  0.00655458  0.00364505 -0.01039089
 -0.04070766 -0.00415223  0.00397358 -0.03976255  0.00087709  0.01718757
  0.01112527 -0.01105386 -0.01857648  0.03114931  0.0043707   0.01010265
 -0.00273892 -0.0237     -0.01758535  0.00644217 -0.00706726  0.00386565
 -0.00940999 -0.01003488  0.00647073 -0.00198868  0.01122584 -0.00729731
 -0.01108328 -0.00288173 -0.03437344 -0.01104882 -0.00978341  0.00688382
 -0.02149723  0.02091461  0.01071058 -0.02032779 -0.03376953 -0.01208208
 -0.00408209 -0.05274599  0.00043536  0.00033211  0.01783239 -0.02652001
  0.00933295 -0.00230759  0.01432832  0.00012344 -0.00400857  0.01431063
  0.01445851  0.02728147  0.01825923  0.01869242  0.01830155 -0.00825256
  0.045708    0.03971025  0.00137035 -0.01573875  0.0172827  -0.01429381
 -0.0038722   0.01460732  0.0089898  -0.01145308  0.00755344 -0.00210035
  0.01141434  0.02177677  0.00615427  0.01965106  0.02330759 -0.03340856
 -0.04588184  0.00261034  0.03905431  0.005461   -0.0309504  -0.00192739
 -0.00713987  0.0111334  -0.00216861  0.01159589  0.01985407 -0.02078279
 -0.02032344 -0.03079031  0.00209977 -0.01274467  0.02649581 -0.00062849
 -0.01215605 -0.05419955  0.01790452 -0.01735547  0.00837917 -0.00153469
  0.03456865 -0.05325317  0.01586758 -0.03847298  0.01485192  0.01143985
  0.00046204  0.05696256 -0.01798709 -0.00318238  0.01850352 -0.04155617
  0.04122665  0.00567383 -0.0074503  -0.03310946  0.04023556 -0.01783714
  0.02726265 -0.0141836   0.02438884  0.01868067  0.02759301 -0.02397838
  0.00075403  0.0204922   0.02197541  0.00397953  0.00990431 -0.00941213
 -0.01188141  0.0141815  -0.00814373  0.0063068  -0.00220684  0.01542826
 -0.0127088  -0.02779719  0.00744291  0.0013038  -0.01480073  0.006438
  0.01416349 -0.00457946 -0.01541557 -0.01298807 -0.01882389 -0.0168133
  0.00142719  0.02837362  0.00350059 -0.01504229 -0.00446812  0.01438555
  0.02853093 -0.00883114 -0.01737806 -0.0025397  -0.03711834 -0.00841873
  0.02722701 -0.0054467   0.00467661  0.01786073 -0.00658744  0.03463225
  0.01566959  0.00348195 -0.01589103 -0.0164353   0.00094127 -0.01806031
 -0.04044517  0.02790125  0.04024201  0.02701897 -0.02881585 -0.00378977
 -0.00488664  0.01974203  0.00728836 -0.00369063 -0.01337916 -0.01104925
  0.00736075  0.00108332  0.00720279 -0.00605611  0.00031024 -0.00624926
 -0.02142286  0.01413467 -0.00153332 -0.02454466  0.01693262 -0.01152521
  0.02843513 -0.00839742  0.00444771 -0.00746116  0.01148958 -0.00943075
 -0.00844916  0.004551   -0.00449355 -0.0079079   0.01043462 -0.02941891
 -0.0231054  -0.01493952  0.01705624  0.00976374 -0.02708174  0.00410496
 -0.00873282  0.01186389 -0.00686416 -0.0059383  -0.00071539  0.01431673
  0.00555304 -0.01986418  0.02321977 -0.01541338 -0.02609937 -0.01995953
 -0.00048183  0.02400667  0.00318615  0.0232248  -0.00973626 -0.02275565
  0.0136182   0.01457288 -0.00870228  0.01823884  0.0293435  -0.03017141
  0.01055535 -0.00944739 -0.00471609  0.00337423  0.01817558  0.01331917
 -0.00462779  0.00374128  0.020611    0.01693613  0.01753112 -0.00468499
  0.01089525  0.026834    0.01085776 -0.02172902 -0.01086674 -0.02038584
  0.0041761  -0.02284047 -0.01594419  0.00255587 -0.00970417  0.04408061
  0.01091218 -0.01157123 -0.03443874 -0.03418404 -0.03463555  0.0125404
  0.00753943  0.01536337  0.02890608  0.02568626 -0.01141054 -0.00389267]



In [62]:

    
comment = ':)'.split(" ")
myvect = model.infer_vector(comment)
print myvect









    



[  1.24285102e-03  -1.29657832e-03   5.14305313e-04   1.31310441e-03
   3.30964016e-04   8.09322170e-04  -5.70763252e-04  -4.74735134e-04
   2.04503740e-04  -1.64516823e-04   1.21135334e-03   1.21228769e-03
  -1.41432253e-03  -3.14957608e-04   1.15435303e-03  -2.24736941e-04
  -1.38484733e-03  -1.29070238e-03   3.52830946e-04   5.44279872e-04
   1.10735476e-03  -1.04843464e-03  -3.41511914e-04   1.30341633e-03
  -6.44849555e-04  -1.29601581e-03  -1.07744410e-04   1.10998552e-03
   6.85571518e-04  -3.35332734e-04  -4.42845077e-04  -1.35943049e-03
  -1.56357617e-03  -1.17807882e-03   1.13402412e-03  -1.34440212e-04
  -3.37475707e-04  -1.51634391e-03   1.17436284e-03   6.81731792e-04
   5.00002818e-04  -1.15380972e-03   1.15671172e-03  -1.11719721e-03
   9.72568931e-04  -3.75860633e-04   3.36922676e-04  -6.96208794e-04
   6.73536677e-04   2.27944853e-04   1.40047050e-04  -2.13312713e-04
  -5.23253053e-04   3.54013377e-04   1.42755732e-03   1.69788735e-04
   8.25587544e-04  -1.22071803e-03  -1.71599633e-04  -4.77056252e-04
  -9.93321999e-04   1.30426663e-04  -1.04855886e-03   1.56164030e-03
   1.31402141e-03  -5.99207415e-04  -9.61448131e-06  -7.38294184e-05
  -1.47946912e-03  -1.19769515e-03  -3.30931129e-04  -1.26279623e-03
   1.34205341e-03   8.70183692e-04  -3.78213190e-05   7.94355466e-04
   8.70299875e-04  -1.08907861e-03   3.88209621e-04   1.10034214e-03
  -1.11678219e-03  -8.24136485e-04   6.01475826e-04   1.36912300e-03
   9.96101182e-04  -5.94648664e-05  -9.95314447e-04  -9.50073649e-04
  -4.93328029e-04  -5.28351578e-04   6.92845089e-04   5.29156881e-04
   8.52441823e-04   1.18986028e-03  -5.54851198e-04  -1.35061692e-03
   2.07954763e-05  -4.72145446e-04   7.25438760e-04  -1.18317793e-03
   1.55764096e-03  -1.38493336e-03   1.06755167e-03   1.28428568e-03
  -5.17147942e-04   1.11647719e-03  -9.62227583e-04  -1.64729625e-03
   7.21607474e-04   1.83258817e-05   4.24271653e-04   1.60843704e-03
   8.12684069e-04  -1.19957817e-03  -3.60963808e-04  -6.83156977e-05
   7.07257190e-04  -4.73036271e-05   1.50513230e-03   2.16111421e-05
   6.84346887e-04  -6.68324763e-04   7.96574808e-04  -1.50033343e-03
  -1.24968914e-03  -1.66360417e-03  -6.89976616e-04  -1.51505973e-03
  -8.24688032e-05   1.41149387e-04  -1.26537855e-03   1.42640504e-03
  -2.86668539e-04   6.38869184e-04   1.39660726e-03   1.47814257e-03
  -9.46782704e-04   3.02362983e-04  -7.41624855e-04   3.25924135e-04
  -6.55866403e-04  -1.48525997e-03   1.66444143e-03   1.61672244e-03
  -5.78635954e-04   1.16024923e-03  -6.02324610e-04  -1.63115491e-03
  -1.64665107e-04   5.37466491e-04  -1.00073568e-03  -1.43053476e-03
  -1.44765410e-03  -1.46549672e-03  -6.71054935e-04  -1.24123471e-03
  -5.24255156e-04  -3.92476126e-04   1.79225517e-05   4.86500125e-04
   1.77196664e-04   1.32546131e-03   1.06850278e-03  -9.34778829e-04
   1.83726414e-04   1.38497574e-03  -2.12250321e-04  -1.10455346e-03
   5.05930162e-04   6.14404154e-04   1.62774813e-04  -1.29845680e-03
   2.99543521e-04   9.79323173e-04  -5.55292470e-04  -8.33292797e-05
  -1.70688843e-04   2.60640110e-04   1.05562608e-03   1.58454452e-04
  -1.50604406e-03  -2.53608196e-05  -6.06746064e-04   8.01507558e-05
   1.06298132e-03  -1.04816316e-03   8.30344274e-04   8.65144772e-04
   9.22594278e-04  -5.42452559e-04   1.08180637e-03   7.23092409e-04
  -8.70686199e-04   1.38719415e-03  -1.10167079e-03  -7.12447392e-04
   1.55073975e-03   1.66658021e-03  -1.57412561e-03   5.64243353e-04
  -8.45753937e-04   1.24583638e-03   1.33532030e-03   1.11449650e-03
  -6.49038004e-04  -6.50598842e-04   7.51515792e-04   8.22467628e-05
   6.41385035e-04  -5.99347637e-04   1.66343793e-03   1.73441818e-04
  -3.69484158e-04  -8.57451058e-04   2.22361676e-04   6.33481774e-04
   1.25034444e-03   8.02298542e-04  -1.13411923e-03  -6.59665675e-04
  -1.21318270e-03   9.29615111e-04  -7.57992239e-05  -9.80554381e-04
  -3.69939720e-04  -1.49466319e-03   8.78220308e-04  -7.76818080e-04
   8.95755424e-04   1.64227199e-03  -1.06735170e-05  -1.12913003e-04
  -1.54282781e-03  -1.55227914e-04  -1.26978150e-04  -4.68331964e-05
  -6.40298298e-04  -5.02409530e-04  -5.40491128e-05   4.74969624e-04
   5.93840552e-04  -1.23400800e-03  -1.20597321e-03  -7.10080785e-04
   8.24188872e-04  -1.05032593e-03   6.90798159e-04   1.23007951e-04
  -8.01436137e-04   2.84738082e-04   1.51505973e-03   1.31770247e-03
   1.41019153e-03  -2.45564384e-04   1.48093968e-03  -1.48178835e-03
  -1.38716887e-05   5.46180876e-04   1.20621594e-03  -1.58285862e-03
   1.39503030e-03  -1.64992199e-03   2.38894994e-04  -1.34671965e-04
   2.49832286e-04   4.86037345e-04  -1.36212143e-03  -1.43044235e-04
   1.66535517e-03   1.46464934e-03   1.34212803e-03  -1.12829485e-03
  -3.35586228e-04   6.69820933e-04  -4.58008813e-04  -1.28884870e-03
  -1.17663387e-03  -1.12720788e-03   1.40631699e-03   4.25208738e-04
   1.54787826e-03   7.37967435e-04   1.40252023e-03  -1.52976729e-03
  -1.52215175e-03  -7.47605576e-04  -1.01363461e-03  -1.33286684e-03
  -1.01464230e-03   8.91762495e-04  -9.40994301e-04   2.39472298e-04
   5.12518105e-04   4.15431045e-04  -4.70137049e-04   1.02431828e-03
  -1.60394666e-05  -4.44189238e-04   1.28376065e-03   3.23429442e-04]



In [8]:

    
def mostSimilarDoc(model,comment):
    '''
    Input: doc2vec model, comment is a str
    Output: the label of the doc most similar to the comment
    '''
    
    docvecs = model.docvecs
    
    wordTokens = mytokenizer(comment)
#     wordTokens = comment.split(" ")
    commentVec = model.infer_vector(wordTokens)
    
    
    mostSimVec = None
    bestSimVal = None
    
    for vec_ind in xrange(len(docvecs)):
        simVal = 1 - cosine(commentVec,docvecs[vec_ind])
        
        if simVal>bestSimVal:
            mostSimVec = vec_ind
            bestSimVal = simVal
    
    
    return docvecs.index_to_doctag(mostSimVec), bestSimVal



In [20]:

    
random.sample(xrange(10), 11)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-20-de476a830e9e> in <module>()
----> 1 random.sample(xrange(10), 11)

/home/mgupta/anaconda2/lib/python2.7/random.pyc in sample(self, population, k)
    321         n = len(population)
    322         if not 0 <= k <= n:
--> 323             raise ValueError("sample larger than population")
    324         random = self.random
    325         _int = int

ValueError: sample larger than population



In [23]:

    
numsamps = 1000
randrows = random.sample(xrange(len(df.index)), numsamps)
comments = df.ix[randrows,'body'].values
subreddits = df.ix[randrows,'subreddit'].values
count = 0
for row,comment in enumerate(comments):
    predictedSub, simVal = mostSimilarDoc(model,comment)
    if predictedSub == subreddits[row]:
        count+=1
        
print count/float(len(comments))



In [23]:

    
predictedSub, simVal = mostSimilarDoc(model,'')
print predictedSub
print simVal









    



TwoXChromosomes
0.119427390737



In [10]:

    
wordTokens = "hi"

commentVec1 = model.infer_vector(wordTokens)
commentVec2 = model.infer_vector(wordTokens)

np.array_equal(commentVec1, commentVec2)









    Out[10]:





False



In [ ]:

	_unit_id	_golden	_unit_state	_trusted_judgments	_last_judgment_at	does_this_tweet_contain_hate_speech	does_this_tweet_contain_hate_speech:confidence	_created_at	orig__golden	orig__last_judgment_at	orig__unit_id	orig__unit_state	_updated_at	orig_does_this_tweet_contain_hate_speech	does_this_tweet_contain_hate_speech_gold	does_this_tweet_contain_hate_speech_gold_reason	does_this_tweet_contain_hate_speechconfidence	tweet_id	tweet_text
0	853718217	True	golden	86	NaN	The tweet uses offensive language but not hate...	0.6013	NaN	True	NaN	615561535.0	golden	NaN	The tweet contains hate speech	The tweet contains hate speech\nThe tweet uses...	NaN	1.0	1.666196e+09	Warning: penny boards will make you a faggot
1	853718218	True	golden	92	NaN	The tweet contains hate speech	0.7227	NaN	True	NaN	615561723.0	golden	NaN	The tweet contains hate speech	The tweet contains hate speech\nThe tweet uses...	NaN	1.0	4.295121e+08	Fuck dykes
2	853718219	True	golden	86	NaN	The tweet contains hate speech	0.5229	NaN	True	NaN	615562039.0	golden	NaN	The tweet contains hate speech	The tweet contains hate speech\nThe tweet uses...	NaN	1.0	3.956238e+08	@sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon...
3	853718220	True	golden	98	NaN	The tweet contains hate speech	0.5184	NaN	True	NaN	615562068.0	golden	NaN	The tweet contains hate speech	The tweet contains hate speech\nThe tweet uses...	NaN	1.0	4.975147e+08	"@jayswaggkillah: "@JacklynAnnn: @jayswaggkill...
4	853718221	True	golden	88	NaN	The tweet uses offensive language but not hate...	0.5185	NaN	True	NaN	615562488.0	golden	NaN	The tweet contains hate speech	The tweet contains hate speech\nThe tweet uses...	NaN	1.0	5.889236e+08	@Zhugstubble You heard me bitch but any way I'...