In [14]:
import gensim
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
import cPickle as pickle
from scipy.spatial.distance import cosine
from nltk.tokenize import PunktSentenceTokenizer

In [2]:
def seperatePunct(incomingString):
    newstring = incomingString
    newstring = newstring.replace("!"," ! ")
    newstring = newstring.replace("@"," @ ")
    newstring = newstring.replace("#"," # ")
    newstring = newstring.replace("$"," $ ")
    newstring = newstring.replace("%"," % ")
    newstring = newstring.replace("^"," ^ ")
    newstring = newstring.replace("&"," & ")
    newstring = newstring.replace("*"," * ")
    newstring = newstring.replace("("," ( ")
    newstring = newstring.replace(")"," ) ")
    newstring = newstring.replace("+"," + ")
    newstring = newstring.replace("="," = ")
    newstring = newstring.replace("?"," ? ")
    newstring = newstring.replace("\'"," \' ")
    newstring = newstring.replace("\""," \" ")
    newstring = newstring.replace("{"," { ")
    newstring = newstring.replace("}"," } ")
    newstring = newstring.replace("["," [ ")
    newstring = newstring.replace("]"," ] ")
    newstring = newstring.replace("<"," < ")
    newstring = newstring.replace(">"," > ")
    newstring = newstring.replace("~"," ~ ")
    newstring = newstring.replace("`"," ` ")
    newstring = newstring.replace(":"," : ")
    newstring = newstring.replace(";"," ; ")
    newstring = newstring.replace("|"," | ")
    newstring = newstring.replace("\\"," \\ ")
    newstring = newstring.replace("/"," / ")
    return newstring

def hasNumbers(inputString):
     return any(char.isdigit() for char in inputString)

def text_cleaner(wordList):
    '''
    INPUT: List of words to be tokenized
    OUTPUT: List of tokenized words
    '''

    tokenziedList = []

    for word in wordList:

        #remove these substrings from the word
        word = word.replace('[deleted]','')
        word = word.replace('&gt','')

        #if link, replace with linktag
        if 'http://' in word:
            tokenziedList.append('LINK_TAG')
            continue

        #if reference to subreddit, replace with reddittag
        if '/r/' in word:
            tokenziedList.append('SUBREDDIT_TAG')
            continue

        #if reference to reddit user, replace with usertag
        if '/u/' in word:
            tokenziedList.append('USER_TAG')
            continue

        #if number, replace with numtag
        #m8 is a word, 5'10" and 54-59, 56:48 are numbers
        if hasNumbers(word) and not any(char.isalpha() for char in word):
            tokenziedList.append('NUM_TAG')
            continue

        #seperate puncuations and add to tokenizedList
        newwords = seperatePunct(word).split(" ")
        tokenziedList.extend(newwords)

    return tokenziedList

def mytokenizer(comment):
    '''
    Input: takes in a reddit comment as a str or unicode and tokenizes it
    Output: a tokenized list
    '''
    tokenizer = PunktSentenceTokenizer()
    sentenceList = tokenizer.tokenize(comment)
    wordList = []
    for sentence in sentenceList:
        wordList.extend(sentence.split(" "))

    return text_cleaner(wordList)

In [3]:
path1 = '../../data/labeledRedditComments.p'
path2 = '../../data/twitter-hate-speech-classifier.csv'
path3 = '../../data/RedditMay2015Comments.sqlite'

In [4]:
df = pickle.load(open(path1, 'rb'))

In [39]:
# List of not hateful subreddits
final_nothate_srs = ['politics', 'worldnews', 'history', 'blackladies', 'lgbt',
                     'TransSpace', 'women', 'TwoXChromosomes', 'DebateReligion',
                     'religion', 'islam', 'Judaism', 'BodyAcceptance', 'fatlogic'
                     'gaybros','AskMen','AskWomen']
    # List of hateful subreddits
final_hateful_srs = ['CoonTown', 'WhiteRights', 'Trans_fags', 'SlutJustice',
                     'TheRedPill', 'KotakuInAction', 'IslamUnveiled', 'GasTheKikes',
                     'AntiPOZi', 'fatpeoplehate', 'TalesofFatHate','hamplanethatred'
                     'shitniggerssay','neofag','altright']

In [40]:
final_hateful_srs


Out[40]:
['CoonTown',
 'WhiteRights',
 'Trans_fags',
 'SlutJustice',
 'TheRedPill',
 'KotakuInAction',
 'IslamUnveiled',
 'GasTheKikes',
 'AntiPOZi',
 'fatpeoplehate',
 'TalesofFatHate',
 'hamplanethatredshitniggerssay',
 'neofag',
 'altright']

In [42]:
df['subreddit'].value_counts()


Out[42]:
worldnews          490354
fatpeoplehate      311183
politics           244927
AskMen             138839
AskWomen           137889
KotakuInAction     128156
TwoXChromosomes    105130
TheRedPill          59145
CoonTown            51979
DebateReligion      41015
islam               25443
history             25242
Judaism              9103
lgbt                 8253
TalesofFatHate       5239
AntiPOZi             4740
blackladies          4396
religion             2623
Trans_fags           2362
WhiteRights          1352
GasTheKikes           919
BodyAcceptance        579
women                 529
TransSpace            472
SlutJustice           309
IslamUnveiled         110
Name: subreddit, dtype: int64

In [43]:
df['label'].value_counts()


Out[43]:
NotHate    1234794
Hate        565494
Name: label, dtype: int64

In [5]:
dfhate = pd.read_csv(path2)

In [26]:
dfhate.head()


Out[26]:
_unit_id _golden _unit_state _trusted_judgments _last_judgment_at does_this_tweet_contain_hate_speech does_this_tweet_contain_hate_speech:confidence _created_at orig__golden orig__last_judgment_at orig__trusted_judgments orig__unit_id orig__unit_state _updated_at orig_does_this_tweet_contain_hate_speech does_this_tweet_contain_hate_speech_gold does_this_tweet_contain_hate_speech_gold_reason does_this_tweet_contain_hate_speechconfidence tweet_id tweet_text
0 853718217 True golden 86 NaN The tweet uses offensive language but not hate... 0.6013 NaN True NaN 0.0 615561535.0 golden NaN The tweet contains hate speech The tweet contains hate speech\nThe tweet uses... NaN 1.0 1.666196e+09 Warning: penny boards will make you a faggot
1 853718218 True golden 92 NaN The tweet contains hate speech 0.7227 NaN True NaN 0.0 615561723.0 golden NaN The tweet contains hate speech The tweet contains hate speech\nThe tweet uses... NaN 1.0 4.295121e+08 Fuck dykes
2 853718219 True golden 86 NaN The tweet contains hate speech 0.5229 NaN True NaN 0.0 615562039.0 golden NaN The tweet contains hate speech The tweet contains hate speech\nThe tweet uses... NaN 1.0 3.956238e+08 @sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon...
3 853718220 True golden 98 NaN The tweet contains hate speech 0.5184 NaN True NaN 0.0 615562068.0 golden NaN The tweet contains hate speech The tweet contains hate speech\nThe tweet uses... NaN 1.0 4.975147e+08 "@jayswaggkillah: "@JacklynAnnn: @jayswaggkill...
4 853718221 True golden 88 NaN The tweet uses offensive language but not hate... 0.5185 NaN True NaN 0.0 615562488.0 golden NaN The tweet contains hate speech The tweet contains hate speech\nThe tweet uses... NaN 1.0 5.889236e+08 @Zhugstubble You heard me bitch but any way I'...

In [27]:
dfhate['does_this_tweet_contain_hate_speech_gold'].value_counts()


Out[27]:
The tweet contains hate speech\nThe tweet uses offensive language but not hate speech    25
The tweet is not offensive                                                               16
The tweet uses offensive language but not hate speech                                    13
The tweet uses offensive language but not hate speech\nThe tweet is not offensive         9
The tweet contains hate speech                                                            4
Name: does_this_tweet_contain_hate_speech_gold, dtype: int64

In [29]:
dfhate['orig_does_this_tweet_contain_hate_speech'].value_counts()


Out[29]:
The tweet uses offensive language but not hate speech    24
The tweet contains hate speech                           22
The tweet is not offensive                               21
Name: orig_does_this_tweet_contain_hate_speech, dtype: int64

In [30]:
def myfunc(x):
    if x in ['The tweet is not offensive']:
        return 0
    else:
        return 1

In [31]:
dfhate['label'] = dfhate['does_this_tweet_contain_hate_speech'].map(lambda x: myfunc(x))

In [32]:
dfhate['label'].value_counts()


Out[32]:
0    7274
1    7235
Name: label, dtype: int64

In [37]:
pre = np.array([1,1,1,0,0,0])
tru = np.array([1,0,1,0,1,1])

print sum(pre+tru == 2)
print sum(pre+tru == 0)
print sum(pre-tru == 1)
print sum(pre-tru == -1)


2
1
1
2

In [33]:
dfhate.to_csv(path2)

In [25]:
dfhate['does_this_tweet_contain_hate_speech'].value_counts()


Out[25]:
The tweet is not offensive                               7274
The tweet uses offensive language but not hate speech    4836
The tweet contains hate speech                           2399
Name: does_this_tweet_contain_hate_speech, dtype: int64

In [6]:
model = gensim.models.Doc2Vec.load('base_model_original_tokenizer.doc2vec')

In [7]:
docvecs = model.docvecs

In [71]:
docvecs[0]


Out[71]:
array([-0.12589121, -0.72398579, -0.13750482,  0.01074985, -0.07752338,
        0.13886024, -0.28057936, -0.11339198, -0.02804809,  0.08569955,
       -0.82156771, -0.2160503 ,  0.43471047,  0.19683866, -0.42664996,
        0.04931739, -0.33742675, -0.73014867, -0.12762018, -0.106265  ,
        0.39234036, -0.39817062, -0.14888453, -0.69462979, -0.30694923,
       -0.07339818, -0.18055308, -0.64332151,  0.57735366,  1.02309513,
       -0.34679177, -0.11108843, -0.05537489,  0.10253237,  0.31550717,
        0.52577186,  0.99049211,  0.05952468, -0.47226873, -0.05540276,
       -0.26681474, -0.08775025, -0.92837352, -0.13519953, -0.70518291,
       -0.20343725, -0.11048806,  0.15157135, -0.44448465,  0.43652859,
       -0.14788841, -0.7766152 , -0.3568432 , -0.55862421, -0.61907762,
       -0.49959502, -0.00634548, -0.02462127, -0.35062504, -0.07442575,
       -0.06620318, -0.2105757 ,  0.03848599,  0.07465713, -0.01480301,
        0.0909562 , -0.48590082, -0.48677263, -0.01644286,  0.53316718,
        0.19098762, -0.00537689,  0.60375124,  0.61171699,  0.5195058 ,
       -0.81575453, -0.17606765, -0.05278037,  0.50832444,  0.81706733,
        0.1719389 , -0.13501805, -0.32697934, -0.19407187,  0.50223619,
        0.35213116, -0.22106965, -0.04424353,  0.02781259, -0.06127879,
       -0.3261008 ,  0.57645082,  0.05099476, -0.12902723, -0.24004243,
       -0.17637867, -0.5424065 , -0.7066685 , -0.51928079, -0.39453816,
        0.46386182, -0.30387247, -0.29021129, -0.15439314,  0.04698645,
        0.54548693, -0.08751345,  0.15773326, -0.4897477 ,  0.00445471,
       -0.22462903,  0.27953592,  0.3809576 ,  0.25621074,  0.02975994,
       -0.45700687, -0.08916998, -0.32495815, -0.31760514, -0.38044152,
       -0.15788171,  0.2868689 , -0.67996204, -0.04327663,  0.44221863,
       -0.20513661, -0.15236355,  0.34446013,  0.26793769,  0.15069135,
       -0.22732359, -0.56328028,  0.01038246, -0.01748611, -0.14974129,
       -0.26700798,  0.13132939, -0.11334003,  0.52558941,  0.20994405,
       -0.11825897, -0.38272485,  0.1874278 , -0.13379002,  0.1304699 ,
        0.01476961, -0.61739039, -0.25798291, -0.16011603, -0.39652464,
       -0.59220809, -0.30506358,  0.14669114, -0.2763429 , -0.51747042,
       -0.16309269,  0.03950921,  0.204364  ,  0.55781382, -0.38286638,
       -0.54690766, -0.54346794, -0.02820516,  0.0100578 , -0.28775266,
        0.13238353,  0.93503314,  0.37181729,  0.05437614,  0.23740833,
        0.39891708,  0.94723952, -0.10406715, -0.18395999, -0.00317672,
        0.45014599, -0.29661059, -0.33467513,  0.01850334, -0.11797646,
        0.08393001,  0.30656683, -0.39040384, -0.50537956, -1.003093  ,
       -0.35080022, -0.04855656, -0.75325412,  0.09916819, -0.03518015,
        0.11805237,  0.25506198,  0.10420178,  0.11222638, -0.43056545,
       -0.30389291,  0.16090877,  0.51357472,  0.32703143, -0.05134602,
        0.53110862,  0.61642408,  0.3364757 , -0.04929738,  0.30545914,
        0.45064425, -0.02958472,  0.01003539,  0.25594985, -0.12772371,
        0.05378566,  0.15690044, -0.19670148, -0.17284095, -0.02730625,
       -0.18926087,  0.19283143, -0.31659678,  0.18789057, -0.14049083,
        0.16017769, -0.1022848 ,  0.20286475,  0.53479397, -0.02167618,
       -0.11878969, -0.07998863,  0.12697545,  0.90858006,  0.31139112,
        0.50222713, -0.24131031,  0.3370797 ,  0.39017102, -0.66540205,
       -0.47660181, -0.3390246 ,  0.76906717, -0.02450531, -0.22576049,
       -0.34447014, -0.5409779 ,  0.24217558, -0.05197965,  0.0607123 ,
        0.25163266, -0.48003322,  0.79306233,  0.51902694,  0.42135218,
       -0.21480867, -0.23312834, -0.11083728,  0.24889012,  0.40282148,
        0.47559056, -0.02321037, -0.49362049, -0.11209472, -0.33152193,
        0.19776765, -0.02687917,  0.10523993, -0.41892961, -0.14945576,
        0.43507275, -0.33522612, -0.67242497,  0.16731207,  0.24476747,
        0.47927377,  0.17116192,  0.45978621,  0.69689423,  0.49695787,
       -0.21593295, -0.44263107,  0.16870342,  0.44734085,  0.0122469 ,
        0.21210839, -0.64636171,  0.57238775, -0.72735292, -0.00281904,
       -0.25434625, -0.15268794,  0.13881858, -0.60550427, -0.10906816,
        0.46427441, -0.43058404,  0.1811271 ,  0.22259298,  0.38453677,
        0.53926337,  0.1520562 ,  0.0299741 , -0.19964291, -0.73852772], dtype=float32)

In [20]:
doctags = docvecs.doctags

In [22]:
doctags.items()


Out[22]:
[('SlutJustice', Doctag(offset=3, word_count=27446, doc_count=309)),
 ('AskWomen', Doctag(offset=25, word_count=6480218, doc_count=137889)),
 ('KotakuInAction', Doctag(offset=5, word_count=6416371, doc_count=128156)),
 ('blackladies', Doctag(offset=14, word_count=217549, doc_count=4396)),
 ('TwoXChromosomes', Doctag(offset=18, word_count=5929809, doc_count=105130)),
 ('politics', Doctag(offset=11, word_count=12084911, doc_count=244927)),
 ('AntiPOZi', Doctag(offset=8, word_count=221689, doc_count=4740)),
 ('islam', Doctag(offset=21, word_count=1520313, doc_count=25443)),
 ('GasTheKikes', Doctag(offset=7, word_count=47158, doc_count=919)),
 ('BodyAcceptance', Doctag(offset=23, word_count=39993, doc_count=579)),
 ('DebateReligion', Doctag(offset=19, word_count=3667567, doc_count=41015)),
 ('religion', Doctag(offset=20, word_count=197943, doc_count=2623)),
 ('worldnews', Doctag(offset=12, word_count=19291877, doc_count=490354)),
 ('fatpeoplehate', Doctag(offset=9, word_count=8193219, doc_count=311183)),
 ('Trans_fags', Doctag(offset=2, word_count=102110, doc_count=2362)),
 ('Judaism', Doctag(offset=22, word_count=507969, doc_count=9103)),
 ('IslamUnveiled', Doctag(offset=6, word_count=7542, doc_count=110)),
 ('lgbt', Doctag(offset=15, word_count=464559, doc_count=8253)),
 ('WhiteRights', Doctag(offset=1, word_count=59447, doc_count=1352)),
 ('AskMen', Doctag(offset=24, word_count=6168171, doc_count=138839)),
 ('women', Doctag(offset=17, word_count=33116, doc_count=529)),
 ('CoonTown', Doctag(offset=0, word_count=2033220, doc_count=51979)),
 ('TransSpace', Doctag(offset=16, word_count=18327, doc_count=472)),
 ('TheRedPill', Doctag(offset=4, word_count=3728901, doc_count=59145)),
 ('TalesofFatHate', Doctag(offset=10, word_count=178528, doc_count=5239)),
 ('history', Doctag(offset=13, word_count=1097160, doc_count=25242))]

In [23]:
doctags['GasTheKikes']


Out[23]:
Doctag(offset=7, word_count=47158, doc_count=919)

In [33]:
len(doctags['GasTheKikes'])


Out[33]:
3

In [34]:
doctags['GasTheKikes'][0]


Out[34]:
7

In [25]:
docvecs.count


Out[25]:
26

In [26]:
type(docvecs)


Out[26]:
gensim.models.doc2vec.DocvecsArray

In [28]:
type(docvecs[0])


Out[28]:
numpy.ndarray

In [31]:
len(docvecs[0])


Out[31]:
300

In [29]:
len(docvecs)


Out[29]:
26

In [30]:
docvecs.index_to_doctag(0)


Out[30]:
'CoonTown'

In [38]:
docvecs.most_similar(14)


Out[38]:
[('BodyAcceptance', 0.21258679032325745),
 ('WhiteRights', 0.21146908402442932),
 ('TransSpace', 0.18464726209640503),
 ('AskMen', 0.16595458984375),
 ('islam', 0.1570722907781601),
 ('Judaism', 0.15705253183841705),
 ('TheRedPill', 0.15415698289871216),
 ('politics', 0.14485692977905273),
 ('KotakuInAction', 0.14440138638019562),
 ('religion', 0.14105652272701263)]

In [40]:
for i in xrange(len(docvecs)):
    print docvecs.index_to_doctag(i)
    print docvecs.most_similar(i)[0]
    print ""


CoonTown
('IslamUnveiled', 0.2738789916038513)

WhiteRights
('GasTheKikes', 0.38529878854751587)

Trans_fags
('SlutJustice', 0.41643446683883667)

SlutJustice
('Trans_fags', 0.41643446683883667)

TheRedPill
('WhiteRights', 0.22930307686328888)

KotakuInAction
('SlutJustice', 0.19015194475650787)

IslamUnveiled
('religion', 0.32894909381866455)

GasTheKikes
('AntiPOZi', 0.39256757497787476)

AntiPOZi
('GasTheKikes', 0.39256757497787476)

fatpeoplehate
('BodyAcceptance', 0.22982779145240784)

TalesofFatHate
('SlutJustice', 0.3041166663169861)

politics
('TransSpace', 0.26541823148727417)

worldnews
('IslamUnveiled', 0.22024142742156982)

history
('IslamUnveiled', 0.25044021010398865)

blackladies
('BodyAcceptance', 0.21258679032325745)

lgbt
('TransSpace', 0.33727318048477173)

TransSpace
('Trans_fags', 0.3732454180717468)

women
('IslamUnveiled', 0.3218139111995697)

TwoXChromosomes
('AskWomen', 0.368299663066864)

DebateReligion
('politics', 0.2580014169216156)

religion
('IslamUnveiled', 0.32894909381866455)

islam
('IslamUnveiled', 0.29662537574768066)

Judaism
('TransSpace', 0.2351522445678711)

BodyAcceptance
('lgbt', 0.3149873614311218)

AskMen
('AskWomen', 0.29158464074134827)

AskWomen
('TwoXChromosomes', 0.3682996928691864)


In [44]:
vocab = model.vocab

In [48]:
len(vocab.keys())


Out[48]:
215221

In [49]:
vocab.keys()[:10]


Out[49]:
[u"cake'",
 u'Craziness',
 u'nunnery',
 u'EXPLAIN',
 u'transend',
 u'sowell',
 u'Panzergroup',
 u'fleeces',
 u'woods',
 u'clotted']

In [50]:
vocab['sowell']


Out[50]:
<gensim.models.word2vec.Vocab at 0x7f35109b4090>

In [51]:
word = vocab['sowell']

In [55]:
word.index


Out[55]:
107432

In [59]:
comment = 'hello world'.split(" ")
myvect = model.infer_vector(comment)
print myvect


[-0.04111374 -0.02500165  0.00240249 -0.05194312  0.01736004  0.03923237
 -0.02321895 -0.00029489  0.01171858 -0.00763664 -0.00423318 -0.01935696
  0.02693051  0.03966122 -0.0059855   0.01473564 -0.03234283 -0.01823245
  0.00217189 -0.00654919 -0.0153838  -0.01858403 -0.01150477  0.02978476
  0.0007985  -0.00109669  0.02316413  0.00655458  0.00364505 -0.01039089
 -0.04070766 -0.00415223  0.00397358 -0.03976255  0.00087709  0.01718757
  0.01112527 -0.01105386 -0.01857648  0.03114931  0.0043707   0.01010265
 -0.00273892 -0.0237     -0.01758535  0.00644217 -0.00706726  0.00386565
 -0.00940999 -0.01003488  0.00647073 -0.00198868  0.01122584 -0.00729731
 -0.01108328 -0.00288173 -0.03437344 -0.01104882 -0.00978341  0.00688382
 -0.02149723  0.02091461  0.01071058 -0.02032779 -0.03376953 -0.01208208
 -0.00408209 -0.05274599  0.00043536  0.00033211  0.01783239 -0.02652001
  0.00933295 -0.00230759  0.01432832  0.00012344 -0.00400857  0.01431063
  0.01445851  0.02728147  0.01825923  0.01869242  0.01830155 -0.00825256
  0.045708    0.03971025  0.00137035 -0.01573875  0.0172827  -0.01429381
 -0.0038722   0.01460732  0.0089898  -0.01145308  0.00755344 -0.00210035
  0.01141434  0.02177677  0.00615427  0.01965106  0.02330759 -0.03340856
 -0.04588184  0.00261034  0.03905431  0.005461   -0.0309504  -0.00192739
 -0.00713987  0.0111334  -0.00216861  0.01159589  0.01985407 -0.02078279
 -0.02032344 -0.03079031  0.00209977 -0.01274467  0.02649581 -0.00062849
 -0.01215605 -0.05419955  0.01790452 -0.01735547  0.00837917 -0.00153469
  0.03456865 -0.05325317  0.01586758 -0.03847298  0.01485192  0.01143985
  0.00046204  0.05696256 -0.01798709 -0.00318238  0.01850352 -0.04155617
  0.04122665  0.00567383 -0.0074503  -0.03310946  0.04023556 -0.01783714
  0.02726265 -0.0141836   0.02438884  0.01868067  0.02759301 -0.02397838
  0.00075403  0.0204922   0.02197541  0.00397953  0.00990431 -0.00941213
 -0.01188141  0.0141815  -0.00814373  0.0063068  -0.00220684  0.01542826
 -0.0127088  -0.02779719  0.00744291  0.0013038  -0.01480073  0.006438
  0.01416349 -0.00457946 -0.01541557 -0.01298807 -0.01882389 -0.0168133
  0.00142719  0.02837362  0.00350059 -0.01504229 -0.00446812  0.01438555
  0.02853093 -0.00883114 -0.01737806 -0.0025397  -0.03711834 -0.00841873
  0.02722701 -0.0054467   0.00467661  0.01786073 -0.00658744  0.03463225
  0.01566959  0.00348195 -0.01589103 -0.0164353   0.00094127 -0.01806031
 -0.04044517  0.02790125  0.04024201  0.02701897 -0.02881585 -0.00378977
 -0.00488664  0.01974203  0.00728836 -0.00369063 -0.01337916 -0.01104925
  0.00736075  0.00108332  0.00720279 -0.00605611  0.00031024 -0.00624926
 -0.02142286  0.01413467 -0.00153332 -0.02454466  0.01693262 -0.01152521
  0.02843513 -0.00839742  0.00444771 -0.00746116  0.01148958 -0.00943075
 -0.00844916  0.004551   -0.00449355 -0.0079079   0.01043462 -0.02941891
 -0.0231054  -0.01493952  0.01705624  0.00976374 -0.02708174  0.00410496
 -0.00873282  0.01186389 -0.00686416 -0.0059383  -0.00071539  0.01431673
  0.00555304 -0.01986418  0.02321977 -0.01541338 -0.02609937 -0.01995953
 -0.00048183  0.02400667  0.00318615  0.0232248  -0.00973626 -0.02275565
  0.0136182   0.01457288 -0.00870228  0.01823884  0.0293435  -0.03017141
  0.01055535 -0.00944739 -0.00471609  0.00337423  0.01817558  0.01331917
 -0.00462779  0.00374128  0.020611    0.01693613  0.01753112 -0.00468499
  0.01089525  0.026834    0.01085776 -0.02172902 -0.01086674 -0.02038584
  0.0041761  -0.02284047 -0.01594419  0.00255587 -0.00970417  0.04408061
  0.01091218 -0.01157123 -0.03443874 -0.03418404 -0.03463555  0.0125404
  0.00753943  0.01536337  0.02890608  0.02568626 -0.01141054 -0.00389267]

In [62]:
comment = ':)'.split(" ")
myvect = model.infer_vector(comment)
print myvect


[  1.24285102e-03  -1.29657832e-03   5.14305313e-04   1.31310441e-03
   3.30964016e-04   8.09322170e-04  -5.70763252e-04  -4.74735134e-04
   2.04503740e-04  -1.64516823e-04   1.21135334e-03   1.21228769e-03
  -1.41432253e-03  -3.14957608e-04   1.15435303e-03  -2.24736941e-04
  -1.38484733e-03  -1.29070238e-03   3.52830946e-04   5.44279872e-04
   1.10735476e-03  -1.04843464e-03  -3.41511914e-04   1.30341633e-03
  -6.44849555e-04  -1.29601581e-03  -1.07744410e-04   1.10998552e-03
   6.85571518e-04  -3.35332734e-04  -4.42845077e-04  -1.35943049e-03
  -1.56357617e-03  -1.17807882e-03   1.13402412e-03  -1.34440212e-04
  -3.37475707e-04  -1.51634391e-03   1.17436284e-03   6.81731792e-04
   5.00002818e-04  -1.15380972e-03   1.15671172e-03  -1.11719721e-03
   9.72568931e-04  -3.75860633e-04   3.36922676e-04  -6.96208794e-04
   6.73536677e-04   2.27944853e-04   1.40047050e-04  -2.13312713e-04
  -5.23253053e-04   3.54013377e-04   1.42755732e-03   1.69788735e-04
   8.25587544e-04  -1.22071803e-03  -1.71599633e-04  -4.77056252e-04
  -9.93321999e-04   1.30426663e-04  -1.04855886e-03   1.56164030e-03
   1.31402141e-03  -5.99207415e-04  -9.61448131e-06  -7.38294184e-05
  -1.47946912e-03  -1.19769515e-03  -3.30931129e-04  -1.26279623e-03
   1.34205341e-03   8.70183692e-04  -3.78213190e-05   7.94355466e-04
   8.70299875e-04  -1.08907861e-03   3.88209621e-04   1.10034214e-03
  -1.11678219e-03  -8.24136485e-04   6.01475826e-04   1.36912300e-03
   9.96101182e-04  -5.94648664e-05  -9.95314447e-04  -9.50073649e-04
  -4.93328029e-04  -5.28351578e-04   6.92845089e-04   5.29156881e-04
   8.52441823e-04   1.18986028e-03  -5.54851198e-04  -1.35061692e-03
   2.07954763e-05  -4.72145446e-04   7.25438760e-04  -1.18317793e-03
   1.55764096e-03  -1.38493336e-03   1.06755167e-03   1.28428568e-03
  -5.17147942e-04   1.11647719e-03  -9.62227583e-04  -1.64729625e-03
   7.21607474e-04   1.83258817e-05   4.24271653e-04   1.60843704e-03
   8.12684069e-04  -1.19957817e-03  -3.60963808e-04  -6.83156977e-05
   7.07257190e-04  -4.73036271e-05   1.50513230e-03   2.16111421e-05
   6.84346887e-04  -6.68324763e-04   7.96574808e-04  -1.50033343e-03
  -1.24968914e-03  -1.66360417e-03  -6.89976616e-04  -1.51505973e-03
  -8.24688032e-05   1.41149387e-04  -1.26537855e-03   1.42640504e-03
  -2.86668539e-04   6.38869184e-04   1.39660726e-03   1.47814257e-03
  -9.46782704e-04   3.02362983e-04  -7.41624855e-04   3.25924135e-04
  -6.55866403e-04  -1.48525997e-03   1.66444143e-03   1.61672244e-03
  -5.78635954e-04   1.16024923e-03  -6.02324610e-04  -1.63115491e-03
  -1.64665107e-04   5.37466491e-04  -1.00073568e-03  -1.43053476e-03
  -1.44765410e-03  -1.46549672e-03  -6.71054935e-04  -1.24123471e-03
  -5.24255156e-04  -3.92476126e-04   1.79225517e-05   4.86500125e-04
   1.77196664e-04   1.32546131e-03   1.06850278e-03  -9.34778829e-04
   1.83726414e-04   1.38497574e-03  -2.12250321e-04  -1.10455346e-03
   5.05930162e-04   6.14404154e-04   1.62774813e-04  -1.29845680e-03
   2.99543521e-04   9.79323173e-04  -5.55292470e-04  -8.33292797e-05
  -1.70688843e-04   2.60640110e-04   1.05562608e-03   1.58454452e-04
  -1.50604406e-03  -2.53608196e-05  -6.06746064e-04   8.01507558e-05
   1.06298132e-03  -1.04816316e-03   8.30344274e-04   8.65144772e-04
   9.22594278e-04  -5.42452559e-04   1.08180637e-03   7.23092409e-04
  -8.70686199e-04   1.38719415e-03  -1.10167079e-03  -7.12447392e-04
   1.55073975e-03   1.66658021e-03  -1.57412561e-03   5.64243353e-04
  -8.45753937e-04   1.24583638e-03   1.33532030e-03   1.11449650e-03
  -6.49038004e-04  -6.50598842e-04   7.51515792e-04   8.22467628e-05
   6.41385035e-04  -5.99347637e-04   1.66343793e-03   1.73441818e-04
  -3.69484158e-04  -8.57451058e-04   2.22361676e-04   6.33481774e-04
   1.25034444e-03   8.02298542e-04  -1.13411923e-03  -6.59665675e-04
  -1.21318270e-03   9.29615111e-04  -7.57992239e-05  -9.80554381e-04
  -3.69939720e-04  -1.49466319e-03   8.78220308e-04  -7.76818080e-04
   8.95755424e-04   1.64227199e-03  -1.06735170e-05  -1.12913003e-04
  -1.54282781e-03  -1.55227914e-04  -1.26978150e-04  -4.68331964e-05
  -6.40298298e-04  -5.02409530e-04  -5.40491128e-05   4.74969624e-04
   5.93840552e-04  -1.23400800e-03  -1.20597321e-03  -7.10080785e-04
   8.24188872e-04  -1.05032593e-03   6.90798159e-04   1.23007951e-04
  -8.01436137e-04   2.84738082e-04   1.51505973e-03   1.31770247e-03
   1.41019153e-03  -2.45564384e-04   1.48093968e-03  -1.48178835e-03
  -1.38716887e-05   5.46180876e-04   1.20621594e-03  -1.58285862e-03
   1.39503030e-03  -1.64992199e-03   2.38894994e-04  -1.34671965e-04
   2.49832286e-04   4.86037345e-04  -1.36212143e-03  -1.43044235e-04
   1.66535517e-03   1.46464934e-03   1.34212803e-03  -1.12829485e-03
  -3.35586228e-04   6.69820933e-04  -4.58008813e-04  -1.28884870e-03
  -1.17663387e-03  -1.12720788e-03   1.40631699e-03   4.25208738e-04
   1.54787826e-03   7.37967435e-04   1.40252023e-03  -1.52976729e-03
  -1.52215175e-03  -7.47605576e-04  -1.01363461e-03  -1.33286684e-03
  -1.01464230e-03   8.91762495e-04  -9.40994301e-04   2.39472298e-04
   5.12518105e-04   4.15431045e-04  -4.70137049e-04   1.02431828e-03
  -1.60394666e-05  -4.44189238e-04   1.28376065e-03   3.23429442e-04]

In [8]:
def mostSimilarDoc(model,comment):
    '''
    Input: doc2vec model, comment is a str
    Output: the label of the doc most similar to the comment
    '''
    
    docvecs = model.docvecs
    
    wordTokens = mytokenizer(comment)
#     wordTokens = comment.split(" ")
    commentVec = model.infer_vector(wordTokens)
    
    
    mostSimVec = None
    bestSimVal = None
    
    for vec_ind in xrange(len(docvecs)):
        simVal = 1 - cosine(commentVec,docvecs[vec_ind])
        
        if simVal>bestSimVal:
            mostSimVec = vec_ind
            bestSimVal = simVal
    
    
    return docvecs.index_to_doctag(mostSimVec), bestSimVal

In [20]:
random.sample(xrange(10), 11)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-20-de476a830e9e> in <module>()
----> 1 random.sample(xrange(10), 11)

/home/mgupta/anaconda2/lib/python2.7/random.pyc in sample(self, population, k)
    321         n = len(population)
    322         if not 0 <= k <= n:
--> 323             raise ValueError("sample larger than population")
    324         random = self.random
    325         _int = int

ValueError: sample larger than population

In [23]:
numsamps = 1000
randrows = random.sample(xrange(len(df.index)), numsamps)
comments = df.ix[randrows,'body'].values
subreddits = df.ix[randrows,'subreddit'].values
count = 0
for row,comment in enumerate(comments):
    predictedSub, simVal = mostSimilarDoc(model,comment)
    if predictedSub == subreddits[row]:
        count+=1
        
print count/float(len(comments))


0.187

In [23]:
predictedSub, simVal = mostSimilarDoc(model,'')
print predictedSub
print simVal


TwoXChromosomes
0.119427390737

In [10]:
wordTokens = "hi"

commentVec1 = model.infer_vector(wordTokens)
commentVec2 = model.infer_vector(wordTokens)

np.array_equal(commentVec1, commentVec2)


Out[10]:
False

In [ ]: