In [1]:
from gensim import utils
import gensim.models.doc2vec
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
import gensim
import sys
import numpy as np
from gensim import corpora, models
import csv
import _pickle as cPickle
from sklearn.externals import joblib
import bz2
from random import shuffle
import ast
from sklearn.linear_model import LogisticRegression

In [3]:
# Load to memory
corpus = gensim.corpora.MmCorpus('data/eos/processed_corpus_EOS.mm')
    
File = open('data/eos/doc2vec_data.txt', 'w') 
for doc in doc2vec_data:
    File.write(' '.join(token for token in doc))

In [ ]:
%%time


doc_filepath = 'data/eos/eos_text_tokenized.gz'
doc2vec_model_path = 'data/eos/doc2vec_model.d2v'

if 1 == 1:
    sentences=gensim.models.doc2vec.TaggedLineDocument(doc_filepath)
    model = gensim.models.doc2vec.Doc2Vec(sentences, size=200, window=10, min_count=5, iter=20, workers=8)
    model.save(doc2vec_model_path)
else:
    model = Doc2Vec.load(doc2vec_model_path)

In [5]:


In [6]:
#The full model was run on a server. The first output below is a sample model run on 1000 lines
#of text. The markdown cell below the output is the output on the server for 30 million lines.
sims = model.docvecs.most_similar(99)

print(sims)


[(5377, 0.5258454084396362), (5599, 0.37886685132980347), (5590, 0.36026638746261597), (7184, 0.3549143075942993), (9802, 0.34451237320899963), (4488, 0.34104377031326294), (3348, 0.34001803398132324), (814, 0.3253442645072937), (4050, 0.32484573125839233), (8534, 0.3236147165298462)]

Below is the same sims = model.docvecs.most_similar(99); print(sims); command but run on the full model.

[(98, 0.8011916875839233), (15987504, 0.5732544660568237), (9361116, 0.56634122133255), (15987503, 0.5599271655082703), (9361115, 0.5523300170898438), (15987502, 0.5513883829116821), (16801862, 0.5123482942581177), (16801861, 0.5077716112136841), (11492322, 0.5020641088485718), (755432, 0.48910319805145264)


In [7]:
print(model.doesnt_match("iran diplomat scarf".split()))
print(model.doesnt_match("black blue yellow shirt navy black green orange".split()))
print(model.doesnt_match("summer winter fall t-shirt spring hot cold".split()))
print(model.doesnt_match("straight slim fit custom regular winter".split()))


print(model.most_similar(positive=['saudi', 'king'], negative=['girl']))
print(model.most_similar(positive=['blue', 'shirt'], negative=['blue']))
print(model.most_similar(positive=['calvin', 'klein'], negative=['tommy']))
print(model.most_similar(positive=['cotton', 'material'], negative=['polyester']))
print(model.most_similar(positive=['nike', 'run'], negative=['express']))



print(model.most_similar_cosmul(positive=['calvin', 'klein'], negative=['tommy']) )
print(model.most_similar_cosmul(positive=['skinny', 'jean'], negative=['large']) )
print(model.most_similar_cosmul(positive=['black', 'dress'], negative=['navy']) )
print(model.most_similar_cosmul(positive=['blue', 'coat'], negative=['yellow']) )


diplomat
yellow
fall
winter
[('sheikh', 0.29759371280670166), ('abdullah', 0.2927790582180023), ('gcc', 0.2861786484718323), ('shaikh', 0.27961617708206177), ('bin', 0.2788739800453186), ('abdulaziz', 0.27637121081352234), ('nayef', 0.2659643888473511), ('renew', 0.2647008001804352), ('indian', 0.26459500193595886), ('hrh', 0.26205959916114807)]
[('couldn', 0.5273951292037964), ('iba', 0.5182523727416992), ('voorheen', 0.4993104636669159), ('aren', 0.499233216047287), ('kearney', 0.4980449676513672), ('hadn', 0.4979991316795349), ('actueel', 0.49152493476867676), ('wasn', 0.4747185707092285), ('elijah', 0.4691956639289856), ('weren', 0.4674947261810303)]
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-7-741cfaac0023> in <module>()
      7 print(model.most_similar(positive=['saudi', 'king'], negative=['girl']))
      8 print(model.most_similar(positive=['blue', 'shirt'], negative=['blue']))
----> 9 print(model.most_similar(positive=['calvin', 'klein'], negative=['tommy']))
     10 print(model.most_similar(positive=['cotton', 'material'], negative=['polyester']))
     11 print(model.most_similar(positive=['nike', 'run'], negative=['express']))

/usr/local/lib/python3.5/dist-packages/gensim/models/word2vec.py in most_similar(self, positive, negative, topn, restrict_vocab, indexer)
   1202         """
   1203 
-> 1204         return self.wv.most_similar(positive, negative, topn, restrict_vocab, indexer)
   1205 
   1206     def wmdistance(self, document1, document2):

/usr/local/lib/python3.5/dist-packages/gensim/models/keyedvectors.py in most_similar(self, positive, negative, topn, restrict_vocab, indexer)
    320                 mean.append(weight * word)
    321             else:
--> 322                 mean.append(weight * self.word_vec(word, use_norm=True))
    323                 if word in self.vocab:
    324                     all_words.add(self.vocab[word].index)

/usr/local/lib/python3.5/dist-packages/gensim/models/keyedvectors.py in word_vec(self, word, use_norm)
    273                 return self.syn0[self.vocab[word].index]
    274         else:
--> 275             raise KeyError("word '%s' not in vocabulary" % word)
    276 
    277     def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None, indexer=None):

KeyError: "word 'tommy' not in vocabulary"

The commands below were run on a server. Below are the ouputs from the server. View the youtube video tutorial for a more in depth explanation.

scarf

shirt

t-shirt

winter

[('national', 0.7366822957992554), ('deluxe', 0.7351764440536499), ('gangster', 0.7151539921760559), ('redhead', 0.706986665725708), ('jedi', 0.7043461203575134), ('hurley', 0.7041280269622803), ('police', 0.6921283602714539), ('vampire', 0.6920264959335327), ('elvis', 0.6828189492225647), ('man', 0.6826508045196533)]

[('sport-shirt', 0.7597200870513916), ('shirt-shirt', 0.6976513862609863), ('blazer', 0.6822149753570557), ('jacket', 0.6756230592727661), ('jogger', 0.6722015142440796), ('t-shirt', 0.6683749556541443), ('vest', 0.6655105352401733), ('fever', 0.6630758047103882), ('mango', 0.6513298749923706), ('sweatshirt', 0.6464142799377441)]

[('hilfiger', 0.7638254165649414), ('bahama', 0.7248090505599976), ('dkny', 0.7199745178222656), ('cabela', 0.6731507182121277), ('aura', 0.6693034172058105), ('research', 0.6658661365509033), ('average', 0.6650013327598572), ('panache', 0.6622791886329651), ('jezebel', 0.6556996703147888), ('anita', 0.6532233357429504)]

[('lightweight', 0.7339928150177002), ('fabric', 0.714056670665741), ('soft', 0.6978321671485901), ('microfiber', 0.6879997253417969), ('premium', 0.6641561985015869), ('craft', 0.6537737846374512), ('stretchy', 0.6503570079803467), ('jersey', 0.6496517062187195), ('silken', 0.6456028819084167), ('durable', 0.6450238823890686)]

[('training', 0.6947599649429321), ('armour', 0.661718487739563), ('active', 0.6468547582626343), ('circuit', 0.6453198194503784), ('pace', 0.6296043395996094), ('speedo', 0.6284787654876709), ('reebok', 0.6192759275436401), ('tyr', 0.613123893737793), ('triathlon', 0.6078574657440186), ('running', 0.6044196486473083)]

[('hilfiger', 3.4731686115264893), ('bahama', 3.238680362701416), ('cabela', 1.4618180990219116), ('average', 1.4044843912124634), ('hilfger', 1.403147578239441), ('vanderbilt', 1.390093445777893), ('island', 1.3801980018615723), ('vassarette', 1.3763628005981445), ('voi', 1.3734792470932007), ('mavi', 1.363916277885437)]

[('ankle', 2.0474910736083984), ('bootcut', 2.024974822998047), ('tapered', 2.005871534347534), ('straight', 1.9957698583602905), ('slouch', 1.9716737270355225), ('boot', 1.915338397026062), ('boyfriend', 1.8978936672210693), ('pant', 1.7995022535324097), ('trouser', 1.7738025188446045), ('duty', 1.728303074836731)]

[('blouse', 0.9562901258468628), ('skirt', 0.9473488926887512), ('gown', 0.9371359944343567), ('jumpsuit', 0.9135350584983826), ('milly', 0.8995895385742188), ('romper', 0.8857595920562744), ('dvf', 0.8833042979240417), ('caftan', 0.8828949928283691), ('nordstromrack', 0.882025957107544), ('top', 0.8750998973846436)]

[('gilet', 0.9358639121055603), ('df', 0.9223589897155762), ('jumper', 0.912643551826477), ('ba', 0.9119251370429993), ('bd', 0.9087516665458679), ('parka', 0.906380295753479), ('jacket', 0.9058188796043396), ('azure', 0.905199408531189), ('blazer', 0.8978071212768555), ('bf', 0.8966451287269592)]