In [1]:
from gensim import utils
import gensim.models.doc2vec
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
import gensim
import sys
import numpy as np
from gensim import corpora, models
import csv
import _pickle as cPickle
from sklearn.externals import joblib
import bz2
from random import shuffle
import ast
from sklearn.linear_model import LogisticRegression
In [3]:
# Load to memory
corpus = gensim.corpora.MmCorpus('data/eos/processed_corpus_EOS.mm')
File = open('data/eos/doc2vec_data.txt', 'w')
for doc in doc2vec_data:
File.write(' '.join(token for token in doc))
In [ ]:
%%time
doc_filepath = 'data/eos/eos_text_tokenized.gz'
doc2vec_model_path = 'data/eos/doc2vec_model.d2v'
if 1 == 1:
sentences=gensim.models.doc2vec.TaggedLineDocument(doc_filepath)
model = gensim.models.doc2vec.Doc2Vec(sentences, size=200, window=10, min_count=5, iter=20, workers=8)
model.save(doc2vec_model_path)
else:
model = Doc2Vec.load(doc2vec_model_path)
In [5]:
In [6]:
#The full model was run on a server. The first output below is a sample model run on 1000 lines
#of text. The markdown cell below the output is the output on the server for 30 million lines.
sims = model.docvecs.most_similar(99)
print(sims)
Below is the same sims = model.docvecs.most_similar(99); print(sims); command but run on the full model.
[(98, 0.8011916875839233), (15987504, 0.5732544660568237), (9361116, 0.56634122133255), (15987503, 0.5599271655082703), (9361115, 0.5523300170898438), (15987502, 0.5513883829116821), (16801862, 0.5123482942581177), (16801861, 0.5077716112136841), (11492322, 0.5020641088485718), (755432, 0.48910319805145264)
In [7]:
print(model.doesnt_match("iran diplomat scarf".split()))
print(model.doesnt_match("black blue yellow shirt navy black green orange".split()))
print(model.doesnt_match("summer winter fall t-shirt spring hot cold".split()))
print(model.doesnt_match("straight slim fit custom regular winter".split()))
print(model.most_similar(positive=['saudi', 'king'], negative=['girl']))
print(model.most_similar(positive=['blue', 'shirt'], negative=['blue']))
print(model.most_similar(positive=['calvin', 'klein'], negative=['tommy']))
print(model.most_similar(positive=['cotton', 'material'], negative=['polyester']))
print(model.most_similar(positive=['nike', 'run'], negative=['express']))
print(model.most_similar_cosmul(positive=['calvin', 'klein'], negative=['tommy']) )
print(model.most_similar_cosmul(positive=['skinny', 'jean'], negative=['large']) )
print(model.most_similar_cosmul(positive=['black', 'dress'], negative=['navy']) )
print(model.most_similar_cosmul(positive=['blue', 'coat'], negative=['yellow']) )
The commands below were run on a server. Below are the ouputs from the server. View the youtube video tutorial for a more in depth explanation.
scarf
shirt
t-shirt
winter
[('national', 0.7366822957992554), ('deluxe', 0.7351764440536499), ('gangster', 0.7151539921760559), ('redhead', 0.706986665725708), ('jedi', 0.7043461203575134), ('hurley', 0.7041280269622803), ('police', 0.6921283602714539), ('vampire', 0.6920264959335327), ('elvis', 0.6828189492225647), ('man', 0.6826508045196533)]
[('sport-shirt', 0.7597200870513916), ('shirt-shirt', 0.6976513862609863), ('blazer', 0.6822149753570557), ('jacket', 0.6756230592727661), ('jogger', 0.6722015142440796), ('t-shirt', 0.6683749556541443), ('vest', 0.6655105352401733), ('fever', 0.6630758047103882), ('mango', 0.6513298749923706), ('sweatshirt', 0.6464142799377441)]
[('hilfiger', 0.7638254165649414), ('bahama', 0.7248090505599976), ('dkny', 0.7199745178222656), ('cabela', 0.6731507182121277), ('aura', 0.6693034172058105), ('research', 0.6658661365509033), ('average', 0.6650013327598572), ('panache', 0.6622791886329651), ('jezebel', 0.6556996703147888), ('anita', 0.6532233357429504)]
[('lightweight', 0.7339928150177002), ('fabric', 0.714056670665741), ('soft', 0.6978321671485901), ('microfiber', 0.6879997253417969), ('premium', 0.6641561985015869), ('craft', 0.6537737846374512), ('stretchy', 0.6503570079803467), ('jersey', 0.6496517062187195), ('silken', 0.6456028819084167), ('durable', 0.6450238823890686)]
[('training', 0.6947599649429321), ('armour', 0.661718487739563), ('active', 0.6468547582626343), ('circuit', 0.6453198194503784), ('pace', 0.6296043395996094), ('speedo', 0.6284787654876709), ('reebok', 0.6192759275436401), ('tyr', 0.613123893737793), ('triathlon', 0.6078574657440186), ('running', 0.6044196486473083)]
[('hilfiger', 3.4731686115264893), ('bahama', 3.238680362701416), ('cabela', 1.4618180990219116), ('average', 1.4044843912124634), ('hilfger', 1.403147578239441), ('vanderbilt', 1.390093445777893), ('island', 1.3801980018615723), ('vassarette', 1.3763628005981445), ('voi', 1.3734792470932007), ('mavi', 1.363916277885437)]
[('ankle', 2.0474910736083984), ('bootcut', 2.024974822998047), ('tapered', 2.005871534347534), ('straight', 1.9957698583602905), ('slouch', 1.9716737270355225), ('boot', 1.915338397026062), ('boyfriend', 1.8978936672210693), ('pant', 1.7995022535324097), ('trouser', 1.7738025188446045), ('duty', 1.728303074836731)]
[('blouse', 0.9562901258468628), ('skirt', 0.9473488926887512), ('gown', 0.9371359944343567), ('jumpsuit', 0.9135350584983826), ('milly', 0.8995895385742188), ('romper', 0.8857595920562744), ('dvf', 0.8833042979240417), ('caftan', 0.8828949928283691), ('nordstromrack', 0.882025957107544), ('top', 0.8750998973846436)]
[('gilet', 0.9358639121055603), ('df', 0.9223589897155762), ('jumper', 0.912643551826477), ('ba', 0.9119251370429993), ('bd', 0.9087516665458679), ('parka', 0.906380295753479), ('jacket', 0.9058188796043396), ('azure', 0.905199408531189), ('blazer', 0.8978071212768555), ('bf', 0.8966451287269592)]