Objectives
Based on https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb
In [ ]:
import re # Regular Expressions
import os.path # File Operations
import pandas as pd # DataFrames & Manipulation
from gensim.models.doc2vec import LabeledSentence, Doc2Vec # Model training
In [ ]:
train_input = "../data/recipes.tsv.bz2"
# preserve empty strings (http://pandas-docs.github.io/pandas-docs-travis/io.html#na-values)
train = pd.read_csv(train_input, delimiter="\t", quoting=3, encoding="utf-8", keep_default_na=False)
print "loaded %d documents." % len(train)
In [ ]:
def normalize(text):
norm_text = text.lower()
for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
norm_text = norm_text.replace(char, ' ' + char + ' ')
return norm_text
In [ ]:
sentences = [LabeledSentence(normalize(text).split(), [i]) for i, text in enumerate(train['instructions'])]
print "%d sentences in corpus" % len(sentences)
see http://radimrehurek.com/gensim/models/doc2vec.html
class gensim.models.doc2vec.Doc2Vec(
documents=None, # list of TaggedDocument elements
dm=1, # training algorithm. dm=1: 'distributed memory' (PV-DM).
otherwise, 'distributed bag of words' (PV-DBOW).
dbow_words=0, # 0 (default), if 1, trains word-vectors simultaneous with DBOW doc-vector
dm_mean=None, # 0 (default), if 1, use the mean of context word vectors instead of sum.
dm_concat=0, # 0 (default), if 1, use concatenation of (all) context vectors (slow).
dm_tag_count=1, # 1 (default), expected document tags per document, when using dm_concat mode
docvecs=None,
docvecs_mapfile=None,
comment=None,
trim_rule=None,
**kwargs
)
In [ ]:
dist_memory = 1 # distributed memory model
vector_mean = 1 # compute mean of input word vectors
num_features = 300 # word vector dimensionality
min_word_count = 2 # minimum word count
num_workers = 4 # number of threads to run in parallel
context = 10 # context window size
downsampling = 1e-3 # downsample setting for frequent words
model_name = "model-d2v_dm_%dfeatures_%dminwords_%dcontext" % (num_features, min_word_count, context)
In [ ]:
# load model or create new one
if os.path.isfile(model_name):
model = Doc2Vec.load(model_name)
do_train = False
else:
model = Doc2Vec(dm=1, dm_mean=1, size=num_features, min_count=min_word_count, window=context,
sample=downsampling, workers=num_workers)
model.build_vocab(sentences)
do_train = True
In [ ]:
import logging
from random import shuffle
from datetime import datetime
# configure usedful logging messages
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
In [ ]:
def train_model(model, sentences, passes=10, alpha=0.025, min_alpha=0.001):
alpha_delta = (alpha - min_alpha) / passes
print("START %s" % datetime.now())
for epoch in range(passes):
shuffle(sentences) # shuffling gets best results
model.alpha, model.min_alpha = alpha, alpha
model.train(sentences)
print("finished epoch %d (alpha: %f) - %s" % (epoch + 1, alpha, datetime.now()))
alpha -= alpha_delta
print("END %s" % str(datetime.now()))
In [ ]:
if do_train:
train_model(model, sentences, passes=30)
# finalize model to save memory
#model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
# save model
model.save(model_name)
In [ ]:
model.wv.most_similar(["pasta"], topn=20)
In [ ]:
model.docvecs[1]
In [ ]:
recipe_no = 42
ids = model.docvecs.most_similar(recipe_no, topn=20)
ids
In [ ]:
train.loc[[recipe_no]+[id for id, score in ids]][['title','instructions']]
Infer vector and use it as positive example (see also #https://groups.google.com/forum/#!msg/gensim/IH_u8HYVbpg/w9TX4yh2DgAJ)
In [ ]:
doc = train['instructions'][recipe_no]
wordvec = model.infer_vector(normalize(doc).split())
ids = model.docvecs.most_similar(positive=[wordvec], topn=20)
ids
In [ ]:
train.loc[[id for id, score in ids]][['title','instructions']]
In [ ]:
doc = u"Wodka, Cointreau, Limettensaft, Cranberrysaft und Eis."
wordvec = model.infer_vector(normalize(doc).split())
ids = model.docvecs.most_similar(positive=[wordvec], topn=20)
ids
In [ ]:
train.loc[[id for id, score in ids]][['title','instructions']]