Doc2Vec trained on recipe instructions

Objectives

  • Create word embeddings for recipes.
  • Use word vectors for (traditional) segmentation, classification, and retrieval of recipes.

Based on https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb

Data Preparation


In [ ]:
import re                                                    # Regular Expressions
import os.path                                               # File Operations
import pandas as pd                                          # DataFrames & Manipulation
from gensim.models.doc2vec import LabeledSentence, Doc2Vec   # Model training

In [ ]:
train_input = "../data/recipes.tsv.bz2"

# preserve empty strings (http://pandas-docs.github.io/pandas-docs-travis/io.html#na-values)
train = pd.read_csv(train_input, delimiter="\t", quoting=3, encoding="utf-8", keep_default_na=False)

print "loaded %d documents." % len(train)

Text Normalization


In [ ]:
def normalize(text):
    norm_text = text.lower()
    
    for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
        norm_text = norm_text.replace(char, ' ' + char + ' ')
    
    return norm_text

In [ ]:
sentences = [LabeledSentence(normalize(text).split(), [i]) for i, text in enumerate(train['instructions'])]

print "%d sentences in corpus" % len(sentences)

Doc2Vec Model

see http://radimrehurek.com/gensim/models/doc2vec.html

class gensim.models.doc2vec.Doc2Vec(
    documents=None,          # list of TaggedDocument elements
    dm=1,                    # training algorithm. dm=1: 'distributed memory' (PV-DM).
                                                   otherwise, 'distributed bag of words' (PV-DBOW).
    dbow_words=0,            # 0 (default), if 1, trains word-vectors simultaneous with DBOW doc-vector
    dm_mean=None,            # 0 (default), if 1, use the mean of context word vectors instead of sum.
    dm_concat=0,             # 0 (default), if 1, use concatenation of (all) context vectors (slow).
    dm_tag_count=1,          # 1 (default), expected document tags per document, when using dm_concat mode
    docvecs=None,
    docvecs_mapfile=None,
    comment=None,
    trim_rule=None,
    **kwargs
)

Model Setup


In [ ]:
dist_memory    =   1   # distributed memory model
vector_mean    =   1   # compute mean of input word vectors
num_features   = 300   # word vector dimensionality
min_word_count =   2   # minimum word count
num_workers    =   4   # number of threads to run in parallel
context        =  10   # context window size
downsampling   = 1e-3  # downsample setting for frequent words

model_name = "model-d2v_dm_%dfeatures_%dminwords_%dcontext" % (num_features, min_word_count, context)

In [ ]:
# load model or create new one
if os.path.isfile(model_name):
    model = Doc2Vec.load(model_name)
    do_train = False
else:
    model = Doc2Vec(dm=1, dm_mean=1, size=num_features, min_count=min_word_count, window=context,
                    sample=downsampling, workers=num_workers)
    model.build_vocab(sentences)
    do_train = True

Model Training

train multiple epochs with decreasing learning rate.


In [ ]:
import logging
from random import shuffle
from datetime import datetime

# configure usedful logging messages
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [ ]:
def train_model(model, sentences, passes=10, alpha=0.025, min_alpha=0.001):
    alpha_delta = (alpha - min_alpha) / passes

    print("START %s" % datetime.now())

    for epoch in range(passes):
        shuffle(sentences)  # shuffling gets best results

        model.alpha, model.min_alpha = alpha, alpha
        model.train(sentences)

        print("finished epoch %d (alpha: %f) - %s" % (epoch + 1, alpha, datetime.now()))

        alpha -= alpha_delta

    print("END %s" % str(datetime.now()))

In [ ]:
if do_train:
    train_model(model, sentences, passes=30)

    # finalize model to save memory
    #model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

    # save model
    model.save(model_name)

Model results

Word Embeddings


In [ ]:
model.wv.most_similar(["pasta"], topn=20)

Document Representation


In [ ]:
model.docvecs[1]

Similar Documents


In [ ]:
recipe_no = 42
ids = model.docvecs.most_similar(recipe_no, topn=20)
ids

In [ ]:
train.loc[[recipe_no]+[id for id, score in ids]][['title','instructions']]

Compute vector for existing document

Infer vector and use it as positive example (see also #https://groups.google.com/forum/#!msg/gensim/IH_u8HYVbpg/w9TX4yh2DgAJ)


In [ ]:
doc = train['instructions'][recipe_no]
wordvec = model.infer_vector(normalize(doc).split())
ids = model.docvecs.most_similar(positive=[wordvec], topn=20)
ids

In [ ]:
train.loc[[id for id, score in ids]][['title','instructions']]

Compute vector for new document


In [ ]:
doc = u"Wodka, Cointreau, Limettensaft, Cranberrysaft und Eis."
wordvec = model.infer_vector(normalize(doc).split())
ids = model.docvecs.most_similar(positive=[wordvec], topn=20)
ids

In [ ]:
train.loc[[id for id, score in ids]][['title','instructions']]