Doc2Vec trained on recipe instructions

Objectives

Create word embeddings for recipes.
Use word vectors for (traditional) segmentation, classification, and retrieval of recipes.

Based on https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb

Data Preparation



In [ ]:

    
import re                                                    # Regular Expressions
import os.path                                               # File Operations
import pandas as pd                                          # DataFrames & Manipulation
from gensim.models.doc2vec import LabeledSentence, Doc2Vec   # Model training



In [ ]:

    
train_input = "../data/recipes.tsv.bz2"

# preserve empty strings (http://pandas-docs.github.io/pandas-docs-travis/io.html#na-values)
train = pd.read_csv(train_input, delimiter="\t", quoting=3, encoding="utf-8", keep_default_na=False)

print "loaded %d documents." % len(train)

Text Normalization



In [ ]:

    
def normalize(text):
    norm_text = text.lower()
    
    for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
        norm_text = norm_text.replace(char, ' ' + char + ' ')
    
    return norm_text



In [ ]:

    
sentences = [LabeledSentence(normalize(text).split(), [i]) for i, text in enumerate(train['instructions'])]

print "%d sentences in corpus" % len(sentences)

Doc2Vec Model

see http://radimrehurek.com/gensim/models/doc2vec.html

class gensim.models.doc2vec.Doc2Vec(
    documents=None,          # list of TaggedDocument elements
    dm=1,                    # training algorithm. dm=1: 'distributed memory' (PV-DM).
                                                   otherwise, 'distributed bag of words' (PV-DBOW).
    dbow_words=0,            # 0 (default), if 1, trains word-vectors simultaneous with DBOW doc-vector
    dm_mean=None,            # 0 (default), if 1, use the mean of context word vectors instead of sum.
    dm_concat=0,             # 0 (default), if 1, use concatenation of (all) context vectors (slow).
    dm_tag_count=1,          # 1 (default), expected document tags per document, when using dm_concat mode
    docvecs=None,
    docvecs_mapfile=None,
    comment=None,
    trim_rule=None,
    **kwargs
)

Model Setup



In [ ]:

    
dist_memory    =   1   # distributed memory model
vector_mean    =   1   # compute mean of input word vectors
num_features   = 300   # word vector dimensionality
min_word_count =   2   # minimum word count
num_workers    =   4   # number of threads to run in parallel
context        =  10   # context window size
downsampling   = 1e-3  # downsample setting for frequent words

model_name = "model-d2v_dm_%dfeatures_%dminwords_%dcontext" % (num_features, min_word_count, context)



In [ ]:

    
# load model or create new one
if os.path.isfile(model_name):
    model = Doc2Vec.load(model_name)
    do_train = False
else:
    model = Doc2Vec(dm=1, dm_mean=1, size=num_features, min_count=min_word_count, window=context,
                    sample=downsampling, workers=num_workers)
    model.build_vocab(sentences)
    do_train = True

Model Training

train multiple epochs with decreasing learning rate.



In [ ]:

    
import logging
from random import shuffle
from datetime import datetime

# configure usedful logging messages
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [ ]:

    
def train_model(model, sentences, passes=10, alpha=0.025, min_alpha=0.001):
    alpha_delta = (alpha - min_alpha) / passes

    print("START %s" % datetime.now())

    for epoch in range(passes):
        shuffle(sentences)  # shuffling gets best results

        model.alpha, model.min_alpha = alpha, alpha
        model.train(sentences)

        print("finished epoch %d (alpha: %f) - %s" % (epoch + 1, alpha, datetime.now()))

        alpha -= alpha_delta

    print("END %s" % str(datetime.now()))



In [ ]:

    
if do_train:
    train_model(model, sentences, passes=30)

    # finalize model to save memory
    #model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

    # save model
    model.save(model_name)

Model results

Word Embeddings



In [ ]:

    
model.wv.most_similar(["pasta"], topn=20)

Document Representation



In [ ]:

    
model.docvecs[1]

Compute vector for existing document

Infer vector and use it as positive example (see also #https://groups.google.com/forum/#!msg/gensim/IH_u8HYVbpg/w9TX4yh2DgAJ)



In [ ]:

    
doc = train['instructions'][recipe_no]
wordvec = model.infer_vector(normalize(doc).split())
ids = model.docvecs.most_similar(positive=[wordvec], topn=20)
ids



In [ ]:

    
train.loc[[id for id, score in ids]][['title','instructions']]

Compute vector for new document



In [ ]:

    
doc = u"Wodka, Cointreau, Limettensaft, Cranberrysaft und Eis."
wordvec = model.infer_vector(normalize(doc).split())
ids = model.docvecs.most_similar(positive=[wordvec], topn=20)
ids



In [ ]:

    
train.loc[[id for id, score in ids]][['title','instructions']]

Doc2Vec trained on recipe instructions

Data Preparation

Text Normalization

Doc2Vec Model

Model Setup

Model Training

Model results

Word Embeddings

Document Representation

Similar Documents

Compute vector for existing document

Compute vector for new document