Word2Vec trained on recipe instructions

Objectives

  • Create word embeddings for recipe.
  • Use word vectors for (traditional) segmentation, classification, and retrieval of recipes.

Data Preparation


In [ ]:
import re                                    # Regular Expressions
import pandas as pd                          # DataFrames & Manipulation
from gensim.models.word2vec import Word2Vec

In [ ]:
train_input = "../data/recipes.tsv.bz2"

# preserve empty strings (http://pandas-docs.github.io/pandas-docs-travis/io.html#na-values)
train = pd.read_csv(train_input, delimiter="\t", quoting=3, encoding="utf-8", keep_default_na=False)

print "loaded %d documents." % len(train)

In [ ]:
train[['title', 'instructions']].head()

Input Normalization

does not need specific filtering of special character, stop words, etc.


In [ ]:
def normalize(text):
    norm_text = text.lower()
    
    for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
        norm_text = norm_text.replace(char, ' ' + char + ' ')
    
    return norm_text

In [ ]:
sentences = [normalize(text).split() for text in train['instructions']]
print "%d documents in corpus" % len(sentences)

Word2Vec Model

see http://radimrehurek.com/gensim/models/word2vec.html

class gensim.models.word2vec.Word2Vec(
->  sentences=None,                        # iterable of sentences (list of words)
->  size=100,                              # feature vector dimension
    alpha=0.025,                           # intial learning rate (drops to min_alpha during training)
->  window=5,                              # maximum distance between current and predicted word
->  min_count=5,                           # ignore words with lower total frequency
    max_vocab_size=None,                   # limit RAM to most frequent words (1M words ~ 1GB)
    sample=0.001,                          # threshold for random downsampling of high frequency words
    seed=1,                                # for random number generator
->  workers=3,                             # number fo worker threads
    min_alpha=0.0001,                      # used for linear learning-rate decay
->  sg=0,                                  # training algorithm - (sg=0) CBOW, (sg=1) skip-gram
    hs=0,                                  # use hierarchical softmax (if 1), or negative sampling (default)
    negative=5,                            # number of noise words used for negative sampling
    cbow_mean=1,                           # use sum (0) of context word vector or mean (1, default)
    hashfxn=<built-in function hash>,
    iter=5,                                # number of iterations (epochs) over the corpus
    null_word=0,
    trim_rule=None,                        # custom vocabulary filtering
    sorted_vocab=1,                        # sort vocab by descending word frequency
    batch_words=10000                      # size of batches (in words) passed to worker threads
)

Define model training parameters


In [ ]:
num_features   = 100   # Word vector dimensionality
min_word_count =  10   # Minimum word count
num_workers    =   4   # Number of threads to run in parallel
context        =  10   # Context window size
downsampling   = 1e-3  # Downsample setting for frequent words

In [ ]:
# Import the built-in logging module and configure it so that Word2Vec creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Training CBOW model

takes about 3 minutes for example data.


In [ ]:
print "Training CBOW model..."
model = Word2Vec(
    sentences, 
    workers=num_workers,
    size=num_features,
    min_count = min_word_count,
    window = context,
    sample = downsampling)

In [ ]:
# make the model much more memory-efficient.
model.init_sims(replace=True)

model_name = "model-w2v_cbow_%dfeatures_%dminwords_%dcontext" % (num_features, min_word_count, context)
model.save(model_name)

Model Details


In [ ]:
print "%d words in vocabulary." % len(model.wv.vocab)
vocab = [(k, v.count) for k, v in model.wv.vocab.items()]
pd.DataFrame.from_records(vocab, columns=['word', 'count']).sort_values('count', ascending=False).reset_index(drop=True)

Word Similarity


In [ ]:
model.most_similar("pasta", topn=20)

In [ ]:
model.most_similar("ofen")

Training skip-gram model

takes about 14 minutes for example data


In [ ]:
print "Training skip-gram model..."
model2 = Word2Vec(
    sentences,
    sg = 1,
    hs = 1,
    workers=num_workers,
    size=num_features,
    min_count = min_word_count,
    window = context,
    sample = downsampling)

In [ ]:
# make the model much more memory-efficient.
model2.init_sims(replace=True)

model_name = "recipes_skip-gram_%dfeatures_%dminwords_%dcontext" % (num_features, min_word_count, context)
model2.save(model_name)

In [ ]:
model2.most_similar("pasta")

In [ ]:
model2.most_similar("ofen")