About

Analysis of recipes with doc2vec.

Prerequesites

  • Python libraries:
    • nltk - natural language processing toolkit which includes functions for cleaning text data.
    • gensim - doc2vec implementation - cython is required for ensuring speedy computations.

In [ ]:
import re                          # Regular Expressions
import pandas as pd                # DataFrames & Manipulation
import nltk.data                   # Sentence tokenizer
from bs4 import BeautifulSoup      # HTML processing
from gensim.models.doc2vec import LabeledSentence, Doc2Vec

In [ ]:
train_input = "../data/recipes.tsv.bz2"

# keep empty strings (http://pandas-docs.github.io/pandas-docs-travis/io.html#na-values)
train = pd.read_csv(train_input, delimiter="\t", quoting=3, encoding="utf-8", keep_default_na=False)

In [ ]:
# load sentence tokenizer model and initialize for german language
nltk.download("punkt")

tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')

In [ ]:
def normalize( text ):
    """
    Remove HTML, non-letter characters, and convert to lower case.
    Return list of words.
    """
    
    # remove HTML markup with BeautifulSoup (and keep spaces after removal)
    plainText = " ".join(BeautifulSoup(text, 'html.parser').strings)
    
    # retain only letters (include umlauts)
    onlyLetters = re.sub(u"[^a-zA-ZäöüÄÖÜß]", " ", plainText)
    
    # get lower case words
    words = onlyLetters.lower().split()
    
    return words

def split_sentences(text):
    """ Split text by sentences and clean each sentence. """
    return filter(None, [normalize(sentence) for sentence in tokenizer.tokenize(text)])

In [ ]:
sentences = []

size = train['instructions'].size

for i in xrange ( 0, size ):
    
    if (i+1) % 10000 == 0:
        print "Processing %d of %d recipies." % ( i+1, size )
    
    # either keep complete text or split into sentences but label all parts with the same ID
    sentences.append(LabeledSentence(normalize(train['instructions'][i]), [i]))
    #sentences += [LabeledSentence(words, [i]) for words in split_sentences(train['instructions'][i])]

In [ ]:
print "Total: %d sentences.\n" % len(sentences)

In [ ]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [ ]:
# Import the built-in logging module and configure it so that Word2Vec creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [ ]:
print "Training model..."
model = Doc2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

In [ ]:
vec = model.docvecs.most_similar(1)
ids = [k for (k,v) in vec]
ids

In [ ]:
vec

In [ ]:
train.loc[ids]

In [ ]:
model.most_similar('pasta')