Analysis of recipes with word2vec.
Partially based on https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-2-word-vectors
In [ ]:
import re # Regular Expressions
import pandas as pd # DataFrames & Manipulation
import nltk.data # Sentence tokenizer
from nltk.corpus import stopwords # Import the stop word list
from bs4 import BeautifulSoup # HTML processing
from gensim.models.word2vec import Word2Vec
In [ ]:
train_input = "../data/recipes.tsv.bz2"
# keep empty strings (http://pandas-docs.github.io/pandas-docs-travis/io.html#na-values)
train = pd.read_csv(train_input, delimiter="\t", quoting=3, encoding="utf-8", keep_default_na=False)
Prepare text processing
In [ ]:
# load sentence tokenizer model and initialize for german language
nltk.download("punkt")
tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
Define functions for cleaning the text data.
In [ ]:
def clean_text( input, remove_stopwords = False ):
"""
Remove HTML, non-letter characters, and convert to lower case.
Return list of words.
"""
# remove HTML markup with BeautifulSoup (and keep spaces after removal)
plainText = " ".join(BeautifulSoup(input, 'html.parser').strings)
# retain only letters (include umlauts)
onlyLetters = re.sub(u"[^a-zA-ZäöüÄÖÜß]", " ", plainText)
# get lower case words
words = onlyLetters.lower().split()
# use set of stop words
if remove_stopwords:
stops = set(stopwords.words("english"))
words = [w for w in words if not w in stops]
return words
def clean_sentences( text, remove_stopwords = False ):
"""
Split text by sentences and clean each sentence.
"""
sentences = tokenizer.tokenize(text)
return filter(None, [clean_text(line, remove_stopwords) for line in sentences])
In [ ]:
sentences = []
size = train['instructions'].size
for i in xrange ( 0, size ):
if (i+1) % 1000 == 0:
print "Processing %d of %d recipies." % ( i+1, size )
sentences += clean_sentences(train['instructions'][i])
In [ ]:
print "Total: %d sentences.\n" % len(sentences)
print "Example Sentences:\n", "\n".join([",".join(sentence) for sentence in sentences[0:3]])
In [ ]:
# Set values for various parameters
num_features = 300 # Word vector dimensionality
min_word_count = 40 # Minimum word count
num_workers = 4 # Number of threads to run in parallel
context = 10 # Context window size
downsampling = 1e-3 # Downsample setting for frequent words
In [ ]:
# Import the built-in logging module and configure it so that Word2Vec creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
In [ ]:
print "Training model..."
model = Word2Vec(sentences, workers=num_workers, \
size=num_features, min_count = min_word_count, \
window = context, sample = downsampling)
In [ ]:
print "%d words in vocabular." % len(model.wv.vocab)
Save the model for later use or continued training
In [ ]:
# make the model much more memory-efficient.
model.init_sims(replace=True)
model_name = "recipes-words_%dfeatures_%dminwords_%dcontext" % (num_features, min_word_count, context)
model.save(model_name)
Some model evaluation examples:
In [ ]:
model.doesnt_match("milch brot eier".split())
In [ ]:
model.most_similar("braten")
In [ ]:
model.most_similar("pasta")
In [ ]:
model.most_similar("brownies")
In [ ]:
model.most_similar("lasagne")