About

Visualization of recipe ingredients with t-SNE.

Based on the vector representations produced by Word2Vec.


In [ ]:
import pandas as pd                             # DataFrames & Manipulation
from gensim.models.word2vec import Word2Vec     # Word2Vec model
from sklearn.preprocessing import LabelEncoder  # Category Encoder

In [ ]:
model_file = "recipes-words_300features_40minwords_10context"

model = Word2Vec.load(model_file)

print "The model contains %d words." % len(model.wv.vocab)

Prepare Ingredients

Load additional ingredient information from separate file (contains 'name' and 'productGroup' of all ingredients).


In [ ]:
ingredient_file = "../data/ingredients.tsv"

ingredients = pd.read_csv(ingredient_file, delimiter="\t", quoting=3, encoding="utf-8", keep_default_na=False)

# convert to categories
ingredients['productGroup'] = ingredients['productGroup'].astype('category')

ingredients.head()

Prepare lookup table for ingredient categories


In [ ]:
# encode ingredient categories as integers
le = LabelEncoder()
le.fit(ingredients['productGroup'])

categories = le.fit_transform(ingredients['productGroup'])
name_lower = [name.lower() for name in ingredients['name']]

lookup = dict(zip(name_lower, categories))

Match ingredient names with words in model


In [ ]:
known_words = [word for word in model.wv.vocab if word in name_lower]
word_cats = [lookup[item] for item in known_words]
X = model[known_words]  # get vector weights
print "found %d of %d ingredients (exact match) in %d words." % (len(known_words), len(name_lower), len(model.wv.vocab))

Visualise Ingredients using t-SNE

see also http://alexanderfabisch.github.io/t-sne-in-scikit-learn.html


In [ ]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

In [ ]:
tsne = TSNE(n_components=2, perplexity=50, random_state=0)
np.set_printoptions(suppress=True)
X_tsne = tsne.fit_transform(X)

In [ ]:
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=word_cats)
plt.show()

In [ ]:
X_pca = PCA().fit_transform(X)

In [ ]:
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=word_cats)