In [ ]:
import pandas as pd # DataFrames & Manipulation
from gensim.models.word2vec import Word2Vec # Word2Vec model
from sklearn.preprocessing import LabelEncoder # Category Encoder
In [ ]:
model_file = "recipes-words_300features_40minwords_10context"
model = Word2Vec.load(model_file)
print "The model contains %d words." % len(model.wv.vocab)
In [ ]:
ingredient_file = "../data/ingredients.tsv"
ingredients = pd.read_csv(ingredient_file, delimiter="\t", quoting=3, encoding="utf-8", keep_default_na=False)
# convert to categories
ingredients['productGroup'] = ingredients['productGroup'].astype('category')
ingredients.head()
Prepare lookup table for ingredient categories
In [ ]:
# encode ingredient categories as integers
le = LabelEncoder()
le.fit(ingredients['productGroup'])
categories = le.fit_transform(ingredients['productGroup'])
name_lower = [name.lower() for name in ingredients['name']]
lookup = dict(zip(name_lower, categories))
Match ingredient names with words in model
In [ ]:
known_words = [word for word in model.wv.vocab if word in name_lower]
word_cats = [lookup[item] for item in known_words]
X = model[known_words] # get vector weights
print "found %d of %d ingredients (exact match) in %d words." % (len(known_words), len(name_lower), len(model.wv.vocab))
see also http://alexanderfabisch.github.io/t-sne-in-scikit-learn.html
In [ ]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
In [ ]:
tsne = TSNE(n_components=2, perplexity=50, random_state=0)
np.set_printoptions(suppress=True)
X_tsne = tsne.fit_transform(X)
In [ ]:
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=word_cats)
plt.show()
In [ ]:
X_pca = PCA().fit_transform(X)
In [ ]:
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=word_cats)