By Eka Renardi. October 29, 2015
Kaggle posed a challenge for predicting the category of cuisine based on its recipe ingredients. This document outlines my approach for solving this Kaggle Competition, What's Cooking.
Yummly graciously provided the dataset for this challenge. The dataset consists of train and test set. The train set listed the cuisines, and ingredients for the cuisine. The test set data listed just the ingredients. A sample train data set is as follows:
{
"id": 25693,
"cuisine": "southern_us",
"ingredients": [
"plain flour",
"ground pepper",
"salt",
"tomatoes",
"ground black pepper",
"thyme",
"eggs",
"green tomatoes",
"yellow corn meal",
"milk",
"vegetable oil"
]
}
Attributes:
In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from collections import Counter, defaultdict
import seaborn as sns
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from wordcloud import WordCloud
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer, TfidfTransformer
from sklearn.feature_selection import chi2, SelectPercentile, SelectKBest, f_classif
from sklearn.cross_validation import train_test_split, cross_val_score, KFold, StratifiedKFold
from sklearn.multiclass import OneVsRestClassifier
from sklearn.decomposition import KernelPCA, TruncatedSVD
from sklearn.lda import LDA
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.svm import LinearSVC
from sklearn import metrics
from scipy.stats import sem
from sklearn.manifold import MDS
from sklearn.metrics import euclidean_distances, roc_curve, auc
from sklearn.datasets import make_classification
import matplotlib.pylab as pyl
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
sns.set_style('whitegrid')
In [2]:
df = pd.read_json('../data/train.json')
In [3]:
cuisines = df.cuisine.value_counts(sort=True)
cuisines.plot(kind="bar", figsize=(12,6), title="Number of Cuisines")
print("shape:", df.shape)
print("unique cuisine count:", len(df.cuisine.unique()))
In [4]:
# calculating the usage of each ingredient, and for each cuisine
cuisine_dict = defaultdict(Counter)
index_dict = defaultdict(Counter)
total_counter = Counter()
# iterate each row
for index, row in df.iterrows():
# get the counters
row_counter = index_dict[row.id]
local_counter = cuisine_dict[row.cuisine]
# count the ingredients usage
arr = row.ingredients
for ingredient in row.ingredients:
key = ingredient.lower()
# increment each word count
total_counter[key] += 1
local_counter[key] += 1
row_counter[key] += 1
# update the counters
cuisine_dict[row.cuisine] = local_counter
index_dict[row.id] = row_counter
In [5]:
# plotting most common ingredients
most_common_ingredients = pd.DataFrame(total_counter.most_common(10), columns=["ingredient", "count"])
most_common_ingredients.plot(kind="bar", x="ingredient", figsize=(10,6), title="10 Most Common Ingredients")
Out[5]:
Let's draw a word cloud for top 100 ingredients.
In [6]:
from wordcloud import WordCloud
wordcloudobj = WordCloud( background_color='white')
wordcloud = wordcloudobj.generate_from_frequencies(total_counter.most_common(100))
plt.imshow(wordcloud)
plt.axis("off")
plt.title("Most common ingredients")
plt.show()
Now let's look at the 100 most common ingredients used for each cuisine
In [7]:
for key, value in cuisine_dict.iteritems():
wc = wordcloudobj.generate_from_frequencies(value.most_common(100))
plt.imshow(wc)
plt.title("100 most common ingredients for '%s' cuisine" % key)
plt.axis("off")
plt.show()
In [8]:
# bar char plot of top 20 most common ingredients
for key, value in cuisine_dict.iteritems():
mc = pd.DataFrame(value.most_common(20), columns=["ingredient", "count"])
mc.plot(kind="bar", x="ingredient", figsize=(10,6), title="20 Most Common Ingredients")
plt.title("20 most common ingredients for '%s' cuisine" % key)
plt.show()
In [9]:
# ngram range for each cuisine
stemmer = PorterStemmer()
ngram_counters = {}
for key, value in cuisine_dict.iteritems():
rng = [len(word_tokenize(stemmer.stem(wc))) for wc, val in value.iteritems()]
local_ngram_counter = Counter(rng)
ngram_counters[key] = local_ngram_counter
local_series = pd.Series(local_ngram_counter)
local_series.plot(kind="bar", figsize=(10,6), title="NGram for %s cuisine" % key)
plt.show()
In [10]:
# vector of ngram for each cuisine
ngram_index = {}
for key, value in ngram_counters.iteritems():
ngram_index[key] = dict(value)
df_ngram = pd.DataFrame.from_dict(ngram_index, orient='index')
df_ngram = df_ngram.fillna(0)
df_ngram
Out[10]:
In [11]:
# getting the percentage of ngram for each cuisines
totalcount = df_ngram.sum().sum()
ngram_sum = df_ngram.sum().groupby(level=0).apply(lambda x: 100*x/float(totalcount))
ngram_sum.plot(kind="bar", title="Percentile Range of Word NGram")
ngram_sum
Out[11]:
Most cuisines, about 91%, are 1 to 3 words in length
In [12]:
# get a vector of cuisines and ingredients
index1 = {}
for key, value in cuisine_dict.iteritems():
index1[key] = dict(value)
df1 = pd.DataFrame.from_dict(index1, orient='index')
df1 = df1.fillna(0)
cuisine_labels = [key for key, value in df1.iterrows()]
ingredients_labels = list(df1.columns)
# calculating the table of probabilities for the cuisines
inst_count=df1.sum(axis='columns')
df1_prob= df1.div(inst_count,axis='rows')
In [13]:
mds = MDS(n_components=2, random_state=1)
X = mds.fit_transform(df1_prob.astype(np.float64))
x1 = X[:,0]
y1 = X[:,1]
In [14]:
pyl.rcParams['figure.figsize'] = (16.0, 12.0)
fig, ax = plt.subplots()
ax=plt.scatter(x1,y1)
plt.title('Profiles of Cuisines')
colors = {
'brazilian': '#216b22',
'british': '#3c00ff',
'cajun_creole': '#216b22',
'chinese': '#f24e06',
'filipino': '#f24e06',
'french': '#3c00ff',
'greek': '#3c00ff',
'indian': '#f24e06',
'irish': '#3c00ff',
'italian': '#3c00ff',
'jamaican': '#216b22',
'japanese': '#f24e06',
'korean': '#f24e06',
'mexican': '#216b22',
'moroccan': '#3c00ff',
'russian': '#3c00ff',
'southern_us': '#216b22',
'spanish': '#3c00ff',
'thai': '#f24e06',
'vietnamese': '#f24e06'
}
for i, name in enumerate(cuisine_labels):
plt.annotate(name, (x1[i], y1[i]), color=colors[name], xytext=(7,1), textcoords='offset points')
plt.show()
It appears that there is some sort of grouping with ingredients and cuisines.
The response variable for the model is the cuisine. It is a categorical variable.
The predictor variable is the ingredients. It is a list of text.
Given that this is a text classification problem. The following data cleaning procedure are performed.
Once the data is cleaned, the following feature engineering are then performed.
The model is subsequently developed following the progression below. For each model, the approach that I use for selection is one with the highest accuracy score. Score is cross validated across 5 folds.
The final model is then tested against test data, and prediction is produced against test.json data.
In [15]:
# read the data again
df = pd.read_json('../data/train.json')
In [16]:
import matplotlib.pylab as pylab
pylab.rcParams['figure.figsize'] = (8.0, 8.0)
In [17]:
# replacement data. Philadelphia Cream Cheese -> cream cheese
thesauri = {}
with open("../code/thesauri.txt", "r") as f:
for line in f:
entry = line.strip().split(",")
key = entry[0]
value = entry[1]
thesauri[key] = value
f.close()
In [18]:
# custom stop words
stopwords = []
with open("../code/stopwords.txt", "r") as f:
for line in f:
stopwords.append(line.strip())
f.close()
In [19]:
# clean ingredients
def clean_ingredients(ingredients):
stemmer = PorterStemmer()
new_ingredients = []
for one in ingredients:
# if match thesauri, then use its simpler form
newone = one
if one in thesauri:
newone = thesauri[one]
# if is not a stop word, then append
new_ingredients.append(
" ".join([stemmer.stem(t) for t in word_tokenize(newone) if not t in stopwords])
)
return new_ingredients
class Tokenizer(object):
def __call__(self, doc):
return doc.split(",")
In [20]:
# now do it for all observations
df['ingredients_all'] = df.ingredients.apply(lambda x: ",".join(clean_ingredients(x)))
In [30]:
# no comma
df['ingredients_string'] = df.ingredients_all.apply(lambda x: x.replace(',', ' '))
# no stemming, replacement, comma, etc, just plain string
df['ingredients_clean'] = df.ingredients.apply(lambda x: " ".join(x))
Setting the response variable.
In [77]:
print df['ingredients_all'][:5]
print df['ingredients_string'][:5]
print df['ingredients_clean'][:5]
In [34]:
# mapping categorical response var
cuisine_mapping = {label:idx for idx,label in enumerate(np.unique(df['cuisine']))}
cuisine_mapping
Out[34]:
In [35]:
# mapping categorical response var
df['cuisine_idx'] = df.cuisine.map(cuisine_mapping)
# set the response variable
y = df['cuisine_idx']
For each model the following function is executed. It evaluates the accuracy score through cross validation (5 folds).
In [36]:
# cross validate each model
def cross_val_models(models, X, y, K):
predLst = []
for model in models:
score, sem_score = cross_val_validation(model[1], X, y, K)
predLst.append({'name': model[0],
'score': score,
'sem' : sem_score})
print "Cross_val %s...%0.3f" % (model[0], score)
return predLst
# get the mean score, and standard error mean
def cross_val_validation(clf, X, y, K):
# create a k-fold cross validation iterator of K folds
cv = StratifiedKFold(y=y, n_folds=K, shuffle=True, random_state=1)
#cv = KFold(len(y), K, shuffle=True, random_state=0)
# get the mean score, and standard error mean
scores = cross_val_score(clf, X, y, cv=cv, scoring="accuracy")
return np.mean(scores), sem(scores)
# grid search helper
def grid_search_models(name, clf, param_grid, X, y, K):
grid = GridSearchCV(clf, param_grid, cv=K, scoring='accuracy')
grid.fit(X, y)
print "Grid_search %s...%0.3f" % (name, grid.best_score_)
return dict({'name': name,
'score': grid.best_score_,
'best_params' : grid.best_estimator_,
'scores': grid.grid_scores_
})
def cross_val_model(clf, X, y, K):
score, sem = cross_val_validation(clf, X, y, K)
print "Cross_val ...%0.3f" % (score)
return dict({'score': score,
'sem' : sem
})
# grid mean score plot helper
def plot_grid_mean_scores(rng, result):
grid_mean_scores = [result.mean_validation_score for result in result['scores']]
plt.plot(rng, grid_mean_scores)
plt.xlabel('Range Value')
plt.ylabel('Cross-Validated Accuracy')
# get grid scores
def get_grid_scores_pd(result):
temp = pd.DataFrame.from_dict(result['scores'])
temp.columns = ['name', 'mean score', 'scores']
return temp
In [37]:
# base models
models = [
('nb',
Pipeline([('vect', CountVectorizer(strip_accents='unicode')),
('clf', MultinomialNB())
])
),
('logistic',
Pipeline([('vect', TfidfVectorizer(strip_accents='unicode', tokenizer=Tokenizer())),
('clf', LogisticRegression(C=1e9))
])
),
]
X = df['ingredients_all']
predLst = cross_val_models(models, X, y, 5)
predDf = pd.DataFrame.from_dict(predLst)
predDf
Out[37]:
Looks like LogisticRegression with TFIDF has the highest score, 0.723. This is with data cleaning, and unique ingredients are separated by comma.
In [54]:
# testing with ingredients that have not been cleaned
model = Pipeline([('vect', TfidfVectorizer(strip_accents='unicode')),
('clf', LogisticRegression(C=1e9))
])
X = df['ingredients_clean']
cross_val_model(model, X, y, 5)
Out[54]:
With no data cleaning, the score has increased to 0.734
In [38]:
# testing with cleaned ingredients without ','
model = Pipeline([('vect', TfidfVectorizer(strip_accents='unicode')),
('clf', LogisticRegression(C=1e9))
])
X = df['ingredients_string']
cross_val_model(model, X, y, 5)
Out[38]:
With data cleaning, but no comma, the score improved further to 0.743
In [108]:
X = df['ingredients_all']
counter = Counter()
for _, row in X.iteritems():
for item in row.split(','):
n = len(item.split(' '))
counter[n] += 1
word_count = dict(counter)
word_count
Out[108]:
In [109]:
# get the word ngram dataframe
df_word_ngram = pd.DataFrame.from_dict(word_count, orient='index')
df_word_ngram.columns = ['percent']
# calculate percentages
totalcount = df_word_ngram.sum()
ngram_word_sum = df_word_ngram.groupby(level=0).apply(lambda x: 100*x/float(totalcount))
ngram_word_sum.plot(kind="bar", title="Percentile Range Word NGram")
ngram_word_sum
Out[109]:
In [110]:
# the word ngram range
word_ngram_range = [(1,3), (1, 5)]
# create the model
model = Pipeline([('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word')),
('clf', LogisticRegression(C=1e9))
])
param_grid = {
'tfidf__ngram_range': word_ngram_range,
}
# cross validate using grid search
X = df['ingredients_string']
word_ngram_results = grid_search_models('word_ngram', model, param_grid, X, y, 5)
word_ngram_results
Out[110]:
In [111]:
wpd = get_grid_scores_pd(word_ngram_results)
wpd
Out[111]:
Looks like word ngram increases our score to 0.783
In [113]:
# get the char ngram
X = df['ingredients_all']
counter = Counter()
for _, row in X.iteritems():
for item in row.split(','):
for one in item.strip().split(' '):
n = len(one)
counter[n] += 1
char_count = dict(counter)
char_count
Out[113]:
In [114]:
# get the char ngram dataframe
df_char_ngram = pd.DataFrame.from_dict(char_count, orient='index')
df_char_ngram.columns = ['percent']
# calculate percentages
totalcount = df_char_ngram.sum()
ngram_char_sum = df_char_ngram.groupby(level=0).apply(lambda x: 100*x/float(totalcount))
ngram_char_sum.plot(kind="bar", title="Percentile Range Char NGram")
ngram_char_sum
Out[114]:
In [115]:
# the char ngram range
char_ngram_range = [(3,10), (3,7), (4,6)]
# create the model
model = Pipeline([('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='char')),
('clf', LogisticRegression(C=1e9))
])
param_grid = {
'tfidf__ngram_range': char_ngram_range,
}
# cross validate using grid search
X = df['ingredients_string']
char_ngram_results = grid_search_models('char_ngram', model, param_grid, X, y, 5)
char_ngram_results
Out[115]:
In [116]:
cpd = get_grid_scores_pd(char_ngram_results)
cpd
Out[116]:
Looks like char ngram does not improve the score.
In [76]:
# total number of ingredients per cuisine
total_ingredients_dict = {}
for key, value in cuisine_dict.iteritems():
total_ingredients_dict[key] = len(value)
# tabulate it
df_total_ingredients = pd.DataFrame.from_dict(total_ingredients_dict, orient='index')
df_total_ingredients = df_total_ingredients.fillna(0)
print df_total_ingredients
print()
print df_total_ingredients.describe()
In [ ]:
#predDf = predDf.ix[1:12]
#predDf.drop(predDf.index[14], inplace=True)
In [117]:
# testing with min features
model = Pipeline([('tfidf', TfidfVectorizer(strip_accents='unicode',
analyzer='word',
ngram_range=(1,3),
max_features=853
)),
('clf', LogisticRegression(C=1e9))
])
X = df['ingredients_string']
maxf_results = grid_search_models('max_features', model, param_grid, X, y, 5)
maxf_results
Out[117]:
In [118]:
mpd = get_grid_scores_pd(maxf_results)
mpd
Out[118]:
The score does not improve.
In [119]:
# transformer class for pipeline, extracting the text and ingredient length
class IngredientExtractor(BaseEstimator, TransformerMixin):
def fit(self, x, y=None):
return self
def transform(self, ingredients):
features = np.recarray(shape=(len(ingredients),),
dtype=[('txt', object),
('ingredient_length', object)])
for i, row in enumerate(ingredients):
features['txt'][i] = row.replace(",", " ")
features['ingredient_length'][i] = str(len(row.split(",")))
return features
# transformer class to select the column
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
In [120]:
# model additional feature (ingredient length) and logistic regression
model = Pipeline([
('ingredients', IngredientExtractor()),
('union', FeatureUnion(
[
# adding ingredient length feature
('ingredient_length', Pipeline([
('extract', ItemSelector(key='ingredient_length')),
('tfidf', TfidfVectorizer()),
])),
# adding ingredient text feature
('txt', Pipeline([
('extract', ItemSelector(key='txt')),
('tfidf', TfidfVectorizer(strip_accents='unicode',
analyzer='word', ngram_range=(1,3))),
])),
],
)),
# using logistic classifier
('clf', LogisticRegression(C=1e9))
])
param_grid = {}
# cross validate using grid search
X = df['ingredients_all']
ful_results = grid_search_models('feature_union', model, param_grid, X, y, 5)
ful_results
Out[120]:
In [152]:
fpd = get_grid_scores_pd(ful_results)
fpd
Out[152]:
The score does not improve.
In [121]:
# using different weights
model = Pipeline([
('ingredients', IngredientExtractor()),
('union', FeatureUnion(
[
# adding ingredient length feature
('ingredient_length', Pipeline([
('extract', ItemSelector(key='ingredient_length')),
('tfidf', TfidfVectorizer()),
])),
# adding ingredient text feature
('txt', Pipeline([
('extract', ItemSelector(key='txt')),
('tfidf', TfidfVectorizer(strip_accents='unicode',
analyzer='word', ngram_range=(1,3))),
])),
],
# weight components in FeatureUnion
transformer_weights={
'txt': 0.8,
'ingredient_length': 0.2,
},
)),
# using logistic classifier
('clf', LogisticRegression(C=1e9))
])
X = df['ingredients_all']
ful_results = grid_search_models('feature_union_weighted', model, param_grid, X, y, 5)
ful_results
Out[121]:
In [122]:
ffpd = get_grid_scores_pd(ful_results)
ffpd
Out[122]:
In [128]:
# let's try simple model with no feature union
model = Pipeline([
# using the best parameters
('tfidf', TfidfVectorizer(strip_accents='unicode',
analyzer='word', ngram_range=(1,3)
)),
# using linear svc
('clf', OneVsRestClassifier(LinearSVC(random_state=1)))
])
param_grid = {}
# cross validate using grid search
X = df['ingredients_all']
simple_svc_results = grid_search_models('simple_svc', model, param_grid, X, y, 5)
simple_svc_results
Out[128]:
In [129]:
svcr = get_grid_scores_pd(simple_svc_results)
svcr
Out[129]:
Score improved to 0.785
In [126]:
# model with SVC, and ingredient length feature addition, with 0.8/0.2 weights
model = Pipeline([
('ingredients', IngredientExtractor()),
('union', FeatureUnion(
[
# adding ingredient length feature
('ingredient_length', Pipeline([
('extract', ItemSelector(key='ingredient_length')),
('vect', TfidfVectorizer()),
])),
# adding ingredient text feature
('txt', Pipeline([
('extract', ItemSelector(key='txt')),
('vect', TfidfVectorizer(strip_accents='unicode',
analyzer='word', ngram_range=(1,3))),
])),
],
# weight components in FeatureUnion
transformer_weights={
'txt': 0.8,
'ingredient_length': 0.2,
},
)),
# using support vector machines
('clf', OneVsRestClassifier(LinearSVC(random_state=1)))
])
# adding weights
param_grid = {}
# cross validate using grid search
X = df['ingredients_all']
svcb_results = grid_search_models('svcb', model, param_grid, X, y, 5)
svcb_results
Out[126]:
In [127]:
svcrb = get_grid_scores_pd(svcb_results)
svcrb
Out[127]:
With weighting score is about the same.
In [130]:
# split data to train set and test set
X = df['ingredients_all']
X_train, X_test, y_train, y_test = train_test_split(X, y)
In [144]:
def train_and_evaluate(clf, X_train, X_test, y_train, y_test):
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print "Accuracy on training set:"
print clf.score(X_train, y_train)
print "Accuracy on testing set:"
print metrics.accuracy_score(y_test, y_pred)
#print clf.score(X_test, y_test)
print "Classification Report:"
print metrics.classification_report(y_test, y_pred)
print "Confusion Matrix:"
cmtrx = metrics.confusion_matrix(y_test, y_pred)
# plot it
fig, ax = plt.subplots(figsize=(12, 12))
ax.matshow(cmtrx, cmap=plt.cm.Blues, alpha=0.3)
for i in range(cmtrx.shape[0]):
for j in range(cmtrx.shape[1]):
ax.text(x=j, y=i,
s=cmtrx[i, j],
va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label')
plt.show()
def auc_score(clf, X_test, y_test):
y_pred_prob = clf.predict_proba(X_test)[:, 1]
print "AUC Score:"
print metrics.roc_auc_score(y_test, y_pred_prob)
print "Log Loss:"
print metrics.log_loss(y_test, y_pred_prob)
# plot it
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [145]:
# replicating the best model
model = Pipeline([
# using the best parameters
('tfidf', TfidfVectorizer(strip_accents='unicode',
analyzer='word', ngram_range=(1,3)
)),
# using linear svc
('clf', OneVsRestClassifier(LinearSVC(random_state=1)))
])
# and evaluate
train_and_evaluate(model, X_train, X_test, y_train, y_test)
In [133]:
# read test.json
testdf = pd.read_json('../data/test.json')
In [134]:
# do the data prep for test.json data
testdf['ingredients_all'] = testdf.ingredients.apply(lambda x: " ".join(clean_ingredients(x)))
In [135]:
# predict it
X1_test = testdf['ingredients_all']
y1_pred = model.predict(X1_test)
In [136]:
idx = testdf.id.values.astype(int)
cuisine_inverse_mapping = {v: k for k, v in cuisine_mapping.items()}
cuisine_inverse_mapping
Out[136]:
In [137]:
y1_pred = [cuisine_inverse_mapping[w] for w in y1_pred]
In [138]:
# and generate the output
output_df = pd.DataFrame()
output_df['id'] = idx
output_df['cuisine'] = y1_pred
output_df.to_csv('output.csv',index=False)
In this paper, I outline the approach that I took to produce a solution for the What's Cooking Kaggle competition.
The approach follows the iterative process of data exploration, data cleaning, feature engineering, modeling, and finally prediction.
In [ ]: