In [38]:
%matplotlib inline
import pandas as pd
import numpy as np
import sys
from sklearn.feature_extraction.text import TfidfVectorizer,HashingVectorizer
from sklearn import cross_validation
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression,SGDClassifier
import re
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import Normalizer,LabelEncoder,normalize,PolynomialFeatures
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import matplotlib
In [39]:
def preprocess(text):
t = re.sub('\W',r' ',text)
t = re.sub('\d',r' ',t)
return t
# A function for cross-validating a classifier
# for a specific evaluation measure
def crossValidateClassifier(X,y,clf):
cross_val = cross_validation.StratifiedKFold(y_train,n_folds=5,shuffle=True)
cv_score = cross_validation.cross_val_score(clf,X,y,scoring='accuracy',cv=cross_val)
print cv_score
print np.mean(cv_score)
return
# A convenient function for training and prediction
def train_and_test(clf,X_train,y_train,X_test):
clf.fit(X_train,y_train)
return clf.predict(X_test)
Let's do a quick inspection of the data by plotting the distribution of the different types of cuisines in the dataset.
In [40]:
train = pd.read_json("train.json")
matplotlib.style.use('ggplot')
cuisine_group = train.groupby('cuisine')
cuisine_group.size().sort_values(ascending=True).plot.barh()
plt.show()
Italian and mexican categories dominate the recipes dataset. We may want later to take this into account in order to make the problem more balanced.
We start by performing basic preprocessing and lemmatizing the words in the indredients part. Then we vectorize by using the $td-idf$ representation. Note, that we use as features unigrams and bigrams.
In [41]:
lemmatizer = WordNetLemmatizer()
train = pd.read_json("train.json")
train['ing'] = [' '.join([lemmatizer.lemmatize(preprocess(ingr)) for ingr in recette]).strip() for recette in train['ingredients']]
tfidf = TfidfVectorizer(sublinear_tf=True,max_df=0.5,ngram_range=(1,2),stop_words='english',norm='l2',binary=False)
tfidf.fit(train['ing'])
X_train = tfidf.transform(train['ing'])
y_train = train['cuisine']
In [56]:
# encode string labels
lenc = LabelEncoder()
lenc.fit(y_train)
y_train_enc = lenc.transform(y_train)
#power normalization
X_train.data**=0.5
normalize(X_train,copy=False)
Out[56]:
Note that here we user power scaling which reduces further the effect of frequent terms. After the scaling we re-normalize the data. We use the square root as default value, but one should optimize this value through random search.
In the following we apply the same transformation on the test data.
In [57]:
test = pd.read_json("test.json")
test['ing'] = [' '.join([lemmatizer.lemmatize(preprocess(ingr)) for ingr in recette]).strip() for recette in test['ingredients']]
X_test = tfidf.transform(test['ing'])
In [62]:
X_test.data**=0.5
normalize(X_test,copy=False)
categories = train['cuisine'].unique()
clf = LinearSVC(C=0.5,multi_class='ovr',dual=True)
crossValidateClassifier(X_train,y_train,clf)
We choose Support Vector Machines in order to train the model, as they provide state-of-the-art results in text classification problems. The cross-validation gives an average of 79.19% in terms of accuracy. Let's try a logistic regression model.
In [59]:
clf = LogisticRegression(C=10.0)
crossValidateClassifier(X_train,y_train,clf)
Accuracy is slightly smaller than SVM's. One should normally try a search (grid/random) in the parameters space for each classifier in order to select the best one.
Great, now we are ready to train the model selected and make predictions for the test set. This will give a descent score of 79.31% in the leaderboard. For my final solution I used Vowpal Wabbit with SGD as a base classifier and quadratic features which was sufficient for getting 14th place.
In [60]:
clf = LinearSVC(C=0.5,multi_class='ovr',dual=True)
test['cuisine']=train_and_test(clf,X_train,y_train,X_test)
test[['id','cuisine']].to_csv("lr_c0.5_power_norm.csv",index=False)
In [ ]: