In [230]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
In [231]:
df = pd.read_csv('menu.csv')
In [232]:
df.head()
Out[232]:
In [233]:
## Category will be the taget variable
df['Category'].unique()
Out[233]:
In [234]:
## Tokenize the Items
In [235]:
countv = CountVectorizer(stop_words='english')
In [236]:
tokenized_words = countv.fit_transform(df['Item'])
In [237]:
len(countv.get_feature_names()) # 143
Out[237]:
In [238]:
tokenized_words = pd.DataFrame(tokenized_words.toarray())
In [239]:
len(df)
Out[239]:
In [240]:
len(tokenized_words)
Out[240]:
In [241]:
df = pd.concat([df, tokenized_words], axis=1)
In [242]:
df.head(100)
Out[242]:
In [243]:
df.drop('Item', inplace=True, axis=1)
In [244]:
df['Serving Size'][78].split(' ')
Out[244]:
In [245]:
df['Serving Size'].tail()
Out[245]:
In [246]:
df['Serving Size'][99].split()[2].split('(')[1]
Out[246]:
In [247]:
def in_grams(value):
if 'g' in value:
return (value.split()[2].split('(')[1])
else:
return (value.split()[0])
In [248]:
in_grams('7.1 oz (202 g)')
Out[248]:
In [249]:
df['Serving Size'] = df['Serving Size'].apply(in_grams)
In [250]:
sc = StandardScaler()
In [251]:
df['Calories'] = sc.fit_transform(df[['Calories']])
df['Serving Size'] = sc.fit_transform(df[['Serving Size']])
In [252]:
df['Calories'].mean()
Out[252]:
In [253]:
for i in df.columns[3:23]:
df[i] = sc.fit_transform(df[[i]])
In [254]:
lencoder = LabelEncoder()
In [255]:
len(df['Category'].unique())
Out[255]:
In [256]:
df['Category'] = lencoder.fit_transform(df['Category'])
In [257]:
X = df.iloc[:, 1:]
y = df.iloc[:, 0]
In [258]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 25)
In [259]:
rfc = RandomForestClassifier(n_estimators=1000)
In [260]:
rfc.fit(X_train, y_train)
Out[260]:
In [261]:
rfc_y_pred = rfc.predict(X_test)
In [262]:
confusion_matrix(y_test, rfc_y_pred)
Out[262]:
In [263]:
accuracy_score(y_test, rfc_y_pred)
Out[263]:
In [264]:
cross_val_score(rfc, X_train, y_train, cv = 10).mean()
Out[264]:
In [266]:
gbc = GradientBoostingClassifier()
In [267]:
gbc.fit(X_train, y_train)
Out[267]:
In [268]:
cross_val_score(gbc, X_train, y_train, cv = 10).mean()
Out[268]:
In [269]:
confusion_matrix(y_test, gbc.predict(X_test))
Out[269]: