In [230]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

In [231]:
df = pd.read_csv('menu.csv')

In [232]:
df.head()


Out[232]:
Category Item Serving Size Calories Calories from Fat Total Fat Total Fat (% Daily Value) Saturated Fat Saturated Fat (% Daily Value) Trans Fat ... Carbohydrates Carbohydrates (% Daily Value) Dietary Fiber Dietary Fiber (% Daily Value) Sugars Protein Vitamin A (% Daily Value) Vitamin C (% Daily Value) Calcium (% Daily Value) Iron (% Daily Value)
0 Breakfast Egg McMuffin 4.8 oz (136 g) 300 120 13.0 20 5.0 25 0.0 ... 31 10 4 17 3 17 10 0 25 15
1 Breakfast Egg White Delight 4.8 oz (135 g) 250 70 8.0 12 3.0 15 0.0 ... 30 10 4 17 3 18 6 0 25 8
2 Breakfast Sausage McMuffin 3.9 oz (111 g) 370 200 23.0 35 8.0 42 0.0 ... 29 10 4 17 2 14 8 0 25 10
3 Breakfast Sausage McMuffin with Egg 5.7 oz (161 g) 450 250 28.0 43 10.0 52 0.0 ... 30 10 4 17 2 21 15 0 30 15
4 Breakfast Sausage McMuffin with Egg Whites 5.7 oz (161 g) 400 210 23.0 35 8.0 42 0.0 ... 30 10 4 17 2 21 6 0 25 10

5 rows × 24 columns


In [233]:
## Category will be the taget variable
df['Category'].unique()


Out[233]:
array(['Breakfast', 'Beef & Pork', 'Chicken & Fish', 'Salads',
       'Snacks & Sides', 'Desserts', 'Beverages', 'Coffee & Tea',
       'Smoothies & Shakes'], dtype=object)

In [234]:
## Tokenize the Items

In [235]:
countv = CountVectorizer(stop_words='english')

In [236]:
tokenized_words = countv.fit_transform(df['Item'])

In [237]:
len(countv.get_feature_names()) # 143


Out[237]:
143

In [238]:
tokenized_words = pd.DataFrame(tokenized_words.toarray())

In [239]:
len(df)


Out[239]:
260

In [240]:
len(tokenized_words)


Out[240]:
260

In [241]:
df = pd.concat([df, tokenized_words], axis=1)

In [242]:
df.head(100)


Out[242]:
Category Item Serving Size Calories Calories from Fat Total Fat Total Fat (% Daily Value) Saturated Fat Saturated Fat (% Daily Value) Trans Fat ... 133 134 135 136 137 138 139 140 141 142
0 Breakfast Egg McMuffin 4.8 oz (136 g) 300 120 13.0 20 5.0 25 0.0 ... 0 0 0 0 0 0 0 0 0 0
1 Breakfast Egg White Delight 4.8 oz (135 g) 250 70 8.0 12 3.0 15 0.0 ... 0 0 0 0 0 0 1 0 0 0
2 Breakfast Sausage McMuffin 3.9 oz (111 g) 370 200 23.0 35 8.0 42 0.0 ... 0 0 0 0 0 0 0 0 0 0
3 Breakfast Sausage McMuffin with Egg 5.7 oz (161 g) 450 250 28.0 43 10.0 52 0.0 ... 0 0 0 0 0 0 0 0 0 0
4 Breakfast Sausage McMuffin with Egg Whites 5.7 oz (161 g) 400 210 23.0 35 8.0 42 0.0 ... 0 0 0 0 0 0 0 1 0 0
5 Breakfast Steak & Egg McMuffin 6.5 oz (185 g) 430 210 23.0 36 9.0 46 1.0 ... 0 0 0 0 0 0 0 0 0 0
6 Breakfast Bacon, Egg & Cheese Biscuit (Regular Biscuit) 5.3 oz (150 g) 460 230 26.0 40 13.0 65 0.0 ... 0 0 0 0 0 0 0 0 0 0
7 Breakfast Bacon, Egg & Cheese Biscuit (Large Biscuit) 5.8 oz (164 g) 520 270 30.0 47 14.0 68 0.0 ... 0 0 0 0 0 0 0 0 0 0
8 Breakfast Bacon, Egg & Cheese Biscuit with Egg Whites (R... 5.4 oz (153 g) 410 180 20.0 32 11.0 56 0.0 ... 0 0 0 0 0 0 0 1 0 0
9 Breakfast Bacon, Egg & Cheese Biscuit with Egg Whites (L... 5.9 oz (167 g) 470 220 25.0 38 12.0 59 0.0 ... 0 0 0 0 0 0 0 1 0 0
10 Breakfast Sausage Biscuit (Regular Biscuit) 4.1 oz (117 g) 430 240 27.0 42 12.0 62 0.0 ... 0 0 0 0 0 0 0 0 0 0
11 Breakfast Sausage Biscuit (Large Biscuit) 4.6 oz (131 g) 480 280 31.0 48 13.0 65 0.0 ... 0 0 0 0 0 0 0 0 0 0
12 Breakfast Sausage Biscuit with Egg (Regular Biscuit) 5.7 oz (163 g) 510 290 33.0 50 14.0 71 0.0 ... 0 0 0 0 0 0 0 0 0 0
13 Breakfast Sausage Biscuit with Egg (Large Biscuit) 6.2 oz (177 g) 570 330 37.0 57 15.0 74 0.0 ... 0 0 0 0 0 0 0 0 0 0
14 Breakfast Sausage Biscuit with Egg Whites (Regular Biscuit) 5.9 oz (167 g) 460 250 27.0 42 12.0 62 0.0 ... 0 0 0 0 0 0 0 1 0 0
15 Breakfast Sausage Biscuit with Egg Whites (Large Biscuit) 6.4 oz (181 g) 520 280 32.0 49 13.0 65 0.0 ... 0 0 0 0 0 0 0 1 0 0
16 Breakfast Southern Style Chicken Biscuit (Regular Biscuit) 5 oz (143 g) 410 180 20.0 31 8.0 41 0.0 ... 0 0 0 0 0 0 0 0 0 0
17 Breakfast Southern Style Chicken Biscuit (Large Biscuit) 5.5 oz (157 g) 470 220 24.0 37 9.0 45 0.0 ... 0 0 0 0 0 0 0 0 0 0
18 Breakfast Steak & Egg Biscuit (Regular Biscuit) 7.1 oz (201 g) 540 290 32.0 49 16.0 78 1.0 ... 0 0 0 0 0 0 0 0 0 0
19 Breakfast Bacon, Egg & Cheese McGriddles 6.1 oz (174 g) 460 190 21.0 32 9.0 44 0.0 ... 0 0 0 0 0 0 0 0 0 0
20 Breakfast Bacon, Egg & Cheese McGriddles with Egg Whites 6.3 oz (178 g) 400 140 15.0 24 7.0 34 0.0 ... 0 0 0 0 0 0 0 1 0 0
21 Breakfast Sausage McGriddles 5 oz (141 g) 420 200 22.0 34 8.0 40 0.0 ... 0 0 0 0 0 0 0 0 0 0
22 Breakfast Sausage, Egg & Cheese McGriddles 7.1 oz (201 g) 550 280 31.0 48 12.0 61 0.0 ... 0 0 0 0 0 0 0 0 0 0
23 Breakfast Sausage, Egg & Cheese McGriddles with Egg Whites 7.2 oz (205 g) 500 230 26.0 40 10.0 52 0.0 ... 0 0 0 0 0 0 0 1 0 0
24 Breakfast Bacon, Egg & Cheese Bagel 6.9 oz (197 g) 620 280 31.0 48 11.0 56 0.5 ... 0 0 0 0 0 0 0 0 0 0
25 Breakfast Bacon, Egg & Cheese Bagel with Egg Whites 7.1 oz (201 g) 570 230 25.0 39 9.0 45 0.5 ... 0 0 0 0 0 0 0 1 0 0
26 Breakfast Steak, Egg & Cheese Bagel 8.5 oz (241 g) 670 310 35.0 53 13.0 63 1.5 ... 0 0 0 0 0 0 0 0 0 0
27 Breakfast Big Breakfast (Regular Biscuit) 9.5 oz (269 g) 740 430 48.0 73 17.0 87 0.0 ... 0 0 0 0 0 0 0 0 0 0
28 Breakfast Big Breakfast (Large Biscuit) 10 oz (283 g) 800 470 52.0 80 18.0 90 0.0 ... 0 0 0 0 0 0 0 0 0 0
29 Breakfast Big Breakfast with Egg Whites (Regular Biscuit) 9.6 oz (272 g) 640 330 37.0 57 14.0 69 0.0 ... 0 0 0 0 0 0 0 1 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
70 Chicken & Fish Premium McWrap Chicken & Bacon (Crispy Chicken) 11.1 oz (316 g) 630 280 32.0 49 9.0 45 0.5 ... 0 0 0 0 0 0 0 0 0 0
71 Chicken & Fish Premium McWrap Chicken & Bacon (Grilled Chicken) 10.7 oz (302 g) 480 170 19.0 28 7.0 36 0.0 ... 0 0 0 0 0 0 0 0 0 0
72 Chicken & Fish Premium McWrap Chicken & Ranch (Crispy Chicken) 10.9 oz (310 g) 610 280 31.0 47 8.0 40 0.5 ... 0 0 0 0 0 0 0 0 0 0
73 Chicken & Fish Premium McWrap Chicken & Ranch (Grilled Chicken) 10.5 oz (297 g) 450 160 18.0 27 6.0 31 0.5 ... 0 0 0 0 0 0 0 0 0 0
74 Chicken & Fish Premium McWrap Southwest Chicken (Crispy Chicken) 11.1 oz (314 g) 670 300 33.0 51 8.0 40 0.5 ... 0 0 0 0 0 0 0 0 0 0
75 Chicken & Fish Premium McWrap Southwest Chicken (Grilled Chic... 11.2 oz (318 g) 520 180 20.0 31 6.0 32 0.0 ... 0 0 0 0 0 0 0 0 0 0
76 Chicken & Fish Premium McWrap Chicken Sweet Chili (Crispy Chi... 10.7 oz (304 g) 540 200 23.0 35 4.5 23 0.0 ... 0 1 0 0 0 0 0 0 0 0
77 Chicken & Fish Premium McWrap Chicken Sweet Chili (Grilled Ch... 10.3 oz (291 g) 380 90 10.0 15 3.0 14 0.0 ... 0 1 0 0 0 0 0 0 0 0
78 Chicken & Fish Chicken McNuggets (4 piece) 2.3 oz (65 g) 190 110 12.0 18 2.0 10 0.0 ... 0 0 0 0 0 0 0 0 0 0
79 Chicken & Fish Chicken McNuggets (6 piece) 3.4 oz (97 g) 280 160 18.0 27 3.0 15 0.0 ... 0 0 0 0 0 0 0 0 0 0
80 Chicken & Fish Chicken McNuggets (10 piece) 5.7 oz (162 g) 470 270 30.0 45 5.0 25 0.0 ... 0 0 0 0 0 0 0 0 0 0
81 Chicken & Fish Chicken McNuggets (20 piece) 11.4 oz (323 g) 940 530 59.0 91 10.0 50 0.0 ... 0 0 0 0 0 0 0 0 0 0
82 Chicken & Fish Chicken McNuggets (40 piece) 22.8 oz (646 g) 1880 1060 118.0 182 20.0 101 1.0 ... 0 0 0 0 0 0 0 0 0 0
83 Chicken & Fish Filet-O-Fish 5 oz (142 g) 390 170 19.0 29 4.0 19 0.0 ... 0 0 0 0 0 0 0 0 0 0
84 Salads Premium Bacon Ranch Salad (without Chicken) 7.9 oz (223 g) 140 70 7.0 11 3.5 18 0.0 ... 0 0 0 0 0 0 0 0 0 0
85 Salads Premium Bacon Ranch Salad with Crispy Chicken 9 oz (255 g) 380 190 21.0 33 6.0 29 0.0 ... 0 0 0 0 0 0 0 0 0 0
86 Salads Premium Bacon Ranch Salad with Grilled Chicken 8.5 oz (241 g) 220 80 8.0 13 4.0 20 0.0 ... 0 0 0 0 0 0 0 0 0 0
87 Salads Premium Southwest Salad (without Chicken) 8.1 oz (230 g) 140 40 4.5 7 2.0 9 0.0 ... 0 0 0 0 0 0 0 0 0 0
88 Salads Premium Southwest Salad with Crispy Chicken 12.3 oz (348 g) 450 190 22.0 33 4.5 22 0.0 ... 0 0 0 0 0 0 0 0 0 0
89 Salads Premium Southwest Salad with Grilled Chicken 11.8 oz (335 g) 290 80 8.0 13 2.5 13 0.0 ... 0 0 0 0 0 0 0 0 0 0
90 Snacks & Sides Chipotle BBQ Snack Wrap (Crispy Chicken) 4.6 oz (130 g) 340 130 15.0 23 4.5 22 0.0 ... 0 0 0 0 0 0 0 0 1 0
91 Snacks & Sides Chipotle BBQ Snack Wrap (Grilled Chicken) 4.3 oz (123 g) 260 70 8.0 13 3.5 18 0.0 ... 0 0 0 0 0 0 0 0 1 0
92 Snacks & Sides Honey Mustard Snack Wrap (Crispy Chicken) 4.3 oz (123 g) 330 130 15.0 23 4.5 22 0.0 ... 0 0 0 0 0 0 0 0 1 0
93 Snacks & Sides Honey Mustard Snack Wrap (Grilled Chicken) 4.1 oz (116 g) 250 70 8.0 13 3.5 18 0.0 ... 0 0 0 0 0 0 0 0 1 0
94 Snacks & Sides Ranch Snack Wrap (Crispy Chicken) 4.5 oz (128 g) 360 180 20.0 30 5.0 27 0.0 ... 0 0 0 0 0 0 0 0 1 0
95 Snacks & Sides Ranch Snack Wrap (Grilled Chicken) 4.3 oz (121 g) 280 120 13.0 20 4.5 22 0.0 ... 0 0 0 0 0 0 0 0 1 0
96 Snacks & Sides Small French Fries 2.6 oz (75 g) 230 100 11.0 17 1.5 8 0.0 ... 0 0 0 0 0 0 0 0 0 0
97 Snacks & Sides Medium French Fries 3.9 oz (111 g) 340 140 16.0 24 2.5 11 0.0 ... 0 0 0 0 0 0 0 0 0 0
98 Snacks & Sides Large French Fries 5.9 oz (168 g) 510 220 24.0 37 3.5 17 0.0 ... 0 0 0 0 0 0 0 0 0 0
99 Snacks & Sides Kids French Fries 1.3 oz (38 g) 110 50 5.0 8 1.0 4 0.0 ... 0 0 0 0 0 0 0 0 0 0

100 rows × 167 columns


In [243]:
df.drop('Item', inplace=True, axis=1)

In [244]:
df['Serving Size'][78].split(' ')


Out[244]:
['2.3', 'oz', '(65', 'g)']

In [245]:
df['Serving Size'].tail()


Out[245]:
255    10.1 oz (285 g)
256    13.4 oz (381 g)
257     6.7 oz (190 g)
258    14.2 oz (403 g)
259     7.1 oz (202 g)
Name: Serving Size, dtype: object

In [246]:
df['Serving Size'][99].split()[2].split('(')[1]


Out[246]:
'38'

In [247]:
def in_grams(value):
    if  'g' in value:
        return (value.split()[2].split('(')[1])
    else:
        return (value.split()[0])

In [248]:
in_grams('7.1 oz (202 g)')


Out[248]:
'202'

In [249]:
df['Serving Size'] = df['Serving Size'].apply(in_grams)

In [250]:
sc = StandardScaler()

In [251]:
df['Calories'] = sc.fit_transform(df[['Calories']])
df['Serving Size'] = sc.fit_transform(df[['Serving Size']])

In [252]:
df['Calories'].mean()


Out[252]:
5.657867337032048e-18

In [253]:
for i in df.columns[3:23]:
    df[i] = sc.fit_transform(df[[i]])

In [254]:
lencoder = LabelEncoder()

In [255]:
len(df['Category'].unique())


Out[255]:
9

In [256]:
df['Category'] = lencoder.fit_transform(df['Category'])

In [257]:
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [258]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 25)

In [259]:
rfc = RandomForestClassifier(n_estimators=1000)

In [260]:
rfc.fit(X_train, y_train)


Out[260]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [261]:
rfc_y_pred = rfc.predict(X_test)

In [262]:
confusion_matrix(y_test, rfc_y_pred)


Out[262]:
array([[ 0,  0,  1,  0,  0,  0,  0],
       [ 0,  5,  0,  0,  0,  0,  0],
       [ 0,  0,  7,  0,  0,  0,  0],
       [ 0,  0,  0,  4,  0,  0,  0],
       [ 0,  0,  0,  0, 14,  0,  0],
       [ 0,  0,  0,  0,  0,  7,  0],
       [ 0,  0,  0,  0,  0,  0,  1]], dtype=int64)

In [263]:
accuracy_score(y_test, rfc_y_pred)


Out[263]:
0.97435897435897434

In [264]:
cross_val_score(rfc, X_train, y_train, cv = 10).mean()


Out[264]:
0.93648131997674322

In [266]:
gbc = GradientBoostingClassifier()

In [267]:
gbc.fit(X_train, y_train)


Out[267]:
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [268]:
cross_val_score(gbc, X_train, y_train, cv = 10).mean()


Out[268]:
0.94488300601000819

In [269]:
confusion_matrix(y_test, gbc.predict(X_test))


Out[269]:
array([[ 0,  0,  1,  0,  0,  0,  0],
       [ 0,  5,  0,  0,  0,  0,  0],
       [ 0,  0,  7,  0,  0,  0,  0],
       [ 0,  0,  1,  3,  0,  0,  0],
       [ 0,  0,  0,  0, 14,  0,  0],
       [ 0,  0,  0,  0,  0,  7,  0],
       [ 0,  0,  0,  0,  0,  0,  1]], dtype=int64)

ends here!