In [1]:
import numpy as np
import pandas as pd
from pandas import *
import csv

In [44]:
#create a list of csvs
files = !ls *.csv
#load into a df
df_list = [pd.read_csv(file) for file in files]

In [45]:
files


Out[45]:
['APP_data_reduced.csv',
 'BB_data_reduced.csv',
 'DS_data_reduced.csv',
 'SLD_data_reduced.csv',
 'SP_data_reduced.csv']

In [279]:
data = pd.concat(df_list, axis = 0)

In [280]:
print data.shape
data.head(2)


(10011, 15)
Out[280]:
id rating bitter meaty piquant salty sour sweet cookTimeInSeconds ingredientCount numberOfServings prepTimeInSeconds totalTimeInSeconds ingredient_list course
0 Bacon-Cheddar-Pinwheels-768341 4 0.833333 0.833333 0.000000 0.833333 0.166667 0.166667 NaN 5 16.0 900.0 2100.0 [Pillsbury™ Refrigerated Crescent Dinner Rolls... Appetizer
1 Fiesta-Corn-Dip-1711704 4 0.500000 0.166667 0.833333 0.833333 0.500000 0.166667 NaN 7 4.0 900.0 900.0 [sour cream, mayonnaise, ground cumin, mexicor... Appetizer

In [281]:
# labelencode the course column
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
data['course_num'] = le.fit_transform(data.course)

In [282]:
#make all column headers, small cases
data.columns = [col.lower() for col in data.columns]

In [283]:
data.isnull().sum()


Out[283]:
id                       0
rating                   0
bitter                3242
meaty                 3242
piquant               3242
salty                 3242
sour                  3242
sweet                 3242
cooktimeinseconds     7406
ingredientcount          0
numberofservings         2
preptimeinseconds     6180
totaltimeinseconds       0
ingredient_list          0
course                   0
course_num               0
dtype: int64

In [284]:
data = data.drop(['cooktimeinseconds','preptimeinseconds', 'course'], axis = 1)

In [285]:
# #clean cook/preptime columns
# for index, row in data.iterrows():
#     if row['cooktimeinseconds'].isnull() and row['preptimeinseconds'].isnull():
#         print row

In [286]:
# clean the ingredient column 
import string
# strip non ascii characters 
printable = set(string.printable)
ingredient = [filter(lambda x: x in printable, z) for z in data['ingredient_list']]

In [287]:
ingredient[0]


Out[287]:
'[Pillsbury Refrigerated Crescent Dinner Rolls, ranch dressing, bacon pieces, shredded cheddar cheese, green onions]'

In [288]:
# remove the braquets
ingredient = [z[1:-1] for z in ingredient]

In [289]:
ingredient[0]


Out[289]:
'Pillsbury Refrigerated Crescent Dinner Rolls, ranch dressing, bacon pieces, shredded cheddar cheese, green onions'

In [290]:
ingredient = [z.split(" ") for z in ingredient]

In [291]:
ingredient[0]


Out[291]:
['Pillsbury',
 'Refrigerated',
 'Crescent',
 'Dinner',
 'Rolls,',
 'ranch',
 'dressing,',
 'bacon',
 'pieces,',
 'shredded',
 'cheddar',
 'cheese,',
 'green',
 'onions']

In [292]:
# perform word lemmatization
from nltk import stem
wordnet_lemm = stem.WordNetLemmatizer()

lem_ingredient = [] # create a new list of lemmatized ingredient list for each recipe
for ing in ingredient:
    lem_ingredient.append([wordnet_lemm.lemmatize(w) for w in ing])
    
# convert back to a string for vectorization
lem_ingredient = [' '.join(z) for z in lem_ingredient]

In [293]:
data['ingredient_lem'] = lem_ingredient
data = data.drop('ingredient_list', axis = 1)
data.head(2)


Out[293]:
id rating bitter meaty piquant salty sour sweet ingredientcount numberofservings totaltimeinseconds course_num ingredient_lem
0 Bacon-Cheddar-Pinwheels-768341 4 0.833333 0.833333 0.000000 0.833333 0.166667 0.166667 5 16.0 2100.0 0 Pillsbury Refrigerated Crescent Dinner Rolls, ...
1 Fiesta-Corn-Dip-1711704 4 0.500000 0.166667 0.833333 0.833333 0.500000 0.166667 7 4.0 900.0 0 sour cream, mayonnaise, ground cumin, mexicorn...

In [294]:
# Vectorize
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
# Assign 'ingredient string' column to 'corpus' variable
# corpus = data['ingredient_lem']
# fit
# dense = vect.fit_transform(corpus).todense()

In [301]:
#array = data.values
# create a validation set
from sklearn import cross_validation

validation_size = 0.20
X = data[['rating', 'bitter', 'meaty', 'piquant', 'salty', 'sour', 'sweet', 'ingredientcount', 'numberofservings', 
        'totaltimeinseconds', 'ingredient_lem']]
print X.shape
Y = data['course_num']
print Y.shape
X_train, X_validation, Y_train, Y_validation = cross_validation.train_test_split(X, Y, test_size = validation_size, random_state=10)
print X_train.shape
print Y_train.shape


(10011, 11)
(10011,)
(8008, 11)
(8008,)

In [ ]: