In [368]:
import numpy as np
import pandas as pd
from pandas import *
import csv

In [369]:
#create a list of csvs
files = !ls *.csv
#load into a df
df_list = [pd.read_csv(file) for file in files]

In [370]:
files


Out[370]:
['APP_data_reduced.csv',
 'BB_data_reduced.csv',
 'DS_data_reduced.csv',
 'SLD_data_reduced.csv',
 'SP_data_reduced.csv']

In [371]:
data = pd.concat(df_list, axis = 0)

In [372]:
print data.shape
data.head(2)


(10011, 15)
Out[372]:
id rating bitter meaty piquant salty sour sweet cookTimeInSeconds ingredientCount numberOfServings prepTimeInSeconds totalTimeInSeconds ingredient_list course
0 Bacon-Cheddar-Pinwheels-768341 4 0.833333 0.833333 0.000000 0.833333 0.166667 0.166667 NaN 5 16.0 900.0 2100.0 [Pillsbury™ Refrigerated Crescent Dinner Rolls... Appetizer
1 Fiesta-Corn-Dip-1711704 4 0.500000 0.166667 0.833333 0.833333 0.500000 0.166667 NaN 7 4.0 900.0 900.0 [sour cream, mayonnaise, ground cumin, mexicor... Appetizer

In [373]:
# labelencode the course column
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
data['course_num'] = le.fit_transform(data.course)

In [374]:
#make all column headers, small cases
data.columns = [col.lower() for col in data.columns]

In [375]:
data.isnull().sum()


Out[375]:
id                       0
rating                   0
bitter                3242
meaty                 3242
piquant               3242
salty                 3242
sour                  3242
sweet                 3242
cooktimeinseconds     7406
ingredientcount          0
numberofservings         2
preptimeinseconds     6180
totaltimeinseconds       0
ingredient_list          0
course                   0
course_num               0
dtype: int64

In [376]:
data = data.drop(['cooktimeinseconds','preptimeinseconds', 'course'], axis = 1)

In [378]:
# #clean cook/preptime columns
# for index, row in data.iterrows():
#     if row['cooktimeinseconds'].isnull() and row['preptimeinseconds'].isnull():
#         print row

In [379]:
# clean the ingredient column 
import string
# strip non ascii characters 
printable = set(string.printable)
ingredient = [filter(lambda x: x in printable, z) for z in data['ingredient_list']]

In [380]:
ingredient[0]


Out[380]:
'[Pillsbury Refrigerated Crescent Dinner Rolls, ranch dressing, bacon pieces, shredded cheddar cheese, green onions]'

In [381]:
# remove the braquets
ingredient = [z[1:-1] for z in ingredient]

In [382]:
ingredient[0]


Out[382]:
'Pillsbury Refrigerated Crescent Dinner Rolls, ranch dressing, bacon pieces, shredded cheddar cheese, green onions'

In [383]:
ingredient = [z.split(" ") for z in ingredient]

In [384]:
ingredient[0]


Out[384]:
['Pillsbury',
 'Refrigerated',
 'Crescent',
 'Dinner',
 'Rolls,',
 'ranch',
 'dressing,',
 'bacon',
 'pieces,',
 'shredded',
 'cheddar',
 'cheese,',
 'green',
 'onions']

In [385]:
# perform word lemmatization
from nltk import stem
wordnet_lemm = stem.WordNetLemmatizer()

lem_ingredient = [] # create a new list of lemmatized ingredient list for each recipe
for ing in ingredient:
    lem_ingredient.append([wordnet_lemm.lemmatize(w) for w in ing])
    
# convert back to a string for vectorization
lem_ingredient = [' '.join(z) for z in lem_ingredient]

In [386]:
data['ingredient_lem'] = lem_ingredient
data = data.drop('ingredient_list', axis = 1)
data.head(2)


Out[386]:
id rating bitter meaty piquant salty sour sweet ingredientcount numberofservings totaltimeinseconds course_num ingredient_lem
0 Bacon-Cheddar-Pinwheels-768341 4 0.833333 0.833333 0.000000 0.833333 0.166667 0.166667 5 16.0 2100.0 0 Pillsbury Refrigerated Crescent Dinner Rolls, ...
1 Fiesta-Corn-Dip-1711704 4 0.500000 0.166667 0.833333 0.833333 0.500000 0.166667 7 4.0 900.0 0 sour cream, mayonnaise, ground cumin, mexicorn...

In [387]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()

In [388]:
array = data.values
from sklearn import cross_validation

X = data['ingredient_lem']
print X.shape
Y = data['course_num']
print Y.shape

X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, random_state=10)

print X_train.shape
print Y_train.shape


(10011,)
(10011,)
(7508,)
(7508,)

In [389]:
#vectorize and convert the matrix into array 
X_train_vect = vect.fit_transform(X_train) 
X_train_vect = X_train_vect.toarray() 
X_test_vect = vect.transform(X_test)
X_test_vect = X_test_vect.toarray()

In [390]:
#create a list of the feature names
features = vect.get_feature_names()
len(features)


Out[390]:
1813

In [402]:
#create dataframe of the vectorized dataset
train_vect_df = pd.DataFrame(data = X_train_vect, columns = features)
train_vect_df['course_num'] = Y_train.values
test_vect_df = pd.DataFrame(data = X_test_vect, columns = features)
test_vect_df['course_num'] = Y_test.values
print train_vect_df.shape
print test_vect_df.shape


(7508, 1814)
(2503, 1814)

In [406]:
# train/test split the other columns from the main dataframe 
array = data.values
_X = array[:, 1:11]
print _X.shape
Y = data['course_num']
print Y.shape
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(_X, Y, random_state=10)
print X_train.shape
print Y_train.shape


(10011, 10)
(10011,)
(7508, 10)
(7508,)

In [407]:
# create a dataframe for training and test set
train_df = pd.DataFrame(data = X_train, columns = list(data.columns[1:11]))
test_df = pd.DataFrame(data = X_test, columns = list(data.columns[1:11]))
print train_df.shape


(7508, 10)

In [408]:
# combine the dataframes for the training set and for the testing set. The new dataframes will have the vectorized
# columns and selected features together.
data_train = pd.concat([train_df, train_vect_df], axis = 1)
data_test = pd.concat([test_df, test_vect_df], axis = 1)

In [ ]: