notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
from pandas import *
import csv



In [44]:

    
#create a list of csvs
files = !ls *.csv
#load into a df
df_list = [pd.read_csv(file) for file in files]



In [45]:

    
files









    Out[45]:





['APP_data_reduced.csv',
 'BB_data_reduced.csv',
 'DS_data_reduced.csv',
 'SLD_data_reduced.csv',
 'SP_data_reduced.csv']



In [279]:

    
data = pd.concat(df_list, axis = 0)



In [280]:

    
print data.shape
data.head(2)









    



(10011, 15)






    Out[280]:






  
    
      
      id
      rating
      bitter
      meaty
      piquant
      salty
      sour
      sweet
      cookTimeInSeconds
      ingredientCount
      numberOfServings
      prepTimeInSeconds
      totalTimeInSeconds
      ingredient_list
      course
    
  
  
    
      0
      Bacon-Cheddar-Pinwheels-768341
      4
      0.833333
      0.833333
      0.000000
      0.833333
      0.166667
      0.166667
      NaN
      5
      16.0
      900.0
      2100.0
      [Pillsbury™ Refrigerated Crescent Dinner Rolls...
      Appetizer
    
    
      1
      Fiesta-Corn-Dip-1711704
      4
      0.500000
      0.166667
      0.833333
      0.833333
      0.500000
      0.166667
      NaN
      7
      4.0
      900.0
      900.0
      [sour cream, mayonnaise, ground cumin, mexicor...
      Appetizer



In [281]:

    
# labelencode the course column
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
data['course_num'] = le.fit_transform(data.course)



In [282]:

    
#make all column headers, small cases
data.columns = [col.lower() for col in data.columns]



In [283]:

    
data.isnull().sum()









    Out[283]:





id                       0
rating                   0
bitter                3242
meaty                 3242
piquant               3242
salty                 3242
sour                  3242
sweet                 3242
cooktimeinseconds     7406
ingredientcount          0
numberofservings         2
preptimeinseconds     6180
totaltimeinseconds       0
ingredient_list          0
course                   0
course_num               0
dtype: int64



In [284]:

    
data = data.drop(['cooktimeinseconds','preptimeinseconds', 'course'], axis = 1)



In [285]:

    
# #clean cook/preptime columns
# for index, row in data.iterrows():
#     if row['cooktimeinseconds'].isnull() and row['preptimeinseconds'].isnull():
#         print row



In [286]:

    
# clean the ingredient column 
import string
# strip non ascii characters 
printable = set(string.printable)
ingredient = [filter(lambda x: x in printable, z) for z in data['ingredient_list']]



In [287]:

    
ingredient[0]









    Out[287]:





'[Pillsbury Refrigerated Crescent Dinner Rolls, ranch dressing, bacon pieces, shredded cheddar cheese, green onions]'



In [288]:

    
# remove the braquets
ingredient = [z[1:-1] for z in ingredient]



In [289]:

    
ingredient[0]









    Out[289]:





'Pillsbury Refrigerated Crescent Dinner Rolls, ranch dressing, bacon pieces, shredded cheddar cheese, green onions'



In [290]:

    
ingredient = [z.split(" ") for z in ingredient]



In [291]:

    
ingredient[0]









    Out[291]:





['Pillsbury',
 'Refrigerated',
 'Crescent',
 'Dinner',
 'Rolls,',
 'ranch',
 'dressing,',
 'bacon',
 'pieces,',
 'shredded',
 'cheddar',
 'cheese,',
 'green',
 'onions']



In [292]:

    
# perform word lemmatization
from nltk import stem
wordnet_lemm = stem.WordNetLemmatizer()

lem_ingredient = [] # create a new list of lemmatized ingredient list for each recipe
for ing in ingredient:
    lem_ingredient.append([wordnet_lemm.lemmatize(w) for w in ing])
    
# convert back to a string for vectorization
lem_ingredient = [' '.join(z) for z in lem_ingredient]



In [293]:

    
data['ingredient_lem'] = lem_ingredient
data = data.drop('ingredient_list', axis = 1)
data.head(2)









    Out[293]:






  
    
      
      id
      rating
      bitter
      meaty
      piquant
      salty
      sour
      sweet
      ingredientcount
      numberofservings
      totaltimeinseconds
      course_num
      ingredient_lem
    
  
  
    
      0
      Bacon-Cheddar-Pinwheels-768341
      4
      0.833333
      0.833333
      0.000000
      0.833333
      0.166667
      0.166667
      5
      16.0
      2100.0
      0
      Pillsbury Refrigerated Crescent Dinner Rolls, ...
    
    
      1
      Fiesta-Corn-Dip-1711704
      4
      0.500000
      0.166667
      0.833333
      0.833333
      0.500000
      0.166667
      7
      4.0
      900.0
      0
      sour cream, mayonnaise, ground cumin, mexicorn...



In [294]:

    
# Vectorize
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
# Assign 'ingredient string' column to 'corpus' variable
# corpus = data['ingredient_lem']
# fit
# dense = vect.fit_transform(corpus).todense()



In [301]:

    
#array = data.values
# create a validation set
from sklearn import cross_validation

validation_size = 0.20
X = data[['rating', 'bitter', 'meaty', 'piquant', 'salty', 'sour', 'sweet', 'ingredientcount', 'numberofservings', 
        'totaltimeinseconds', 'ingredient_lem']]
print X.shape
Y = data['course_num']
print Y.shape
X_train, X_validation, Y_train, Y_validation = cross_validation.train_test_split(X, Y, test_size = validation_size, random_state=10)
print X_train.shape
print Y_train.shape









    



(10011, 11)
(10011,)
(8008, 11)
(8008,)



In [ ]:

	id	rating	bitter	meaty	piquant	salty	sour	sweet	cookTimeInSeconds	ingredientCount	numberOfServings	prepTimeInSeconds	totalTimeInSeconds	ingredient_list	course
0	Bacon-Cheddar-Pinwheels-768341	4	0.833333	0.833333	0.000000	0.833333	0.166667	0.166667	NaN	5	16.0	900.0	2100.0	[Pillsbury™ Refrigerated Crescent Dinner Rolls...	Appetizer
1	Fiesta-Corn-Dip-1711704	4	0.500000	0.166667	0.833333	0.833333	0.500000	0.166667	NaN	7	4.0	900.0	900.0	[sour cream, mayonnaise, ground cumin, mexicor...	Appetizer