notebook.community

Edit and run



In [368]:

    
import numpy as np
import pandas as pd
from pandas import *
import csv



In [369]:

    
#create a list of csvs
files = !ls *.csv
#load into a df
df_list = [pd.read_csv(file) for file in files]



In [370]:

    
files









    Out[370]:





['APP_data_reduced.csv',
 'BB_data_reduced.csv',
 'DS_data_reduced.csv',
 'SLD_data_reduced.csv',
 'SP_data_reduced.csv']



In [371]:

    
data = pd.concat(df_list, axis = 0)



In [372]:

    
print data.shape
data.head(2)









    



(10011, 15)






    Out[372]:






  
    
      
      id
      rating
      bitter
      meaty
      piquant
      salty
      sour
      sweet
      cookTimeInSeconds
      ingredientCount
      numberOfServings
      prepTimeInSeconds
      totalTimeInSeconds
      ingredient_list
      course
    
  
  
    
      0
      Bacon-Cheddar-Pinwheels-768341
      4
      0.833333
      0.833333
      0.000000
      0.833333
      0.166667
      0.166667
      NaN
      5
      16.0
      900.0
      2100.0
      [Pillsbury™ Refrigerated Crescent Dinner Rolls...
      Appetizer
    
    
      1
      Fiesta-Corn-Dip-1711704
      4
      0.500000
      0.166667
      0.833333
      0.833333
      0.500000
      0.166667
      NaN
      7
      4.0
      900.0
      900.0
      [sour cream, mayonnaise, ground cumin, mexicor...
      Appetizer



In [373]:

    
# labelencode the course column
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
data['course_num'] = le.fit_transform(data.course)



In [374]:

    
#make all column headers, small cases
data.columns = [col.lower() for col in data.columns]



In [375]:

    
data.isnull().sum()









    Out[375]:





id                       0
rating                   0
bitter                3242
meaty                 3242
piquant               3242
salty                 3242
sour                  3242
sweet                 3242
cooktimeinseconds     7406
ingredientcount          0
numberofservings         2
preptimeinseconds     6180
totaltimeinseconds       0
ingredient_list          0
course                   0
course_num               0
dtype: int64



In [376]:

    
data = data.drop(['cooktimeinseconds','preptimeinseconds', 'course'], axis = 1)



In [378]:

    
# #clean cook/preptime columns
# for index, row in data.iterrows():
#     if row['cooktimeinseconds'].isnull() and row['preptimeinseconds'].isnull():
#         print row



In [379]:

    
# clean the ingredient column 
import string
# strip non ascii characters 
printable = set(string.printable)
ingredient = [filter(lambda x: x in printable, z) for z in data['ingredient_list']]



In [380]:

    
ingredient[0]









    Out[380]:





'[Pillsbury Refrigerated Crescent Dinner Rolls, ranch dressing, bacon pieces, shredded cheddar cheese, green onions]'



In [381]:

    
# remove the braquets
ingredient = [z[1:-1] for z in ingredient]



In [382]:

    
ingredient[0]









    Out[382]:





'Pillsbury Refrigerated Crescent Dinner Rolls, ranch dressing, bacon pieces, shredded cheddar cheese, green onions'



In [383]:

    
ingredient = [z.split(" ") for z in ingredient]



In [384]:

    
ingredient[0]









    Out[384]:





['Pillsbury',
 'Refrigerated',
 'Crescent',
 'Dinner',
 'Rolls,',
 'ranch',
 'dressing,',
 'bacon',
 'pieces,',
 'shredded',
 'cheddar',
 'cheese,',
 'green',
 'onions']



In [385]:

    
# perform word lemmatization
from nltk import stem
wordnet_lemm = stem.WordNetLemmatizer()

lem_ingredient = [] # create a new list of lemmatized ingredient list for each recipe
for ing in ingredient:
    lem_ingredient.append([wordnet_lemm.lemmatize(w) for w in ing])
    
# convert back to a string for vectorization
lem_ingredient = [' '.join(z) for z in lem_ingredient]



In [386]:

    
data['ingredient_lem'] = lem_ingredient
data = data.drop('ingredient_list', axis = 1)
data.head(2)









    Out[386]:






  
    
      
      id
      rating
      bitter
      meaty
      piquant
      salty
      sour
      sweet
      ingredientcount
      numberofservings
      totaltimeinseconds
      course_num
      ingredient_lem
    
  
  
    
      0
      Bacon-Cheddar-Pinwheels-768341
      4
      0.833333
      0.833333
      0.000000
      0.833333
      0.166667
      0.166667
      5
      16.0
      2100.0
      0
      Pillsbury Refrigerated Crescent Dinner Rolls, ...
    
    
      1
      Fiesta-Corn-Dip-1711704
      4
      0.500000
      0.166667
      0.833333
      0.833333
      0.500000
      0.166667
      7
      4.0
      900.0
      0
      sour cream, mayonnaise, ground cumin, mexicorn...



In [387]:

    
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()



In [388]:

    
array = data.values
from sklearn import cross_validation

X = data['ingredient_lem']
print X.shape
Y = data['course_num']
print Y.shape

X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, random_state=10)

print X_train.shape
print Y_train.shape









    



(10011,)
(10011,)
(7508,)
(7508,)



In [389]:

    
#vectorize and convert the matrix into array 
X_train_vect = vect.fit_transform(X_train) 
X_train_vect = X_train_vect.toarray() 
X_test_vect = vect.transform(X_test)
X_test_vect = X_test_vect.toarray()



In [390]:

    
#create a list of the feature names
features = vect.get_feature_names()
len(features)









    Out[390]:





1813



In [402]:

    
#create dataframe of the vectorized dataset
train_vect_df = pd.DataFrame(data = X_train_vect, columns = features)
train_vect_df['course_num'] = Y_train.values
test_vect_df = pd.DataFrame(data = X_test_vect, columns = features)
test_vect_df['course_num'] = Y_test.values
print train_vect_df.shape
print test_vect_df.shape









    



(7508, 1814)
(2503, 1814)



In [406]:

    
# train/test split the other columns from the main dataframe 
array = data.values
_X = array[:, 1:11]
print _X.shape
Y = data['course_num']
print Y.shape
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(_X, Y, random_state=10)
print X_train.shape
print Y_train.shape









    



(10011, 10)
(10011,)
(7508, 10)
(7508,)



In [407]:

    
# create a dataframe for training and test set
train_df = pd.DataFrame(data = X_train, columns = list(data.columns[1:11]))
test_df = pd.DataFrame(data = X_test, columns = list(data.columns[1:11]))
print train_df.shape



In [408]:

    
# combine the dataframes for the training set and for the testing set. The new dataframes will have the vectorized
# columns and selected features together.
data_train = pd.concat([train_df, train_vect_df], axis = 1)
data_test = pd.concat([test_df, test_vect_df], axis = 1)



In [ ]:

	id	rating	bitter	meaty	piquant	salty	sour	sweet	cookTimeInSeconds	ingredientCount	numberOfServings	prepTimeInSeconds	totalTimeInSeconds	ingredient_list	course
0	Bacon-Cheddar-Pinwheels-768341	4	0.833333	0.833333	0.000000	0.833333	0.166667	0.166667	NaN	5	16.0	900.0	2100.0	[Pillsbury™ Refrigerated Crescent Dinner Rolls...	Appetizer
1	Fiesta-Corn-Dip-1711704	4	0.500000	0.166667	0.833333	0.833333	0.500000	0.166667	NaN	7	4.0	900.0	900.0	[sour cream, mayonnaise, ground cumin, mexicor...	Appetizer