In [1]:

    
import pandas as pd
import numpy as np

Soup

Main



In [53]:

    
df = pd.read_csv('SP_main.csv')
df1 = pd.read_csv('SP_main_1.csv')
df2 = pd.read_csv('SP_main_2.csv')
df3 = pd.read_csv('SP_main_3.csv')



In [54]:

    
print df.shape
df.head(3)









    



(500, 5)






    Out[54]:






  
    
      
      Unnamed: 0
      id
      rating
      recipeName
      sourceDisplayName
    
  
  
    
      0
      0
      Cheesy-Cauliflower-Soup-1710335
      4
      Cheesy Cauliflower Soup
      Souffles and Sawdust
    
    
      1
      1
      Cauliflower-Chowder-1709202
      4
      Cauliflower Chowder
      Damn Delicious
    
    
      2
      2
      Creamy-White-Bean-Tomato-Soup---perfect-for-an...
      4
      Creamy White Bean Tomato Soup - perfect for an...
      Vegetarian Mamma



In [55]:

    
df = df.drop('Unnamed: 0', 1)



In [56]:

    
print df1.shape
df1 = df1.drop('Unnamed: 0', 1)
df1.head(3)









    



(500, 5)






    Out[56]:






  
    
      
      id
      rating
      recipeName
      sourceDisplayName
    
  
  
    
      0
      Coconut-chicken-soup-345533
      5
      Coconut Chicken Soup
      Elana's Pantry
    
    
      1
      Okra-Soup-1707002
      4
      Okra Soup
      Heghine
    
    
      2
      Vegetable-_-Bean-Soup-1634117
      4
      Vegetable & Bean Soup
      Food Hunter



In [57]:

    
print df2.shape
df2 = df2.drop('Unnamed: 0', 1)
df2.head(3)









    



(500, 5)






    Out[57]:






  
    
      
      id
      rating
      recipeName
      sourceDisplayName
    
  
  
    
      0
      Healthy-Carrot-_-Ginger-Soup-Maker-Soup-1521247
      4
      Healthy Carrot & Ginger Soup Maker Soup
      Recipe This
    
    
      1
      Italian-Orzo-Spinach-Soup-682577
      4
      Italian Orzo Spinach Soup
      Gimme Some Oven
    
    
      2
      Creamy-Cauliflower-Soup-1625274
      4
      Creamy Cauliflower Soup
      Hello Sweetie



In [58]:

    
print df3.shape
df3 = df3.drop('Unnamed: 0', 1)
df3.head(3)









    



(500, 5)






    Out[58]:






  
    
      
      id
      rating
      recipeName
      sourceDisplayName
    
  
  
    
      0
      Best-tomato-soup-345204
      5
      Best Tomato Soup
      Lottie + Doof
    
    
      1
      Thai-Green-Curry-Spinach-Soup-763031
      4
      Thai Green Curry Spinach Soup
      Amuse Your Bouche
    
    
      2
      Spinach-tofu-soup-348926
      5
      Spinach-Tofu Soup
      Every Day with Rachael Ray



In [59]:

    
#concatenate the main tables. 
SP_main= pd.concat([df, df1, df2, df3])
#create a new dataframe with selected columns
SP_main_reduced = SP_main.drop(['recipeName', 'sourceDisplayName'], axis = 1)



In [60]:

    
#peek at dataframe
print SP_main.shape
SP_main = SP_main.sort_values(by = 'id')
SP_main.head(3)









    



(2000, 4)






    Out[60]:






  
    
      
      id
      rating
      recipeName
      sourceDisplayName
    
  
  
    
      203
      0_-of-the-calories_-Beet-soup-1684369
      4
      0% of the calories! Beet soup
      Good food. Best Recipes
    
    
      69
      10-Minute-Creamy-Cauliflower-Soup-988293
      4
      10 Minute Creamy Cauliflower Soup
      Super Healthy Kids
    
    
      136
      10-Minute-Paleo-Tomato-Soup-1006193
      4
      10-Minute Paleo Tomato Soup
      Cook Eat Paleo



In [61]:

    
for i in SP_main.duplicated('id'):
    if i == True:
        print i
SP_main = SP_main.drop_duplicates('id')
SP_main.shape









    



True
True






    Out[61]:





(1998, 4)



In [62]:

    
fdf = pd.read_csv('SP_flavors.csv')
fdf1 = pd.read_csv('SP_flavors_1.csv')
fdf2 = pd.read_csv('SP_flavors_2.csv')
fdf3 = pd.read_csv('SP_flavors_3.csv')



In [63]:

    
print fdf.shape
fdf = fdf.drop('Unnamed: 0', 1)
fdf = fdf.rename(columns = {'index':'id'})
fdf.head(3)









    



(500, 8)






    Out[63]:






  
    
      
      id
      bitter
      meaty
      piquant
      salty
      sour
      sweet
    
  
  
    
      0
      0_-of-the-calories_-Beet-soup-1684369
      0.166667
      0.166667
      0.0
      0.166667
      0.833333
      0.333333
    
    
      1
      20-Minute-Broccoli-Cheese-Soup-1311351
      0.166667
      0.166667
      0.0
      0.333333
      0.833333
      0.166667
    
    
      2
      20-Minute-Homemade-Chicken-Noodle-Soup-1278881
      0.166667
      0.166667
      0.0
      0.166667
      0.166667
      0.166667



In [64]:

    
print fdf1.shape
fdf1 = fdf1.drop('Unnamed: 0', 1)
fdf1 = fdf1.rename(columns = {'index':'id'})
fdf1.head(3)









    



(500, 8)






    Out[64]:






  
    
      
      id
      bitter
      meaty
      piquant
      salty
      sour
      sweet
    
  
  
    
      0
      10-Minute-Creamy-Cauliflower-Soup-988293
      0.500000
      0.166667
      0.000000
      0.333333
      0.333333
      0.333333
    
    
      1
      10-Minute-Paleo-Tomato-Soup-1006193
      0.333333
      0.166667
      0.166667
      0.166667
      0.500000
      0.166667
    
    
      2
      13-Bean-Soup-1682837
      0.166667
      0.000000
      0.166667
      0.166667
      0.166667
      0.166667



In [65]:

    
print fdf2.shape
fdf2 = fdf2.drop('Unnamed: 0', 1)
fdf2 = fdf2.rename(columns = {'index':'id'})
fdf2.head(3)









    



(500, 8)






    Out[65]:






  
    
      
      id
      bitter
      meaty
      piquant
      salty
      sour
      sweet
    
  
  
    
      0
      10-Minute-Tomato-Soup-472890
      0.166667
      0.166667
      0.0
      0.500000
      0.666667
      0.166667
    
    
      1
      15-Bean-Turkey-Soup-1378516
      0.166667
      0.166667
      0.0
      0.166667
      0.166667
      0.166667
    
    
      2
      15-Minute-Black-Bean-Soup-1633689
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN



In [66]:

    
print fdf3.shape
fdf3 = fdf3.drop('Unnamed: 0', 1)
fdf3 = fdf3.rename(columns = {'index':'id'})
fdf3.head(3)









    



(500, 8)






    Out[66]:






  
    
      
      id
      bitter
      meaty
      piquant
      salty
      sour
      sweet
    
  
  
    
      0
      10_-New-Mexican-Red-Chile-Posole-1264640
      0.166667
      0.833333
      0.666667
      0.166667
      0.166667
      0.166667
    
    
      1
      20-Minute-Sausage-and-Tortellini-Soup-477120
      0.166667
      0.166667
      0.666667
      0.166667
      0.333333
      0.166667
    
    
      2
      5-Ingredient-Pumpkin-Curry-Soup-1413710
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN



In [67]:

    
#concatenate the flavors tables. 
SP_flavors= pd.concat([fdf, fdf1, fdf2, fdf3])



In [68]:

    
#peek at dataframe
print SP_flavors.shape
SP_flavors = SP_flavors.sort_values(by= 'id')
SP_flavors.head(3)









    



(2000, 7)






    Out[68]:






  
    
      
      id
      bitter
      meaty
      piquant
      salty
      sour
      sweet
    
  
  
    
      0
      0_-of-the-calories_-Beet-soup-1684369
      0.166667
      0.166667
      0.000000
      0.166667
      0.833333
      0.333333
    
    
      0
      10-Minute-Creamy-Cauliflower-Soup-988293
      0.500000
      0.166667
      0.000000
      0.333333
      0.333333
      0.333333
    
    
      1
      10-Minute-Paleo-Tomato-Soup-1006193
      0.333333
      0.166667
      0.166667
      0.166667
      0.500000
      0.166667



In [69]:

    
len([i for i, j in zip(SP_flavors.id, SP_main.id) if i == j])









    Out[69]:





975



In [70]:

    
for i in SP_flavors.duplicated('id'):
    if i == True:
        print i
SP_flavors = SP_flavors.drop_duplicates('id')
SP_flavors.shape









    



True
True






    Out[70]:





(1998, 7)

Cuisine



In [71]:

    
cdf = pd.read_csv('SP_cuisines.csv')
cdf1 = pd.read_csv('SP_cuisines_1.csv')
cdf2 = pd.read_csv('SP_cuisines_2.csv')
cdf3 = pd.read_csv('SP_cuisines_3.csv')



In [72]:

    
print cdf.shape
cdf = cdf.drop('Unnamed: 0', 1)
cdf = cdf.rename(columns = {'index':'id'})
print cdf.columns
cdf.head(3)









    



(500, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')






    Out[72]:






  
    
      
      id
      American
      Asian
      Barbecue
      Cajun & Creole
      Chinese
      Cuban
      English
      French
      German
      ...
      Kid-Friendly
      Mediterranean
      Mexican
      Moroccan
      Portuguese
      Southern & Soul Food
      Southwestern
      Spanish
      Swedish
      Thai
    
  
  
    
      0
      0_-of-the-calories_-Beet-soup-1684369
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      20-Minute-Broccoli-Cheese-Soup-1311351
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      20-Minute-Homemade-Chicken-Noodle-Soup-1278881
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

3 rows × 27 columns



In [73]:

    
print cdf1.shape
cdf1 = cdf1.drop('Unnamed: 0', 1)
cdf1 = cdf1.rename(columns = {'index':'id'})
print cdf1.columns
cdf1.head(3)









    



(500, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')






    Out[73]:






  
    
      
      id
      American
      Asian
      Barbecue
      Cajun & Creole
      Chinese
      Cuban
      English
      French
      German
      ...
      Kid-Friendly
      Mediterranean
      Mexican
      Moroccan
      Portuguese
      Southern & Soul Food
      Southwestern
      Spanish
      Swedish
      Thai
    
  
  
    
      0
      10-Minute-Creamy-Cauliflower-Soup-988293
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      10-Minute-Paleo-Tomato-Soup-1006193
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      13-Bean-Soup-1682837
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

3 rows × 27 columns



In [74]:

    
print cdf2.shape
cdf2 = cdf2.drop('Unnamed: 0', 1)
cdf2 = cdf2.rename(columns = {'index':'id'})
print cdf2.columns
cdf2.head(3)









    



(500, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')






    Out[74]:






  
    
      
      id
      American
      Asian
      Barbecue
      Cajun & Creole
      Chinese
      Cuban
      English
      French
      German
      ...
      Kid-Friendly
      Mediterranean
      Mexican
      Moroccan
      Portuguese
      Southern & Soul Food
      Southwestern
      Spanish
      Swedish
      Thai
    
  
  
    
      0
      10-Minute-Tomato-Soup-472890
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      15-Bean-Turkey-Soup-1378516
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      15-Minute-Black-Bean-Soup-1633689
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

3 rows × 27 columns



In [75]:

    
print cdf3.shape
cdf3 = cdf3.drop('Unnamed: 0', 1)
cdf3 = cdf3.rename(columns = {'index':'id'})
print cdf3.columns
cdf3.head(3)









    



(500, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')






    Out[75]:






  
    
      
      id
      American
      Asian
      Barbecue
      Cajun & Creole
      Chinese
      Cuban
      English
      French
      German
      ...
      Kid-Friendly
      Mediterranean
      Mexican
      Moroccan
      Portuguese
      Southern & Soul Food
      Southwestern
      Spanish
      Swedish
      Thai
    
  
  
    
      0
      10_-New-Mexican-Red-Chile-Posole-1264640
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      20-Minute-Sausage-and-Tortellini-Soup-477120
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      5-Ingredient-Pumpkin-Curry-Soup-1413710
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

3 rows × 27 columns



In [76]:

    
#concatenate the cuisine tables. 
SP_cuisines= pd.concat([cdf, cdf1, cdf2, cdf3])



In [77]:

    
#peek at dataframe
print SP_cuisines.shape
cols = list(SP_cuisines)
cols.insert(0, cols.pop(cols.index('id')))
SP_cuisines = SP_cuisines.ix[:, cols]
SP_cuisines = SP_cuisines.sort_values(by ='id')
SP_cuisines.head(3)









    



(2000, 27)






    Out[77]:






  
    
      
      id
      American
      Asian
      Barbecue
      Cajun & Creole
      Chinese
      Cuban
      English
      French
      German
      ...
      Kid-Friendly
      Mediterranean
      Mexican
      Moroccan
      Portuguese
      Southern & Soul Food
      Southwestern
      Spanish
      Swedish
      Thai
    
  
  
    
      0
      0_-of-the-calories_-Beet-soup-1684369
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      0
      10-Minute-Creamy-Cauliflower-Soup-988293
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      10-Minute-Paleo-Tomato-Soup-1006193
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

3 rows × 27 columns



In [78]:

    
len([i for i, j in zip(SP_main.id, SP_cuisines.id) if i == j])









    Out[78]:





975



In [79]:

    
for i in SP_cuisines.duplicated('id'):
    if i == True:
        print i
SP_cuisines = SP_cuisines.drop_duplicates('id')
SP_cuisines.shape









    



True
True






    Out[79]:





(1998, 27)

Details



In [80]:

    
ddf = pd.read_csv('SP_details.csv')
ddf1 = pd.read_csv('SP_details_1.csv')
ddf2 = pd.read_csv('SP_details_2.csv')
ddf3 = pd.read_csv('SP_details_3.csv')



In [81]:

    
print len(set(ddf1.id) & set(df1.id))
print len(set(ddf2.id) & set(df2.id))
print len(set(ddf3.id) & set(df3.id))



In [82]:

    
print ddf.shape
ddf = ddf.drop('Unnamed: 0', 1)
print ddf.columns
ddf.head(3)









    



(500, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')






    Out[82]:






  
    
      
      id
      cookTimeInSeconds
      ingredientCount
      numberOfServings
      prepTimeInSeconds
      totalTimeInSeconds
    
  
  
    
      0
      Cheesy-Cauliflower-Soup-1710335
      NaN
      7
      4
      NaN
      2100
    
    
      1
      Cauliflower-Chowder-1709202
      NaN
      14
      6
      NaN
      3300
    
    
      2
      Creamy-White-Bean-Tomato-Soup---perfect-for-an...
      NaN
      5
      4
      NaN
      1800



In [83]:

    
print ddf1.shape
ddf1 = ddf1.drop('Unnamed: 0', 1)
print ddf1.columns
ddf1.head(3)









    



(500, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')






    Out[83]:






  
    
      
      id
      cookTimeInSeconds
      ingredientCount
      numberOfServings
      prepTimeInSeconds
      totalTimeInSeconds
    
  
  
    
      0
      Coconut-chicken-soup-345533
      NaN
      10
      4
      NaN
      1800.0
    
    
      1
      Okra-Soup-1707002
      NaN
      17
      4
      NaN
      3900.0
    
    
      2
      Vegetable-_-Bean-Soup-1634117
      NaN
      11
      4
      NaN
      3900.0



In [84]:

    
print ddf2.shape
ddf2 = ddf2.drop('Unnamed: 0', 1)
print ddf2.columns
ddf2.head(3)









    



(500, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')






    Out[84]:






  
    
      
      id
      cookTimeInSeconds
      ingredientCount
      numberOfServings
      prepTimeInSeconds
      totalTimeInSeconds
    
  
  
    
      0
      Healthy-Carrot-_-Ginger-Soup-Maker-Soup-1521247
      1500.0
      8
      2
      900.0
      2400
    
    
      1
      Italian-Orzo-Spinach-Soup-682577
      NaN
      14
      5
      NaN
      3000
    
    
      2
      Creamy-Cauliflower-Soup-1625274
      NaN
      9
      4
      NaN
      3000



In [85]:

    
print ddf3.shape
ddf3 = ddf3.drop('Unnamed: 0', 1)
print ddf3.columns
ddf3.head(3)









    



(500, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')






    Out[85]:






  
    
      
      id
      cookTimeInSeconds
      ingredientCount
      numberOfServings
      prepTimeInSeconds
      totalTimeInSeconds
    
  
  
    
      0
      Best-tomato-soup-345204
      NaN
      12
      6
      NaN
      4500.0
    
    
      1
      Thai-Green-Curry-Spinach-Soup-763031
      900.0
      12
      2
      300.0
      1200.0
    
    
      2
      Spinach-tofu-soup-348926
      900.0
      7
      4
      300.0
      1200.0



In [86]:

    
#concatenate the details tables. 
SP_details= pd.concat([ddf, ddf1, ddf2, ddf3])



In [87]:

    
#peek at dataframe
print SP_details.shape
SP_details = SP_details.sort_values(by = 'id')
SP_details.head(3)









    



(2000, 6)






    Out[87]:






  
    
      
      id
      cookTimeInSeconds
      ingredientCount
      numberOfServings
      prepTimeInSeconds
      totalTimeInSeconds
    
  
  
    
      203
      0_-of-the-calories_-Beet-soup-1684369
      NaN
      7
      4
      NaN
      3000.0
    
    
      69
      10-Minute-Creamy-Cauliflower-Soup-988293
      NaN
      7
      4
      NaN
      1800.0
    
    
      136
      10-Minute-Paleo-Tomato-Soup-1006193
      NaN
      7
      4
      NaN
      1200.0



In [88]:

    
len([i for i, j in zip(SP_main.id, SP_details.id) if i == j])









    Out[88]:





975



In [89]:

    
for i in SP_details.duplicated('id'):
    if i == True:
        print i
SP_details = SP_details.drop_duplicates('id')
SP_details.shape









    



True
True






    Out[89]:





(1998, 6)

Ingredients



In [90]:

    
idf = pd.read_csv('SP_ingredients.csv')
idf1 = pd.read_csv('SP_ingredients_1.csv')
idf2 = pd.read_csv('SP_ingredients_2.csv')
idf3 = pd.read_csv('SP_ingredients_3.csv')



In [91]:

    
print len(set(idf1.id) & set(ddf1.id))
print len(set(idf2.id) & set(ddf2.id))
print len(set(idf3.id) & set(ddf3.id))



In [92]:

    
print idf.shape
idf = idf.drop('Unnamed: 0', 1)
print idf.columns
idf.head(3)









    



(500, 838)
Index([u'id', u'course', u'acorn squash', u'agave nectar',
       u'aged cheddar cheese', u'all purpose potatoes', u'all-purpose flour',
       u'allspice', u'allspice berries', u'almond extract',
       ...
       u'worcestershire sauce', u'yellow bell pepper', u'yellow onion',
       u'yellow peppers', u'yellow split peas', u'yellow squash', u'yoghurt',
       u'yukon gold potatoes', u'zest', u'zucchini'],
      dtype='object', length=837)






    Out[92]:






  
    
      
      id
      course
      acorn squash
      agave nectar
      aged cheddar cheese
      all purpose potatoes
      all-purpose flour
      allspice
      allspice berries
      almond extract
      ...
      worcestershire sauce
      yellow bell pepper
      yellow onion
      yellow peppers
      yellow split peas
      yellow squash
      yoghurt
      yukon gold potatoes
      zest
      zucchini
    
  
  
    
      0
      Cheesy-Cauliflower-Soup-1710335
      Breakfast and Brunch
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      Cauliflower-Chowder-1709202
      Breakfast and Brunch
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      Creamy-White-Bean-Tomato-Soup---perfect-for-an...
      Breakfast and Brunch
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

3 rows × 837 columns



In [93]:

    
print idf1.shape
idf1 = idf1.drop('Unnamed: 0', 1)
print idf1.columns
idf1.head(3)









    



(500, 801)
Index([u'id', u'course', u'acorn squash', u'agave nectar', u'ale',
       u'all-purpose flour', u'allspice', u'almond milk', u'almonds',
       u'ancho chili pepper',
       ...
       u'yellow corn', u'yellow food coloring', u'yellow onion',
       u'yellow peppers', u'yellow split peas', u'yellow squash',
       u'yellow tomato', u'yoghurt', u'yukon gold potatoes', u'zucchini'],
      dtype='object', length=800)






    Out[93]:






  
    
      
      id
      course
      acorn squash
      agave nectar
      ale
      all-purpose flour
      allspice
      almond milk
      almonds
      ancho chili pepper
      ...
      yellow corn
      yellow food coloring
      yellow onion
      yellow peppers
      yellow split peas
      yellow squash
      yellow tomato
      yoghurt
      yukon gold potatoes
      zucchini
    
  
  
    
      0
      Coconut-chicken-soup-345533
      Breakfast and Brunch
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      Okra-Soup-1707002
      Breakfast and Brunch
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      Vegetable-_-Bean-Soup-1634117
      Breakfast and Brunch
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

3 rows × 800 columns



In [94]:

    
print idf2.shape
idf2 = idf2.drop('Unnamed: 0', 1)
print idf2.columns
idf2.head(3)









    



(500, 795)
Index([u'id', u'course', u'a1 steak sauce', u'all-purpose flour', u'allspice',
       u'allspice berries', u'almond milk', u'almonds', u'alphabet pasta',
       u'american cheese',
       ...
       u'worcestershire sauce', u'yeast', u'yellow hominy', u'yellow miso',
       u'yellow onion', u'yellow peppers', u'yellow squash', u'yoghurt',
       u'yukon gold potatoes', u'zucchini'],
      dtype='object', length=794)






    Out[94]:






  
    
      
      id
      course
      a1 steak sauce
      all-purpose flour
      allspice
      allspice berries
      almond milk
      almonds
      alphabet pasta
      american cheese
      ...
      worcestershire sauce
      yeast
      yellow hominy
      yellow miso
      yellow onion
      yellow peppers
      yellow squash
      yoghurt
      yukon gold potatoes
      zucchini
    
  
  
    
      0
      Healthy-Carrot-_-Ginger-Soup-Maker-Soup-1521247
      Breakfast and Brunch
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      Italian-Orzo-Spinach-Soup-682577
      Breakfast and Brunch
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      Creamy-Cauliflower-Soup-1625274
      Breakfast and Brunch
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

3 rows × 794 columns



In [95]:

    
print idf3.shape
idf3 = idf3.drop('Unnamed: 0', 1)
print idf3.columns
idf3.head(3)









    



(500, 778)
Index([u'id', u'course', u'acorn squash', u'ale', u'all purpose potatoes',
       u'all purpose seasoning', u'all-purpose flour', u'allspice',
       u'almond butter', u'almond milk',
       ...
       u'xanthan gum', u'yams', u'yeast', u'yellow mustard seeds',
       u'yellow onion', u'yellow split peas', u'yellow squash', u'yoghurt',
       u'yukon gold potatoes', u'zucchini'],
      dtype='object', length=777)






    Out[95]:






  
    
      
      id
      course
      acorn squash
      ale
      all purpose potatoes
      all purpose seasoning
      all-purpose flour
      allspice
      almond butter
      almond milk
      ...
      xanthan gum
      yams
      yeast
      yellow mustard seeds
      yellow onion
      yellow split peas
      yellow squash
      yoghurt
      yukon gold potatoes
      zucchini
    
  
  
    
      0
      Best-tomato-soup-345204
      Breakfast and Brunch
      0.0
      0.0
      0.0
      0.0
      1.0
      1.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      Thai-Green-Curry-Spinach-Soup-763031
      Breakfast and Brunch
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      Spinach-tofu-soup-348926
      Breakfast and Brunch
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

3 rows × 777 columns



In [96]:

    
#concatenate the ingredients tables. 
SP_ing= pd.concat([idf, idf1, idf2, idf3])
#create a new dataframe with selected columns
SP_ing_reduced = SP_ing[['id', 'ingredient_list']]



In [97]:

    
print SP_ing.shape
SP_ing.head(3)









    



(2000, 1515)






    Out[97]:






  
    
      
      a1 steak sauce
      acorn squash
      agave nectar
      aged cheddar cheese
      ale
      all purpose potatoes
      all purpose seasoning
      all-purpose flour
      allspice
      allspice berries
      ...
      yellow mustard seeds
      yellow onion
      yellow peppers
      yellow split peas
      yellow squash
      yellow tomato
      yoghurt
      yukon gold potatoes
      zest
      zucchini
    
  
  
    
      0
      NaN
      0.0
      0.0
      0.0
      NaN
      0.0
      NaN
      0.0
      0.0
      0.0
      ...
      NaN
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
    
    
      1
      NaN
      0.0
      0.0
      0.0
      NaN
      0.0
      NaN
      0.0
      0.0
      0.0
      ...
      NaN
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
    
    
      2
      NaN
      0.0
      0.0
      0.0
      NaN
      0.0
      NaN
      0.0
      0.0
      0.0
      ...
      NaN
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
    
  

3 rows × 1515 columns



In [98]:

    
#drop unnamed column & make id first column
#SP_ing = SP_ing.drop('Unnamed: 0', 1)
cols = list(SP_ing)
cols.insert(0, cols.pop(cols.index('id')))
SP_ing = SP_ing.ix[:, cols]



In [99]:

    
print SP_ing.shape
SP_ing = SP_ing.sort_values(by = 'id')
SP_ing.head(3)









    



(2000, 1515)






    Out[99]:






  
    
      
      id
      a1 steak sauce
      acorn squash
      agave nectar
      aged cheddar cheese
      ale
      all purpose potatoes
      all purpose seasoning
      all-purpose flour
      allspice
      ...
      yellow mustard seeds
      yellow onion
      yellow peppers
      yellow split peas
      yellow squash
      yellow tomato
      yoghurt
      yukon gold potatoes
      zest
      zucchini
    
  
  
    
      203
      0_-of-the-calories_-Beet-soup-1684369
      NaN
      0.0
      0.0
      0.0
      NaN
      0.0
      NaN
      0.0
      0.0
      ...
      NaN
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
    
    
      69
      10-Minute-Creamy-Cauliflower-Soup-988293
      NaN
      0.0
      0.0
      NaN
      0.0
      NaN
      NaN
      0.0
      0.0
      ...
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
    
    
      136
      10-Minute-Paleo-Tomato-Soup-1006193
      NaN
      0.0
      0.0
      NaN
      0.0
      NaN
      NaN
      0.0
      0.0
      ...
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
    
  

3 rows × 1515 columns



In [100]:

    
len([i for i, j in zip(SP_main.id, SP_ing.id) if i != j])









    Out[100]:





1023



In [101]:

    
for i in SP_ing.duplicated('id'):
    if i == True:
        print i
SP_ing = SP_ing.drop_duplicates('id')
SP_ing.shape









    



True
True






    Out[101]:





(1998, 1515)

Join all tables for SOUP



In [102]:

    
# set index to column 'id'
_df = [SP_main, SP_main_reduced, SP_cuisines, SP_flavors, SP_details, SP_ing, SP_ing_reduced]

for df in _df:
     df.set_index('id', inplace = True)



In [103]:

    
# join dataframes
SP_data = SP_main.join([SP_cuisines, SP_flavors, SP_details, SP_ing])
SP_data_reduced = SP_main_reduced.join([SP_flavors, SP_details, SP_ing_reduced])
# create a course column
SP_data['course'] = 'soup'
SP_data_reduced['course'] = 'soup'



In [104]:

    
print SP_data.shape
SP_data.head(3)









    



(1998, 1554)






    Out[104]:






  
    
      
      rating
      recipeName
      sourceDisplayName
      American
      Asian
      Barbecue
      Cajun & Creole
      Chinese
      Cuban
      English
      ...
      yellow mustard seeds
      yellow onion
      yellow peppers
      yellow split peas
      yellow squash
      yellow tomato
      yoghurt
      yukon gold potatoes
      zest
      zucchini
    
    
      id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0_-of-the-calories_-Beet-soup-1684369
      4
      0% of the calories! Beet soup
      Good food. Best Recipes
      0
      0
      0
      0
      0
      0
      0
      ...
      NaN
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
    
    
      10-Minute-Creamy-Cauliflower-Soup-988293
      4
      10 Minute Creamy Cauliflower Soup
      Super Healthy Kids
      0
      0
      0
      0
      0
      0
      0
      ...
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
    
    
      10-Minute-Paleo-Tomato-Soup-1006193
      4
      10-Minute Paleo Tomato Soup
      Cook Eat Paleo
      0
      0
      0
      0
      0
      0
      0
      ...
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
    
  

3 rows × 1554 columns



In [105]:

    
# save to csv
SP_data.to_csv('SP_data.csv')
SP_data_reduced.to_csv('SP_data_reduced.csv')

	Unnamed: 0	id	rating	recipeName	sourceDisplayName
0	0	Cheesy-Cauliflower-Soup-1710335	4	Cheesy Cauliflower Soup	Souffles and Sawdust
1	1	Cauliflower-Chowder-1709202	4	Cauliflower Chowder	Damn Delicious
2	2	Creamy-White-Bean-Tomato-Soup---perfect-for-an...	4	Creamy White Bean Tomato Soup - perfect for an...	Vegetarian Mamma

	id	rating	recipeName	sourceDisplayName
0	Coconut-chicken-soup-345533	5	Coconut Chicken Soup	Elana's Pantry
1	Okra-Soup-1707002	4	Okra Soup	Heghine
2	Vegetable-_-Bean-Soup-1634117	4	Vegetable & Bean Soup	Food Hunter

	id	rating	recipeName	sourceDisplayName
0	Healthy-Carrot-_-Ginger-Soup-Maker-Soup-1521247	4	Healthy Carrot & Ginger Soup Maker Soup	Recipe This
1	Italian-Orzo-Spinach-Soup-682577	4	Italian Orzo Spinach Soup	Gimme Some Oven
2	Creamy-Cauliflower-Soup-1625274	4	Creamy Cauliflower Soup	Hello Sweetie

	id	rating	recipeName	sourceDisplayName
0	Best-tomato-soup-345204	5	Best Tomato Soup	Lottie + Doof
1	Thai-Green-Curry-Spinach-Soup-763031	4	Thai Green Curry Spinach Soup	Amuse Your Bouche
2	Spinach-tofu-soup-348926	5	Spinach-Tofu Soup	Every Day with Rachael Ray

	id	rating	recipeName	sourceDisplayName
203	0_-of-the-calories_-Beet-soup-1684369	4	0% of the calories! Beet soup	Good food. Best Recipes
69	10-Minute-Creamy-Cauliflower-Soup-988293	4	10 Minute Creamy Cauliflower Soup	Super Healthy Kids
136	10-Minute-Paleo-Tomato-Soup-1006193	4	10-Minute Paleo Tomato Soup	Cook Eat Paleo

	id	bitter	meaty	salty	sour	sweet
0	0_-of-the-calories_-Beet-soup-1684369	0.166667	0.166667	0.166667	0.833333	0.333333
1	20-Minute-Broccoli-Cheese-Soup-1311351	0.166667	0.166667	0.333333	0.833333	0.166667
2	20-Minute-Homemade-Chicken-Noodle-Soup-1278881	0.166667	0.166667	0.166667	0.166667	0.166667

	id	bitter	meaty	piquant	salty	sour	sweet
0	10-Minute-Creamy-Cauliflower-Soup-988293	0.500000	0.166667	0.000000	0.333333	0.333333	0.333333
1	10-Minute-Paleo-Tomato-Soup-1006193	0.333333	0.166667	0.166667	0.166667	0.500000	0.166667
2	13-Bean-Soup-1682837	0.166667	0.000000	0.166667	0.166667	0.166667	0.166667

	id	bitter	meaty	piquant	salty	sour	sweet
0	10-Minute-Tomato-Soup-472890	0.166667	0.166667	0.0	0.500000	0.666667	0.166667
1	15-Bean-Turkey-Soup-1378516	0.166667	0.166667	0.0	0.166667	0.166667	0.166667
2	15-Minute-Black-Bean-Soup-1633689	NaN	NaN	NaN	NaN	NaN	NaN

	id	bitter	meaty	piquant	salty	sour	sweet
0	10_-New-Mexican-Red-Chile-Posole-1264640	0.166667	0.833333	0.666667	0.166667	0.166667	0.166667
1	20-Minute-Sausage-and-Tortellini-Soup-477120	0.166667	0.166667	0.666667	0.166667	0.333333	0.166667
2	5-Ingredient-Pumpkin-Curry-Soup-1413710	NaN	NaN	NaN	NaN	NaN	NaN

	id	cookTimeInSeconds	ingredientCount	numberOfServings	prepTimeInSeconds	totalTimeInSeconds
0	Cheesy-Cauliflower-Soup-1710335	NaN	7	4	NaN	2100
1	Cauliflower-Chowder-1709202	NaN	14	6	NaN	3300
2	Creamy-White-Bean-Tomato-Soup---perfect-for-an...	NaN	5	4	NaN	1800

	id	cookTimeInSeconds	ingredientCount	numberOfServings	prepTimeInSeconds	totalTimeInSeconds
0	Coconut-chicken-soup-345533	NaN	10	4	NaN	1800.0
1	Okra-Soup-1707002	NaN	17	4	NaN	3900.0
2	Vegetable-_-Bean-Soup-1634117	NaN	11	4	NaN	3900.0

	id	cookTimeInSeconds	ingredientCount	numberOfServings	prepTimeInSeconds	totalTimeInSeconds
0	Healthy-Carrot-_-Ginger-Soup-Maker-Soup-1521247	1500.0	8	2	900.0	2400
1	Italian-Orzo-Spinach-Soup-682577	NaN	14	5	NaN	3000
2	Creamy-Cauliflower-Soup-1625274	NaN	9	4	NaN	3000

	id	cookTimeInSeconds	ingredientCount	numberOfServings	prepTimeInSeconds	totalTimeInSeconds
0	Best-tomato-soup-345204	NaN	12	6	NaN	4500.0
1	Thai-Green-Curry-Spinach-Soup-763031	900.0	12	2	300.0	1200.0
2	Spinach-tofu-soup-348926	900.0	7	4	300.0	1200.0

	id	cookTimeInSeconds	ingredientCount	numberOfServings	prepTimeInSeconds	totalTimeInSeconds
203	0_-of-the-calories_-Beet-soup-1684369	NaN	7	4	NaN	3000.0
69	10-Minute-Creamy-Cauliflower-Soup-988293	NaN	7	4	NaN	1800.0
136	10-Minute-Paleo-Tomato-Soup-1006193	NaN	7	4	NaN	1200.0

	id	course	...
0	Cheesy-Cauliflower-Soup-1710335	Breakfast and Brunch	...
1	Cauliflower-Chowder-1709202	Breakfast and Brunch	...
2	Creamy-White-Bean-Tomato-Soup---perfect-for-an...	Breakfast and Brunch	...

	a1 steak sauce	ale	all purpose seasoning	...	yellow mustard seeds	yellow tomato
0	NaN	NaN	NaN	...	NaN	NaN
1	NaN	NaN	NaN	...	NaN	NaN
2	NaN	NaN	NaN	...	NaN	NaN

	a1 steak sauce	ale	all purpose seasoning	...	yellow mustard seeds	yellow tomato
0	NaN	NaN	NaN	...	NaN	NaN
1	NaN	NaN	NaN	...	NaN	NaN
2	NaN	NaN	NaN	...	NaN	NaN