In [1]:
import pandas as pd
import numpy as np

Soup

  • Main

In [53]:
df = pd.read_csv('SP_main.csv')
df1 = pd.read_csv('SP_main_1.csv')
df2 = pd.read_csv('SP_main_2.csv')
df3 = pd.read_csv('SP_main_3.csv')

In [54]:
print df.shape
df.head(3)


(500, 5)
Out[54]:
Unnamed: 0 id rating recipeName sourceDisplayName
0 0 Cheesy-Cauliflower-Soup-1710335 4 Cheesy Cauliflower Soup Souffles and Sawdust
1 1 Cauliflower-Chowder-1709202 4 Cauliflower Chowder Damn Delicious
2 2 Creamy-White-Bean-Tomato-Soup---perfect-for-an... 4 Creamy White Bean Tomato Soup - perfect for an... Vegetarian Mamma

In [55]:
df = df.drop('Unnamed: 0', 1)

In [56]:
print df1.shape
df1 = df1.drop('Unnamed: 0', 1)
df1.head(3)


(500, 5)
Out[56]:
id rating recipeName sourceDisplayName
0 Coconut-chicken-soup-345533 5 Coconut Chicken Soup Elana's Pantry
1 Okra-Soup-1707002 4 Okra Soup Heghine
2 Vegetable-_-Bean-Soup-1634117 4 Vegetable & Bean Soup Food Hunter

In [57]:
print df2.shape
df2 = df2.drop('Unnamed: 0', 1)
df2.head(3)


(500, 5)
Out[57]:
id rating recipeName sourceDisplayName
0 Healthy-Carrot-_-Ginger-Soup-Maker-Soup-1521247 4 Healthy Carrot & Ginger Soup Maker Soup Recipe This
1 Italian-Orzo-Spinach-Soup-682577 4 Italian Orzo Spinach Soup Gimme Some Oven
2 Creamy-Cauliflower-Soup-1625274 4 Creamy Cauliflower Soup Hello Sweetie

In [58]:
print df3.shape
df3 = df3.drop('Unnamed: 0', 1)
df3.head(3)


(500, 5)
Out[58]:
id rating recipeName sourceDisplayName
0 Best-tomato-soup-345204 5 Best Tomato Soup Lottie + Doof
1 Thai-Green-Curry-Spinach-Soup-763031 4 Thai Green Curry Spinach Soup Amuse Your Bouche
2 Spinach-tofu-soup-348926 5 Spinach-Tofu Soup Every Day with Rachael Ray

In [59]:
#concatenate the main tables. 
SP_main= pd.concat([df, df1, df2, df3])
#create a new dataframe with selected columns
SP_main_reduced = SP_main.drop(['recipeName', 'sourceDisplayName'], axis = 1)

In [60]:
#peek at dataframe
print SP_main.shape
SP_main = SP_main.sort_values(by = 'id')
SP_main.head(3)


(2000, 4)
Out[60]:
id rating recipeName sourceDisplayName
203 0_-of-the-calories_-Beet-soup-1684369 4 0% of the calories! Beet soup Good food. Best Recipes
69 10-Minute-Creamy-Cauliflower-Soup-988293 4 10 Minute Creamy Cauliflower Soup Super Healthy Kids
136 10-Minute-Paleo-Tomato-Soup-1006193 4 10-Minute Paleo Tomato Soup Cook Eat Paleo

In [61]:
for i in SP_main.duplicated('id'):
    if i == True:
        print i
SP_main = SP_main.drop_duplicates('id')
SP_main.shape


True
True
Out[61]:
(1998, 4)

In [62]:
fdf = pd.read_csv('SP_flavors.csv')
fdf1 = pd.read_csv('SP_flavors_1.csv')
fdf2 = pd.read_csv('SP_flavors_2.csv')
fdf3 = pd.read_csv('SP_flavors_3.csv')

In [63]:
print fdf.shape
fdf = fdf.drop('Unnamed: 0', 1)
fdf = fdf.rename(columns = {'index':'id'})
fdf.head(3)


(500, 8)
Out[63]:
id bitter meaty piquant salty sour sweet
0 0_-of-the-calories_-Beet-soup-1684369 0.166667 0.166667 0.0 0.166667 0.833333 0.333333
1 20-Minute-Broccoli-Cheese-Soup-1311351 0.166667 0.166667 0.0 0.333333 0.833333 0.166667
2 20-Minute-Homemade-Chicken-Noodle-Soup-1278881 0.166667 0.166667 0.0 0.166667 0.166667 0.166667

In [64]:
print fdf1.shape
fdf1 = fdf1.drop('Unnamed: 0', 1)
fdf1 = fdf1.rename(columns = {'index':'id'})
fdf1.head(3)


(500, 8)
Out[64]:
id bitter meaty piquant salty sour sweet
0 10-Minute-Creamy-Cauliflower-Soup-988293 0.500000 0.166667 0.000000 0.333333 0.333333 0.333333
1 10-Minute-Paleo-Tomato-Soup-1006193 0.333333 0.166667 0.166667 0.166667 0.500000 0.166667
2 13-Bean-Soup-1682837 0.166667 0.000000 0.166667 0.166667 0.166667 0.166667

In [65]:
print fdf2.shape
fdf2 = fdf2.drop('Unnamed: 0', 1)
fdf2 = fdf2.rename(columns = {'index':'id'})
fdf2.head(3)


(500, 8)
Out[65]:
id bitter meaty piquant salty sour sweet
0 10-Minute-Tomato-Soup-472890 0.166667 0.166667 0.0 0.500000 0.666667 0.166667
1 15-Bean-Turkey-Soup-1378516 0.166667 0.166667 0.0 0.166667 0.166667 0.166667
2 15-Minute-Black-Bean-Soup-1633689 NaN NaN NaN NaN NaN NaN

In [66]:
print fdf3.shape
fdf3 = fdf3.drop('Unnamed: 0', 1)
fdf3 = fdf3.rename(columns = {'index':'id'})
fdf3.head(3)


(500, 8)
Out[66]:
id bitter meaty piquant salty sour sweet
0 10_-New-Mexican-Red-Chile-Posole-1264640 0.166667 0.833333 0.666667 0.166667 0.166667 0.166667
1 20-Minute-Sausage-and-Tortellini-Soup-477120 0.166667 0.166667 0.666667 0.166667 0.333333 0.166667
2 5-Ingredient-Pumpkin-Curry-Soup-1413710 NaN NaN NaN NaN NaN NaN

In [67]:
#concatenate the flavors tables. 
SP_flavors= pd.concat([fdf, fdf1, fdf2, fdf3])

In [68]:
#peek at dataframe
print SP_flavors.shape
SP_flavors = SP_flavors.sort_values(by= 'id')
SP_flavors.head(3)


(2000, 7)
Out[68]:
id bitter meaty piquant salty sour sweet
0 0_-of-the-calories_-Beet-soup-1684369 0.166667 0.166667 0.000000 0.166667 0.833333 0.333333
0 10-Minute-Creamy-Cauliflower-Soup-988293 0.500000 0.166667 0.000000 0.333333 0.333333 0.333333
1 10-Minute-Paleo-Tomato-Soup-1006193 0.333333 0.166667 0.166667 0.166667 0.500000 0.166667

In [69]:
len([i for i, j in zip(SP_flavors.id, SP_main.id) if i == j])


Out[69]:
975

In [70]:
for i in SP_flavors.duplicated('id'):
    if i == True:
        print i
SP_flavors = SP_flavors.drop_duplicates('id')
SP_flavors.shape


True
True
Out[70]:
(1998, 7)
  • Cuisine

In [71]:
cdf = pd.read_csv('SP_cuisines.csv')
cdf1 = pd.read_csv('SP_cuisines_1.csv')
cdf2 = pd.read_csv('SP_cuisines_2.csv')
cdf3 = pd.read_csv('SP_cuisines_3.csv')

In [72]:
print cdf.shape
cdf = cdf.drop('Unnamed: 0', 1)
cdf = cdf.rename(columns = {'index':'id'})
print cdf.columns
cdf.head(3)


(500, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')
Out[72]:
id American Asian Barbecue Cajun & Creole Chinese Cuban English French German ... Kid-Friendly Mediterranean Mexican Moroccan Portuguese Southern & Soul Food Southwestern Spanish Swedish Thai
0 0_-of-the-calories_-Beet-soup-1684369 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 20-Minute-Broccoli-Cheese-Soup-1311351 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 20-Minute-Homemade-Chicken-Noodle-Soup-1278881 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 27 columns


In [73]:
print cdf1.shape
cdf1 = cdf1.drop('Unnamed: 0', 1)
cdf1 = cdf1.rename(columns = {'index':'id'})
print cdf1.columns
cdf1.head(3)


(500, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')
Out[73]:
id American Asian Barbecue Cajun & Creole Chinese Cuban English French German ... Kid-Friendly Mediterranean Mexican Moroccan Portuguese Southern & Soul Food Southwestern Spanish Swedish Thai
0 10-Minute-Creamy-Cauliflower-Soup-988293 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 10-Minute-Paleo-Tomato-Soup-1006193 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 13-Bean-Soup-1682837 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 27 columns


In [74]:
print cdf2.shape
cdf2 = cdf2.drop('Unnamed: 0', 1)
cdf2 = cdf2.rename(columns = {'index':'id'})
print cdf2.columns
cdf2.head(3)


(500, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')
Out[74]:
id American Asian Barbecue Cajun & Creole Chinese Cuban English French German ... Kid-Friendly Mediterranean Mexican Moroccan Portuguese Southern & Soul Food Southwestern Spanish Swedish Thai
0 10-Minute-Tomato-Soup-472890 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 15-Bean-Turkey-Soup-1378516 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 15-Minute-Black-Bean-Soup-1633689 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 27 columns


In [75]:
print cdf3.shape
cdf3 = cdf3.drop('Unnamed: 0', 1)
cdf3 = cdf3.rename(columns = {'index':'id'})
print cdf3.columns
cdf3.head(3)


(500, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')
Out[75]:
id American Asian Barbecue Cajun & Creole Chinese Cuban English French German ... Kid-Friendly Mediterranean Mexican Moroccan Portuguese Southern & Soul Food Southwestern Spanish Swedish Thai
0 10_-New-Mexican-Red-Chile-Posole-1264640 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0
1 20-Minute-Sausage-and-Tortellini-Soup-477120 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 5-Ingredient-Pumpkin-Curry-Soup-1413710 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 27 columns


In [76]:
#concatenate the cuisine tables. 
SP_cuisines= pd.concat([cdf, cdf1, cdf2, cdf3])

In [77]:
#peek at dataframe
print SP_cuisines.shape
cols = list(SP_cuisines)
cols.insert(0, cols.pop(cols.index('id')))
SP_cuisines = SP_cuisines.ix[:, cols]
SP_cuisines = SP_cuisines.sort_values(by ='id')
SP_cuisines.head(3)


(2000, 27)
Out[77]:
id American Asian Barbecue Cajun & Creole Chinese Cuban English French German ... Kid-Friendly Mediterranean Mexican Moroccan Portuguese Southern & Soul Food Southwestern Spanish Swedish Thai
0 0_-of-the-calories_-Beet-soup-1684369 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
0 10-Minute-Creamy-Cauliflower-Soup-988293 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 10-Minute-Paleo-Tomato-Soup-1006193 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 27 columns


In [78]:
len([i for i, j in zip(SP_main.id, SP_cuisines.id) if i == j])


Out[78]:
975

In [79]:
for i in SP_cuisines.duplicated('id'):
    if i == True:
        print i
SP_cuisines = SP_cuisines.drop_duplicates('id')
SP_cuisines.shape


True
True
Out[79]:
(1998, 27)
  • Details

In [80]:
ddf = pd.read_csv('SP_details.csv')
ddf1 = pd.read_csv('SP_details_1.csv')
ddf2 = pd.read_csv('SP_details_2.csv')
ddf3 = pd.read_csv('SP_details_3.csv')

In [81]:
print len(set(ddf1.id) & set(df1.id))
print len(set(ddf2.id) & set(df2.id))
print len(set(ddf3.id) & set(df3.id))


500
500
500

In [82]:
print ddf.shape
ddf = ddf.drop('Unnamed: 0', 1)
print ddf.columns
ddf.head(3)


(500, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')
Out[82]:
id cookTimeInSeconds ingredientCount numberOfServings prepTimeInSeconds totalTimeInSeconds
0 Cheesy-Cauliflower-Soup-1710335 NaN 7 4 NaN 2100
1 Cauliflower-Chowder-1709202 NaN 14 6 NaN 3300
2 Creamy-White-Bean-Tomato-Soup---perfect-for-an... NaN 5 4 NaN 1800

In [83]:
print ddf1.shape
ddf1 = ddf1.drop('Unnamed: 0', 1)
print ddf1.columns
ddf1.head(3)


(500, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')
Out[83]:
id cookTimeInSeconds ingredientCount numberOfServings prepTimeInSeconds totalTimeInSeconds
0 Coconut-chicken-soup-345533 NaN 10 4 NaN 1800.0
1 Okra-Soup-1707002 NaN 17 4 NaN 3900.0
2 Vegetable-_-Bean-Soup-1634117 NaN 11 4 NaN 3900.0

In [84]:
print ddf2.shape
ddf2 = ddf2.drop('Unnamed: 0', 1)
print ddf2.columns
ddf2.head(3)


(500, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')
Out[84]:
id cookTimeInSeconds ingredientCount numberOfServings prepTimeInSeconds totalTimeInSeconds
0 Healthy-Carrot-_-Ginger-Soup-Maker-Soup-1521247 1500.0 8 2 900.0 2400
1 Italian-Orzo-Spinach-Soup-682577 NaN 14 5 NaN 3000
2 Creamy-Cauliflower-Soup-1625274 NaN 9 4 NaN 3000

In [85]:
print ddf3.shape
ddf3 = ddf3.drop('Unnamed: 0', 1)
print ddf3.columns
ddf3.head(3)


(500, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')
Out[85]:
id cookTimeInSeconds ingredientCount numberOfServings prepTimeInSeconds totalTimeInSeconds
0 Best-tomato-soup-345204 NaN 12 6 NaN 4500.0
1 Thai-Green-Curry-Spinach-Soup-763031 900.0 12 2 300.0 1200.0
2 Spinach-tofu-soup-348926 900.0 7 4 300.0 1200.0

In [86]:
#concatenate the details tables. 
SP_details= pd.concat([ddf, ddf1, ddf2, ddf3])

In [87]:
#peek at dataframe
print SP_details.shape
SP_details = SP_details.sort_values(by = 'id')
SP_details.head(3)


(2000, 6)
Out[87]:
id cookTimeInSeconds ingredientCount numberOfServings prepTimeInSeconds totalTimeInSeconds
203 0_-of-the-calories_-Beet-soup-1684369 NaN 7 4 NaN 3000.0
69 10-Minute-Creamy-Cauliflower-Soup-988293 NaN 7 4 NaN 1800.0
136 10-Minute-Paleo-Tomato-Soup-1006193 NaN 7 4 NaN 1200.0

In [88]:
len([i for i, j in zip(SP_main.id, SP_details.id) if i == j])


Out[88]:
975

In [89]:
for i in SP_details.duplicated('id'):
    if i == True:
        print i
SP_details = SP_details.drop_duplicates('id')
SP_details.shape


True
True
Out[89]:
(1998, 6)
  • Ingredients

In [90]:
idf = pd.read_csv('SP_ingredients.csv')
idf1 = pd.read_csv('SP_ingredients_1.csv')
idf2 = pd.read_csv('SP_ingredients_2.csv')
idf3 = pd.read_csv('SP_ingredients_3.csv')

In [91]:
print len(set(idf1.id) & set(ddf1.id))
print len(set(idf2.id) & set(ddf2.id))
print len(set(idf3.id) & set(ddf3.id))


500
500
500

In [92]:
print idf.shape
idf = idf.drop('Unnamed: 0', 1)
print idf.columns
idf.head(3)


(500, 838)
Index([u'id', u'course', u'acorn squash', u'agave nectar',
       u'aged cheddar cheese', u'all purpose potatoes', u'all-purpose flour',
       u'allspice', u'allspice berries', u'almond extract',
       ...
       u'worcestershire sauce', u'yellow bell pepper', u'yellow onion',
       u'yellow peppers', u'yellow split peas', u'yellow squash', u'yoghurt',
       u'yukon gold potatoes', u'zest', u'zucchini'],
      dtype='object', length=837)
Out[92]:
id course acorn squash agave nectar aged cheddar cheese all purpose potatoes all-purpose flour allspice allspice berries almond extract ... worcestershire sauce yellow bell pepper yellow onion yellow peppers yellow split peas yellow squash yoghurt yukon gold potatoes zest zucchini
0 Cheesy-Cauliflower-Soup-1710335 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 Cauliflower-Chowder-1709202 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 Creamy-White-Bean-Tomato-Soup---perfect-for-an... Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

3 rows × 837 columns


In [93]:
print idf1.shape
idf1 = idf1.drop('Unnamed: 0', 1)
print idf1.columns
idf1.head(3)


(500, 801)
Index([u'id', u'course', u'acorn squash', u'agave nectar', u'ale',
       u'all-purpose flour', u'allspice', u'almond milk', u'almonds',
       u'ancho chili pepper',
       ...
       u'yellow corn', u'yellow food coloring', u'yellow onion',
       u'yellow peppers', u'yellow split peas', u'yellow squash',
       u'yellow tomato', u'yoghurt', u'yukon gold potatoes', u'zucchini'],
      dtype='object', length=800)
Out[93]:
id course acorn squash agave nectar ale all-purpose flour allspice almond milk almonds ancho chili pepper ... yellow corn yellow food coloring yellow onion yellow peppers yellow split peas yellow squash yellow tomato yoghurt yukon gold potatoes zucchini
0 Coconut-chicken-soup-345533 Breakfast and Brunch 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 Okra-Soup-1707002 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 Vegetable-_-Bean-Soup-1634117 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

3 rows × 800 columns


In [94]:
print idf2.shape
idf2 = idf2.drop('Unnamed: 0', 1)
print idf2.columns
idf2.head(3)


(500, 795)
Index([u'id', u'course', u'a1 steak sauce', u'all-purpose flour', u'allspice',
       u'allspice berries', u'almond milk', u'almonds', u'alphabet pasta',
       u'american cheese',
       ...
       u'worcestershire sauce', u'yeast', u'yellow hominy', u'yellow miso',
       u'yellow onion', u'yellow peppers', u'yellow squash', u'yoghurt',
       u'yukon gold potatoes', u'zucchini'],
      dtype='object', length=794)
Out[94]:
id course a1 steak sauce all-purpose flour allspice allspice berries almond milk almonds alphabet pasta american cheese ... worcestershire sauce yeast yellow hominy yellow miso yellow onion yellow peppers yellow squash yoghurt yukon gold potatoes zucchini
0 Healthy-Carrot-_-Ginger-Soup-Maker-Soup-1521247 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 Italian-Orzo-Spinach-Soup-682577 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 Creamy-Cauliflower-Soup-1625274 Breakfast and Brunch 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

3 rows × 794 columns


In [95]:
print idf3.shape
idf3 = idf3.drop('Unnamed: 0', 1)
print idf3.columns
idf3.head(3)


(500, 778)
Index([u'id', u'course', u'acorn squash', u'ale', u'all purpose potatoes',
       u'all purpose seasoning', u'all-purpose flour', u'allspice',
       u'almond butter', u'almond milk',
       ...
       u'xanthan gum', u'yams', u'yeast', u'yellow mustard seeds',
       u'yellow onion', u'yellow split peas', u'yellow squash', u'yoghurt',
       u'yukon gold potatoes', u'zucchini'],
      dtype='object', length=777)
Out[95]:
id course acorn squash ale all purpose potatoes all purpose seasoning all-purpose flour allspice almond butter almond milk ... xanthan gum yams yeast yellow mustard seeds yellow onion yellow split peas yellow squash yoghurt yukon gold potatoes zucchini
0 Best-tomato-soup-345204 Breakfast and Brunch 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 Thai-Green-Curry-Spinach-Soup-763031 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 Spinach-tofu-soup-348926 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

3 rows × 777 columns


In [96]:
#concatenate the ingredients tables. 
SP_ing= pd.concat([idf, idf1, idf2, idf3])
#create a new dataframe with selected columns
SP_ing_reduced = SP_ing[['id', 'ingredient_list']]

In [97]:
print SP_ing.shape
SP_ing.head(3)


(2000, 1515)
Out[97]:
a1 steak sauce acorn squash agave nectar aged cheddar cheese ale all purpose potatoes all purpose seasoning all-purpose flour allspice allspice berries ... yellow mustard seeds yellow onion yellow peppers yellow split peas yellow squash yellow tomato yoghurt yukon gold potatoes zest zucchini
0 NaN 0.0 0.0 0.0 NaN 0.0 NaN 0.0 0.0 0.0 ... NaN 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0
1 NaN 0.0 0.0 0.0 NaN 0.0 NaN 0.0 0.0 0.0 ... NaN 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0
2 NaN 0.0 0.0 0.0 NaN 0.0 NaN 0.0 0.0 0.0 ... NaN 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0

3 rows × 1515 columns


In [98]:
#drop unnamed column & make id first column
#SP_ing = SP_ing.drop('Unnamed: 0', 1)
cols = list(SP_ing)
cols.insert(0, cols.pop(cols.index('id')))
SP_ing = SP_ing.ix[:, cols]

In [99]:
print SP_ing.shape
SP_ing = SP_ing.sort_values(by = 'id')
SP_ing.head(3)


(2000, 1515)
Out[99]:
id a1 steak sauce acorn squash agave nectar aged cheddar cheese ale all purpose potatoes all purpose seasoning all-purpose flour allspice ... yellow mustard seeds yellow onion yellow peppers yellow split peas yellow squash yellow tomato yoghurt yukon gold potatoes zest zucchini
203 0_-of-the-calories_-Beet-soup-1684369 NaN 0.0 0.0 0.0 NaN 0.0 NaN 0.0 0.0 ... NaN 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0
69 10-Minute-Creamy-Cauliflower-Soup-988293 NaN 0.0 0.0 NaN 0.0 NaN NaN 0.0 0.0 ... NaN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0
136 10-Minute-Paleo-Tomato-Soup-1006193 NaN 0.0 0.0 NaN 0.0 NaN NaN 0.0 0.0 ... NaN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0

3 rows × 1515 columns


In [100]:
len([i for i, j in zip(SP_main.id, SP_ing.id) if i != j])


Out[100]:
1023

In [101]:
for i in SP_ing.duplicated('id'):
    if i == True:
        print i
SP_ing = SP_ing.drop_duplicates('id')
SP_ing.shape


True
True
Out[101]:
(1998, 1515)

Join all tables for SOUP


In [102]:
# set index to column 'id'
_df = [SP_main, SP_main_reduced, SP_cuisines, SP_flavors, SP_details, SP_ing, SP_ing_reduced]

for df in _df:
     df.set_index('id', inplace = True)

In [103]:
# join dataframes
SP_data = SP_main.join([SP_cuisines, SP_flavors, SP_details, SP_ing])
SP_data_reduced = SP_main_reduced.join([SP_flavors, SP_details, SP_ing_reduced])
# create a course column
SP_data['course'] = 'soup'
SP_data_reduced['course'] = 'soup'

In [104]:
print SP_data.shape
SP_data.head(3)


(1998, 1554)
Out[104]:
rating recipeName sourceDisplayName American Asian Barbecue Cajun & Creole Chinese Cuban English ... yellow mustard seeds yellow onion yellow peppers yellow split peas yellow squash yellow tomato yoghurt yukon gold potatoes zest zucchini
id
0_-of-the-calories_-Beet-soup-1684369 4 0% of the calories! Beet soup Good food. Best Recipes 0 0 0 0 0 0 0 ... NaN 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0
10-Minute-Creamy-Cauliflower-Soup-988293 4 10 Minute Creamy Cauliflower Soup Super Healthy Kids 0 0 0 0 0 0 0 ... NaN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0
10-Minute-Paleo-Tomato-Soup-1006193 4 10-Minute Paleo Tomato Soup Cook Eat Paleo 0 0 0 0 0 0 0 ... NaN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0

3 rows × 1554 columns


In [105]:
# save to csv
SP_data.to_csv('SP_data.csv')
SP_data_reduced.to_csv('SP_data_reduced.csv')