In [1]:
import pandas as pd
import numpy as np

Salads

  • Main

In [2]:
df = pd.read_csv('SLD_main.csv')
df1 = pd.read_csv('SLD_main_1.csv')
df2 = pd.read_csv('SLD_main_2.csv')
df3 = pd.read_csv('SLD_main_3.csv')

In [3]:
print df.shape
df.head(3)


(500, 5)
Out[3]:
Unnamed: 0 id rating recipeName sourceDisplayName
0 0 Greek-Pasta-Salad-1712241 4 Greek Pasta Salad Live Serendipity
1 1 Caprese-Salad-Recipe-With-Tomatoes_-Basil-And-... 4 Caprese Salad Recipe With Tomatoes, Basil And ... Melanie Cooks
2 2 Grandmas-Cucumber-Salad-1710308 4 Grandma’s Cucumber Salad Chin Deep

In [4]:
df = df.drop('Unnamed: 0', 1)

In [5]:
print df1.shape
df1 = df1.drop('Unnamed: 0', 1)
df1.head(3)


(501, 5)
Out[5]:
id rating recipeName sourceDisplayName
0 Apple-Coleslaw-1680966 4 Apple Coleslaw FoodForYourGood.com
1 Watermelon-Salad-with-Mint-and-Feta-1668127 4 Watermelon Salad with Mint and Feta The Organic Kitchen
2 Sweet-Kale-Salad-1693185 4 Sweet Kale Salad ifoodreal

In [6]:
print df2.shape
df2 = df2.drop('Unnamed: 0', 1)
df2.head(3)


(500, 5)
Out[6]:
id rating recipeName sourceDisplayName
0 Lemony-Coleslaw-with-Apples-1700377 3 Lemony Coleslaw with Apples Weight Watchers
1 Thai-Curry-Pasta-Salad-_Vegan_-Gluten-Free_-17... 3 Thai Curry Pasta Salad [Vegan, Gluten-Free] One Green Planet
2 Paleo-Ranch-Dressing-1620911 4 Paleo Ranch Dressing Follow the Ruels

In [7]:
print df3.shape
df3 = df3.drop('Unnamed: 0', 1)
df3.head(3)


(500, 5)
Out[7]:
id rating recipeName sourceDisplayName
0 Cucumber-Chickpea-Salad-with-Balsamic-Tahini-D... 4 Cucumber Chickpea Salad with Balsamic Tahini D... Veggie Inspired
1 Cucumber-Tomato-Salad-1102770 4 Cucumber Tomato Salad Barefeet In The Kitchen
2 Quinoa-Tabbouleh-Salad-The-Shiksa-Blog-48849 5 Quinoa Tabbouleh Salad Tori Avey

In [8]:
#concatenate the main tables. 
SLD_main= pd.concat([df, df1, df2, df3])
#create a new dataframe with selected columns
SLD_main_reduced = SLD_main.drop(['recipeName', 'sourceDisplayName'], axis = 1)

In [9]:
#peek at dataframe
print SLD_main.shape
SLD_main.head(3)


(2001, 4)
Out[9]:
id rating recipeName sourceDisplayName
0 Greek-Pasta-Salad-1712241 4 Greek Pasta Salad Live Serendipity
1 Caprese-Salad-Recipe-With-Tomatoes_-Basil-And-... 4 Caprese Salad Recipe With Tomatoes, Basil And ... Melanie Cooks
2 Grandmas-Cucumber-Salad-1710308 4 Grandma’s Cucumber Salad Chin Deep

In [10]:
for i in SLD_main.duplicated('id'):
    if i == True:
        print i
SLD_main = SLD_main.drop_duplicates('id')
SLD_main.shape


True
Out[10]:
(2000, 4)
  • Flavors

In [11]:
fdf = pd.read_csv('SLD_flavors.csv')
fdf1 = pd.read_csv('SLD_flavors_1.csv')
fdf2 = pd.read_csv('SLD_flavors_2.csv')
fdf3 = pd.read_csv('SLD_flavors_3.csv')

In [12]:
print fdf.shape
fdf = fdf.drop('Unnamed: 0', 1)
fdf = fdf.rename(columns = {'index':'id'})
fdf.head(3)


(500, 8)
Out[12]:
id bitter meaty piquant salty sour sweet
0 10-Minute-Thai-Shrimp_-Cucumber-_-Avocado-Sala... 0.500000 0.166667 0.166667 0.833333 0.166667 0.166667
1 3-Ingredient-Cucumber-Dill-Ribbon-Salad-1624433 0.166667 0.166667 0.000000 0.166667 0.166667 0.166667
2 5-Ingredient-Corn-Salad-1695247 NaN NaN NaN NaN NaN NaN

In [13]:
print fdf1.shape
fdf1 = fdf1.drop('Unnamed: 0', 1)
fdf1 = fdf1.rename(columns = {'index':'id'})
fdf1.head(3)


(501, 8)
Out[13]:
id bitter meaty piquant salty sour sweet
0 1-Minute-Lentil-Salad-1639395 NaN NaN NaN NaN NaN NaN
1 10-Minute-Tomato-and-Walnut-Salad-1681561 NaN NaN NaN NaN NaN NaN
2 4-Ingredient-Chickpea-Salad-1708777 NaN NaN NaN NaN NaN NaN

In [14]:
print fdf2.shape
fdf2 = fdf2.drop('Unnamed: 0', 1)
fdf2 = fdf2.rename(columns = {'index':'id'})
fdf2.head(3)


(500, 8)
Out[14]:
id bitter meaty piquant salty sour sweet
0 3-Healthy-Salad-Dressings_-1707545 0.166667 0.833333 0.166667 0.500000 0.500000 0.166667
1 4-Ingredient-Frito-_Salad_-1641651 NaN NaN NaN NaN NaN NaN
2 4-Ingredient-Tangy-Potato-Salad-1641682 0.166667 0.166667 0.166667 0.166667 0.833333 0.166667

In [15]:
print fdf3.shape
fdf3 = fdf3.drop('Unnamed: 0', 1)
fdf3 = fdf3.rename(columns = {'index':'id'})
fdf3.head(3)


(500, 8)
Out[15]:
id bitter meaty piquant salty sour sweet
0 15-Minute-Greek-Cucumber-Salad-1119811 0.166667 0.166667 0.333333 0.500000 0.333333 0.166667
1 5-Minute-Apple-Cranberry-Salad-1336215 0.166667 0.166667 0.000000 0.166667 0.166667 0.500000
2 A-Sweet-And-Sour-Side-Salad-1627813 0.166667 0.166667 0.000000 0.000000 0.166667 0.166667

In [16]:
#concatenate the flavors tables. 
SLD_flavors= pd.concat([fdf, fdf1, fdf2, fdf3])

In [17]:
#peek at dataframe
print SLD_flavors.shape
SLD_flavors.head(3)


(2001, 7)
Out[17]:
id bitter meaty piquant salty sour sweet
0 10-Minute-Thai-Shrimp_-Cucumber-_-Avocado-Sala... 0.500000 0.166667 0.166667 0.833333 0.166667 0.166667
1 3-Ingredient-Cucumber-Dill-Ribbon-Salad-1624433 0.166667 0.166667 0.000000 0.166667 0.166667 0.166667
2 5-Ingredient-Corn-Salad-1695247 NaN NaN NaN NaN NaN NaN

In [18]:
for i in SLD_flavors.duplicated('id'):
    if i == True:
        print i
SLD_flavors = SLD_flavors.drop_duplicates('id')
SLD_flavors.shape


True
Out[18]:
(2000, 7)
  • Cuisine

In [19]:
cdf = pd.read_csv('SLD_cuisines.csv')
cdf1 = pd.read_csv('SLD_cuisines_1.csv')
cdf2 = pd.read_csv('SLD_cuisines_2.csv')
cdf3 = pd.read_csv('SLD_cuisines_3.csv')

In [20]:
print cdf.shape
cdf = cdf.drop('Unnamed: 0', 1)
cdf = cdf.rename(columns = {'index':'id'})
print cdf.columns
cdf.head(3)


(500, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')
Out[20]:
id American Asian Barbecue Cajun & Creole Chinese Cuban English French German ... Kid-Friendly Mediterranean Mexican Moroccan Portuguese Southern & Soul Food Southwestern Spanish Swedish Thai
0 10-Minute-Thai-Shrimp_-Cucumber-_-Avocado-Sala... 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 3-Ingredient-Cucumber-Dill-Ribbon-Salad-1624433 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 5-Ingredient-Corn-Salad-1695247 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 27 columns


In [21]:
print cdf1.shape
cdf1 = cdf1.drop('Unnamed: 0', 1)
cdf1 = cdf1.rename(columns = {'index':'id'})
print cdf1.columns
cdf1.head(3)


(501, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')
Out[21]:
id American Asian Barbecue Cajun & Creole Chinese Cuban English French German ... Kid-Friendly Mediterranean Mexican Moroccan Portuguese Southern & Soul Food Southwestern Spanish Swedish Thai
0 1-Minute-Lentil-Salad-1639395 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 10-Minute-Tomato-and-Walnut-Salad-1681561 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 4-Ingredient-Chickpea-Salad-1708777 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 27 columns


In [22]:
print cdf2.shape
cdf2 = cdf2.drop('Unnamed: 0', 1)
cdf2 = cdf2.rename(columns = {'index':'id'})
print cdf2.columns
cdf2.head(3)


(500, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')
Out[22]:
id American Asian Barbecue Cajun & Creole Chinese Cuban English French German ... Kid-Friendly Mediterranean Mexican Moroccan Portuguese Southern & Soul Food Southwestern Spanish Swedish Thai
0 3-Healthy-Salad-Dressings_-1707545 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 4-Ingredient-Frito-_Salad_-1641651 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 4-Ingredient-Tangy-Potato-Salad-1641682 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 27 columns


In [23]:
print cdf3.shape
cdf3 = cdf3.drop('Unnamed: 0', 1)
cdf3 = cdf3.rename(columns = {'index':'id'})
print cdf3.columns
cdf3.head(3)


(500, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')
Out[23]:
id American Asian Barbecue Cajun & Creole Chinese Cuban English French German ... Kid-Friendly Mediterranean Mexican Moroccan Portuguese Southern & Soul Food Southwestern Spanish Swedish Thai
0 15-Minute-Greek-Cucumber-Salad-1119811 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 5-Minute-Apple-Cranberry-Salad-1336215 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 A-Sweet-And-Sour-Side-Salad-1627813 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 27 columns


In [24]:
#concatenate the cuisine tables. 
SLD_cuisines= pd.concat([cdf, cdf1, cdf2, cdf3])

In [25]:
#peek at dataframe
print SLD_cuisines.shape
SLD_cuisines.head(3)


(2001, 27)
Out[25]:
id American Asian Barbecue Cajun & Creole Chinese Cuban English French German ... Kid-Friendly Mediterranean Mexican Moroccan Portuguese Southern & Soul Food Southwestern Spanish Swedish Thai
0 10-Minute-Thai-Shrimp_-Cucumber-_-Avocado-Sala... 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 3-Ingredient-Cucumber-Dill-Ribbon-Salad-1624433 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 5-Ingredient-Corn-Salad-1695247 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 27 columns


In [26]:
for i in SLD_cuisines.duplicated('id'):
    if i == True:
        print i
SLD_cuisines = SLD_cuisines.drop_duplicates('id')
SLD_cuisines.shape


True
Out[26]:
(2000, 27)
  • Details

In [27]:
ddf = pd.read_csv('SLD_details.csv')
ddf1 = pd.read_csv('SLD_details_1.csv')
ddf2 = pd.read_csv('SLD_details_2.csv')
ddf3 = pd.read_csv('SLD_details_3.csv')

In [28]:
print ddf.shape
ddf = ddf.drop('Unnamed: 0', 1)
print ddf.columns
ddf.head(3)


(500, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')
Out[28]:
id cookTimeInSeconds ingredientCount numberOfServings prepTimeInSeconds totalTimeInSeconds
0 Greek-Pasta-Salad-1712241 NaN 7 4 NaN 2400
1 Caprese-Salad-Recipe-With-Tomatoes_-Basil-And-... NaN 7 4 NaN 900
2 Grandmas-Cucumber-Salad-1710308 NaN 9 8 NaN 1500

In [29]:
print ddf1.shape
ddf1 = ddf1.drop('Unnamed: 0', 1)
print ddf1.columns
ddf1.head(3)


(501, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')
Out[29]:
id cookTimeInSeconds ingredientCount numberOfServings prepTimeInSeconds totalTimeInSeconds
0 Apple-Coleslaw-1680966 NaN 8 4 NaN 1200
1 Watermelon-Salad-with-Mint-and-Feta-1668127 NaN 5 4 600.0 600
2 Sweet-Kale-Salad-1693185 NaN 15 4 NaN 1200

In [30]:
print ddf2.shape
ddf2 = ddf2.drop('Unnamed: 0', 1)
print ddf2.columns
ddf2.head(3)


(500, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')
Out[30]:
id cookTimeInSeconds ingredientCount numberOfServings prepTimeInSeconds totalTimeInSeconds
0 Lemony-Coleslaw-with-Apples-1700377 NaN 11 12 900.0 900
1 Thai-Curry-Pasta-Salad-_Vegan_-Gluten-Free_-17... NaN 21 1 NaN 1800
2 Paleo-Ranch-Dressing-1620911 NaN 12 4 NaN 600

In [31]:
print ddf3.shape
ddf3 = ddf3.drop('Unnamed: 0', 1)
print ddf3.columns
ddf3.head(3)


(500, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')
Out[31]:
id cookTimeInSeconds ingredientCount numberOfServings prepTimeInSeconds totalTimeInSeconds
0 Cucumber-Chickpea-Salad-with-Balsamic-Tahini-D... NaN 12 4 NaN 1200.0
1 Cucumber-Tomato-Salad-1102770 NaN 8 7 NaN 900.0
2 Quinoa-Tabbouleh-Salad-The-Shiksa-Blog-48849 NaN 8 7 NaN 2400.0

In [32]:
#concatenate the details tables. 
SLD_details= pd.concat([ddf, ddf1, ddf2, ddf3])

In [33]:
#peek at dataframe
print SLD_details.shape
SLD_details.head(3)


(2001, 6)
Out[33]:
id cookTimeInSeconds ingredientCount numberOfServings prepTimeInSeconds totalTimeInSeconds
0 Greek-Pasta-Salad-1712241 NaN 7 4 NaN 2400.0
1 Caprese-Salad-Recipe-With-Tomatoes_-Basil-And-... NaN 7 4 NaN 900.0
2 Grandmas-Cucumber-Salad-1710308 NaN 9 8 NaN 1500.0

In [34]:
for i in SLD_details.duplicated('id'):
    if i == True:
        print i
SLD_details = SLD_details.drop_duplicates('id')
SLD_details.shape


True
Out[34]:
(2000, 6)
  • Ingredients

In [35]:
idf = pd.read_csv('SLD_ingredients.csv')
idf1 = pd.read_csv('SLD_ingredients_1.csv')
idf2 = pd.read_csv('SLD_ingredients_2.csv')
idf3 = pd.read_csv('SLD_ingredients_3.csv')

In [36]:
print idf.shape
idf = idf.drop('Unnamed: 0', 1)
print idf.columns
idf.head(3)


(500, 733)
Index([u'id', u'course', u'agave nectar', u'almond butter', u'almonds',
       u'anchovy fillets', u'anchovy paste', u'apple butter',
       u'apple cider vinegar', u'apples',
       ...
       u'whole kernel corn', u'whole milk', u'whole wheat cheese tortellini',
       u'wine vinegar', u'worcestershire sauce', u'yellow bell pepper',
       u'yellow mustard', u'yoghurt', u'zucchini', u'zucchini noodles'],
      dtype='object', length=732)
Out[36]:
id course agave nectar almond butter almonds anchovy fillets anchovy paste apple butter apple cider vinegar apples ... whole kernel corn whole milk whole wheat cheese tortellini wine vinegar worcestershire sauce yellow bell pepper yellow mustard yoghurt zucchini zucchini noodles
0 Greek-Pasta-Salad-1712241 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 Caprese-Salad-Recipe-With-Tomatoes_-Basil-And-... Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 Grandmas-Cucumber-Salad-1710308 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

3 rows × 732 columns


In [37]:
print idf1.shape
idf1 = idf1.drop('Unnamed: 0', 1)
print idf1.columns
idf1.head(3)


(501, 752)
Index([u'id', u'course', u'agave nectar', u'aged balsamic vinegar',
       u'allspice', u'almond butter', u'almond oil', u'almonds',
       u'anchovy fillets', u'anchovy paste',
       ...
       u'yellow onion', u'yellow peppers', u'yellow squash', u'yellow tomato',
       u'yoghurt', u'yogurt', u'yukon gold', u'yukon gold potatoes', u'zest',
       u'zucchini'],
      dtype='object', length=751)
Out[37]:
id course agave nectar aged balsamic vinegar allspice almond butter almond oil almonds anchovy fillets anchovy paste ... yellow onion yellow peppers yellow squash yellow tomato yoghurt yogurt yukon gold yukon gold potatoes zest zucchini
0 Apple-Coleslaw-1680966 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 Watermelon-Salad-with-Mint-and-Feta-1668127 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 Sweet-Kale-Salad-1693185 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

3 rows × 751 columns


In [38]:
print idf2.shape
idf2 = idf2.drop('Unnamed: 0', 1)
print idf2.columns
idf2.head(3)


(500, 766)
Index([u'id', u'course', u'agave nectar', u'alfalfa sprouts',
       u'all purpose potatoes', u'allspice', u'almond butter', u'almond milk',
       u'almonds', u'anchovy fillets',
       ...
       u'yellow mustard', u'yellow onion', u'yellow peppers',
       u'yellow summer squash', u'yoghurt', u'yukon gold potatoes', u'za'atar',
       u'zesty italian dressing', u'ziti', u'zucchini'],
      dtype='object', length=765)
Out[38]:
id course agave nectar alfalfa sprouts all purpose potatoes allspice almond butter almond milk almonds anchovy fillets ... yellow mustard yellow onion yellow peppers yellow summer squash yoghurt yukon gold potatoes za'atar zesty italian dressing ziti zucchini
0 Lemony-Coleslaw-with-Apples-1700377 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 Thai-Curry-Pasta-Salad-_Vegan_-Gluten-Free_-17... Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 Paleo-Ranch-Dressing-1620911 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

3 rows × 765 columns


In [39]:
print idf3.shape
idf3 = idf3.drop('Unnamed: 0', 1)
print idf3.columns
idf3.head(3)


(500, 761)
Index([u'id', u'course', u'adobo sauce', u'agave nectar',
       u'aged balsamic vinegar', u'alessi sea salt', u'almond butter',
       u'almonds', u'anchovies', u'anchovy paste',
       ...
       u'yellow mustard', u'yellow onion', u'yellow peppers', u'yellow squash',
       u'yellow summer squash', u'yoghurt', u'yukon gold potatoes', u'za'atar',
       u'zest', u'zucchini'],
      dtype='object', length=760)
Out[39]:
id course adobo sauce agave nectar aged balsamic vinegar alessi sea salt almond butter almonds anchovies anchovy paste ... yellow mustard yellow onion yellow peppers yellow squash yellow summer squash yoghurt yukon gold potatoes za'atar zest zucchini
0 Cucumber-Chickpea-Salad-with-Balsamic-Tahini-D... Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 Cucumber-Tomato-Salad-1102770 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 Quinoa-Tabbouleh-Salad-The-Shiksa-Blog-48849 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

3 rows × 760 columns


In [40]:
#concatenate the ingredients tables. 
SLD_ing= pd.concat([idf, idf1, idf2, idf3])
#create a new dataframe with selected columns
SLD_ing_reduced = SLD_ing[['id', 'ingredient_list']]

In [41]:
SLD_ing.head(3)


Out[41]:
adobo sauce agave nectar aged balsamic vinegar alessi sea salt alfalfa sprouts all purpose potatoes allspice almond butter almond milk almond oil ... yoghurt yogurt yukon gold yukon gold potatoes za'atar zest zesty italian dressing ziti zucchini zucchini noodles
0 NaN 0.0 NaN NaN NaN NaN NaN 0.0 NaN NaN ... 0.0 NaN NaN NaN NaN NaN NaN NaN 0.0 0.0
1 NaN 0.0 NaN NaN NaN NaN NaN 0.0 NaN NaN ... 0.0 NaN NaN NaN NaN NaN NaN NaN 0.0 0.0
2 NaN 0.0 NaN NaN NaN NaN NaN 0.0 NaN NaN ... 0.0 NaN NaN NaN NaN NaN NaN NaN 0.0 0.0

3 rows × 1397 columns


In [45]:
#drop unnamed column & make id first column
cols = list(SLD_ing)
cols.insert(0, cols.pop(cols.index('id')))
SLD_ing = SLD_ing.ix[:, cols]

In [46]:
SLD_ing.head(3)


Out[46]:
id adobo sauce agave nectar aged balsamic vinegar alessi sea salt alfalfa sprouts all purpose potatoes allspice almond butter almond milk ... yoghurt yogurt yukon gold yukon gold potatoes za'atar zest zesty italian dressing ziti zucchini zucchini noodles
0 Greek-Pasta-Salad-1712241 NaN 0.0 NaN NaN NaN NaN NaN 0.0 NaN ... 0.0 NaN NaN NaN NaN NaN NaN NaN 0.0 0.0
1 Caprese-Salad-Recipe-With-Tomatoes_-Basil-And-... NaN 0.0 NaN NaN NaN NaN NaN 0.0 NaN ... 0.0 NaN NaN NaN NaN NaN NaN NaN 0.0 0.0
2 Grandmas-Cucumber-Salad-1710308 NaN 0.0 NaN NaN NaN NaN NaN 0.0 NaN ... 0.0 NaN NaN NaN NaN NaN NaN NaN 0.0 0.0

3 rows × 1397 columns


In [47]:
for i in SLD_ing.duplicated('id'):
    if i == True:
        print i
SLD_ing = SLD_ing.drop_duplicates('id')
SLD_ing.shape


Out[47]:
(2000, 1397)

Join all tables for Salads


In [49]:
# set index to column 'id'
_df = [SLD_main, SLD_main_reduced, SLD_cuisines, SLD_flavors, SLD_details, SLD_ing, SLD_ing_reduced]

for df in _df:
    df.set_index('id', inplace = True)

In [50]:
# join dataframes
SLD_data = SLD_main.join([SLD_cuisines, SLD_flavors, SLD_details, SLD_ing])
SLD_data_reduced = SLD_main_reduced.join([SLD_flavors, SLD_details, SLD_ing_reduced])
# create a course column
SLD_data['course'] = 'salad'
SLD_data_reduced['course'] = 'salad'

In [51]:
SLD_data.head(3)


Out[51]:
rating recipeName sourceDisplayName American Asian Barbecue Cajun & Creole Chinese Cuban English ... yoghurt yogurt yukon gold yukon gold potatoes za'atar zest zesty italian dressing ziti zucchini zucchini noodles
id
Greek-Pasta-Salad-1712241 4 Greek Pasta Salad Live Serendipity 0 0 0 0 0 0 0 ... 0.0 NaN NaN NaN NaN NaN NaN NaN 0.0 0.0
Caprese-Salad-Recipe-With-Tomatoes_-Basil-And-Mozarella-1710947 4 Caprese Salad Recipe With Tomatoes, Basil And ... Melanie Cooks 0 0 0 0 0 0 0 ... 0.0 NaN NaN NaN NaN NaN NaN NaN 0.0 0.0
Grandmas-Cucumber-Salad-1710308 4 Grandma’s Cucumber Salad Chin Deep 0 0 0 0 0 0 0 ... 0.0 NaN NaN NaN NaN NaN NaN NaN 0.0 0.0

3 rows × 1436 columns


In [52]:
#Save into a csv file
SLD_data.to_csv('SLD_data.csv')
SLD_data_reduced.to_csv('SLD_data_reduced.csv')