In [1]:
import pandas as pd
import numpy as np

In [59]:
df = pd.read_csv('DS_main.csv')
df1 = pd.read_csv('DS_main_1.csv')
df2 = pd.read_csv('DS_main_2.csv')
df3 = pd.read_csv('DS_main_3.csv')

In [60]:
print df.shape
df.head(3)


(500, 5)
Out[60]:
Unnamed: 0 id rating recipeName sourceDisplayName
0 0 Easy_-No-Bake_-Mud-Pie-1708171 4 Easy, No Bake, Mud Pie Ann's Entitled Life
1 1 Tita_s-Fruit-Dessert-_SundaySupper-1706994 4 Tita's Fruit Dessert #SundaySupper Basic N Delicious
2 2 No-Bake-Oreo-Cheesecake-1710284 3 No Bake Oreo Cheesecake Cook and Share

In [61]:
df = df.drop('Unnamed: 0', 1)

In [62]:
print df1.shape
df1 = df1.drop('Unnamed: 0', 1)
df1.head(3)


(500, 5)
Out[62]:
id rating recipeName sourceDisplayName
0 Lemonade-Popsicles-1682120 3 Lemonade Popsicles Beachbody
1 Best-Peanut-Butter-Buttercream-Frosting-1682977 4 Best Peanut Butter Buttercream Frosting Two Sisters Crafting
2 Omega-Chocolate-Bars-1695745 4 Omega Chocolate Bars A Healthy Life For Me

In [63]:
print df2.shape
df2 = df2.drop('Unnamed: 0', 1)
df2.head(3)


(500, 5)
Out[63]:
id rating recipeName sourceDisplayName
0 Raw-Brownies-With-Frosting-1707657 4 Raw Brownies With Frosting Planticize
1 Easy-Cinnamon-Roll-Cake-1552842 4 Easy Cinnamon Roll Cake Cookies & Cups
2 Coconut-Ice-Cream-1711710 4 Coconut Ice Cream Get Inspired Everyday!

In [64]:
print df3.shape
df3 = df3.drop('Unnamed: 0', 1)
df3.head(3)


(500, 5)
Out[64]:
id rating recipeName sourceDisplayName
0 Snickers-Chocolate-Cake-Mix-Bars-1710321 4 Snickers Chocolate Cake Mix Bars Kleinworth Co.
1 Mango-Coconut-Popsicles-1699009 4 Mango Coconut Popsicles Eat Good 4 Life
2 Man-Bars-1703874 4 Man Bars South your mouth

In [65]:
#concatenate the main tables. 
DS_main= pd.concat([df, df1, df2, df3])
#create a new dataframe with selected columns
DS_main_reduced = DS_main.drop(['recipeName', 'sourceDisplayName'], axis = 1)

In [66]:
#peek at dataframe
print DS_main.shape
DS_main.head(3)


(2000, 4)
Out[66]:
id rating recipeName sourceDisplayName
0 Easy_-No-Bake_-Mud-Pie-1708171 4 Easy, No Bake, Mud Pie Ann's Entitled Life
1 Tita_s-Fruit-Dessert-_SundaySupper-1706994 4 Tita's Fruit Dessert #SundaySupper Basic N Delicious
2 No-Bake-Oreo-Cheesecake-1710284 3 No Bake Oreo Cheesecake Cook and Share

In [67]:
for i in DS_main.duplicated('id'):
    if i == True:
        print i
DS_main = DS_main.drop_duplicates('id')
DS_main.shape


True
Out[67]:
(1999, 4)
  • Flavors

In [68]:
fdf = pd.read_csv('DS_flavors.csv')
fdf1 = pd.read_csv('DS_flavors_1.csv')
fdf2 = pd.read_csv('DS_flavors_2.csv')
fdf3 = pd.read_csv('DS_flavors_3.csv')

In [69]:
print fdf.shape
fdf = fdf.drop('Unnamed: 0', 1)
fdf = fdf.rename(columns = {'index':'id'})
fdf.head(3)


(500, 8)
Out[69]:
id bitter meaty piquant salty sour sweet
0 -No-Bake-Caramel-Cheesecake-Shooters-1706735 NaN NaN NaN NaN NaN NaN
1 1-2-3-4-Cupcakes-760182 0.166667 0.333333 0.0 0.333333 0.166667 0.833333
2 1-Minute-Chocolate-Frosting-1690971 NaN NaN NaN NaN NaN NaN

In [70]:
print fdf1.shape
fdf1 = fdf1.drop('Unnamed: 0', 1)
fdf1 = fdf1.rename(columns = {'index':'id'})
fdf1.head(3)


(500, 8)
Out[70]:
id bitter meaty piquant salty sour sweet
0 10-Layer-Chocolate-Caramel-Mousse-Cake-1697890 0.166667 0.166667 0.0 0.5 0.166667 0.833333
1 2-Bite-Cherry-Pies-1709499 NaN NaN NaN NaN NaN NaN
2 3-Ingredient-Brownies-1703671 0.166667 0.166667 0.0 0.0 0.166667 0.833333

In [71]:
print fdf2.shape
fdf2 = fdf2.drop('Unnamed: 0', 1)
fdf2 = fdf2.rename(columns = {'index':'id'})
fdf2.head(3)


(500, 8)
Out[71]:
id bitter meaty piquant salty sour sweet
0 1-Minute-Sugar-Free-Peanut-Butter-Mug-Cake-169... 0.833333 0.833333 0.0 0.833333 0.166667 0.333333
1 10-Minute-Strawberry-Jam-1702695 0.000000 0.166667 0.0 0.000000 0.833333 0.666667
2 1_2-Syn-Strawberry-Mousse-1703461 NaN NaN NaN NaN NaN NaN

In [72]:
print fdf3.shape
fdf3 = fdf3.drop('Unnamed: 0', 1)
fdf3 = fdf3.rename(columns = {'index':'id'})
fdf3.head(3)


(500, 8)
Out[72]:
id bitter meaty piquant salty sour sweet
0 100-Calorie-Chocolate-Peanut-Butter-Squares-15... NaN NaN NaN NaN NaN NaN
1 15-Minute-Strawberry-Jam-1683439 0.0 0.166667 0.0 0.0 0.833333 0.666667
2 1_-Classic-Chocolate-Cake-1708247 NaN NaN NaN NaN NaN NaN

In [73]:
#concatenate the flavor tables. 
DS_flavors= pd.concat([fdf, fdf1, fdf2, fdf3])

In [74]:
#peek at dataframe
print DS_flavors.shape
DS_flavors.head(3)


(2000, 7)
Out[74]:
id bitter meaty piquant salty sour sweet
0 -No-Bake-Caramel-Cheesecake-Shooters-1706735 NaN NaN NaN NaN NaN NaN
1 1-2-3-4-Cupcakes-760182 0.166667 0.333333 0.0 0.333333 0.166667 0.833333
2 1-Minute-Chocolate-Frosting-1690971 NaN NaN NaN NaN NaN NaN

In [75]:
for i in DS_flavors.duplicated('id'):
    if i == True:
        print i
DS_flavors = DS_flavors.drop_duplicates('id')
DS_flavors.shape


True
Out[75]:
(1999, 7)
  • Cuisine

In [76]:
cdf = pd.read_csv('DS_cuisines.csv')
cdf1 = pd.read_csv('DS_cuisines_1.csv')
cdf2 = pd.read_csv('DS_cuisines_2.csv')
cdf3 = pd.read_csv('DS_cuisines_3.csv')

In [77]:
print cdf.shape
cdf = cdf.drop('Unnamed: 0', 1)
cdf = cdf.rename(columns = {'index':'id'})
print cdf.columns
cdf.head(3)


(500, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')
Out[77]:
id American Asian Barbecue Cajun & Creole Chinese Cuban English French German ... Kid-Friendly Mediterranean Mexican Moroccan Portuguese Southern & Soul Food Southwestern Spanish Swedish Thai
0 -No-Bake-Caramel-Cheesecake-Shooters-1706735 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 1-2-3-4-Cupcakes-760182 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 1-Minute-Chocolate-Frosting-1690971 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 27 columns


In [78]:
print cdf1.shape
cdf1 = cdf1.drop('Unnamed: 0', 1)
cdf1 = cdf1.rename(columns = {'index':'id'})
print cdf1.columns
cdf1.head(3)


(500, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')
Out[78]:
id American Asian Barbecue Cajun & Creole Chinese Cuban English French German ... Kid-Friendly Mediterranean Mexican Moroccan Portuguese Southern & Soul Food Southwestern Spanish Swedish Thai
0 10-Layer-Chocolate-Caramel-Mousse-Cake-1697890 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 2-Bite-Cherry-Pies-1709499 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 0
2 3-Ingredient-Brownies-1703671 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 27 columns


In [79]:
print cdf2.shape
cdf2 = cdf2.drop('Unnamed: 0', 1)
cdf2 = cdf2.rename(columns = {'index':'id'})
print cdf2.columns
cdf2.head(3)


(500, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')
Out[79]:
id American Asian Barbecue Cajun & Creole Chinese Cuban English French German ... Kid-Friendly Mediterranean Mexican Moroccan Portuguese Southern & Soul Food Southwestern Spanish Swedish Thai
0 1-Minute-Sugar-Free-Peanut-Butter-Mug-Cake-169... 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 10-Minute-Strawberry-Jam-1702695 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 1_2-Syn-Strawberry-Mousse-1703461 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 27 columns


In [80]:
print cdf3.shape
cdf3 = cdf3.drop('Unnamed: 0', 1)
cdf3 = cdf3.rename(columns = {'index':'id'})
print cdf3.columns
cdf3.head(3)


(500, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')
Out[80]:
id American Asian Barbecue Cajun & Creole Chinese Cuban English French German ... Kid-Friendly Mediterranean Mexican Moroccan Portuguese Southern & Soul Food Southwestern Spanish Swedish Thai
0 100-Calorie-Chocolate-Peanut-Butter-Squares-15... 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 15-Minute-Strawberry-Jam-1683439 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 1_-Classic-Chocolate-Cake-1708247 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 27 columns


In [81]:
#concatenate the cuisines tables. 
DS_cuisines= pd.concat([cdf, cdf1, cdf2, cdf3])

In [82]:
#peek at dataframe
print DS_cuisines.shape
DS_cuisines.head(3)


(2000, 27)
Out[82]:
id American Asian Barbecue Cajun & Creole Chinese Cuban English French German ... Kid-Friendly Mediterranean Mexican Moroccan Portuguese Southern & Soul Food Southwestern Spanish Swedish Thai
0 -No-Bake-Caramel-Cheesecake-Shooters-1706735 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 1-2-3-4-Cupcakes-760182 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 1-Minute-Chocolate-Frosting-1690971 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 27 columns


In [83]:
for i in DS_cuisines.duplicated('id'):
    if i == True:
        print i
DS_cuisines = DS_cuisines.drop_duplicates('id')
DS_cuisines.shape


True
Out[83]:
(1999, 27)
  • Details

In [84]:
ddf = pd.read_csv('DS_details.csv')
ddf1 = pd.read_csv('DS_details_1.csv')
ddf2 = pd.read_csv('DS_details_2.csv')
ddf3 = pd.read_csv('DS_details_3.csv')

In [85]:
print ddf.shape
ddf = ddf.drop('Unnamed: 0', 1)
print ddf.columns
ddf.head(3)


(500, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')
Out[85]:
id cookTimeInSeconds ingredientCount numberOfServings prepTimeInSeconds totalTimeInSeconds
0 Easy_-No-Bake_-Mud-Pie-1708171 11700.0 5 6 900.0 12600
1 Tita_s-Fruit-Dessert-_SundaySupper-1706994 NaN 6 4 900.0 900
2 No-Bake-Oreo-Cheesecake-1710284 NaN 6 4 1800.0 1800

In [86]:
print ddf1.shape
ddf1 = ddf1.drop('Unnamed: 0', 1)
print ddf1.columns
ddf1.head(3)


(500, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')
Out[86]:
id cookTimeInSeconds ingredientCount numberOfServings prepTimeInSeconds totalTimeInSeconds
0 Lemonade-Popsicles-1682120 NaN 3 4.0 600.0 15000
1 Best-Peanut-Butter-Buttercream-Frosting-1682977 NaN 5 4.0 NaN 1200
2 Omega-Chocolate-Bars-1695745 120.0 5 25.0 1080.0 1200

In [87]:
print ddf2.shape
ddf2 = ddf2.drop('Unnamed: 0', 1)
print ddf2.columns
ddf2.head(3)


(500, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')
Out[87]:
id cookTimeInSeconds ingredientCount numberOfServings prepTimeInSeconds totalTimeInSeconds
0 Raw-Brownies-With-Frosting-1707657 NaN 10 4 900.0 900
1 Easy-Cinnamon-Roll-Cake-1552842 NaN 9 10 NaN 3000
2 Coconut-Ice-Cream-1711710 NaN 2 4 300.0 300

In [88]:
print ddf3.shape
ddf3 = ddf3.drop('Unnamed: 0', 1)
print ddf3.columns
ddf3.head(3)


(500, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')
Out[88]:
id cookTimeInSeconds ingredientCount numberOfServings prepTimeInSeconds totalTimeInSeconds
0 Snickers-Chocolate-Cake-Mix-Bars-1710321 3300.0 4 4 480.0 3780
1 Mango-Coconut-Popsicles-1699009 NaN 5 10 NaN 300
2 Man-Bars-1703874 NaN 7 24 NaN 2400

In [89]:
#concatenate the detail tables. 
DS_details= pd.concat([ddf, ddf1, ddf2, ddf3])

In [90]:
#peek at dataframe
print DS_details.shape
DS_details.head(3)


(2000, 6)
Out[90]:
id cookTimeInSeconds ingredientCount numberOfServings prepTimeInSeconds totalTimeInSeconds
0 Easy_-No-Bake_-Mud-Pie-1708171 11700.0 5 6.0 900.0 12600
1 Tita_s-Fruit-Dessert-_SundaySupper-1706994 NaN 6 4.0 900.0 900
2 No-Bake-Oreo-Cheesecake-1710284 NaN 6 4.0 1800.0 1800

In [91]:
for i in DS_details.duplicated('id'):
    if i == True:
        print i
DS_details = DS_details.drop_duplicates('id')
DS_details.shape


True
Out[91]:
(1999, 6)
  • Ingredients

In [92]:
idf = pd.read_csv('DS_ingredients.csv')
idf1 = pd.read_csv('DS_ingredients_1.csv')
idf2 = pd.read_csv('DS_ingredients_2.csv')
idf3 = pd.read_csv('DS_ingredients_3.csv')

In [93]:
print idf.shape
idf = idf.drop('Unnamed: 0', 1)
print idf.columns
idf.head(3)


(500, 603)
Index([u'id', u'course', u'active dry yeast', u'agave nectar', u'alcohol',
       u'all-purpose flour', u'allspice', u'almond butter', u'almond extract',
       u'almond flour',
       ...
       u'xylitol sweetener', u'yellow cake mix', u'yoghurt', u'yogurt',
       u'yolk', u'yoplait', u'yoplait greek 100 blackberry pie yogurt',
       u'yoplait greek key lime pie yogurt',
       u'yoplait greek lemon meringue yogurt', u'zucchini'],
      dtype='object', length=602)
Out[93]:
id course active dry yeast agave nectar alcohol all-purpose flour allspice almond butter almond extract almond flour ... xylitol sweetener yellow cake mix yoghurt yogurt yolk yoplait yoplait greek 100 blackberry pie yogurt yoplait greek key lime pie yogurt yoplait greek lemon meringue yogurt zucchini
0 Easy_-No-Bake_-Mud-Pie-1708171 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 Tita_s-Fruit-Dessert-_SundaySupper-1706994 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 No-Bake-Oreo-Cheesecake-1710284 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

3 rows × 602 columns


In [94]:
print idf1.shape
idf1 = idf1.drop('Unnamed: 0', 1)
print idf1.columns
idf1.head(3)


(500, 609)
Index([u'id', u'course', u'agar', u'agave nectar', u'all-purpose flour',
       u'allspice', u'almond butter', u'almond extract', u'almond flour',
       u'almond meal',
       ...
       u'yellow cake mix', u'yellow cornmeal', u'yellow food coloring',
       u'yoghurt', u'yogurt', u'yolk', u'yoplait greek 100 apple pie yogurt',
       u'yoplait greek 100 mango yogurt',
       u'yoplait greek 100 raspberry yogurt', u'zest'],
      dtype='object', length=608)
Out[94]:
id course agar agave nectar all-purpose flour allspice almond butter almond extract almond flour almond meal ... yellow cake mix yellow cornmeal yellow food coloring yoghurt yogurt yolk yoplait greek 100 apple pie yogurt yoplait greek 100 mango yogurt yoplait greek 100 raspberry yogurt zest
0 Lemonade-Popsicles-1682120 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 Best-Peanut-Butter-Buttercream-Frosting-1682977 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 Omega-Chocolate-Bars-1695745 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

3 rows × 608 columns


In [95]:
print idf2.shape
idf2 = idf2.drop('Unnamed: 0', 1)
print idf2.columns
idf2.head(3)


(500, 629)
Index([u'id', u'course', u'9 inch chocolate crumb crust', u'agar',
       u'agave nectar', u'all-purpose flour', u'allspice', u'almond butter',
       u'almond extract', u'almond flour',
       ...
       u'whole wheat flour', u'whole wheat white flour', u'yellow cake mix',
       u'yellow food coloring', u'yoghurt', u'yogurt', u'yolk', u'yoplait',
       u'yoplait greek 100 pineapple yogurt', u'yoplait greek caramel yogurt'],
      dtype='object', length=628)
Out[95]:
id course 9 inch chocolate crumb crust agar agave nectar all-purpose flour allspice almond butter almond extract almond flour ... whole wheat flour whole wheat white flour yellow cake mix yellow food coloring yoghurt yogurt yolk yoplait yoplait greek 100 pineapple yogurt yoplait greek caramel yogurt
0 Raw-Brownies-With-Frosting-1707657 Breakfast and Brunch 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 Easy-Cinnamon-Roll-Cake-1552842 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 Coconut-Ice-Cream-1711710 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

3 rows × 628 columns


In [96]:
print idf3.shape
idf3 = idf3.drop('Unnamed: 0', 1)
print idf3.columns
idf3.head(3)


(500, 652)
Index([u'id', u'course', u'agave nectar', u'all-purpose flour', u'allspice',
       u'almond butter', u'almond extract', u'almond flour', u'almond meal',
       u'almond milk',
       ...
       u'whole wheat flour', u'whole wheat pastry flour',
       u'whole wheat white flour', u'xanthum gum', u'yellow cake mix',
       u'yellow food coloring', u'yoghurt', u'yolk', u'yoplait', u'zest'],
      dtype='object', length=651)
Out[96]:
id course agave nectar all-purpose flour allspice almond butter almond extract almond flour almond meal almond milk ... whole wheat flour whole wheat pastry flour whole wheat white flour xanthum gum yellow cake mix yellow food coloring yoghurt yolk yoplait zest
0 Snickers-Chocolate-Cake-Mix-Bars-1710321 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 Mango-Coconut-Popsicles-1699009 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 Man-Bars-1703874 Breakfast and Brunch 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

3 rows × 651 columns


In [97]:
#concatenate the ingredients tables. 
DS_ing= pd.concat([idf, idf1, idf2, idf3])
#create a new dataframe with selected columns
DS_ing_reduced = DS_ing[['id', 'ingredient_list']]

In [98]:
DS_ing.head(3)


Out[98]:
9 inch chocolate crumb crust active dry yeast agar agave nectar alcohol all-purpose flour allspice almond butter almond extract almond flour ... yoplait greek 100 apple pie yogurt yoplait greek 100 blackberry pie yogurt yoplait greek 100 mango yogurt yoplait greek 100 pineapple yogurt yoplait greek 100 raspberry yogurt yoplait greek caramel yogurt yoplait greek key lime pie yogurt yoplait greek lemon meringue yogurt zest zucchini
0 NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... NaN 0.0 NaN NaN NaN NaN 0.0 0.0 NaN 0.0
1 NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... NaN 0.0 NaN NaN NaN NaN 0.0 0.0 NaN 0.0
2 NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... NaN 0.0 NaN NaN NaN NaN 0.0 0.0 NaN 0.0

3 rows × 1174 columns


In [99]:
#make id first column
cols = list(DS_ing)
cols.insert(0, cols.pop(cols.index('id')))
DS_ing = DS_ing.ix[:, cols]

In [100]:
DS_ing.head(3)


Out[100]:
id 9 inch chocolate crumb crust active dry yeast agar agave nectar alcohol all-purpose flour allspice almond butter almond extract ... yoplait greek 100 apple pie yogurt yoplait greek 100 blackberry pie yogurt yoplait greek 100 mango yogurt yoplait greek 100 pineapple yogurt yoplait greek 100 raspberry yogurt yoplait greek caramel yogurt yoplait greek key lime pie yogurt yoplait greek lemon meringue yogurt zest zucchini
0 Easy_-No-Bake_-Mud-Pie-1708171 NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 ... NaN 0.0 NaN NaN NaN NaN 0.0 0.0 NaN 0.0
1 Tita_s-Fruit-Dessert-_SundaySupper-1706994 NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 ... NaN 0.0 NaN NaN NaN NaN 0.0 0.0 NaN 0.0
2 No-Bake-Oreo-Cheesecake-1710284 NaN 0.0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 ... NaN 0.0 NaN NaN NaN NaN 0.0 0.0 NaN 0.0

3 rows × 1174 columns


In [101]:
for i in DS_ing.duplicated('id'):
    if i == True:
        print i
DS_ing = DS_ing.drop_duplicates('id')
DS_ing.shape


True
Out[101]:
(1999, 1174)

Join all tables for Dessert


In [102]:
# set index to column 'id'
_df = [DS_main, DS_main_reduced, DS_cuisines, DS_flavors, DS_details, DS_ing, DS_ing_reduced]

for df in _df:
    df.set_index('id', inplace = True)

In [105]:
# join dataframes
DS_data = DS_main.join([DS_cuisines, DS_flavors, DS_details, DS_ing])
DS_data_reduced = DS_main_reduced.join([DS_flavors, DS_details, DS_ing_reduced])

# create a course column
DS_data['course'] = 'dessert'
DS_data_reduced ['course'] = 'dessert'

In [106]:
DS_data.shape


Out[106]:
(1999, 1213)

In [107]:
# save into csv
DS_data.to_csv('DS_data.csv')
DS_data_reduced.to_csv('DS_data_reduced.csv')