In [3]:
import pandas as pd
import numpy as np

Breakfast & Brunch

  • Main

In [211]:
df = pd.read_csv('BB_main.csv')
df1 = pd.read_csv('BB_main_1.csv')
df2 = pd.read_csv('BB_main_2.csv')
df3 = pd.read_csv('BB_main_3.csv')

In [212]:
print df.shape
df2.id.duplicated().sum()


(500, 5)
Out[212]:
0

In [213]:
df = df.drop('Unnamed: 0', 1)

In [214]:
print df1.shape
df1 = df1.drop('Unnamed: 0', 1)
df.id.duplicated().sum()


(500, 5)
Out[214]:
0

In [215]:
print df2.shape
df2 = df2.drop('Unnamed: 0', 1)
df2.head(3)


(500, 5)
Out[215]:
id rating recipeName sourceDisplayName
0 Gluten-Free-Waffles-1629571 4 Gluten-Free Waffles Dinner Was Delish
1 Peanut-Butter-Pancakes-1342291 4 Peanut Butter Pancakes Just So Tasty
2 Kahlua-and-Cream-French-Toast-1597605 4 Kahlua and Cream French Toast Dizzy Busy and Hungry

In [216]:
print df3.shape
df3 = df3.drop('Unnamed: 0', 1)
df3.head(3)


(500, 5)
Out[216]:
id rating recipeName sourceDisplayName
0 Chocolate-Breakfast-Oatmeal-1237224 4 Chocolate Breakfast Oatmeal Family Food on the Table
1 Fluffy-French-Toast-785994 5 Fluffy French Toast AllRecipes
2 Overnight-Cranberry-Orange-French-Toast-Casser... 4 Overnight Cranberry Orange French Toast Casserole Kristine's Kitchen

In [217]:
#concatenate the main tables. 
BB_main= pd.concat([df, df1, df2, df3], join = 'outer', axis = 0, ignore_index = True)

In [218]:
#peek at dataframe
print BB_main.shape
BB_main.head(3)


(2000, 4)
Out[218]:
id rating recipeName sourceDisplayName
0 Clean-Eating-Banana-Pancakes-1708094 3 Clean Eating Banana Pancakes The Happier Homemaker
1 Norwegian-Pancakes-1707570 4 Norwegian Pancakes Enjoy cooking
2 Egg-Bake-Breakfast-Foil-Packet-1708169 4 Egg Bake Breakfast Foil Packet Julie's Eats & Treats

In [219]:
for i in BB_main.duplicated('id'):
    if i == True:
        print i
BB_main = BB_main.drop_duplicates('id')
BB_main.shape


True
Out[219]:
(1999, 4)
  • Flavors

In [220]:
fdf = pd.read_csv('BB_flavors.csv')
fdf1 = pd.read_csv('BB_flavors_1.csv')
fdf2 = pd.read_csv('BB_flavors_2.csv')
fdf3 = pd.read_csv('BB_flavors_3.csv')

In [221]:
print fdf.shape
fdf = fdf.drop('Unnamed: 0', 1)
fdf = fdf.rename(columns = {'index':'id'})
fdf1.id.duplicated().sum()


(500, 8)
Out[221]:
0

In [222]:
print fdf1.shape
fdf1 = fdf1.drop('Unnamed: 0', 1)
fdf1.head(3)


(500, 8)
Out[222]:
id bitter meaty piquant salty sour sweet
0 1-2-3-Gluten-Free-Pancakes-1533270 0.333333 0.500000 0.0 0.166667 0.166667 0.5
1 2-INGREDIENT-PANCAKES-1683615 NaN NaN NaN NaN NaN NaN
2 2-Ingredient-Pancakes-1264118 0.166667 0.166667 0.0 0.166667 0.333333 0.5

In [223]:
print fdf2.shape
fdf2 = fdf2.drop('Unnamed: 0', 1)
fdf2 = fdf2.rename(columns = {'index':'id'})
fdf2.id.duplicated().sum()


(500, 8)
Out[223]:
0

In [224]:
print fdf3.shape
fdf3 = fdf3.drop('Unnamed: 0', 1)
fdf3 = fdf3.rename(columns = {'index':'id'})
fdf3.id.duplicated().sum()


(500, 8)
Out[224]:
0

In [225]:
#concatenate the flavors tables. 
BB_flavors= pd.concat([fdf, fdf1, fdf2, fdf3], join = 'outer', axis = 0, ignore_index = True)

In [226]:
#peek at dataframe
print BB_flavors.shape
BB_flavors.head(2)


(2000, 7)
Out[226]:
id bitter meaty piquant salty sour sweet
0 2--Minute-Low-Carb-English-Muffin-1697908 NaN NaN NaN NaN NaN NaN
1 2-Ingredient-Pancakes-1686555 0.166667 0.166667 0.0 0.166667 0.333333 0.5

In [227]:
for i in BB_flavors.duplicated('id'):
    if i == True:
        print i
BB_flavors = BB_flavors.drop_duplicates('id')
BB_flavors.shape


True
Out[227]:
(1999, 7)
  • Cuisine

In [228]:
cdf = pd.read_csv('BB_cuisines.csv')
cdf1 = pd.read_csv('BB_cuisines_1.csv')
cdf2 = pd.read_csv('BB_cuisines_2.csv')
cdf3 = pd.read_csv('BB_cuisines_3.csv')

In [229]:
print cdf.shape
cdf = cdf.drop('Unnamed: 0', 1)
cdf = cdf.rename(columns = {'index':'id'})
print cdf.columns
cdf.id.duplicated().sum()


(500, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')
Out[229]:
0

In [230]:
print cdf1.shape
cdf1 = cdf1.drop('Unnamed: 0', 1)
print cdf1.columns
cdf1.id.duplicated().sum()


(500, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')
Out[230]:
0

In [231]:
print cdf2.shape
cdf2 = cdf2.drop('Unnamed: 0', 1)
cdf2 = cdf2.rename(columns = {'index':'id'})
print cdf2.columns
cdf2.id.duplicated().sum()


(500, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')
Out[231]:
0

In [232]:
print cdf3.shape
cdf3 = cdf3.drop('Unnamed: 0', 1)
cdf3 = cdf3.rename(columns = {'index':'id'})
print cdf3.columns
cdf3.id.duplicated().sum()


(500, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')
Out[232]:
0

In [233]:
#concatenate the cuisines tables. 
BB_cuisines= pd.concat([cdf, cdf1, cdf2, cdf3], join = 'outer', axis = 0, ignore_index = True)

In [236]:
#peek at dataframe
print BB_cuisines.shape
BB_cuisines.head(3)


(2000, 27)
Out[236]:
id American Asian Barbecue Cajun & Creole Chinese Cuban English French German ... Kid-Friendly Mediterranean Mexican Moroccan Portuguese Southern & Soul Food Southwestern Spanish Swedish Thai
0 2--Minute-Low-Carb-English-Muffin-1697908 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 2-Ingredient-Pancakes-1686555 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 2-Ingredient-Protein-Crepes-1684862 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 27 columns


In [237]:
for i in BB_cuisines.duplicated('id'):
    if i == True:
        print i
BB_cuisines = BB_cuisines.drop_duplicates('id')
BB_cuisines.shape


True
Out[237]:
(1999, 27)
  • Details

In [238]:
ddf = pd.read_csv('BB_details.csv')
ddf1 = pd.read_csv('BB_details_1.csv')
ddf2 = pd.read_csv('BB_details_2.csv')
ddf3 = pd.read_csv('BB_details_3.csv')

In [239]:
len(set(ddf3.id) & set(ddf2.id))


Out[239]:
0

In [240]:
print ddf.shape
ddf = ddf.drop('Unnamed: 0', 1)
print ddf.columns
ddf.id.duplicated().sum()


(500, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')
Out[240]:
0

In [241]:
print ddf1.shape
ddf1 = ddf1.drop('Unnamed: 0', 1)
print ddf1.columns
ddf1.id.duplicated().sum()


(500, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')
Out[241]:
0

In [242]:
print ddf2.shape
ddf2 = ddf2.drop('Unnamed: 0', 1)
print ddf2.columns
ddf2.id.duplicated().sum()


(500, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')
Out[242]:
0

In [243]:
print ddf3.shape
ddf3 = ddf3.drop('Unnamed: 0', 1)
print ddf3.columns
ddf3.id.duplicated().sum()


(500, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')
Out[243]:
0

In [244]:
#concatenate the details tables. 
BB_details= pd.concat([ddf, ddf1, ddf2, ddf3], join = 'outer', axis = 0, ignore_index = True)

In [246]:
#peek at dataframe
print BB_details.shape
BB_details.head(3)


(2000, 6)
Out[246]:
id cookTimeInSeconds ingredientCount numberOfServings prepTimeInSeconds totalTimeInSeconds
0 Clean-Eating-Banana-Pancakes-1708094 NaN 4 4 NaN 1500
1 Norwegian-Pancakes-1707570 NaN 5 6 NaN 1500
2 Egg-Bake-Breakfast-Foil-Packet-1708169 600.0 7 4 600.0 1200

In [247]:
for i in BB_details.duplicated('id'):
    if i == True:
        print i
BB_details = BB_details.drop_duplicates('id')
BB_details.shape


True
Out[247]:
(1999, 6)
  • Ingredients

In [258]:
idf = pd.read_csv('BB_ingredients.csv')
idf1 = pd.read_csv('BB_ingredients_1.csv')
idf2 = pd.read_csv('BB_ingredients_2.csv')
idf3 = pd.read_csv('BB_ingredients_3.csv')

In [259]:
len(set(idf2.id) & set(idf.id))


Out[259]:
0

In [260]:
print idf.shape
idf = idf.drop('Unnamed: 0', 1)
print idf.columns
idf.id.duplicated().sum()


(500, 708)
Index([u'id', u'course', u'active dry yeast', u'agave nectar',
       u'all-purpose flour', u'almond butter', u'almond extract',
       u'almond flour', u'almond meal', u'almond milk',
       ...
       u'whole wheat tortillas', u'whole wheat white flour',
       u'wholemeal bread', u'wild rice', u'yellow bell pepper',
       u'yellow cornmeal', u'yellow onion', u'yellow squash', u'yoghurt',
       u'zucchini'],
      dtype='object', length=707)
Out[260]:
0

In [261]:
print idf1.shape
idf1 = idf1.drop('Unnamed: 0', 1)
print idf1.columns
idf1.id.duplicated().sum()


(500, 699)
Index([u'id', u'course', u'active dry yeast', u'agave nectar',
       u'aged cheddar cheese', u'all-purpose flour', u'allspice',
       u'almond butter', u'almond extract', u'almond flour',
       ...
       u'whole wheat tortillas', u'whole wheat white flour', u'wild rice',
       u'worcestershire sauce', u'yeast', u'yellow onion', u'yellow peppers',
       u'yellow squash', u'yoghurt', u'zucchini'],
      dtype='object', length=698)
Out[261]:
0

In [262]:
print idf2.shape
idf2 = idf2.drop('Unnamed: 0', 1)
print idf2.columns
idf2.id.duplicated().sum()


(500, 633)
Index([u'id', u'course', u'achiote powder', u'active dry yeast',
       u'agave nectar', u'all purpose potatoes', u'all-purpose flour',
       u'allspice', u'almond butter', u'almond extract',
       ...
       u'wholemeal bread', u'wholemeal flour', u'xanthan gum', u'yeast',
       u'yellow mustard', u'yellow onion', u'yellow peppers', u'yoghurt',
       u'yoplait', u'zucchini'],
      dtype='object', length=632)
Out[262]:
0

In [263]:
print idf3.shape
idf3 = idf3.drop('Unnamed: 0', 1)
print idf3.columns
idf3.id.duplicated().sum()


(500, 679)
Index([u'id', u'course', u'adams natural peanut butter', u'agave nectar',
       u'all purpose potatoes', u'all purpose seasoning', u'all-purpose flour',
       u'allspice', u'almond butter', u'almond chocolate milk',
       ...
       u'xanthan gum', u'xylitol sweetener', u'yeast', u'yellow cornmeal',
       u'yellow onion', u'yoghurt', u'yoplait',
       u'yoplait greek 100 apple pie yogurt', u'yukon gold potatoes',
       u'zucchini'],
      dtype='object', length=678)
Out[263]:
0

In [264]:
#concatenate the ingredients tables. 
BB_ing= pd.concat([idf, idf1, idf2, idf3], join = 'outer', axis = 0, ignore_index = True)

In [267]:
#make id first column
cols = list(BB_ing)
cols.insert(0, cols.pop(cols.index('id')))
BB_ing = BB_ing.ix[:, cols]

In [268]:
BB_ing.head(3)


Out[268]:
id achiote powder active dry yeast adams natural peanut butter agave nectar aged cheddar cheese all purpose potatoes all purpose seasoning all-purpose flour allspice ... yellow cornmeal yellow mustard yellow onion yellow peppers yellow squash yoghurt yoplait yoplait greek 100 apple pie yogurt yukon gold potatoes zucchini
0 Clean-Eating-Banana-Pancakes-1708094 NaN 0.0 NaN 0.0 NaN NaN NaN 0.0 NaN ... 0.0 NaN 0.0 NaN 0.0 0.0 NaN NaN NaN 0.0
1 Norwegian-Pancakes-1707570 NaN 0.0 NaN 0.0 NaN NaN NaN 1.0 NaN ... 0.0 NaN 0.0 NaN 0.0 0.0 NaN NaN NaN 0.0
2 Egg-Bake-Breakfast-Foil-Packet-1708169 NaN 0.0 NaN 0.0 NaN NaN NaN 0.0 NaN ... 0.0 NaN 0.0 NaN 0.0 0.0 NaN NaN NaN 0.0

3 rows × 1299 columns


In [269]:
for i in BB_ing.duplicated('id'):
    if i == True:
        print i
BB_ing = BB_ing.drop_duplicates('id')
BB_ing.shape


True
Out[269]:
(1999, 1299)

Join all tables for Breakfast and Brunch


In [270]:
# set index to column 'id'
_df = [BB_main, BB_cuisines, BB_flavors, BB_details, BB_ing]

for df in _df:
    df.set_index('id', inplace = True)

In [274]:
# join dataframes
BB_data = BB_main.join([BB_cuisines, BB_flavors, BB_details, BB_ing])

# add course column-Breakfast&Brunch
BB_data['course'] = 'Breakfast_Brunch'

In [275]:
print BB_data.shape
BB_data.head(3)


(1999, 1338)
Out[275]:
rating recipeName sourceDisplayName American Asian Barbecue Cajun & Creole Chinese Cuban English ... yellow cornmeal yellow mustard yellow onion yellow peppers yellow squash yoghurt yoplait yoplait greek 100 apple pie yogurt yukon gold potatoes zucchini
id
Clean-Eating-Banana-Pancakes-1708094 3 Clean Eating Banana Pancakes The Happier Homemaker 0 0 0 0 0 0 0 ... 0.0 NaN 0.0 NaN 0.0 0.0 NaN NaN NaN 0.0
Norwegian-Pancakes-1707570 4 Norwegian Pancakes Enjoy cooking 0 0 0 0 0 0 0 ... 0.0 NaN 0.0 NaN 0.0 0.0 NaN NaN NaN 0.0
Egg-Bake-Breakfast-Foil-Packet-1708169 4 Egg Bake Breakfast Foil Packet Julie's Eats & Treats 0 0 1 0 0 0 0 ... 0.0 NaN 0.0 NaN 0.0 0.0 NaN NaN NaN 0.0

3 rows × 1338 columns


In [276]:
# save to csv
BB_data.to_csv('BB_Data.csv')