In [1]:

    
import pandas as pd
import numpy as np

Breakfast & Brunch

Main



In [74]:

    
df = pd.read_csv('BB_main.csv')
df1 = pd.read_csv('BB_main_1.csv')
df2 = pd.read_csv('BB_main_2.csv')
df3 = pd.read_csv('BB_main_3.csv')



In [75]:

    
print df.shape
df2.id.duplicated().sum()









    



(500, 5)






    Out[75]:





0



In [76]:

    
df = df.drop('Unnamed: 0', 1)



In [77]:

    
print df1.shape
df1 = df1.drop('Unnamed: 0', 1)
df.id.duplicated().sum()









    



(500, 5)






    Out[77]:





0



In [78]:

    
print df2.shape
df2 = df2.drop('Unnamed: 0', 1)
df2.head(3)









    



(500, 5)






    Out[78]:






  
    
      
      id
      rating
      recipeName
      sourceDisplayName
    
  
  
    
      0
      Gluten-Free-Waffles-1629571
      4
      Gluten-Free Waffles
      Dinner Was Delish
    
    
      1
      Peanut-Butter-Pancakes-1342291
      4
      Peanut Butter Pancakes
      Just So Tasty
    
    
      2
      Kahlua-and-Cream-French-Toast-1597605
      4
      Kahlua and Cream French Toast
      Dizzy Busy and Hungry



In [79]:

    
print df3.shape
df3 = df3.drop('Unnamed: 0', 1)
df3.head(3)









    



(500, 5)






    Out[79]:






  
    
      
      id
      rating
      recipeName
      sourceDisplayName
    
  
  
    
      0
      Chocolate-Breakfast-Oatmeal-1237224
      4
      Chocolate Breakfast Oatmeal
      Family Food on the Table
    
    
      1
      Fluffy-French-Toast-785994
      5
      Fluffy French Toast
      AllRecipes
    
    
      2
      Overnight-Cranberry-Orange-French-Toast-Casser...
      4
      Overnight Cranberry Orange French Toast Casserole
      Kristine's Kitchen



In [80]:

    
#concatenate the main tables. 
BB_main= pd.concat([df, df1, df2, df3], join = 'outer', axis = 0, ignore_index = True)
#create a new dataframe with selected columns
BB_main_reduced = BB_main.drop(['recipeName', 'sourceDisplayName'], axis = 1)



In [81]:

    
#peek at dataframe
print BB_main.shape
BB_main.head(3)









    



(2000, 4)






    Out[81]:






  
    
      
      id
      rating
      recipeName
      sourceDisplayName
    
  
  
    
      0
      Clean-Eating-Banana-Pancakes-1708094
      3
      Clean Eating Banana Pancakes
      The Happier Homemaker
    
    
      1
      Norwegian-Pancakes-1707570
      4
      Norwegian Pancakes
      Enjoy cooking
    
    
      2
      Egg-Bake-Breakfast-Foil-Packet-1708169
      4
      Egg Bake Breakfast Foil Packet
      Julie's Eats & Treats



In [82]:

    
for i in BB_main.duplicated('id'):
    if i == True:
        print i
BB_main = BB_main.drop_duplicates('id')
BB_main.shape









    



True






    Out[82]:





(1999, 4)

Flavors



In [83]:

    
fdf = pd.read_csv('BB_flavors.csv')
fdf1 = pd.read_csv('BB_flavors_1.csv')
fdf2 = pd.read_csv('BB_flavors_2.csv')
fdf3 = pd.read_csv('BB_flavors_3.csv')



In [84]:

    
print fdf.shape
fdf = fdf.drop('Unnamed: 0', 1)
fdf = fdf.rename(columns = {'index':'id'})
fdf1.id.duplicated().sum()









    



(500, 8)






    Out[84]:





0



In [85]:

    
print fdf1.shape
fdf1 = fdf1.drop('Unnamed: 0', 1)
fdf1.head(3)









    



(500, 8)






    Out[85]:






  
    
      
      id
      bitter
      meaty
      piquant
      salty
      sour
      sweet
    
  
  
    
      0
      1-2-3-Gluten-Free-Pancakes-1533270
      0.333333
      0.500000
      0.0
      0.166667
      0.166667
      0.5
    
    
      1
      2-INGREDIENT-PANCAKES-1683615
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      2
      2-Ingredient-Pancakes-1264118
      0.166667
      0.166667
      0.0
      0.166667
      0.333333
      0.5



In [86]:

    
print fdf2.shape
fdf2 = fdf2.drop('Unnamed: 0', 1)
fdf2 = fdf2.rename(columns = {'index':'id'})
fdf2.id.duplicated().sum()









    



(500, 8)






    Out[86]:





0



In [87]:

    
print fdf3.shape
fdf3 = fdf3.drop('Unnamed: 0', 1)
fdf3 = fdf3.rename(columns = {'index':'id'})
fdf3.id.duplicated().sum()









    



(500, 8)






    Out[87]:





0



In [88]:

    
#concatenate the flavors tables. 
BB_flavors= pd.concat([fdf, fdf1, fdf2, fdf3], join = 'outer', axis = 0, ignore_index = True)



In [89]:

    
#peek at dataframe
print BB_flavors.shape
BB_flavors.head(2)









    



(2000, 7)






    Out[89]:






  
    
      
      id
      bitter
      meaty
      piquant
      salty
      sour
      sweet
    
  
  
    
      0
      2--Minute-Low-Carb-English-Muffin-1697908
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      1
      2-Ingredient-Pancakes-1686555
      0.166667
      0.166667
      0.0
      0.166667
      0.333333
      0.5



In [90]:

    
for i in BB_flavors.duplicated('id'):
    if i == True:
        print i
BB_flavors = BB_flavors.drop_duplicates('id')
BB_flavors.shape









    



True






    Out[90]:





(1999, 7)

Cuisine



In [91]:

    
cdf = pd.read_csv('BB_cuisines.csv')
cdf1 = pd.read_csv('BB_cuisines_1.csv')
cdf2 = pd.read_csv('BB_cuisines_2.csv')
cdf3 = pd.read_csv('BB_cuisines_3.csv')



In [92]:

    
print cdf.shape
cdf = cdf.drop('Unnamed: 0', 1)
cdf = cdf.rename(columns = {'index':'id'})
print cdf.columns
cdf.id.duplicated().sum()









    



(500, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')






    Out[92]:





0



In [93]:

    
print cdf1.shape
cdf1 = cdf1.drop('Unnamed: 0', 1)
print cdf1.columns
cdf1.id.duplicated().sum()









    



(500, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')






    Out[93]:





0



In [94]:

    
print cdf2.shape
cdf2 = cdf2.drop('Unnamed: 0', 1)
cdf2 = cdf2.rename(columns = {'index':'id'})
print cdf2.columns
cdf2.id.duplicated().sum()









    



(500, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')






    Out[94]:





0



In [95]:

    
print cdf3.shape
cdf3 = cdf3.drop('Unnamed: 0', 1)
cdf3 = cdf3.rename(columns = {'index':'id'})
print cdf3.columns
cdf3.id.duplicated().sum()









    



(500, 28)
Index([u'id', u'American', u'Asian', u'Barbecue', u'Cajun & Creole',
       u'Chinese', u'Cuban', u'English', u'French', u'German', u'Greek',
       u'Hawaiian', u'Hungarian', u'Indian', u'Irish', u'Italian', u'Japanese',
       u'Kid-Friendly', u'Mediterranean', u'Mexican', u'Moroccan',
       u'Portuguese', u'Southern & Soul Food', u'Southwestern', u'Spanish',
       u'Swedish', u'Thai'],
      dtype='object')






    Out[95]:





0



In [96]:

    
#concatenate the cuisines tables. 
BB_cuisines= pd.concat([cdf, cdf1, cdf2, cdf3], join = 'outer', axis = 0, ignore_index = True)



In [97]:

    
#peek at dataframe
print BB_cuisines.shape
BB_cuisines.head(3)









    



(2000, 27)






    Out[97]:






  
    
      
      id
      American
      Asian
      Barbecue
      Cajun & Creole
      Chinese
      Cuban
      English
      French
      German
      ...
      Kid-Friendly
      Mediterranean
      Mexican
      Moroccan
      Portuguese
      Southern & Soul Food
      Southwestern
      Spanish
      Swedish
      Thai
    
  
  
    
      0
      2--Minute-Low-Carb-English-Muffin-1697908
      0
      0
      0
      0
      0
      0
      1
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      2-Ingredient-Pancakes-1686555
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      2-Ingredient-Protein-Crepes-1684862
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

3 rows × 27 columns



In [98]:

    
for i in BB_cuisines.duplicated('id'):
    if i == True:
        print i
BB_cuisines = BB_cuisines.drop_duplicates('id')
BB_cuisines.shape









    



True






    Out[98]:





(1999, 27)

Details



In [99]:

    
ddf = pd.read_csv('BB_details.csv')
ddf1 = pd.read_csv('BB_details_1.csv')
ddf2 = pd.read_csv('BB_details_2.csv')
ddf3 = pd.read_csv('BB_details_3.csv')



In [100]:

    
len(set(ddf3.id) & set(ddf2.id))









    Out[100]:





0



In [101]:

    
print ddf.shape
ddf = ddf.drop('Unnamed: 0', 1)
print ddf.columns
ddf.id.duplicated().sum()









    



(500, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')






    Out[101]:





0



In [102]:

    
print ddf1.shape
ddf1 = ddf1.drop('Unnamed: 0', 1)
print ddf1.columns
ddf1.id.duplicated().sum()









    



(500, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')






    Out[102]:





0



In [103]:

    
print ddf2.shape
ddf2 = ddf2.drop('Unnamed: 0', 1)
print ddf2.columns
ddf2.id.duplicated().sum()









    



(500, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')






    Out[103]:





0



In [104]:

    
print ddf3.shape
ddf3 = ddf3.drop('Unnamed: 0', 1)
print ddf3.columns
ddf3.id.duplicated().sum()









    



(500, 7)
Index([u'id', u'cookTimeInSeconds', u'ingredientCount', u'numberOfServings',
       u'prepTimeInSeconds', u'totalTimeInSeconds'],
      dtype='object')






    Out[104]:





0



In [105]:

    
#concatenate the details tables. 
BB_details= pd.concat([ddf, ddf1, ddf2, ddf3], join = 'outer', axis = 0, ignore_index = True)



In [106]:

    
#peek at dataframe
print BB_details.shape
BB_details.head(3)









    



(2000, 6)






    Out[106]:






  
    
      
      id
      cookTimeInSeconds
      ingredientCount
      numberOfServings
      prepTimeInSeconds
      totalTimeInSeconds
    
  
  
    
      0
      Clean-Eating-Banana-Pancakes-1708094
      NaN
      4
      4
      NaN
      1500
    
    
      1
      Norwegian-Pancakes-1707570
      NaN
      5
      6
      NaN
      1500
    
    
      2
      Egg-Bake-Breakfast-Foil-Packet-1708169
      600.0
      7
      4
      600.0
      1200



In [107]:

    
for i in BB_details.duplicated('id'):
    if i == True:
        print i
BB_details = BB_details.drop_duplicates('id')
BB_details.shape









    



True






    Out[107]:





(1999, 6)

Ingredients



In [108]:

    
idf = pd.read_csv('BB_ingredients.csv')
idf1 = pd.read_csv('BB_ingredients_1.csv')
idf2 = pd.read_csv('BB_ingredients_2.csv')
idf3 = pd.read_csv('BB_ingredients_3.csv')



In [109]:

    
len(set(idf2.id) & set(idf.id))









    Out[109]:





0



In [110]:

    
print idf.shape
idf = idf.drop('Unnamed: 0', 1)
print idf.columns
idf.id.duplicated().sum()









    



(500, 708)
Index([u'id', u'course', u'active dry yeast', u'agave nectar',
       u'all-purpose flour', u'almond butter', u'almond extract',
       u'almond flour', u'almond meal', u'almond milk',
       ...
       u'whole wheat tortillas', u'whole wheat white flour',
       u'wholemeal bread', u'wild rice', u'yellow bell pepper',
       u'yellow cornmeal', u'yellow onion', u'yellow squash', u'yoghurt',
       u'zucchini'],
      dtype='object', length=707)






    Out[110]:





0



In [111]:

    
print idf1.shape
idf1 = idf1.drop('Unnamed: 0', 1)
print idf1.columns
idf1.id.duplicated().sum()









    



(500, 699)
Index([u'id', u'course', u'active dry yeast', u'agave nectar',
       u'aged cheddar cheese', u'all-purpose flour', u'allspice',
       u'almond butter', u'almond extract', u'almond flour',
       ...
       u'whole wheat tortillas', u'whole wheat white flour', u'wild rice',
       u'worcestershire sauce', u'yeast', u'yellow onion', u'yellow peppers',
       u'yellow squash', u'yoghurt', u'zucchini'],
      dtype='object', length=698)






    Out[111]:





0



In [112]:

    
print idf2.shape
idf2 = idf2.drop('Unnamed: 0', 1)
print idf2.columns
idf2.id.duplicated().sum()









    



(500, 633)
Index([u'id', u'course', u'achiote powder', u'active dry yeast',
       u'agave nectar', u'all purpose potatoes', u'all-purpose flour',
       u'allspice', u'almond butter', u'almond extract',
       ...
       u'wholemeal bread', u'wholemeal flour', u'xanthan gum', u'yeast',
       u'yellow mustard', u'yellow onion', u'yellow peppers', u'yoghurt',
       u'yoplait', u'zucchini'],
      dtype='object', length=632)






    Out[112]:





0



In [113]:

    
print idf3.shape
idf3 = idf3.drop('Unnamed: 0', 1)
print idf3.columns
idf3.id.duplicated().sum()









    



(500, 679)
Index([u'id', u'course', u'adams natural peanut butter', u'agave nectar',
       u'all purpose potatoes', u'all purpose seasoning', u'all-purpose flour',
       u'allspice', u'almond butter', u'almond chocolate milk',
       ...
       u'xanthan gum', u'xylitol sweetener', u'yeast', u'yellow cornmeal',
       u'yellow onion', u'yoghurt', u'yoplait',
       u'yoplait greek 100 apple pie yogurt', u'yukon gold potatoes',
       u'zucchini'],
      dtype='object', length=678)






    Out[113]:





0



In [114]:

    
#concatenate the ingredients tables. 
BB_ing= pd.concat([idf, idf1, idf2, idf3], join = 'outer', axis = 0, ignore_index = True)
#create a new dataframe with selected columns
BB_ing_reduced = BB_ing[['id', 'ingredient_list']]



In [115]:

    
#make id first column
cols = list(BB_ing)
cols.insert(0, cols.pop(cols.index('id')))
BB_ing = BB_ing.ix[:, cols]



In [116]:

    
BB_ing.head(3)









    Out[116]:






  
    
      
      id
      achiote powder
      active dry yeast
      adams natural peanut butter
      agave nectar
      aged cheddar cheese
      all purpose potatoes
      all purpose seasoning
      all-purpose flour
      allspice
      ...
      yellow cornmeal
      yellow mustard
      yellow onion
      yellow peppers
      yellow squash
      yoghurt
      yoplait
      yoplait greek 100 apple pie yogurt
      yukon gold potatoes
      zucchini
    
  
  
    
      0
      Clean-Eating-Banana-Pancakes-1708094
      NaN
      0.0
      NaN
      0.0
      NaN
      NaN
      NaN
      0.0
      NaN
      ...
      0.0
      NaN
      0.0
      NaN
      0.0
      0.0
      NaN
      NaN
      NaN
      0.0
    
    
      1
      Norwegian-Pancakes-1707570
      NaN
      0.0
      NaN
      0.0
      NaN
      NaN
      NaN
      1.0
      NaN
      ...
      0.0
      NaN
      0.0
      NaN
      0.0
      0.0
      NaN
      NaN
      NaN
      0.0
    
    
      2
      Egg-Bake-Breakfast-Foil-Packet-1708169
      NaN
      0.0
      NaN
      0.0
      NaN
      NaN
      NaN
      0.0
      NaN
      ...
      0.0
      NaN
      0.0
      NaN
      0.0
      0.0
      NaN
      NaN
      NaN
      0.0
    
  

3 rows × 1299 columns



In [117]:

    
for i in BB_ing.duplicated('id'):
    if i == True:
        print i
BB_ing = BB_ing.drop_duplicates('id')
BB_ing.shape









    



True






    Out[117]:





(1999, 1299)

Join all tables for Breakfast and Brunch



In [118]:

    
# set index to column 'id'
_df = [BB_main, BB_main_reduced, BB_cuisines, BB_flavors, BB_details, BB_ing, BB_ing_reduced]

for df in _df:
    df.set_index('id', inplace = True)



In [119]:

    
# join dataframes
BB_data = BB_main.join([BB_cuisines, BB_flavors, BB_details, BB_ing])
BB_data_reduced = BB_main_reduced.join([BB_flavors, BB_details, BB_ing_reduced])

# add course column-Breakfast&Brunch
BB_data['course'] = 'Breakfast_Brunch'
BB_data_reduced['course'] = 'Breakfast_Brunch'



In [54]:

    
print BB_data.shape
BB_data.head(3)









    



(1999, 1338)






    Out[54]:






  
    
      
      rating
      recipeName
      sourceDisplayName
      American
      Asian
      Barbecue
      Cajun & Creole
      Chinese
      Cuban
      English
      ...
      yellow cornmeal
      yellow mustard
      yellow onion
      yellow peppers
      yellow squash
      yoghurt
      yoplait
      yoplait greek 100 apple pie yogurt
      yukon gold potatoes
      zucchini
    
    
      id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      Clean-Eating-Banana-Pancakes-1708094
      3
      Clean Eating Banana Pancakes
      The Happier Homemaker
      0
      0
      0
      0
      0
      0
      0
      ...
      0.0
      NaN
      0.0
      NaN
      0.0
      0.0
      NaN
      NaN
      NaN
      0.0
    
    
      Norwegian-Pancakes-1707570
      4
      Norwegian Pancakes
      Enjoy cooking
      0
      0
      0
      0
      0
      0
      0
      ...
      0.0
      NaN
      0.0
      NaN
      0.0
      0.0
      NaN
      NaN
      NaN
      0.0
    
    
      Egg-Bake-Breakfast-Foil-Packet-1708169
      4
      Egg Bake Breakfast Foil Packet
      Julie's Eats & Treats
      0
      0
      1
      0
      0
      0
      0
      ...
      0.0
      NaN
      0.0
      NaN
      0.0
      0.0
      NaN
      NaN
      NaN
      0.0
    
  

3 rows × 1338 columns



In [120]:

    
# save to csv
BB_data.to_csv('BB_Data.csv')
BB_data_reduced.to_csv('BB_data_reduced.csv')

	id	rating	recipeName	sourceDisplayName
0	Gluten-Free-Waffles-1629571	4	Gluten-Free Waffles	Dinner Was Delish
1	Peanut-Butter-Pancakes-1342291	4	Peanut Butter Pancakes	Just So Tasty
2	Kahlua-and-Cream-French-Toast-1597605	4	Kahlua and Cream French Toast	Dizzy Busy and Hungry

	id	rating	recipeName	sourceDisplayName
0	Chocolate-Breakfast-Oatmeal-1237224	4	Chocolate Breakfast Oatmeal	Family Food on the Table
1	Fluffy-French-Toast-785994	5	Fluffy French Toast	AllRecipes
2	Overnight-Cranberry-Orange-French-Toast-Casser...	4	Overnight Cranberry Orange French Toast Casserole	Kristine's Kitchen

	id	rating	recipeName	sourceDisplayName
0	Clean-Eating-Banana-Pancakes-1708094	3	Clean Eating Banana Pancakes	The Happier Homemaker
1	Norwegian-Pancakes-1707570	4	Norwegian Pancakes	Enjoy cooking
2	Egg-Bake-Breakfast-Foil-Packet-1708169	4	Egg Bake Breakfast Foil Packet	Julie's Eats & Treats

	id	bitter	meaty	piquant	salty	sour	sweet
0	1-2-3-Gluten-Free-Pancakes-1533270	0.333333	0.500000	0.0	0.166667	0.166667	0.5
1	2-INGREDIENT-PANCAKES-1683615	NaN	NaN	NaN	NaN	NaN	NaN
2	2-Ingredient-Pancakes-1264118	0.166667	0.166667	0.0	0.166667	0.333333	0.5

	id	bitter	meaty	piquant	salty	sour	sweet
0	2--Minute-Low-Carb-English-Muffin-1697908	NaN	NaN	NaN	NaN	NaN	NaN
1	2-Ingredient-Pancakes-1686555	0.166667	0.166667	0.0	0.166667	0.333333	0.5

	id	English	...
0	2--Minute-Low-Carb-English-Muffin-1697908	1	...
1	2-Ingredient-Pancakes-1686555	0	...
2	2-Ingredient-Protein-Crepes-1684862	0	...

	id	cookTimeInSeconds	ingredientCount	numberOfServings	prepTimeInSeconds	totalTimeInSeconds
0	Clean-Eating-Banana-Pancakes-1708094	NaN	4	4	NaN	1500
1	Norwegian-Pancakes-1707570	NaN	5	6	NaN	1500
2	Egg-Bake-Breakfast-Foil-Packet-1708169	600.0	7	4	600.0	1200