In [1]:
from yummly import Client
import json
import requests
import pandas as pd
import numpy as np 
import re

In [2]:
# API call for the first 500 BB recipes labeled as such only!
header= {'X-Yummly-App-ID':'79663a75', 'X-Yummly-App-Key':'02b233108f476f3110e0f65437c4d6dd'}
url='http://api.yummly.com/v1/api/recipes?'
parameters={
            'allowedCourse[]':'course^course-Breakfast and Brunch',
            'excludedCourse[]': ['course^course-Main Dishes','course^course-Appetizers', 'course^course-Salads', 'course^course-Lunch',
                                'course^course-Side Dishes','course^course-Desserts','course^course-Breads',
                                 'course^course-Soups', 'course^course-Beverages', 'course^course-Condiments and Sauces',
                                'course^course-Cocktails', 'course^course-Snacks'],
            'maxResult': 501,
            'start': 500
            }

response=requests.get(url, headers = header, params = parameters)

In [3]:
response.status_code


Out[3]:
200

In [4]:
BB=response.json()

print type(BB)
print BB.keys()


<type 'dict'>
[u'matches', u'totalMatchCount', u'attribution', u'facetCounts', u'criteria']

In [5]:
#only interrested in the information under matches. 
print len(BB['matches'])
print type(BB['matches'])
print BB['matches'][0].keys()


500
<type 'list'>
[u'flavors', u'rating', u'totalTimeInSeconds', u'ingredients', u'smallImageUrls', u'sourceDisplayName', u'recipeName', u'attributes', u'id', u'imageUrlsBySize']

In [6]:
#checkout one recipe
BB_matches=BB['matches']
BB_matches[0]


Out[6]:
{u'attributes': {u'course': [u'Breakfast and Brunch']},
 u'flavors': {u'bitter': 0.8333333333333334,
  u'meaty': 0.16666666666666666,
  u'piquant': 0.0,
  u'salty': 0.6666666666666666,
  u'sour': 0.16666666666666666,
  u'sweet': 0.5},
 u'id': u'Eggless-Peach-Waffles-1703856',
 u'imageUrlsBySize': {u'90': u'https://lh3.googleusercontent.com/hEwX3IPjP0fAMlsV6AQFC3_ubxWwLjYA_LoFLlm6NBObamN89Z0QqC6xWJ6tRhcPIsnCGrMVo-v3RFpqbBnq=s90-c'},
 u'ingredients': [u'whole wheat flour',
  u'all-purpose flour',
  u'cornmeal',
  u'baking powder',
  u'salt',
  u'sugar',
  u'ground nutmeg',
  u'ground cinnamon',
  u'milk',
  u'oil',
  u'peaches'],
 u'rating': 4,
 u'recipeName': u'Eggless Peach Waffles',
 u'smallImageUrls': [u'https://lh3.googleusercontent.com/gaGodnc9Oe1ELEGtFoIonTL117_wuptGCNrVYuCPq67JnLE4JJyXsB_CFAFDII881ggNdm7fCc2eR8dHWaGwuA=s90'],
 u'sourceDisplayName': u'The Schizo Chef',
 u'totalTimeInSeconds': 1500}

In [7]:
#import previous list of recipes collected
df=pd.read_csv('BB_main.csv')
df1=pd.read_csv('BB_main_1.csv')
BB_ids=df.id
BB1_ids=df1.id
print BB_ids[0]
print BB1_ids[0]
BB2_ids=[]
for recipe in BB_matches:
    BB2_ids.append(recipe['id'])
print BB2_ids[0]
#check if there are dupplicate recipes
print [i for i, j in zip(BB_ids, BB2_ids) if i == j]
print [i for i, j in zip(BB1_ids, BB2_ids) if i == j]


Healthy-Chocolate-Porridge-1711204
Baked-French-Toast-Casserole-1636754
Eggless-Peach-Waffles-1703856
[]
['Whole-30-California-Avocado-Sweet-Potato-Hash-1593576', 'Perfect-Pancakes-The-Pioneer-Woman-Cooks-_-Ree-Drummond-41350', 'Cheesy-Baked-Egg-in-Toast-1242287', 'Make-Ahead-Fruit-_-Yogurt-Breakfast-Parfaits-475395', 'Cottage-Cheese-Scrambled-Eggs-recipe-137-calories-781513', 'Broken-arm-breakfast-casserole-with-cottage-cheese_-bacon_-feta_-and-green-onions-310079', 'Blueberry-Muffin-Overnight-Oats-1294948', 'Spinach-And-Cheese-Egg-Muffins-1267448', 'Spaghetti-Squash-Egg-Baskets-1249325']

In [8]:
#remove duplicate recipe from the recipe
BB_matches[:] = [d for d in BB_matches if d.get('id') != 'French-Toast-with-Vegan-Nog-964692']
BB_matches[:] = [d for d in BB_matches if d.get('id') != 'Quick-and-Easy-Waffles-1537027'] 
                 #'Quick-and-Easy-Waffles-1537027'

# check to see if recipes have been removed
BB2_ids = []
for recipe in BB_matches:
    BB2_ids.append(recipe['id'])
    
print [i for i, j in zip(BB1_ids, BB2_ids) if i == j]
len(BB_matches)


['Whole-30-California-Avocado-Sweet-Potato-Hash-1593576', 'Perfect-Pancakes-The-Pioneer-Woman-Cooks-_-Ree-Drummond-41350', 'Cheesy-Baked-Egg-in-Toast-1242287', 'Make-Ahead-Fruit-_-Yogurt-Breakfast-Parfaits-475395', 'Cottage-Cheese-Scrambled-Eggs-recipe-137-calories-781513', 'Lemon-Poppy-Seed-Pancakes-1583289', 'Perfect-Brunch-Eggs-573684', 'Pancake-1286748', 'Chocolate-Chip-Pancakes-1056165']
Out[8]:
498

In [9]:
#forming lists to create dataframes of the features we want. 
main_list = []
ingredients_list = []
attributes_list = []

for food in BB_matches:

    _d1 = {}
    _d1['id'] = food['id']
    _d1['rating'] = food['rating']
    _d1['recipeName'] = food['recipeName']
    _d1['sourceDisplayName'] = food['sourceDisplayName']
    main_list.append(_d1)
    
    _d2 = {}
    _d2['id'] =food['id']
    _d2['course']= 'Breakfast and Brunch'
    _d2['ingredient_list'] =  food['ingredients']
    
    for i in food['ingredients']:
        i = i.lower() # additional code to conver to lowercase
        i = re.sub(r'\d+%\s', '', i) # additional code to remove 1%, 2%, etc
        i = re.sub(r'\xae', '', i) # remove '\xae' characters
        i = re.sub(r'shredded\s', '', i)
        i = re.sub(r'chopped\s', '', i)
        i = re.sub(r'diced\s', '', i)
        i = re.sub(r'crumbled\s', '', i)
        i = re.sub(r'fresh\s', '', i)
        i = re.sub(r'grated\s', '', i)
        i = re.sub(r'fat free\s', '', i)
        i = re.sub(r'boneless\s', '', i)
        i = re.sub(r'boneless skinless\s', '', i)
        i = re.sub(r'minced\s', '', i)
        i = re.sub(r'sliced\s', '', i)
        i = re.sub(r'(?!ground beef)ground ', '', i)
        i = re.sub(r'^dried\s', '', i)
        i = re.sub(r'^cooked\s', '', i)
        
        _d2[i] = 1
    ingredients_list.append(_d2)

    _d3 = {}
    _d3['id'] = food['id']
    for k, v in food['attributes'].items():
        for i in v:
            _d3[i] = 1
    attributes_list.append(_d3)
    
flavors_dict = {}

for food in BB_matches:
    flavors_dict[food.get('id')] = food.get('flavors')

In [11]:
# read in dictionary for course and cuisine and create list of possible values for each
cuisine_df = pd.read_csv('cuisine_headers.csv', names=['cuisine'])

cuisine_list= cuisine_df.cuisine

In [12]:
#create dictionary of cuisine and course for each recipe
cuisine_dict={}
for food in BB_matches:
    cuisine_dict[food.get('id')]= food['attributes'].get('cuisine')

        
_cuisines= {}       

for k, v in cuisine_dict.iteritems():
    cuisine_val = {}
    for course in cuisine_list:
        try:
            if course in v :
                cuisine_val[course] = 1
            else:
                cuisine_val[course] = 0
        except TypeError:
            cuisine_val[course] = 0
    
        _cuisines[k] = cuisine_val

In [13]:
# second api call to get other features for each recipe
key_id= '_app_id=79663a75&_app_key=02b233108f476f3110e0f65437c4d6dd'
url='http://api.yummly.com/v1/api/recipe/'

In [14]:
# retrieve other features for all recipes

def get_recipe(_id):
    response = requests.get(url + _id + '?' + key_id)
    return response.json()

recipes=[]
for _id in BB2_ids :
    recipes.append(get_recipe(_id))

In [15]:
response.status_code


Out[15]:
200

In [16]:
print len(recipes)
print recipes[1].keys()


498
[u'totalTime', u'ingredientLines', u'attribution', u'name', u'prepTimeInSeconds', u'rating', u'cookTimeInSeconds', u'numberOfServings', u'yield', u'nutritionEstimates', u'source', u'flavors', u'images', u'attributes', u'cookTime', u'id', u'prepTime', u'totalTimeInSeconds']

In [17]:
#for each recipe create a new dictionary of selected attributes and append into a list

recipe_details=[]
for recipe in recipes:
    _dict={}
    #import pdb; pdb.set_trace()
    _dict['id']=recipe['id']
    _dict['ingredientCount']= len(recipe['ingredientLines'])
    _dict['numberOfServings']= recipe['numberOfServings']
    _dict['prepTimeInSeconds'] = recipe.get('prepTimeInSeconds')
    _dict['cookTimeInSeconds'] = recipe.get('cookTimeInSeconds')
    _dict['totalTimeInSeconds']= recipe.get('totalTimeInSeconds')
     
    recipe_details.append(_dict)

In [10]:
#create dataframes, arrange column index and save into csv
# df_main = pd.DataFrame(main_list)
# df_main.to_csv('BB_main_2.csv', encoding ='utf-8')

df_ingredients = pd.DataFrame(ingredients_list)
df_ingredients = df_ingredients.fillna(0)
cols = list(df_ingredients)
cols.insert(0, cols.pop(cols.index('id')))
cols.insert(1, cols.pop(cols.index('course')))
df_ingredients= df_ingredients.ix[:,cols]
df_ingredients.to_csv('BB_ingredients_2.csv', encoding ='utf-8')

# df_attributes = pd.DataFrame(attributes_list)
# df_attributes = df_attributes.fillna(0)
# cols = list(df_attributes)
# cols.insert(0, cols.pop(cols.index('id')))
# df_attributes = df_attributes.ix[:,cols]
# df_attributes.to_csv('BB_attributes_2.csv')

# df_flavors = pd.DataFrame(flavors_dict).transpose()
# df_flavors.reset_index(level=0, inplace=True)
# df_flavors.to_csv('BB_flavors_2.csv')

# df_cuisines = pd.DataFrame(_cuisines).transpose()
# df_cuisines.reset_index(level=0, inplace=True)
# df_cuisines.to_csv('BB_cuisines_2.csv')

# df_details=pd.DataFrame(recipe_details)
# cols = list(df_details)
# cols.insert(0, cols.pop(cols.index('id')))
# df_details=df_details.ix[:,cols]
# df_details.to_csv('BB_details_2.csv')