In [1]:
from yummly import Client
import json
import requests
import pandas as pd
import numpy as np
import re
In [31]:
# API call for the first 500 BB recipes labeled as such only!
header= {'X-Yummly-App-ID':'79663a75', 'X-Yummly-App-Key':'02b233108f476f3110e0f65437c4d6dd'}
url='http://api.yummly.com/v1/api/recipes?'
parameters={
'allowedCourse[]':'course^course-Breakfast and Brunch',
'excludedCourse[]': ['course^course-Main Dishes','course^course-Appetizers', 'course^course-Salads', 'course^course-Lunch',
'course^course-Side Dishes','course^course-Desserts','course^course-Breads',
'course^course-Soups', 'course^course-Beverages', 'course^course-Condiments and Sauces',
'course^course-Cocktails', 'course^course-Snacks'],
'maxResult': 501,
'start': 500
}
response=requests.get(url, headers = header, params = parameters)
In [32]:
response.status_code
Out[32]:
In [52]:
BB=response.json()
print type(BB)
print BB.keys()
In [53]:
#only interrested in the information under matches.
print len(BB['matches'])
print type(BB['matches'])
print BB['matches'][0].keys()
In [54]:
#checkout one recipe
BB_matches=BB['matches']
BB_matches[0]
Out[54]:
In [55]:
#import previous list of recipes collected
df=pd.read_csv('BB_main.csv')
df1=pd.read_csv('BB_main_1.csv')
BB_ids=df.id
BB1_ids=df1.id
print BB_ids[0]
print BB1_ids[0]
BB2_ids=[]
for recipe in BB_matches:
BB2_ids.append(recipe['id'])
print BB2_ids[0]
#check if there are dupplicate recipes
print [i for i, j in zip(BB_ids, BB2_ids) if i == j]
print [i for i, j in zip(BB1_ids, BB2_ids) if i == j]
In [57]:
#remove duplicate recipe from the recipe
BB_matches[:] = [d for d in BB_matches if d.get('id') != 'French-Toast-with-Vegan-Nog-964692']
BB_matches[:] = [d for d in BB_matches if d.get('id') != 'Quick-and-Easy-Waffles-1537027']
#'Quick-and-Easy-Waffles-1537027'
# check to see if recipes have been removed
BB2_ids = []
for recipe in BB_matches:
BB2_ids.append(recipe['id'])
print [i for i, j in zip(BB1_ids, BB2_ids) if i == j]
len(BB_matches)
Out[57]:
In [69]:
#forming lists to create dataframes of the features we want.
main_list = []
ingredients_list = []
attributes_list = []
for food in BB_matches:
_d1 = {}
_d1['id'] = food['id']
_d1['rating'] = food['rating']
_d1['recipeName'] = food['recipeName']
_d1['sourceDisplayName'] = food['sourceDisplayName']
main_list.append(_d1)
_d2 = {}
_d2['id'] =food['id']
_d2['course']= 'Breakfast and Brunch'
for i in food['ingredients']:
i = i.lower() # additional code to conver to lowercase
i = re.sub(r'\d+%\s', '', i) # additional code to remove 1%, 2%, etc
i = re.sub(r'\xae', '', i) # remove '\xae' characters
_d2[i] = 1
ingredients_list.append(_d2)
_d3 = {}
_d3['id'] = food['id']
for k, v in food['attributes'].items():
for i in v:
_d3[i] = 1
attributes_list.append(_d3)
flavors_dict = {}
for food in BB_matches:
flavors_dict[food.get('id')] = food.get('flavors')
In [59]:
# read in dictionary for course and cuisine and create list of possible values for each
cuisine_df = pd.read_csv('cuisine_headers.csv', names=['cuisine'])
cuisine_list= cuisine_df.cuisine
In [61]:
#create dictionary of cuisine and course for each recipe
cuisine_dict={}
for food in BB_matches:
cuisine_dict[food.get('id')]= food['attributes'].get('cuisine')
_cuisines= {}
for k, v in cuisine_dict.iteritems():
cuisine_val = {}
for course in cuisine_list:
try:
if course in v :
cuisine_val[course] = 1
else:
cuisine_val[course] = 0
except TypeError:
cuisine_val[course] = 0
_cuisines[k] = cuisine_val
In [64]:
# second api call to get other features for each recipe
key_id= '_app_id=79663a75&_app_key=02b233108f476f3110e0f65437c4d6dd'
url='http://api.yummly.com/v1/api/recipe/'
In [65]:
# retrieve other features for all recipes
def get_recipe(_id):
response = requests.get(url + _id + '?' + key_id)
return response.json()
recipes=[]
for _id in BB2_ids :
recipes.append(get_recipe(_id))
In [62]:
response.status_code
Out[62]:
In [66]:
print len(recipes)
print recipes[1].keys()
In [67]:
#for each recipe create a new dictionary of selected attributes and append into a list
recipe_details=[]
for recipe in recipes:
_dict={}
#import pdb; pdb.set_trace()
_dict['id']=recipe['id']
_dict['ingredientCount']= len(recipe['ingredientLines'])
_dict['numberOfServings']= recipe['numberOfServings']
if 'prepTimeInSeconds' in recipe.keys():
_dict['prepTimeInSeconds']= recipe['prepTimeInSeconds']
else:
_dict['prepTimeInSeconds']= None
if 'cookTimeInSeconds' in recipe.keys():
_dict['cookTimeInSeconds']= recipe['cookTimeInSeconds']
else:
_dict['cookTimeInSeconds']= None
_dict['totalTimeInSeconds']=recipe['totalTimeInSeconds']
recipe_details.append(_dict)
In [71]:
#create dataframes, arrange column index and save into csv
df_main = pd.DataFrame(main_list)
df_main.to_csv('BB_main_2.csv', encoding ='utf-8')
df_ingredients = pd.DataFrame(ingredients_list)
df_ingredients = df_ingredients.fillna(0)
cols = list(df_ingredients)
cols.insert(0, cols.pop(cols.index('id')))
cols.insert(1, cols.pop(cols.index('course')))
df_ingredients= df_ingredients.ix[:,cols]
df_ingredients.to_csv('BB_ingredients_2.csv', encoding ='utf-8')
df_attributes = pd.DataFrame(attributes_list)
df_attributes = df_attributes.fillna(0)
cols = list(df_attributes)
cols.insert(0, cols.pop(cols.index('id')))
df_attributes = df_attributes.ix[:,cols]
df_attributes.to_csv('BB_attributes_2.csv')
df_flavors = pd.DataFrame(flavors_dict).transpose()
df_flavors.reset_index(level=0, inplace=True)
df_flavors.to_csv('BB_flavors_2.csv')
df_cuisines = pd.DataFrame(_cuisines).transpose()
df_cuisines.reset_index(level=0, inplace=True)
df_cuisines.to_csv('BB_cuisines_2.csv')
df_details=pd.DataFrame(recipe_details)
cols = list(df_details)
cols.insert(0, cols.pop(cols.index('id')))
df_details=df_details.ix[:,cols]
df_details.to_csv('BB_details_2.csv')
In [27]:
for i in df_ingredients.columns:
if 'egg' in i:
print i
In [29]:
df_ingredients.sum(axis=0).sort_values(ascending=False)
Out[29]:
In [ ]: