In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import json
import string
from fractions import Fraction
from collections import defaultdict
import apriori
In [3]:
f = open('recipeitems-latest.json', 'r')
lines = f.readlines()
f.close()
recipes = []
for line in lines:
recipe = json.loads(line)
recipes.append(recipe)
In [4]:
allingredients = []
allnames = []
for recipe in recipes:
name = recipe['name']
ingredients = recipe['ingredients'].split('\n')
allingredients.extend(ingredients)
for i in ingredients:
allnames.append(name)
In [5]:
empty_words = ['freerange', 'organic', 'ripe', 'freshly', 'dried', 'fresh', 'thin', 'thick', 'chopped',
'ground', 'cooked', 'kosher', 'cut', 'into', 'wedges', 'diced', 'trimmed', 'grated',
'halved', 'lengthwise', 'and', 'pieces', 'medium', 'peeled', 'grated', 'thinkly', 'sliced',
'quartered', 'good-quality', 'good', 'quality']
In [6]:
iwords = ['cup', 'teaspoon', 'tablespoons', 'cups', 'whole', 'tablespoon', 'teaspoons', 'ounces', 'tbsp', 'tsp',
'large', 'pound', 'can', 'pinch', 'pounds', 'tbs', 'medium', 'small', 'slices', 'package', 'bunch', 'g',
'dash', 'lb', 'oz', 'stick', 'cans', 'fl', 'sprigs', 'stalks', 'box', 'ounce', 'pieces', 'fluid', 'sticks',
'pint', 'bag', 'dashes', 'jar', 'quart', 'pinches', 'ml', 'packages', 'quarts', 'grams', 'sprig',
'drops', 'tb', 'stalk', 'bottle', 'handful', 'c', 'strips', 'container', 'bunches', 'cubes', 'slice', 'heads',
'handfuls', 'piece', 'milliliters', 't', 'kg', 'pints', 'lbs', 'cube', 'gallon', 'bulb', 'block', 'boxes',
'packet', 'jars']
In [7]:
confirmed_measurement_terms = defaultdict(int)
filter_map = defaultdict(int)
for word in empty_words:
filter_map[word] = 1
for word in iwords:
confirmed_measurement_terms[word] = 1
In [8]:
def get_numeric(x):
x = x.replace('~', '')
x = x.replace('?', '0')
try:
amt = float(sum(Fraction(s) for s in x.split()))
return amt
except ValueError:
arr = x.split('-')
if len(arr)==2:
try:
amt1 = float(sum(Fraction(s) for s in arr[0].split()))
amt2 = float(sum(Fraction(s) for s in arr[1].split()))
return amt1 + amt2
except ValueError:
return -1
return -1
In [9]:
def remove_parenthesis_content(x):
i = x.find('(')
if i == -1:
return x
j = x.find(')', i)
if j == -1:
return x
return x[0:i] + x[(j+1):len(x)]
In [10]:
unmatched = []
raw = []
names = []
measures = defaultdict(int)
pingredients = []
measurel = []
amts = []
for ingredient, name in zip(allingredients, allnames):
oingredient = ingredient
i = 0
ingredient = ingredient.replace(',', ' ').encode('ascii', 'replace')
ingredient = remove_parenthesis_content(ingredient)
terms = ingredient.lower().strip().replace('\t', ' ').replace('/', ' ').split(' ')
terms = [term.strip() for term in terms]
try:
terms.remove('')
except ValueError:
i=0
tamt = 0
amt = 0
while i < len(terms) and amt >= 0:
amt = get_numeric(terms[i])
if amt != -1:
tamt += amt
i += 1
measure = ''
c = 1
ing = ''
while i < len(terms) and c > 0:
c = confirmed_measurement_terms[terms[i]]
if c > 0:
measure += terms[i] + ' '
i += 1
ing = ''
j = i
while i < len(terms):
c2 = filter_map[terms[i]]
if c2 == 0:
if len(ing) > 0:
ing += ' ' + terms[i]
else:
ing += terms[i]
i+=1
if tamt == 0:
tamt = 1.0
if ' '.join(terms[0:j]) != ing:
#print tamt, '|', measure, '|', ing, '|', ingredient
amts.append(tamt)
pingredients.append(ing)
measurel.append(measure)
raw.append(oingredient)
names.append(name)
df = pd.DataFrame({'raw': raw, 'name': names, 'ingredient': pingredients, 'amount': amts, 'measurement': measurel})
df['ingredient'] = df['ingredient'].apply(lambda x: x.encode('ascii', 'replace'))
df.head()
Out[10]:
In [11]:
df.to_csv('recipes.csv', sep='\t', index=False, encoding='utf-8')
In [12]:
gp = pd.DataFrame(df.groupby(['ingredient'])['amount'].count())
gp.sort('amount', inplace=True, ascending=0)
plt.plot(gp.amount)
plt.ylim([0, 10])
Out[12]:
In [13]:
gp.amount.mean()
Out[13]:
In [ ]: