notebook.community

Edit and run



In [2]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import json
import string
from fractions import Fraction
from collections import defaultdict
import apriori



In [3]:

    
f = open('recipeitems-latest.json', 'r')
lines = f.readlines()
f.close()
recipes = []
for line in lines:
    recipe = json.loads(line)
    recipes.append(recipe)



In [4]:

    
allingredients = []
allnames = []
for recipe in recipes:
    name = recipe['name']
    ingredients = recipe['ingredients'].split('\n')
    allingredients.extend(ingredients)
    for i in ingredients:
        allnames.append(name)



In [5]:

    
empty_words = ['freerange', 'organic', 'ripe', 'freshly', 'dried', 'fresh', 'thin', 'thick', 'chopped',
               'ground', 'cooked', 'kosher', 'cut', 'into', 'wedges', 'diced', 'trimmed', 'grated',
               'halved', 'lengthwise', 'and', 'pieces', 'medium', 'peeled', 'grated', 'thinkly', 'sliced',
              'quartered', 'good-quality', 'good', 'quality']



In [6]:

    
iwords = ['cup', 'teaspoon', 'tablespoons', 'cups', 'whole', 'tablespoon', 'teaspoons', 'ounces', 'tbsp', 'tsp',
 'large', 'pound', 'can', 'pinch', 'pounds', 'tbs', 'medium', 'small', 'slices', 'package', 'bunch', 'g',
 'dash', 'lb', 'oz', 'stick', 'cans', 'fl', 'sprigs', 'stalks', 'box', 'ounce', 'pieces', 'fluid', 'sticks',
 'pint', 'bag', 'dashes', 'jar', 'quart', 'pinches', 'ml', 'packages', 'quarts', 'grams', 'sprig',
 'drops', 'tb', 'stalk', 'bottle', 'handful', 'c', 'strips', 'container', 'bunches', 'cubes', 'slice', 'heads',
 'handfuls', 'piece', 'milliliters', 't', 'kg', 'pints', 'lbs', 'cube', 'gallon', 'bulb', 'block', 'boxes',
 'packet', 'jars']



In [7]:

    
confirmed_measurement_terms = defaultdict(int)
filter_map = defaultdict(int)

for word in empty_words:
    filter_map[word] = 1

for word in iwords:
    confirmed_measurement_terms[word] = 1



In [8]:

    
def get_numeric(x):
    x = x.replace('~', '')
    x = x.replace('?', '0')
    try:
        amt = float(sum(Fraction(s) for s in x.split()))
        return amt
    except ValueError:
        arr = x.split('-')
        if len(arr)==2:
            try:
                amt1 = float(sum(Fraction(s) for s in arr[0].split()))
                amt2 = float(sum(Fraction(s) for s in arr[1].split()))
                return amt1 + amt2
            except ValueError:
                return -1
        return -1



In [9]:

    
def remove_parenthesis_content(x):
    i = x.find('(')
    if i == -1:
        return x
    j = x.find(')', i)
    if j == -1:
        return x
    return x[0:i] + x[(j+1):len(x)]



In [10]:

    
unmatched = []
raw = []
names = []
measures = defaultdict(int)
pingredients = []
measurel = []
amts = []

for ingredient, name in zip(allingredients, allnames):
    oingredient = ingredient
    i = 0
    ingredient = ingredient.replace(',', ' ').encode('ascii', 'replace')
    ingredient = remove_parenthesis_content(ingredient)
    terms = ingredient.lower().strip().replace('\t', ' ').replace('/', ' ').split(' ')
    terms = [term.strip() for term in terms]
    try:
        terms.remove('')
    except ValueError:
        i=0
    tamt = 0
    amt = 0
    while i < len(terms) and amt >= 0:
        amt = get_numeric(terms[i])
        if amt != -1:
            tamt += amt
            i += 1
    measure = ''
    c = 1
    ing = ''
    while i < len(terms) and c > 0:
        c = confirmed_measurement_terms[terms[i]]
        if c > 0:
            measure += terms[i] + ' '
            i += 1
    ing = ''
    j = i
    while i < len(terms):
        c2 = filter_map[terms[i]]
        if c2 == 0:
            if len(ing) > 0:
                ing += ' ' + terms[i]
            else:
                ing += terms[i]
        i+=1
    if tamt == 0:
        tamt = 1.0
    if ' '.join(terms[0:j]) != ing:
        #print tamt, '|', measure, '|', ing, '|', ingredient
        amts.append(tamt)
        pingredients.append(ing)
        measurel.append(measure)
        raw.append(oingredient)
        names.append(name)

df = pd.DataFrame({'raw': raw, 'name': names, 'ingredient': pingredients, 'amount': amts, 'measurement': measurel})
df['ingredient'] = df['ingredient'].apply(lambda x: x.encode('ascii', 'replace'))
df.head()









    Out[10]:






  
    
      
      amount
      ingredient
      measurement
      name
      raw
    
  
  
    
      0
      1
      biscuits
      
      Drop Biscuits and Sausage Gravy
      Biscuits
    
    
      1
      3
      all-purpose flour
      cups
      Drop Biscuits and Sausage Gravy
      3 cups All-purpose Flour
    
    
      2
      2
      baking powder
      tablespoons
      Drop Biscuits and Sausage Gravy
      2 Tablespoons Baking Powder
    
    
      3
      3
      salt
      teaspoon
      Drop Biscuits and Sausage Gravy
      1/2 teaspoon Salt
    
    
      4
      4
      cold butter
      stick
      Drop Biscuits and Sausage Gravy
      1-1/2 stick (3/4 Cup) Cold Butter, Cut Into Pi...



In [11]:

    
df.to_csv('recipes.csv', sep='\t', index=False, encoding='utf-8')



In [12]:

    
gp = pd.DataFrame(df.groupby(['ingredient'])['amount'].count())
gp.sort('amount', inplace=True, ascending=0)
plt.plot(gp.amount)
plt.ylim([0, 10])









    Out[12]:





(0, 10)



In [13]:

    
gp.amount.mean()









    Out[13]:





5.8387741407769127



In [ ]:

	amount	ingredient	measurement	name	raw
0	1	biscuits		Drop Biscuits and Sausage Gravy	Biscuits
1	3	all-purpose flour	cups	Drop Biscuits and Sausage Gravy	3 cups All-purpose Flour
2	2	baking powder	tablespoons	Drop Biscuits and Sausage Gravy	2 Tablespoons Baking Powder
3	3	salt	teaspoon	Drop Biscuits and Sausage Gravy	1/2 teaspoon Salt
4	4	cold butter	stick	Drop Biscuits and Sausage Gravy	1-1/2 stick (3/4 Cup) Cold Butter, Cut Into Pi...