In [2]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json, urllib2
import os, glob, sys, time
import nltk,re
import itertools
from nltk.corpus import stopwords
%matplotlib inline

Helper functions



In [3]:

    
#Read 70k recipes from allrecipes.com into a list of dictionaries
def read_data():
    folder='recipes/'
    files=pd.read_csv('all_files.txt',header=None)[0].values
    k=0
    recipes=[]
    st=time.time()
    for filename in files:
        f=open(folder+filename,'r')
        r=json.load(f)
        recipes.append(r)
        k+=1
        if k%10000==0:
            print k
    return recipes

#Removing and replacing some noizy symbols
def clean_string(s):    
    sep_symbols=[';']
    for ss in sep_symbols:
        s=s.replace(ss,'.')
    for i in range(10):
        s=s.replace('..','.')
    bad_symbols=[')','(','!','-']
    for bs in bad_symbols:
        s=s.replace(bs,' ')
    s=s.replace(',',' , ')                  
    s=s.replace('  ',' ')
    s=s.replace('. ','.')
    return s

#Raw direction text -> List of single directions
def get_clean_directions(recipe):
    raw=recipe['directions']
    direction=''
    for dd in raw:
        direction=direction+dd+'.'
    direction=clean_string(direction).lower()
    s=direction.split('.')
    return s

Highlighting recipes



In [4]:

    
def bright(l,idx1,idx2,idx3):
    l2=[]
    for i in range(len(l)):
        if idx1[i]:
            l2.append(yellow(l[i]))
        elif idx2[i]:
            l2.append(blue(l[i]))
        elif idx3[i]:
            l2.append(purple(l[i]))
        else:
            l2.append(l[i])
    l2=' '.join(l2)
    return l2

def purple(string):
    return '\x1b[1;45m'+string+'\x1b[0m'

def yellow(string):
    return '\x1b[1;43m'+string+'\x1b[0m'

def blue(string):
    return '\x1b[1;46m'+string+'\x1b[0m'

def highlight_recipe(recipes,recipe_id):
    dirs=get_clean_directions(recipes[recipe_id])
    ingr_words=list(set(ingr_words_func(recipes[recipe_id]['ingr'])))
    for d in dirs:
        if len(d)>0:
            d_words=np.array(d.split(' '))
            ingr_idx,measure_idx=np.array(define_ingr_measure(d_words, ingr_words))
            action_idx=np.array([(word in actions_set) for word in d_words]).astype(np.int32)
            colored_string=bright(d_words,action_idx,ingr_idx,measure_idx)
            print colored_string
            print create_instructions(d)
            print '_____________________________________'
            #print create_instructions(d)
            

            
import re

Ingredients stuff



In [5]:

    
#cleaning and reading ingridiends and measures

def read_measure_list(path):
    measures=pd.read_csv(path,header=None)
    measure_list=measures[0].values
    return measure_list

def remove_stopwords(text_list):
    stop = stopwords.words('english')
    content = [w for w in text_list if w.lower() not in stop]
    return content

def remove_digits(text_list):
    content=[]
    for w in text_list:
        w = re.sub('[./]', ' ', w).split() 
        content.append(w)
    content = list(itertools.chain.from_iterable(content))
    content = [w for w in content if w.isdigit()==0]
    return content

def get_clean_text(text):
    return text.replace('(','').replace(')','').replace(',','').replace('-',' ').replace('/',' ').replace(';',' ').replace('  ',' ')

def ingr_words_func(ingr_list):
    recipe_words=[]
    for recipe in ingr_list:
        recipe=get_clean_text(recipe)
        recipe_words.append([element for element in recipe.lower().split()])
    recipe_words = list(itertools.chain.from_iterable(recipe_words))
    recipe_words=remove_stopwords(remove_digits(recipe_words))
    return recipe_words

#defining ingridients and measures
def define_ingr_measure(dirs_words, ingr_words):
    if_ingr=[0]*len(dirs_words)
    if_measure=[0]*len(dirs_words)
    for i,dirs_word in enumerate(dirs_words):
        for ingrs in ingr_words:
            if dirs_word==ingrs:
                if dirs_word not in measure_list:
                    if_ingr[i]=1
                else:
                    if_measure[i]=1
    return if_ingr,if_measure

Semantic role labeling part



In [6]:

    
from practnlptools.tools import Annotator
annotator=Annotator()
def create_instructions(phrase,flag=1):
    
    annotated = annotator.getAnnotations(phrase)['srl']
    annotated_steps = []
    if len(annotated) > 0:
        for i in xrange(len(annotated)):
            annotated_step = dict()
            annotated_step['action'] = annotated[i]['V']
            if set(['A1','A2']).issubset(annotated[i].keys()):
                annotated_step['object'] = annotated[i]['A1']
                annotated_step['target'] = annotated[i]['A2']
            elif set(['A2']).issubset(annotated[i].keys()):
                annotated_step['target'] = annotated[i]['A2']
            elif set(['A1']).issubset(annotated[i].keys()):
                annotated_step['object'] = annotated[i]['A1']
            else:
                pass
            annotated_steps.append(annotated_step)
    if (len(annotated_steps)==0) & (flag):
        return create_instructions('they '+phrase,0)
    return annotated_steps

Main

Loading data



In [274]:

    
%time recipes=read_data()
actions=pd.read_csv('actions_dict_sorted.txt',sep=' ')
measure_list=read_measure_list('measure_list.txt')









    



10000
20000
30000
40000
50000
60000
70000
CPU times: user 10.1 s, sys: 12.1 s, total: 22.3 s
Wall time: 39.6 s

Tagging results



In [8]:

    
actions_set=set(actions[:100].word.values)



In [9]:

    
recipes[6710]['recipe_id']









    Out[9]:





13388



In [10]:

    
recipes[6718]['ingr']









    Out[10]:





[u'1/2 cup chopped red onion',
 u'1/4 cup chopped celery',
 u'1 teaspoon garlic powder',
 u'2 tablespoons margarine',
 u'1/2 cup all-purpose flour',
 u'4 cups chicken broth',
 u'1 1/2 cups chopped baby carrots',
 u'2 potatoes, peeled and diced',
 u'1 tablespoon chopped fresh parsley',
 u'1 teaspoon freshly ground black pepper',
 u'1 pinch chopped fresh dill weed',
 u'3 cups milk',
 u'3 cups shredded Cheddar cheese']



In [11]:

    
highlight_recipe(recipes,6718)









    



using a large stock pot , saute onions , celery and garlic powder in butter , over low heat until onions are tender
[{'action': 'using', 'object': 'a large stock pot , saute onions , celery and garlic powder in butter'}]
_____________________________________
slowly stir in flour with 1 cup of chicken broth to make a rue
[{'action': 'stir', 'object': 'with 1 cup of chicken broth', 'target': 'in flour'}, {'action': 'make', 'object': 'a rue'}]
_____________________________________
stir until well mixed
[{'action': 'stir', 'target': 'until well mixed'}]
_____________________________________
add 3 cups chicken broth , carrots , potatoes , parsley , pepper and dill
[{'action': 'add', 'object': '3 cups chicken broth , carrots , potatoes , parsley , pepper and dill'}]
_____________________________________
bring to a boil and then add milk and cheese
[{'action': 'bring'}, {'action': 'add', 'object': 'milk and cheese'}]
_____________________________________
stir until cheese is melted , reduce heat to low and simmer for one hour , stirring occasionally
[{'action': 'melted', 'object': 'cheese'}, {'action': 'reduce', 'object': 'heat', 'target': 'to low'}, {'action': 'simmer'}, {'action': 'stirring'}]
_____________________________________



In [12]:

    
df=pd.read_csv('labeled_recipes/l13394.txt')
df









    Out[12]:






  
    
      
      line_id
      action
      object
      target
      no
      nt
      description
    
  
  
    
      0
      0
      heat
      oil
      skillet
      1 1/2 tablespoon
      0
      0
    
    
      1
      1
      add
      cabbage
      body
      1/4
      0
      0
    
    
      2
      1
      add
      pork
      body
      4 ounce
      0
      0
    
    
      3
      2
      fry
      body
      0
      0
      0
      until pork in no longer pink inside approximat...
    
    
      4
      3
      stir
      body
      0
      0
      0
      while it is frying
    
    
      5
      4
      add
      chicken broth
      body
      6
      0
      0
    
    
      6
      4
      add
      soy sause
      body
      2 tablespoon
      0
      0
    
    
      7
      4
      add
      ginger
      body
      1/2 teaspoon
      0
      0
    
    
      8
      4
      bring
      body
      boil
      0
      0
      0
    
    
      9
      5
      reduce
      heat
      low
      0
      0
      0
    
    
      10
      6
      simmer
      body
      0
      0
      0
      10 minutes  stirring occasionally
    
    
      11
      7
      stir
      onion
      body
      8
      0
      0
    
    
      12
      7
      add
      noodles
      body
      4 ounce
      0
      0
    
    
      13
      8
      cook
      body
      0
      0
      0
      until noodles are tender 2 to 4 minutes



In [101]:

    
import os
directory = "labeled_recipes"
for filename in os.listdir(directory)[2:]:
    idx = int(filename.split('.')[0][1:])
    new_filename = os.path.join(directory,'l'+ str(recipes[idx]['recipe_id']) + '.txt')
    os.rename(os.path.join(directory,filename),new_filename)









    



l6665.txt
l6667.txt
l6668.txt
l6669.txt
l6670.txt
l6671.txt
l6673.txt
l6674.txt
l6675.txt
l6676.txt
l6677.txt
l6678.txt
l6679.txt
l6680.txt
l6681.txt
l6682.txt
l6683.txt
l6684.txt
l6685.txt
l6686.txt
l6688.txt
l6689.txt
l6690.txt
l6691.txt
l6692.txt
l6693.txt
l6694.txt
l6698.txt
l6699.txt
l6700.txt
l6701.txt
l6702.txt
l6703.txt
l6704.txt
l6705.txt
l6706.txt



In [ ]:



In [16]:

    
features = df.values.tolist()
features









    Out[16]:





[[0, 'heat', 'oil', 'pot', '6 tablespoon', 0, '0'],
 [0, 'add', 'tortillas', 'body', '8 (6 inch)', 0, '0'],
 [0, 'add', 'garlic', 'body', '6 clove', 0, '0'],
 [0, 'add', 'cilantro', 'body', '1/2 cup', 0, '0'],
 [0, 'add', 'onion', 'body', '1', 0, '0'],
 [1, 'saute', 'body', '0', '0', 0, ' for 2 to 3 minutes'],
 [2, 'stir', 'tomatoes', 'body', '1 (29 ounce)', 0, '0'],
 [2, 'bring', 'body', 'boil', '0', 0, '0'],
 [3, 'add', 'cumin', 'body', '2 tablespoon', 0, '0'],
 [3, 'add', 'chili powder', 'body', '1 tablespoon', 0, '0'],
 [3, 'add', 'bay leaves', 'body', '3', 0, '0'],
 [3, 'add', 'chicken', 'body', '6 cup', 0, '0'],
 [4, 'return', 'body', 'boil', '0', 0, '0'],
 [4, 'reduce', 'heat', 'medium', '0', 0, '0'],
 [4, 'add', 'salt', 'body', '1 teaspoon', 0, '0'],
 [4, 'add', 'cayenne', 'body', '1/2 teaspoon', 0, '0'],
 [5, 'simmer', 'body', '0', '0', 0, 'for 30 minutes'],
 [5, 'remove', 'bay leaves', '0', '0', 0, '0'],
 [5, 'stir', 'chicken', 'body', '0', 0, '0'],
 [6, 'heat', 'body', '0', '0', 0, '0'],
 [6, 'serve', 'body', '0', '0', 0, '0']]



In [ ]:

First attempt to build flow of commands



In [14]:

    
recipe_id=7777
dirs=get_clean_directions(recipes[recipe_id])
ingr_words=list(set(ingr_words_func(recipes[recipe_id]['ingr'])))
for d in dirs:
    if len(d)>0:
        print d
        print create_instructions(d)
        print '____________________________________________'









    



in a large pan or wok , heat oil over medium high heat
[]
____________________________________________
stir in the garlic , ginger , and chile pepper
[{'action': 'stir', 'target': 'in the garlic , ginger , and chile pepper'}]
____________________________________________
as the garlic and ginger begin to sweat , add beans , toss to mix , cover and reduce heat
[{'action': 'begin', 'object': 'the garlic and ginger'}, {'action': 'sweat', 'object': 'the garlic and ginger'}, {'action': 'add', 'object': 'beans'}, {'action': 'toss', 'object': 'to mix'}, {'action': 'mix'}, {'action': 'reduce', 'object': 'heat'}]
____________________________________________
steam for 5 8 minutes
[]
____________________________________________
add 1/4 cup of water if necessary
[{'action': 'add', 'object': '1/4 cup of water'}]
____________________________________________
remove cover , increase heat to high
[{'action': 'remove', 'object': 'cover , increase heat to high'}]
____________________________________________
add soy and oyster sauce and stir for two minutes more or until sauce thickens
[{'action': 'add', 'object': 'soy and oyster sauce'}, {'action': 'stir'}, {'action': 'thickens', 'object': 'sauce'}]
____________________________________________
serve warm
[{'action': 'serve', 'target': 'warm'}]
____________________________________________

Bayseian approach



In [13]:

    
from nltk.corpus import wordnet as wn
def is_known(word):
    """return True if this word "exists" in WordNet
       (or at least in nltk.corpus.stopwords)."""
    if word.lower() in nltk.corpus.stopwords.words('english'):
        return True
    synset = wn.synsets(word,pos=wn.VERB)
    if len(synset) == 0:
        return False
    else:
        return True



In [19]:

    
actions_dict_upd = dict()
actions_dict = dict(actions.values.tolist())
word_actions = actions_dict.keys()
for word in word_actions:
    if is_known(word) == True:
        actions_dict_upd[word] = actions_dict[word]



In [21]:

    
import operator
sorted_x = sorted(actions_dict_upd.items(), key=operator.itemgetter(1),reverse=True)



In [251]:

    
f = open('action_dict_wordnet.txt','w')
for (x,y) in sorted_x:
    if y > 40:
        f.write(x + ' ' + str(y) + '\n')
f.close()



In [937]:

    
from __future__ import division
import operator
import nltk
import numpy as np
from scipy.stats import binom
import string
from nltk.tokenize import RegexpTokenizer

def isValid(word):
  if word.startswith("#"):
    return False # no hashtag
  else:
    vword = word.translate(string.maketrans("", ""), string.punctuation)
    return len(vword) == len(word)

def llr(c1, c2, c12, n):
  # H0: Independence p(w1,w2) = p(w1,~w2) = c2/N
  p0 = c2 / n
  # H1: Dependence, p(w1,w2) = c12/N
  p10 = c12 / n
  # H1: p(~w1,w2) = (c2-c12)/N
  p11 = (c2 - c12) / n
  # binomial probabilities
  # H0: b(c12; c1, p0),  b(c2-c12; N-c1, p0)
  # H1: b(c12, c1, p10), b(c2-c12; N-c1, p11)
  probs = np.matrix([
    [binom(c1, p0).logpmf(c12), binom(n - c1, p0).logpmf(c2 - c12)],
    [binom(c1, p10).logpmf(c12), binom(n - c1, p11).logpmf(c2 - c12)]])
  # LLR = p(H1) / p(H0)
  return np.sum(probs[1, :]) - np.sum(probs[0, :])

def isLikelyNGram(ngram, phrases):
  if len(ngram) == 2:
    return True
  prevGram = ngram[:-1]
  return phrases.has_key(prevGram)

def main():
  # accumulate words and word frequency distributions
  lines = []
  unigramFD = nltk.FreqDist()
  
  i = 0
  for line in recipes[:10]:
    i += 1
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(''.join(line['directions']).strip('.').lower())
    #words = filter(lambda x: isValid(x), words)
    for x in words:
      unigramFD[x] += 1
    lines.append(words)
    if i > 1000:
      break
  
  # identify likely phrases using a multi-pass algorithm based
  # on the LLR approach described in the Building Search Applications
  # Lucene, LingPipe and GATE book, except that we treat n-gram
  # collocations beyond 2 as n-1 gram plus a unigram.
  phrases = nltk.defaultdict(float)
  prevGramFD = None
  for i in range(2, 4):
    ngramFD = nltk.FreqDist()
    for words in lines:
      nextGrams = nltk.skipgrams(words, i,i)
      nextGrams = filter(lambda x: isLikelyNGram(x, phrases), nextGrams)
      for x in nextGrams:
        ngramFD[x] += 1
    for k, v in ngramFD.iteritems():
      if v > 1 and v < 5:
        c1 = unigramFD[k[0]] if prevGramFD == None else prevGramFD[k[:-1]]
        
        c2 = unigramFD[k[1]] if prevGramFD == None else unigramFD[k[len(k) - 1]]
        c12 = ngramFD[k]
        n = unigramFD.N() if prevGramFD == None else prevGramFD.N()
        phrases[k] = llr(c1, c2, c12, n)
    # only consider bigrams where LLR > 0, ie P(H1) > P(H0)
    likelyPhrases = nltk.defaultdict(float)
    likelyPhrases.update([(k, v) for (k, v)
      in phrases.iteritems() if len(k) == i and v > 0])
    
    print "==== #-grams = %d ====" % (i)
    sortedPhrases = sorted(likelyPhrases.items(),
      key=operator.itemgetter(1), reverse=True)
    for k, v in sortedPhrases:
        print k, v
    prevGramFD = ngramFD

if __name__ == "__main__":
      main()









    



==== #-grams = 2 ====
(u'350', u'175') 3.99425974384
(u'f', u'175') 3.97415922752
(u'degrees', u'175') 3.9454442042
(u'and', u'beat') 3.79038307827
(u'plastic', u'wrap') 2.99677187865
(u'375', u'190') 2.99677187865
(u'smooth', u'elastic') 2.99461902529
(u'baking', u'soda') 2.98816046521
(u'cover', u'plastic') 2.98170190512
(u'bake', u'at') 2.98170190512
(u'f', u'190') 2.97954905176
(u'with', u'plastic') 2.9666319316
(u'with', u'wrap') 2.9666319316
(u'with', u'towel') 2.9666319316
(u'let', u'stand') 2.96232622488
(u'degrees', u'190') 2.95802051815
(u'until', u'combined') 2.93433913118
(u'and', u'elastic') 2.84176643665
(u'away', u'sides') 1.99856562259
(u'pulls', u'away') 1.99856562259
(u'tester', u'inserted') 1.99856562259
(u'7', u'tablespoons') 1.99713090212
(u'after', u'addition') 1.99569618164
(u'from', u'sides') 1.99569618164
(u'each', u'addition') 1.99282674069
(u'loaves', u'are') 1.99282674069
(u'eggs', u'orange') 1.99139202022
(u'floured', u'counter') 1.99139202022
(u'eggs', u'juice') 1.99139202022
(u'pan', u'cornbread') 1.98995729974
(u'bake', u'preheated') 1.98708785879
(u'2', u'cup') 1.98565313831
(u'well', u'addition') 1.98565313831
(u'with', u'hands') 1.97704481546
(u'bowl', u'sift') 1.97561009498
(u'1', u'cup') 1.97561009498
(u'1', u'tablespoon') 1.97561009498
(u'let', u'sit') 1.97417537451
(u'let', u'covered') 1.97417537451
(u'add', u'whole') 1.97130593356
(u'a', u'counter') 1.95982816975
(u'until', u'doubles') 1.95552400832
(u'until', u'pulls') 1.95552400832
(u'until', u'tester') 1.95552400832
(u'until', u'away') 1.95552400832
(u'to', u'ingredients') 1.95408928785
(u'in', u'preheated') 1.94261152404
(u'the', u'milk') 1.93400320119
(u'the', u'sides') 1.93400320119
(u'punch', u'down') 1.49224350438
(u'golden', u'brown') 0.745993232362
(u'floured', u'surface') 0.741685981084
(u'let', u'rest') 0.724456975971
(u'minutes', u'cool') 0.711535222136
(u'a', u'surface') 0.71009947171
(u'in', u'size') 0.692870466596
(u'and', u'9') 0.644054952109
(u'and', u'rest') 0.644054952109
(u'loaf', u'pans') 0.173729702741
(u'into', u'loaves') 0.153614756752
(u'comes', u'clean') 0.0890220882527
(u'whole', u'wheat') 0.0890220882527
(u'margarine', u'hot') 0.0890220882527
(u'9', u'x') 0.0875868531713
(u'preheat', u'375') 0.0868692356307
(u'each', u'sheet') 0.08615161809
(u'large', u'mixing') 0.0854340005493
(u'baking', u'sheets') 0.0854340005493
(u'large', u'7') 0.0854340005493
(u'out', u'clean') 0.0847163830087
(u'is', u'elastic') 0.083998765468
(u'salt', u'soda') 0.083998765468
(u'oven', u'375') 0.0832811479273
(u'c', u'grease') 0.0832811479273
(u'cover', u'wrap') 0.0832811479273
(u'rise', u'again') 0.0825635303867
(u'place', u'on') 0.0825635303867
(u'bowl', u'7') 0.077540207602
(u'degrees', u'grease') 0.0753873549801
(u'a', u'towel') 0.0696464146547
(u'a', u'small') 0.0696464146547
(u'a', u'mixing') 0.0696464146547
(u'to', u'375') 0.0667759444921
(u'to', u'make') 0.0667759444921
(u'in', u'small') 0.0610350041668
(u'in', u'medium') 0.0610350041668
(u'the', u'liquid') 0.0567292989228
(u'and', u'soda') 0.0366360077842
(u'and', u'stand') 0.0366360077842
==== #-grams = 3 ====
(u'350', u'f', u'175') 3.99805997888
(u'350', u'degrees', u'175') 3.99805997888
(u'375', u'degrees', u'190') 2.99890882635
(u'375', u'f', u'190') 2.99890882635
(u'smooth', u'and', u'elastic') 2.99890882635
(u'with', u'plastic', u'wrap') 2.99890882635
(u'cover', u'plastic', u'wrap') 2.99890882635
(u'from', u'the', u'sides') 1.99951507314
(u'away', u'from', u'sides') 1.99951507314
(u'out', u'floured', u'counter') 1.99951507314
(u'pulls', u'from', u'sides') 1.99951507314
(u'well', u'after', u'addition') 1.99951507314
(u'pulls', u'away', u'sides') 1.99951507314
(u'flour', u'after', u'addition') 1.99951507314
(u'flour', u'well', u'addition') 1.99951507314
(u'until', u'tester', u'inserted') 1.99951507314
(u'out', u'a', u'counter') 1.99951507314
(u'well', u'each', u'addition') 1.99951507314
(u'away', u'the', u'sides') 1.99951507314
(u'until', u'loaves', u'are') 1.99951507314
(u'after', u'each', u'addition') 1.99951507314
(u'pulls', u'the', u'sides') 1.99951507314
(u'as', u'bread', u'easily') 1.99951507314
(u'tester', u'inserted', u'comes') 1.99951507314
(u'until', u'pulls', u'away') 1.99951507314
(u'5', u'minutes', u'creamy') 1.99903010707
(u'dissolve', u'the', u'milk') 1.99903010707
(u'bake', u'in', u'preheated') 1.99903010707
(u'dough', u'smooth', u'elastic') 0.0899724506468
(u'with', u'a', u'towel') 0.0899724506468
(u'is', u'smooth', u'elastic') 0.0899724506468
(u'comes', u'out', u'clean') 0.0899724506468
(u'oven', u'375', u'190') 0.0899724506468
(u'to', u'375', u'190') 0.0899724506468
(u'2', u'3', u'more') 0.0899724506468
(u'cover', u'a', u'towel') 0.0899724506468
(u'is', u'and', u'elastic') 0.0899724506468
(u'salt', u'baking', u'soda') 0.0899724506468
(u'flour', u'baking', u'soda') 0.0899724506468
(u'add', u'whole', u'wheat') 0.0899724506468
(u'until', u'smooth', u'elastic') 0.0899724506468
(u'f', u'190', u'grease') 0.0897299382169
(u'190', u'degrees', u'grease') 0.0897299382169
(u'dough', u'is', u'elastic') 0.0897299382169
(u'bowl', u'with', u'plastic') 0.0897299382169
(u'190', u'c', u'grease') 0.0897299382169
(u'and', u'beat', u'combined') 0.0894874257869
(u'until', u'and', u'elastic') 0.0894874257869
(u'dough', u'and', u'elastic') 0.0894874257869
(u'bowl', u'cover', u'plastic') 0.0894874257869
(u'2', u'cups', u'more') 0.0894874257869



In [896]:

    
def word_grams(words, min=1, max=4):
    s = []
    for n in range(min, max):
        for ngram in ngrams(words, n):
            s.append(' '.join(str(i) for i in ngram))
    return s

print word_grams('one two three four'.split(' '))









    



['one', 'two', 'three', 'four', 'one two', 'two three', 'three four', 'one two three', 'two three four']



In [922]:

    
def everygrams(sequence,min_len=1, max_len=-1):
    """
    This function returns all possible ngrams for n 
    ranging from 1 to len(sequence).
    >>> list(everygrams('a b c'.split()))
    [('a',), ('b',), ('c',), ('a', 'b'), ('b', 'c'), ('a', 'b', 'c')]
    """
    for n in range(1, len(sequence)+1):
        for ng in nltk.ngrams(sequence, n):
            yield ng

doc1 = "Singularity is still a confusing phenomenon in physics".split()
doc2 = "Quantum theory still wins over String theory".split()
_vec1 = list(everygrams(doc1,min_len=2, max_len=2))
print _vec1









    



[('Singularity',), ('is',), ('still',), ('a',), ('confusing',), ('phenomenon',), ('in',), ('physics',), ('Singularity', 'is'), ('is', 'still'), ('still', 'a'), ('a', 'confusing'), ('confusing', 'phenomenon'), ('phenomenon', 'in'), ('in', 'physics'), ('Singularity', 'is', 'still'), ('is', 'still', 'a'), ('still', 'a', 'confusing'), ('a', 'confusing', 'phenomenon'), ('confusing', 'phenomenon', 'in'), ('phenomenon', 'in', 'physics'), ('Singularity', 'is', 'still', 'a'), ('is', 'still', 'a', 'confusing'), ('still', 'a', 'confusing', 'phenomenon'), ('a', 'confusing', 'phenomenon', 'in'), ('confusing', 'phenomenon', 'in', 'physics'), ('Singularity', 'is', 'still', 'a', 'confusing'), ('is', 'still', 'a', 'confusing', 'phenomenon'), ('still', 'a', 'confusing', 'phenomenon', 'in'), ('a', 'confusing', 'phenomenon', 'in', 'physics'), ('Singularity', 'is', 'still', 'a', 'confusing', 'phenomenon'), ('is', 'still', 'a', 'confusing', 'phenomenon', 'in'), ('still', 'a', 'confusing', 'phenomenon', 'in', 'physics'), ('Singularity', 'is', 'still', 'a', 'confusing', 'phenomenon', 'in'), ('is', 'still', 'a', 'confusing', 'phenomenon', 'in', 'physics'), ('Singularity', 'is', 'still', 'a', 'confusing', 'phenomenon', 'in', 'physics')]



In [98]:

    
from itertools import islice, chain, combinations
from nltk.tokenize import RegexpTokenizer
import nltk
def skipgrams(sequence, n, k):
    
    for ngram in nltk.ngrams(sequence, n + k, pad_right=True):
        head = ngram[:1]
        tail = ngram[1:]
        for skip_tail in combinations(tail, n - 1):
            if skip_tail[-1] is None:
                continue
            yield head + skip_tail

sent = "Insurgents killed in ongoing fighting".split()
list(skipgrams(sent, 2, 3))









    Out[98]:





[('Insurgents', 'killed'),
 ('Insurgents', 'in'),
 ('Insurgents', 'ongoing'),
 ('Insurgents', 'fighting'),
 ('killed', 'in'),
 ('killed', 'ongoing'),
 ('killed', 'fighting'),
 ('in', 'ongoing'),
 ('in', 'fighting'),
 ('ongoing', 'fighting')]



In [ ]:

    
from nltk.corpus import stopwords



In [125]:

    
def bigram_sent(sent):
    tokenizer = RegexpTokenizer(r'\w+')
    verb = tokenizer.tokenize(sent.lower())
    filtered_words = [word for word in verb if word not in stopwords.words('english')]
    bigrams_list = list(skipgrams(filtered_words,2,10))
    return bigrams_list,verb[0]



In [126]:

    
def bigrams_count(recipes):
    recipe_corpus = []
   
    for text in recipes:
        text_join = ''.join(text['directions'])
        result = ''.join(i for i in text_join if not i.isdigit())
        recipe_corpus.append(result)
    merged = list(itertools.chain(recipe_corpus))
    all_bigrams = []
    tokenizer = RegexpTokenizer(r'\w+')
    for merge in merged:
        tokens = tokenizer.tokenize(merge.lower())
        tokens = [word for word in tokens if word not in stopwords.words('english')]
        bigrams_merge = list(skipgrams(tokens,2,10))#.split(' '),2,2)
        for (x,y) in bigrams_merge:
            all_bigrams.append((x,y))
    return all_bigrams



In [127]:

    
pairs,verb = bigram_sent(sent)
print pairs









    



[('combine', 'parmesan'), ('combine', 'cheese'), ('combine', 'pepper'), ('combine', 'garlic'), ('combine', 'powder'), ('parmesan', 'cheese'), ('parmesan', 'pepper'), ('parmesan', 'garlic'), ('parmesan', 'powder'), ('cheese', 'pepper'), ('cheese', 'garlic'), ('cheese', 'powder'), ('pepper', 'garlic'), ('pepper', 'powder'), ('garlic', 'powder')]



In [305]:

    
sent = ['bake in 350 degrees F (175 degrees C) oven for 15 minutes or until golden brown']
print sent









    



['bake in 350 degrees F (175 degrees C) oven for 15 minutes or until golden brown']



In [129]:

    
bigrams_with_verb









    Out[129]:





[('combine', 'parmesan'),
 ('combine', 'cheese'),
 ('combine', 'pepper'),
 ('combine', 'garlic'),
 ('combine', 'powder')]



In [276]:

    
all_recipes = bigrams_count(recipes[:10000])



In [306]:

    
from collections import Counter
all_recipes_sorted = dict(Counter(all_recipes))
verb_probas = dict()
for s in sent:
    pairs,verb = bigram_sent(s)
    bigrams_with_verb = []
    print verb
    for pair in pairs:
        if verb in pair:
            bigrams_with_verb.append(pair)
    bigram_probas = dict()
    for (x,y) in bigrams_with_verb:
        if (x,y) in all_recipes_sorted.keys():
            bigram_probas[(x,y)] = all_recipes_sorted[(x,y)]/float(actions_dict_upd[verb])
    verb_probas[verb] = bigram_probas
    verb_pr = 1/float(actions_dict_upd[verb])
    proba_sent = verb_pr
    print verb_probas
    for key in verb_probas[verb]:
        if verb_probas[verb][key] > 0:
            proba_sent = proba_sent * verb_probas[verb][key]
            print key #+ ('bowl','bowl')
    print proba_sent









    



bake
{'bake': {('bake', 'golden'): 0.02630698172568445, ('bake', 'degrees'): 0.17223375058571525, ('bake', 'f'): 0.08966463618716113, ('bake', 'brown'): 0.03156168418234152, ('bake', 'c'): 0.08705401968003212, ('bake', 'minutes'): 0.2136689202757882, ('bake', 'oven'): 0.07895441461945243}}
('bake', 'golden')
('bake', 'degrees')
('bake', 'f')
('bake', 'brown')
('bake', 'c')
('bake', 'minutes')
('bake', 'oven')
6.30268674243e-13



In [304]:

    
proba_sent = verb_pr
for key in verb_probas[verb]:
    if verb_probas[verb][key] > 0.1:
        proba_sent = proba_sent * verb_probas[verb][key]
        print key
print proba_sent
print verb_probas









    



3.24801870859e-05
{'place': {('place', 'ungreased'): 0.008899571261530466, ('place', 'cookie'): 0.02478238274652462, ('place', 'sheet'): 0.0223463687150838}}



In [213]:

    
bigrams_with_verb = []
for pair in pairs:
    if verb not in pair:
        bigrams_with_verb.append(pair)
bigram_probas = dict()
for (x,y) in bigrams_with_verb:
    if (x,y) in all_recipes_sorted.keys():
        bigram_probas[(x,y)] = all_recipes_sorted[(x,y)]/float(410)
verb_probas[verb] = bigram_probas
verb_pr = 1/float(actions_dict_upd[verb])



In [214]:

    
verb_probas









    Out[214]:





{'combine': {('soda', 'bowl'): 0.7902439024390244,
  ('soup', 'bowl'): 0.10975609756097561,
  ('soup', 'soda'): 0.00975609756097561,
  ('tomato', 'bowl'): 0.06585365853658537,
  ('tomato', 'soda'): 0.012195121951219513,
  ('tomato', 'soup'): 0.11951219512195121}}



In [346]:

    
k = 0

for verb in verbs:
    scores = {}
    for (x,y) in all_recipes:
        if x == verb and y not in utensils:
            if (x,y) in scores:
                scores[(x,y)] += 1/float(actions_dict_upd[verb])
            else:
                scores[(x,y)] = 1/float(actions_dict_upd[verb])
    print max(scores.iteritems(), key=operator.itemgetter(1))[0:2]









    



((u'combine', u'sugar'), 0.09327397554172816)
((u'mix', u'well'), 0.10216435857036596)
((u'stir', u'mixture'), 0.04374850585703928)
((u'fry', u'brown'), 0.08408953418027812)
((u'cool', u'wire'), 0.050940487203206396)
((u'boil', u'minutes'), 0.46467817896388086)
((u'place', u'minutes'), 0.04043783292191641)



In [330]:

    
scores









    Out[330]:





{(u'mix', u'bowl'): 656,
 (u'mix', u'dish'): 302,
 (u'mix', u'heat'): 394,
 (u'mix', u'oven'): 301,
 (u'mix', u'pan'): 684,
 (u'mix', u'saucepan'): 101,
 (u'mix', u'skillet'): 87}



In [317]:

    
actions_dict_upd['bake']









    Out[317]:





29878



In [308]:

    
utensils = ['bowl','oven','pan','saucepan','heat','dish','skillet']



In [331]:

    
verbs = ['combine','mix','stir','fry','cool','boil','place']

	line_id	action	object	target	no	description
0	0	heat	oil	skillet	1 1/2 tablespoon	0
1	1	add	cabbage	body	1/4	0
2	1	add	pork	body	4 ounce	0
3	2	fry	body	0	0	until pork in no longer pink inside approximat...
4	3	stir	body	0	0	while it is frying
5	4	add	chicken broth	body	6	0
6	4	add	soy sause	body	2 tablespoon	0
7	4	add	ginger	body	1/2 teaspoon	0
8	4	bring	body	boil	0	0
9	5	reduce	heat	low	0	0
10	6	simmer	body	0	0	10 minutes stirring occasionally
11	7	stir	onion	body	8	0
12	7	add	noodles	body	4 ounce	0
13	8	cook	body	0	0	until noodles are tender 2 to 4 minutes