In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json, urllib2
import os, glob, sys, time
import nltk,re
import itertools
from nltk.corpus import stopwords
%matplotlib inline

Helper functions


In [3]:
#Read 70k recipes from allrecipes.com into a list of dictionaries
def read_data():
    folder='recipes/'
    files=pd.read_csv('all_files.txt',header=None)[0].values
    k=0
    recipes=[]
    st=time.time()
    for filename in files:
        f=open(folder+filename,'r')
        r=json.load(f)
        recipes.append(r)
        k+=1
        if k%10000==0:
            print k
    return recipes

#Removing and replacing some noizy symbols
def clean_string(s):    
    sep_symbols=[';']
    for ss in sep_symbols:
        s=s.replace(ss,'.')
    for i in range(10):
        s=s.replace('..','.')
    bad_symbols=[')','(','!','-']
    for bs in bad_symbols:
        s=s.replace(bs,' ')
    s=s.replace(',',' , ')                  
    s=s.replace('  ',' ')
    s=s.replace('. ','.')
    return s

#Raw direction text -> List of single directions
def get_clean_directions(recipe):
    raw=recipe['directions']
    direction=''
    for dd in raw:
        direction=direction+dd+'.'
    direction=clean_string(direction).lower()
    s=direction.split('.')
    return s

Highlighting recipes


In [4]:
def bright(l,idx1,idx2,idx3):
    l2=[]
    for i in range(len(l)):
        if idx1[i]:
            l2.append(yellow(l[i]))
        elif idx2[i]:
            l2.append(blue(l[i]))
        elif idx3[i]:
            l2.append(purple(l[i]))
        else:
            l2.append(l[i])
    l2=' '.join(l2)
    return l2

def purple(string):
    return '\x1b[1;45m'+string+'\x1b[0m'

def yellow(string):
    return '\x1b[1;43m'+string+'\x1b[0m'

def blue(string):
    return '\x1b[1;46m'+string+'\x1b[0m'

def highlight_recipe(recipes,recipe_id):
    dirs=get_clean_directions(recipes[recipe_id])
    ingr_words=list(set(ingr_words_func(recipes[recipe_id]['ingr'])))
    for d in dirs:
        if len(d)>0:
            d_words=np.array(d.split(' '))
            ingr_idx,measure_idx=np.array(define_ingr_measure(d_words, ingr_words))
            action_idx=np.array([(word in actions_set) for word in d_words]).astype(np.int32)
            colored_string=bright(d_words,action_idx,ingr_idx,measure_idx)
            print colored_string
            print create_instructions(d)
            print '_____________________________________'
            #print create_instructions(d)
            

            
import re

Ingredients stuff


In [5]:
#cleaning and reading ingridiends and measures

def read_measure_list(path):
    measures=pd.read_csv(path,header=None)
    measure_list=measures[0].values
    return measure_list

def remove_stopwords(text_list):
    stop = stopwords.words('english')
    content = [w for w in text_list if w.lower() not in stop]
    return content

def remove_digits(text_list):
    content=[]
    for w in text_list:
        w = re.sub('[./]', ' ', w).split() 
        content.append(w)
    content = list(itertools.chain.from_iterable(content))
    content = [w for w in content if w.isdigit()==0]
    return content

def get_clean_text(text):
    return text.replace('(','').replace(')','').replace(',','').replace('-',' ').replace('/',' ').replace(';',' ').replace('  ',' ')

def ingr_words_func(ingr_list):
    recipe_words=[]
    for recipe in ingr_list:
        recipe=get_clean_text(recipe)
        recipe_words.append([element for element in recipe.lower().split()])
    recipe_words = list(itertools.chain.from_iterable(recipe_words))
    recipe_words=remove_stopwords(remove_digits(recipe_words))
    return recipe_words

#defining ingridients and measures
def define_ingr_measure(dirs_words, ingr_words):
    if_ingr=[0]*len(dirs_words)
    if_measure=[0]*len(dirs_words)
    for i,dirs_word in enumerate(dirs_words):
        for ingrs in ingr_words:
            if dirs_word==ingrs:
                if dirs_word not in measure_list:
                    if_ingr[i]=1
                else:
                    if_measure[i]=1
    return if_ingr,if_measure

Semantic role labeling part


In [6]:
from practnlptools.tools import Annotator
annotator=Annotator()
def create_instructions(phrase,flag=1):
    
    annotated = annotator.getAnnotations(phrase)['srl']
    annotated_steps = []
    if len(annotated) > 0:
        for i in xrange(len(annotated)):
            annotated_step = dict()
            annotated_step['action'] = annotated[i]['V']
            if set(['A1','A2']).issubset(annotated[i].keys()):
                annotated_step['object'] = annotated[i]['A1']
                annotated_step['target'] = annotated[i]['A2']
            elif set(['A2']).issubset(annotated[i].keys()):
                annotated_step['target'] = annotated[i]['A2']
            elif set(['A1']).issubset(annotated[i].keys()):
                annotated_step['object'] = annotated[i]['A1']
            else:
                pass
            annotated_steps.append(annotated_step)
    if (len(annotated_steps)==0) & (flag):
        return create_instructions('they '+phrase,0)
    return annotated_steps

Main

Loading data

In [274]:
%time recipes=read_data()
actions=pd.read_csv('actions_dict_sorted.txt',sep=' ')
measure_list=read_measure_list('measure_list.txt')


10000
20000
30000
40000
50000
60000
70000
CPU times: user 10.1 s, sys: 12.1 s, total: 22.3 s
Wall time: 39.6 s

Tagging results


In [8]:
actions_set=set(actions[:100].word.values)

In [9]:
recipes[6710]['recipe_id']


Out[9]:
13388

In [10]:
recipes[6718]['ingr']


Out[10]:
[u'1/2 cup chopped red onion',
 u'1/4 cup chopped celery',
 u'1 teaspoon garlic powder',
 u'2 tablespoons margarine',
 u'1/2 cup all-purpose flour',
 u'4 cups chicken broth',
 u'1 1/2 cups chopped baby carrots',
 u'2 potatoes, peeled and diced',
 u'1 tablespoon chopped fresh parsley',
 u'1 teaspoon freshly ground black pepper',
 u'1 pinch chopped fresh dill weed',
 u'3 cups milk',
 u'3 cups shredded Cheddar cheese']

In [11]:
highlight_recipe(recipes,6718)


using a large stock pot , saute onions , celery and garlic powder in butter , over low heat until onions are tender
[{'action': 'using', 'object': 'a large stock pot , saute onions , celery and garlic powder in butter'}]
_____________________________________
slowly stir in flour with 1 cup of chicken broth to make a rue
[{'action': 'stir', 'object': 'with 1 cup of chicken broth', 'target': 'in flour'}, {'action': 'make', 'object': 'a rue'}]
_____________________________________
stir until well mixed
[{'action': 'stir', 'target': 'until well mixed'}]
_____________________________________
add 3 cups chicken broth , carrots , potatoes , parsley , pepper and dill
[{'action': 'add', 'object': '3 cups chicken broth , carrots , potatoes , parsley , pepper and dill'}]
_____________________________________
bring to a boil and then add milk and cheese
[{'action': 'bring'}, {'action': 'add', 'object': 'milk and cheese'}]
_____________________________________
stir until cheese is melted , reduce heat to low and simmer for one hour , stirring occasionally
[{'action': 'melted', 'object': 'cheese'}, {'action': 'reduce', 'object': 'heat', 'target': 'to low'}, {'action': 'simmer'}, {'action': 'stirring'}]
_____________________________________

In [12]:
df=pd.read_csv('labeled_recipes/l13394.txt')
df


Out[12]:
line_id action object target no nt description
0 0 heat oil skillet 1 1/2 tablespoon 0 0
1 1 add cabbage body 1/4 0 0
2 1 add pork body 4 ounce 0 0
3 2 fry body 0 0 0 until pork in no longer pink inside approximat...
4 3 stir body 0 0 0 while it is frying
5 4 add chicken broth body 6 0 0
6 4 add soy sause body 2 tablespoon 0 0
7 4 add ginger body 1/2 teaspoon 0 0
8 4 bring body boil 0 0 0
9 5 reduce heat low 0 0 0
10 6 simmer body 0 0 0 10 minutes stirring occasionally
11 7 stir onion body 8 0 0
12 7 add noodles body 4 ounce 0 0
13 8 cook body 0 0 0 until noodles are tender 2 to 4 minutes

In [101]:
import os
directory = "labeled_recipes"
for filename in os.listdir(directory)[2:]:
    idx = int(filename.split('.')[0][1:])
    new_filename = os.path.join(directory,'l'+ str(recipes[idx]['recipe_id']) + '.txt')
    os.rename(os.path.join(directory,filename),new_filename)


l6665.txt
l6667.txt
l6668.txt
l6669.txt
l6670.txt
l6671.txt
l6673.txt
l6674.txt
l6675.txt
l6676.txt
l6677.txt
l6678.txt
l6679.txt
l6680.txt
l6681.txt
l6682.txt
l6683.txt
l6684.txt
l6685.txt
l6686.txt
l6688.txt
l6689.txt
l6690.txt
l6691.txt
l6692.txt
l6693.txt
l6694.txt
l6698.txt
l6699.txt
l6700.txt
l6701.txt
l6702.txt
l6703.txt
l6704.txt
l6705.txt
l6706.txt

In [ ]:


In [16]:
features = df.values.tolist()
features


Out[16]:
[[0, 'heat', 'oil', 'pot', '6 tablespoon', 0, '0'],
 [0, 'add', 'tortillas', 'body', '8 (6 inch)', 0, '0'],
 [0, 'add', 'garlic', 'body', '6 clove', 0, '0'],
 [0, 'add', 'cilantro', 'body', '1/2 cup', 0, '0'],
 [0, 'add', 'onion', 'body', '1', 0, '0'],
 [1, 'saute', 'body', '0', '0', 0, ' for 2 to 3 minutes'],
 [2, 'stir', 'tomatoes', 'body', '1 (29 ounce)', 0, '0'],
 [2, 'bring', 'body', 'boil', '0', 0, '0'],
 [3, 'add', 'cumin', 'body', '2 tablespoon', 0, '0'],
 [3, 'add', 'chili powder', 'body', '1 tablespoon', 0, '0'],
 [3, 'add', 'bay leaves', 'body', '3', 0, '0'],
 [3, 'add', 'chicken', 'body', '6 cup', 0, '0'],
 [4, 'return', 'body', 'boil', '0', 0, '0'],
 [4, 'reduce', 'heat', 'medium', '0', 0, '0'],
 [4, 'add', 'salt', 'body', '1 teaspoon', 0, '0'],
 [4, 'add', 'cayenne', 'body', '1/2 teaspoon', 0, '0'],
 [5, 'simmer', 'body', '0', '0', 0, 'for 30 minutes'],
 [5, 'remove', 'bay leaves', '0', '0', 0, '0'],
 [5, 'stir', 'chicken', 'body', '0', 0, '0'],
 [6, 'heat', 'body', '0', '0', 0, '0'],
 [6, 'serve', 'body', '0', '0', 0, '0']]

In [ ]:

First attempt to build flow of commands


In [14]:
recipe_id=7777
dirs=get_clean_directions(recipes[recipe_id])
ingr_words=list(set(ingr_words_func(recipes[recipe_id]['ingr'])))
for d in dirs:
    if len(d)>0:
        print d
        print create_instructions(d)
        print '____________________________________________'


in a large pan or wok , heat oil over medium high heat
[]
____________________________________________
stir in the garlic , ginger , and chile pepper
[{'action': 'stir', 'target': 'in the garlic , ginger , and chile pepper'}]
____________________________________________
as the garlic and ginger begin to sweat , add beans , toss to mix , cover and reduce heat
[{'action': 'begin', 'object': 'the garlic and ginger'}, {'action': 'sweat', 'object': 'the garlic and ginger'}, {'action': 'add', 'object': 'beans'}, {'action': 'toss', 'object': 'to mix'}, {'action': 'mix'}, {'action': 'reduce', 'object': 'heat'}]
____________________________________________
steam for 5 8 minutes
[]
____________________________________________
add 1/4 cup of water if necessary
[{'action': 'add', 'object': '1/4 cup of water'}]
____________________________________________
remove cover , increase heat to high
[{'action': 'remove', 'object': 'cover , increase heat to high'}]
____________________________________________
add soy and oyster sauce and stir for two minutes more or until sauce thickens
[{'action': 'add', 'object': 'soy and oyster sauce'}, {'action': 'stir'}, {'action': 'thickens', 'object': 'sauce'}]
____________________________________________
serve warm
[{'action': 'serve', 'target': 'warm'}]
____________________________________________

Bayseian approach


In [13]:
from nltk.corpus import wordnet as wn
def is_known(word):
    """return True if this word "exists" in WordNet
       (or at least in nltk.corpus.stopwords)."""
    if word.lower() in nltk.corpus.stopwords.words('english'):
        return True
    synset = wn.synsets(word,pos=wn.VERB)
    if len(synset) == 0:
        return False
    else:
        return True

In [19]:
actions_dict_upd = dict()
actions_dict = dict(actions.values.tolist())
word_actions = actions_dict.keys()
for word in word_actions:
    if is_known(word) == True:
        actions_dict_upd[word] = actions_dict[word]

In [21]:
import operator
sorted_x = sorted(actions_dict_upd.items(), key=operator.itemgetter(1),reverse=True)

In [251]:
f = open('action_dict_wordnet.txt','w')
for (x,y) in sorted_x:
    if y > 40:
        f.write(x + ' ' + str(y) + '\n')
f.close()

In [937]:
from __future__ import division
import operator
import nltk
import numpy as np
from scipy.stats import binom
import string
from nltk.tokenize import RegexpTokenizer

def isValid(word):
  if word.startswith("#"):
    return False # no hashtag
  else:
    vword = word.translate(string.maketrans("", ""), string.punctuation)
    return len(vword) == len(word)

def llr(c1, c2, c12, n):
  # H0: Independence p(w1,w2) = p(w1,~w2) = c2/N
  p0 = c2 / n
  # H1: Dependence, p(w1,w2) = c12/N
  p10 = c12 / n
  # H1: p(~w1,w2) = (c2-c12)/N
  p11 = (c2 - c12) / n
  # binomial probabilities
  # H0: b(c12; c1, p0),  b(c2-c12; N-c1, p0)
  # H1: b(c12, c1, p10), b(c2-c12; N-c1, p11)
  probs = np.matrix([
    [binom(c1, p0).logpmf(c12), binom(n - c1, p0).logpmf(c2 - c12)],
    [binom(c1, p10).logpmf(c12), binom(n - c1, p11).logpmf(c2 - c12)]])
  # LLR = p(H1) / p(H0)
  return np.sum(probs[1, :]) - np.sum(probs[0, :])

def isLikelyNGram(ngram, phrases):
  if len(ngram) == 2:
    return True
  prevGram = ngram[:-1]
  return phrases.has_key(prevGram)

def main():
  # accumulate words and word frequency distributions
  lines = []
  unigramFD = nltk.FreqDist()
  
  i = 0
  for line in recipes[:10]:
    i += 1
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(''.join(line['directions']).strip('.').lower())
    #words = filter(lambda x: isValid(x), words)
    for x in words:
      unigramFD[x] += 1
    lines.append(words)
    if i > 1000:
      break
  
  # identify likely phrases using a multi-pass algorithm based
  # on the LLR approach described in the Building Search Applications
  # Lucene, LingPipe and GATE book, except that we treat n-gram
  # collocations beyond 2 as n-1 gram plus a unigram.
  phrases = nltk.defaultdict(float)
  prevGramFD = None
  for i in range(2, 4):
    ngramFD = nltk.FreqDist()
    for words in lines:
      nextGrams = nltk.skipgrams(words, i,i)
      nextGrams = filter(lambda x: isLikelyNGram(x, phrases), nextGrams)
      for x in nextGrams:
        ngramFD[x] += 1
    for k, v in ngramFD.iteritems():
      if v > 1 and v < 5:
        c1 = unigramFD[k[0]] if prevGramFD == None else prevGramFD[k[:-1]]
        
        c2 = unigramFD[k[1]] if prevGramFD == None else unigramFD[k[len(k) - 1]]
        c12 = ngramFD[k]
        n = unigramFD.N() if prevGramFD == None else prevGramFD.N()
        phrases[k] = llr(c1, c2, c12, n)
    # only consider bigrams where LLR > 0, ie P(H1) > P(H0)
    likelyPhrases = nltk.defaultdict(float)
    likelyPhrases.update([(k, v) for (k, v)
      in phrases.iteritems() if len(k) == i and v > 0])
    
    print "==== #-grams = %d ====" % (i)
    sortedPhrases = sorted(likelyPhrases.items(),
      key=operator.itemgetter(1), reverse=True)
    for k, v in sortedPhrases:
        print k, v
    prevGramFD = ngramFD

if __name__ == "__main__":
      main()


==== #-grams = 2 ====
(u'350', u'175') 3.99425974384
(u'f', u'175') 3.97415922752
(u'degrees', u'175') 3.9454442042
(u'and', u'beat') 3.79038307827
(u'plastic', u'wrap') 2.99677187865
(u'375', u'190') 2.99677187865
(u'smooth', u'elastic') 2.99461902529
(u'baking', u'soda') 2.98816046521
(u'cover', u'plastic') 2.98170190512
(u'bake', u'at') 2.98170190512
(u'f', u'190') 2.97954905176
(u'with', u'plastic') 2.9666319316
(u'with', u'wrap') 2.9666319316
(u'with', u'towel') 2.9666319316
(u'let', u'stand') 2.96232622488
(u'degrees', u'190') 2.95802051815
(u'until', u'combined') 2.93433913118
(u'and', u'elastic') 2.84176643665
(u'away', u'sides') 1.99856562259
(u'pulls', u'away') 1.99856562259
(u'tester', u'inserted') 1.99856562259
(u'7', u'tablespoons') 1.99713090212
(u'after', u'addition') 1.99569618164
(u'from', u'sides') 1.99569618164
(u'each', u'addition') 1.99282674069
(u'loaves', u'are') 1.99282674069
(u'eggs', u'orange') 1.99139202022
(u'floured', u'counter') 1.99139202022
(u'eggs', u'juice') 1.99139202022
(u'pan', u'cornbread') 1.98995729974
(u'bake', u'preheated') 1.98708785879
(u'2', u'cup') 1.98565313831
(u'well', u'addition') 1.98565313831
(u'with', u'hands') 1.97704481546
(u'bowl', u'sift') 1.97561009498
(u'1', u'cup') 1.97561009498
(u'1', u'tablespoon') 1.97561009498
(u'let', u'sit') 1.97417537451
(u'let', u'covered') 1.97417537451
(u'add', u'whole') 1.97130593356
(u'a', u'counter') 1.95982816975
(u'until', u'doubles') 1.95552400832
(u'until', u'pulls') 1.95552400832
(u'until', u'tester') 1.95552400832
(u'until', u'away') 1.95552400832
(u'to', u'ingredients') 1.95408928785
(u'in', u'preheated') 1.94261152404
(u'the', u'milk') 1.93400320119
(u'the', u'sides') 1.93400320119
(u'punch', u'down') 1.49224350438
(u'golden', u'brown') 0.745993232362
(u'floured', u'surface') 0.741685981084
(u'let', u'rest') 0.724456975971
(u'minutes', u'cool') 0.711535222136
(u'a', u'surface') 0.71009947171
(u'in', u'size') 0.692870466596
(u'and', u'9') 0.644054952109
(u'and', u'rest') 0.644054952109
(u'loaf', u'pans') 0.173729702741
(u'into', u'loaves') 0.153614756752
(u'comes', u'clean') 0.0890220882527
(u'whole', u'wheat') 0.0890220882527
(u'margarine', u'hot') 0.0890220882527
(u'9', u'x') 0.0875868531713
(u'preheat', u'375') 0.0868692356307
(u'each', u'sheet') 0.08615161809
(u'large', u'mixing') 0.0854340005493
(u'baking', u'sheets') 0.0854340005493
(u'large', u'7') 0.0854340005493
(u'out', u'clean') 0.0847163830087
(u'is', u'elastic') 0.083998765468
(u'salt', u'soda') 0.083998765468
(u'oven', u'375') 0.0832811479273
(u'c', u'grease') 0.0832811479273
(u'cover', u'wrap') 0.0832811479273
(u'rise', u'again') 0.0825635303867
(u'place', u'on') 0.0825635303867
(u'bowl', u'7') 0.077540207602
(u'degrees', u'grease') 0.0753873549801
(u'a', u'towel') 0.0696464146547
(u'a', u'small') 0.0696464146547
(u'a', u'mixing') 0.0696464146547
(u'to', u'375') 0.0667759444921
(u'to', u'make') 0.0667759444921
(u'in', u'small') 0.0610350041668
(u'in', u'medium') 0.0610350041668
(u'the', u'liquid') 0.0567292989228
(u'and', u'soda') 0.0366360077842
(u'and', u'stand') 0.0366360077842
==== #-grams = 3 ====
(u'350', u'f', u'175') 3.99805997888
(u'350', u'degrees', u'175') 3.99805997888
(u'375', u'degrees', u'190') 2.99890882635
(u'375', u'f', u'190') 2.99890882635
(u'smooth', u'and', u'elastic') 2.99890882635
(u'with', u'plastic', u'wrap') 2.99890882635
(u'cover', u'plastic', u'wrap') 2.99890882635
(u'from', u'the', u'sides') 1.99951507314
(u'away', u'from', u'sides') 1.99951507314
(u'out', u'floured', u'counter') 1.99951507314
(u'pulls', u'from', u'sides') 1.99951507314
(u'well', u'after', u'addition') 1.99951507314
(u'pulls', u'away', u'sides') 1.99951507314
(u'flour', u'after', u'addition') 1.99951507314
(u'flour', u'well', u'addition') 1.99951507314
(u'until', u'tester', u'inserted') 1.99951507314
(u'out', u'a', u'counter') 1.99951507314
(u'well', u'each', u'addition') 1.99951507314
(u'away', u'the', u'sides') 1.99951507314
(u'until', u'loaves', u'are') 1.99951507314
(u'after', u'each', u'addition') 1.99951507314
(u'pulls', u'the', u'sides') 1.99951507314
(u'as', u'bread', u'easily') 1.99951507314
(u'tester', u'inserted', u'comes') 1.99951507314
(u'until', u'pulls', u'away') 1.99951507314
(u'5', u'minutes', u'creamy') 1.99903010707
(u'dissolve', u'the', u'milk') 1.99903010707
(u'bake', u'in', u'preheated') 1.99903010707
(u'dough', u'smooth', u'elastic') 0.0899724506468
(u'with', u'a', u'towel') 0.0899724506468
(u'is', u'smooth', u'elastic') 0.0899724506468
(u'comes', u'out', u'clean') 0.0899724506468
(u'oven', u'375', u'190') 0.0899724506468
(u'to', u'375', u'190') 0.0899724506468
(u'2', u'3', u'more') 0.0899724506468
(u'cover', u'a', u'towel') 0.0899724506468
(u'is', u'and', u'elastic') 0.0899724506468
(u'salt', u'baking', u'soda') 0.0899724506468
(u'flour', u'baking', u'soda') 0.0899724506468
(u'add', u'whole', u'wheat') 0.0899724506468
(u'until', u'smooth', u'elastic') 0.0899724506468
(u'f', u'190', u'grease') 0.0897299382169
(u'190', u'degrees', u'grease') 0.0897299382169
(u'dough', u'is', u'elastic') 0.0897299382169
(u'bowl', u'with', u'plastic') 0.0897299382169
(u'190', u'c', u'grease') 0.0897299382169
(u'and', u'beat', u'combined') 0.0894874257869
(u'until', u'and', u'elastic') 0.0894874257869
(u'dough', u'and', u'elastic') 0.0894874257869
(u'bowl', u'cover', u'plastic') 0.0894874257869
(u'2', u'cups', u'more') 0.0894874257869

In [896]:
def word_grams(words, min=1, max=4):
    s = []
    for n in range(min, max):
        for ngram in ngrams(words, n):
            s.append(' '.join(str(i) for i in ngram))
    return s

print word_grams('one two three four'.split(' '))


['one', 'two', 'three', 'four', 'one two', 'two three', 'three four', 'one two three', 'two three four']

In [922]:
def everygrams(sequence,min_len=1, max_len=-1):
    """
    This function returns all possible ngrams for n 
    ranging from 1 to len(sequence).
    >>> list(everygrams('a b c'.split()))
    [('a',), ('b',), ('c',), ('a', 'b'), ('b', 'c'), ('a', 'b', 'c')]
    """
    for n in range(1, len(sequence)+1):
        for ng in nltk.ngrams(sequence, n):
            yield ng

doc1 = "Singularity is still a confusing phenomenon in physics".split()
doc2 = "Quantum theory still wins over String theory".split()
_vec1 = list(everygrams(doc1,min_len=2, max_len=2))
print _vec1


[('Singularity',), ('is',), ('still',), ('a',), ('confusing',), ('phenomenon',), ('in',), ('physics',), ('Singularity', 'is'), ('is', 'still'), ('still', 'a'), ('a', 'confusing'), ('confusing', 'phenomenon'), ('phenomenon', 'in'), ('in', 'physics'), ('Singularity', 'is', 'still'), ('is', 'still', 'a'), ('still', 'a', 'confusing'), ('a', 'confusing', 'phenomenon'), ('confusing', 'phenomenon', 'in'), ('phenomenon', 'in', 'physics'), ('Singularity', 'is', 'still', 'a'), ('is', 'still', 'a', 'confusing'), ('still', 'a', 'confusing', 'phenomenon'), ('a', 'confusing', 'phenomenon', 'in'), ('confusing', 'phenomenon', 'in', 'physics'), ('Singularity', 'is', 'still', 'a', 'confusing'), ('is', 'still', 'a', 'confusing', 'phenomenon'), ('still', 'a', 'confusing', 'phenomenon', 'in'), ('a', 'confusing', 'phenomenon', 'in', 'physics'), ('Singularity', 'is', 'still', 'a', 'confusing', 'phenomenon'), ('is', 'still', 'a', 'confusing', 'phenomenon', 'in'), ('still', 'a', 'confusing', 'phenomenon', 'in', 'physics'), ('Singularity', 'is', 'still', 'a', 'confusing', 'phenomenon', 'in'), ('is', 'still', 'a', 'confusing', 'phenomenon', 'in', 'physics'), ('Singularity', 'is', 'still', 'a', 'confusing', 'phenomenon', 'in', 'physics')]

In [98]:
from itertools import islice, chain, combinations
from nltk.tokenize import RegexpTokenizer
import nltk
def skipgrams(sequence, n, k):
    
    for ngram in nltk.ngrams(sequence, n + k, pad_right=True):
        head = ngram[:1]
        tail = ngram[1:]
        for skip_tail in combinations(tail, n - 1):
            if skip_tail[-1] is None:
                continue
            yield head + skip_tail

sent = "Insurgents killed in ongoing fighting".split()
list(skipgrams(sent, 2, 3))


Out[98]:
[('Insurgents', 'killed'),
 ('Insurgents', 'in'),
 ('Insurgents', 'ongoing'),
 ('Insurgents', 'fighting'),
 ('killed', 'in'),
 ('killed', 'ongoing'),
 ('killed', 'fighting'),
 ('in', 'ongoing'),
 ('in', 'fighting'),
 ('ongoing', 'fighting')]

In [ ]:
from nltk.corpus import stopwords

In [125]:
def bigram_sent(sent):
    tokenizer = RegexpTokenizer(r'\w+')
    verb = tokenizer.tokenize(sent.lower())
    filtered_words = [word for word in verb if word not in stopwords.words('english')]
    bigrams_list = list(skipgrams(filtered_words,2,10))
    return bigrams_list,verb[0]

In [126]:
def bigrams_count(recipes):
    recipe_corpus = []
   
    for text in recipes:
        text_join = ''.join(text['directions'])
        result = ''.join(i for i in text_join if not i.isdigit())
        recipe_corpus.append(result)
    merged = list(itertools.chain(recipe_corpus))
    all_bigrams = []
    tokenizer = RegexpTokenizer(r'\w+')
    for merge in merged:
        tokens = tokenizer.tokenize(merge.lower())
        tokens = [word for word in tokens if word not in stopwords.words('english')]
        bigrams_merge = list(skipgrams(tokens,2,10))#.split(' '),2,2)
        for (x,y) in bigrams_merge:
            all_bigrams.append((x,y))
    return all_bigrams

In [127]:
pairs,verb = bigram_sent(sent)
print pairs


[('combine', 'parmesan'), ('combine', 'cheese'), ('combine', 'pepper'), ('combine', 'garlic'), ('combine', 'powder'), ('parmesan', 'cheese'), ('parmesan', 'pepper'), ('parmesan', 'garlic'), ('parmesan', 'powder'), ('cheese', 'pepper'), ('cheese', 'garlic'), ('cheese', 'powder'), ('pepper', 'garlic'), ('pepper', 'powder'), ('garlic', 'powder')]

In [305]:
sent = ['bake in 350 degrees F (175 degrees C) oven for 15 minutes or until golden brown']
print sent


['bake in 350 degrees F (175 degrees C) oven for 15 minutes or until golden brown']

In [129]:
bigrams_with_verb


Out[129]:
[('combine', 'parmesan'),
 ('combine', 'cheese'),
 ('combine', 'pepper'),
 ('combine', 'garlic'),
 ('combine', 'powder')]

In [276]:
all_recipes = bigrams_count(recipes[:10000])

In [306]:
from collections import Counter
all_recipes_sorted = dict(Counter(all_recipes))
verb_probas = dict()
for s in sent:
    pairs,verb = bigram_sent(s)
    bigrams_with_verb = []
    print verb
    for pair in pairs:
        if verb in pair:
            bigrams_with_verb.append(pair)
    bigram_probas = dict()
    for (x,y) in bigrams_with_verb:
        if (x,y) in all_recipes_sorted.keys():
            bigram_probas[(x,y)] = all_recipes_sorted[(x,y)]/float(actions_dict_upd[verb])
    verb_probas[verb] = bigram_probas
    verb_pr = 1/float(actions_dict_upd[verb])
    proba_sent = verb_pr
    print verb_probas
    for key in verb_probas[verb]:
        if verb_probas[verb][key] > 0:
            proba_sent = proba_sent * verb_probas[verb][key]
            print key #+ ('bowl','bowl')
    print proba_sent


bake
{'bake': {('bake', 'golden'): 0.02630698172568445, ('bake', 'degrees'): 0.17223375058571525, ('bake', 'f'): 0.08966463618716113, ('bake', 'brown'): 0.03156168418234152, ('bake', 'c'): 0.08705401968003212, ('bake', 'minutes'): 0.2136689202757882, ('bake', 'oven'): 0.07895441461945243}}
('bake', 'golden')
('bake', 'degrees')
('bake', 'f')
('bake', 'brown')
('bake', 'c')
('bake', 'minutes')
('bake', 'oven')
6.30268674243e-13

In [304]:
proba_sent = verb_pr
for key in verb_probas[verb]:
    if verb_probas[verb][key] > 0.1:
        proba_sent = proba_sent * verb_probas[verb][key]
        print key
print proba_sent
print verb_probas


3.24801870859e-05
{'place': {('place', 'ungreased'): 0.008899571261530466, ('place', 'cookie'): 0.02478238274652462, ('place', 'sheet'): 0.0223463687150838}}

In [213]:
bigrams_with_verb = []
for pair in pairs:
    if verb not in pair:
        bigrams_with_verb.append(pair)
bigram_probas = dict()
for (x,y) in bigrams_with_verb:
    if (x,y) in all_recipes_sorted.keys():
        bigram_probas[(x,y)] = all_recipes_sorted[(x,y)]/float(410)
verb_probas[verb] = bigram_probas
verb_pr = 1/float(actions_dict_upd[verb])

In [214]:
verb_probas


Out[214]:
{'combine': {('soda', 'bowl'): 0.7902439024390244,
  ('soup', 'bowl'): 0.10975609756097561,
  ('soup', 'soda'): 0.00975609756097561,
  ('tomato', 'bowl'): 0.06585365853658537,
  ('tomato', 'soda'): 0.012195121951219513,
  ('tomato', 'soup'): 0.11951219512195121}}

In [346]:
k = 0

for verb in verbs:
    scores = {}
    for (x,y) in all_recipes:
        if x == verb and y not in utensils:
            if (x,y) in scores:
                scores[(x,y)] += 1/float(actions_dict_upd[verb])
            else:
                scores[(x,y)] = 1/float(actions_dict_upd[verb])
    print max(scores.iteritems(), key=operator.itemgetter(1))[0:2]


((u'combine', u'sugar'), 0.09327397554172816)
((u'mix', u'well'), 0.10216435857036596)
((u'stir', u'mixture'), 0.04374850585703928)
((u'fry', u'brown'), 0.08408953418027812)
((u'cool', u'wire'), 0.050940487203206396)
((u'boil', u'minutes'), 0.46467817896388086)
((u'place', u'minutes'), 0.04043783292191641)

In [330]:
scores


Out[330]:
{(u'mix', u'bowl'): 656,
 (u'mix', u'dish'): 302,
 (u'mix', u'heat'): 394,
 (u'mix', u'oven'): 301,
 (u'mix', u'pan'): 684,
 (u'mix', u'saucepan'): 101,
 (u'mix', u'skillet'): 87}

In [317]:
actions_dict_upd['bake']


Out[317]:
29878

In [308]:
utensils = ['bowl','oven','pan','saucepan','heat','dish','skillet']

In [331]:
verbs = ['combine','mix','stir','fry','cool','boil','place']