In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json, urllib2
import os, glob, sys, time
import nltk,re
import itertools
from nltk.corpus import stopwords
%matplotlib inline
In [3]:
#Read 70k recipes from allrecipes.com into a list of dictionaries
def read_data():
folder='recipes/'
files=pd.read_csv('all_files.txt',header=None)[0].values
k=0
recipes=[]
st=time.time()
for filename in files:
f=open(folder+filename,'r')
r=json.load(f)
recipes.append(r)
k+=1
if k%10000==0:
print k
return recipes
#Removing and replacing some noizy symbols
def clean_string(s):
sep_symbols=[';']
for ss in sep_symbols:
s=s.replace(ss,'.')
for i in range(10):
s=s.replace('..','.')
bad_symbols=[')','(','!','-']
for bs in bad_symbols:
s=s.replace(bs,' ')
s=s.replace(',',' , ')
s=s.replace(' ',' ')
s=s.replace('. ','.')
return s
#Raw direction text -> List of single directions
def get_clean_directions(recipe):
raw=recipe['directions']
direction=''
for dd in raw:
direction=direction+dd+'.'
direction=clean_string(direction).lower()
s=direction.split('.')
return s
In [4]:
def bright(l,idx1,idx2,idx3):
l2=[]
for i in range(len(l)):
if idx1[i]:
l2.append(yellow(l[i]))
elif idx2[i]:
l2.append(blue(l[i]))
elif idx3[i]:
l2.append(purple(l[i]))
else:
l2.append(l[i])
l2=' '.join(l2)
return l2
def purple(string):
return '\x1b[1;45m'+string+'\x1b[0m'
def yellow(string):
return '\x1b[1;43m'+string+'\x1b[0m'
def blue(string):
return '\x1b[1;46m'+string+'\x1b[0m'
def highlight_recipe(recipes,recipe_id):
dirs=get_clean_directions(recipes[recipe_id])
ingr_words=list(set(ingr_words_func(recipes[recipe_id]['ingr'])))
for d in dirs:
if len(d)>0:
d_words=np.array(d.split(' '))
ingr_idx,measure_idx=np.array(define_ingr_measure(d_words, ingr_words))
action_idx=np.array([(word in actions_set) for word in d_words]).astype(np.int32)
colored_string=bright(d_words,action_idx,ingr_idx,measure_idx)
print colored_string
print create_instructions(d)
print '_____________________________________'
#print create_instructions(d)
import re
In [5]:
#cleaning and reading ingridiends and measures
def read_measure_list(path):
measures=pd.read_csv(path,header=None)
measure_list=measures[0].values
return measure_list
def remove_stopwords(text_list):
stop = stopwords.words('english')
content = [w for w in text_list if w.lower() not in stop]
return content
def remove_digits(text_list):
content=[]
for w in text_list:
w = re.sub('[./]', ' ', w).split()
content.append(w)
content = list(itertools.chain.from_iterable(content))
content = [w for w in content if w.isdigit()==0]
return content
def get_clean_text(text):
return text.replace('(','').replace(')','').replace(',','').replace('-',' ').replace('/',' ').replace(';',' ').replace(' ',' ')
def ingr_words_func(ingr_list):
recipe_words=[]
for recipe in ingr_list:
recipe=get_clean_text(recipe)
recipe_words.append([element for element in recipe.lower().split()])
recipe_words = list(itertools.chain.from_iterable(recipe_words))
recipe_words=remove_stopwords(remove_digits(recipe_words))
return recipe_words
#defining ingridients and measures
def define_ingr_measure(dirs_words, ingr_words):
if_ingr=[0]*len(dirs_words)
if_measure=[0]*len(dirs_words)
for i,dirs_word in enumerate(dirs_words):
for ingrs in ingr_words:
if dirs_word==ingrs:
if dirs_word not in measure_list:
if_ingr[i]=1
else:
if_measure[i]=1
return if_ingr,if_measure
In [6]:
from practnlptools.tools import Annotator
annotator=Annotator()
def create_instructions(phrase,flag=1):
annotated = annotator.getAnnotations(phrase)['srl']
annotated_steps = []
if len(annotated) > 0:
for i in xrange(len(annotated)):
annotated_step = dict()
annotated_step['action'] = annotated[i]['V']
if set(['A1','A2']).issubset(annotated[i].keys()):
annotated_step['object'] = annotated[i]['A1']
annotated_step['target'] = annotated[i]['A2']
elif set(['A2']).issubset(annotated[i].keys()):
annotated_step['target'] = annotated[i]['A2']
elif set(['A1']).issubset(annotated[i].keys()):
annotated_step['object'] = annotated[i]['A1']
else:
pass
annotated_steps.append(annotated_step)
if (len(annotated_steps)==0) & (flag):
return create_instructions('they '+phrase,0)
return annotated_steps
In [274]:
%time recipes=read_data()
actions=pd.read_csv('actions_dict_sorted.txt',sep=' ')
measure_list=read_measure_list('measure_list.txt')
In [8]:
actions_set=set(actions[:100].word.values)
In [9]:
recipes[6710]['recipe_id']
Out[9]:
In [10]:
recipes[6718]['ingr']
Out[10]:
In [11]:
highlight_recipe(recipes,6718)
In [12]:
df=pd.read_csv('labeled_recipes/l13394.txt')
df
Out[12]:
In [101]:
import os
directory = "labeled_recipes"
for filename in os.listdir(directory)[2:]:
idx = int(filename.split('.')[0][1:])
new_filename = os.path.join(directory,'l'+ str(recipes[idx]['recipe_id']) + '.txt')
os.rename(os.path.join(directory,filename),new_filename)
In [ ]:
In [16]:
features = df.values.tolist()
features
Out[16]:
In [ ]:
In [14]:
recipe_id=7777
dirs=get_clean_directions(recipes[recipe_id])
ingr_words=list(set(ingr_words_func(recipes[recipe_id]['ingr'])))
for d in dirs:
if len(d)>0:
print d
print create_instructions(d)
print '____________________________________________'
In [13]:
from nltk.corpus import wordnet as wn
def is_known(word):
"""return True if this word "exists" in WordNet
(or at least in nltk.corpus.stopwords)."""
if word.lower() in nltk.corpus.stopwords.words('english'):
return True
synset = wn.synsets(word,pos=wn.VERB)
if len(synset) == 0:
return False
else:
return True
In [19]:
actions_dict_upd = dict()
actions_dict = dict(actions.values.tolist())
word_actions = actions_dict.keys()
for word in word_actions:
if is_known(word) == True:
actions_dict_upd[word] = actions_dict[word]
In [21]:
import operator
sorted_x = sorted(actions_dict_upd.items(), key=operator.itemgetter(1),reverse=True)
In [251]:
f = open('action_dict_wordnet.txt','w')
for (x,y) in sorted_x:
if y > 40:
f.write(x + ' ' + str(y) + '\n')
f.close()
In [937]:
from __future__ import division
import operator
import nltk
import numpy as np
from scipy.stats import binom
import string
from nltk.tokenize import RegexpTokenizer
def isValid(word):
if word.startswith("#"):
return False # no hashtag
else:
vword = word.translate(string.maketrans("", ""), string.punctuation)
return len(vword) == len(word)
def llr(c1, c2, c12, n):
# H0: Independence p(w1,w2) = p(w1,~w2) = c2/N
p0 = c2 / n
# H1: Dependence, p(w1,w2) = c12/N
p10 = c12 / n
# H1: p(~w1,w2) = (c2-c12)/N
p11 = (c2 - c12) / n
# binomial probabilities
# H0: b(c12; c1, p0), b(c2-c12; N-c1, p0)
# H1: b(c12, c1, p10), b(c2-c12; N-c1, p11)
probs = np.matrix([
[binom(c1, p0).logpmf(c12), binom(n - c1, p0).logpmf(c2 - c12)],
[binom(c1, p10).logpmf(c12), binom(n - c1, p11).logpmf(c2 - c12)]])
# LLR = p(H1) / p(H0)
return np.sum(probs[1, :]) - np.sum(probs[0, :])
def isLikelyNGram(ngram, phrases):
if len(ngram) == 2:
return True
prevGram = ngram[:-1]
return phrases.has_key(prevGram)
def main():
# accumulate words and word frequency distributions
lines = []
unigramFD = nltk.FreqDist()
i = 0
for line in recipes[:10]:
i += 1
tokenizer = RegexpTokenizer(r'\w+')
words = tokenizer.tokenize(''.join(line['directions']).strip('.').lower())
#words = filter(lambda x: isValid(x), words)
for x in words:
unigramFD[x] += 1
lines.append(words)
if i > 1000:
break
# identify likely phrases using a multi-pass algorithm based
# on the LLR approach described in the Building Search Applications
# Lucene, LingPipe and GATE book, except that we treat n-gram
# collocations beyond 2 as n-1 gram plus a unigram.
phrases = nltk.defaultdict(float)
prevGramFD = None
for i in range(2, 4):
ngramFD = nltk.FreqDist()
for words in lines:
nextGrams = nltk.skipgrams(words, i,i)
nextGrams = filter(lambda x: isLikelyNGram(x, phrases), nextGrams)
for x in nextGrams:
ngramFD[x] += 1
for k, v in ngramFD.iteritems():
if v > 1 and v < 5:
c1 = unigramFD[k[0]] if prevGramFD == None else prevGramFD[k[:-1]]
c2 = unigramFD[k[1]] if prevGramFD == None else unigramFD[k[len(k) - 1]]
c12 = ngramFD[k]
n = unigramFD.N() if prevGramFD == None else prevGramFD.N()
phrases[k] = llr(c1, c2, c12, n)
# only consider bigrams where LLR > 0, ie P(H1) > P(H0)
likelyPhrases = nltk.defaultdict(float)
likelyPhrases.update([(k, v) for (k, v)
in phrases.iteritems() if len(k) == i and v > 0])
print "==== #-grams = %d ====" % (i)
sortedPhrases = sorted(likelyPhrases.items(),
key=operator.itemgetter(1), reverse=True)
for k, v in sortedPhrases:
print k, v
prevGramFD = ngramFD
if __name__ == "__main__":
main()
In [896]:
def word_grams(words, min=1, max=4):
s = []
for n in range(min, max):
for ngram in ngrams(words, n):
s.append(' '.join(str(i) for i in ngram))
return s
print word_grams('one two three four'.split(' '))
In [922]:
def everygrams(sequence,min_len=1, max_len=-1):
"""
This function returns all possible ngrams for n
ranging from 1 to len(sequence).
>>> list(everygrams('a b c'.split()))
[('a',), ('b',), ('c',), ('a', 'b'), ('b', 'c'), ('a', 'b', 'c')]
"""
for n in range(1, len(sequence)+1):
for ng in nltk.ngrams(sequence, n):
yield ng
doc1 = "Singularity is still a confusing phenomenon in physics".split()
doc2 = "Quantum theory still wins over String theory".split()
_vec1 = list(everygrams(doc1,min_len=2, max_len=2))
print _vec1
In [98]:
from itertools import islice, chain, combinations
from nltk.tokenize import RegexpTokenizer
import nltk
def skipgrams(sequence, n, k):
for ngram in nltk.ngrams(sequence, n + k, pad_right=True):
head = ngram[:1]
tail = ngram[1:]
for skip_tail in combinations(tail, n - 1):
if skip_tail[-1] is None:
continue
yield head + skip_tail
sent = "Insurgents killed in ongoing fighting".split()
list(skipgrams(sent, 2, 3))
Out[98]:
In [ ]:
from nltk.corpus import stopwords
In [125]:
def bigram_sent(sent):
tokenizer = RegexpTokenizer(r'\w+')
verb = tokenizer.tokenize(sent.lower())
filtered_words = [word for word in verb if word not in stopwords.words('english')]
bigrams_list = list(skipgrams(filtered_words,2,10))
return bigrams_list,verb[0]
In [126]:
def bigrams_count(recipes):
recipe_corpus = []
for text in recipes:
text_join = ''.join(text['directions'])
result = ''.join(i for i in text_join if not i.isdigit())
recipe_corpus.append(result)
merged = list(itertools.chain(recipe_corpus))
all_bigrams = []
tokenizer = RegexpTokenizer(r'\w+')
for merge in merged:
tokens = tokenizer.tokenize(merge.lower())
tokens = [word for word in tokens if word not in stopwords.words('english')]
bigrams_merge = list(skipgrams(tokens,2,10))#.split(' '),2,2)
for (x,y) in bigrams_merge:
all_bigrams.append((x,y))
return all_bigrams
In [127]:
pairs,verb = bigram_sent(sent)
print pairs
In [305]:
sent = ['bake in 350 degrees F (175 degrees C) oven for 15 minutes or until golden brown']
print sent
In [129]:
bigrams_with_verb
Out[129]:
In [276]:
all_recipes = bigrams_count(recipes[:10000])
In [306]:
from collections import Counter
all_recipes_sorted = dict(Counter(all_recipes))
verb_probas = dict()
for s in sent:
pairs,verb = bigram_sent(s)
bigrams_with_verb = []
print verb
for pair in pairs:
if verb in pair:
bigrams_with_verb.append(pair)
bigram_probas = dict()
for (x,y) in bigrams_with_verb:
if (x,y) in all_recipes_sorted.keys():
bigram_probas[(x,y)] = all_recipes_sorted[(x,y)]/float(actions_dict_upd[verb])
verb_probas[verb] = bigram_probas
verb_pr = 1/float(actions_dict_upd[verb])
proba_sent = verb_pr
print verb_probas
for key in verb_probas[verb]:
if verb_probas[verb][key] > 0:
proba_sent = proba_sent * verb_probas[verb][key]
print key #+ ('bowl','bowl')
print proba_sent
In [304]:
proba_sent = verb_pr
for key in verb_probas[verb]:
if verb_probas[verb][key] > 0.1:
proba_sent = proba_sent * verb_probas[verb][key]
print key
print proba_sent
print verb_probas
In [213]:
bigrams_with_verb = []
for pair in pairs:
if verb not in pair:
bigrams_with_verb.append(pair)
bigram_probas = dict()
for (x,y) in bigrams_with_verb:
if (x,y) in all_recipes_sorted.keys():
bigram_probas[(x,y)] = all_recipes_sorted[(x,y)]/float(410)
verb_probas[verb] = bigram_probas
verb_pr = 1/float(actions_dict_upd[verb])
In [214]:
verb_probas
Out[214]:
In [346]:
k = 0
for verb in verbs:
scores = {}
for (x,y) in all_recipes:
if x == verb and y not in utensils:
if (x,y) in scores:
scores[(x,y)] += 1/float(actions_dict_upd[verb])
else:
scores[(x,y)] = 1/float(actions_dict_upd[verb])
print max(scores.iteritems(), key=operator.itemgetter(1))[0:2]
In [330]:
scores
Out[330]:
In [317]:
actions_dict_upd['bake']
Out[317]:
In [308]:
utensils = ['bowl','oven','pan','saucepan','heat','dish','skillet']
In [331]:
verbs = ['combine','mix','stir','fry','cool','boil','place']