In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json, urllib2
import os, glob, sys, time
import nltk,re
from practnlptools.tools import Annotator
import itertools
from nltk.corpus import stopwords
%matplotlib inline
In [2]:
#Read 70k recipes from allrecipes.com into a list of dictionaries
def read_data():
folder='recipes/'
files=pd.read_csv('all_files.txt',header=None)[0].values
k=0
recipes=[]
st=time.time()
for filename in files:
f=open(folder+filename,'r')
r=json.load(f)
recipes.append(r)
k+=1
if k%10000==0:
print k
return recipes
#Removing and replacing some noizy symbols
def clean_string(s):
sep_symbols=[';']
for ss in sep_symbols:
s=s.replace(ss,'.')
for i in range(10):
s=s.replace('..','.')
bad_symbols=[')','(','!','-']
for bs in bad_symbols:
s=s.replace(bs,' ')
s=s.replace(',',' , ')
s=s.replace(' ',' ')
s=s.replace('. ','.')
return s
#Raw direction text -> List of single directions
def get_clean_directions(recipe):
raw=recipe['directions']
direction=''
for dd in raw:
direction=direction+dd+'.'
direction=clean_string(direction).lower()
s=direction.split('.')
return s
In [3]:
def bright(l,idx1,idx2,idx3,idx4):
l2=[]
for i in range(len(l)):
if idx2[i]:
l2.append(blue(l[i]))
elif idx1[i]:
l2.append(yellow(l[i]))
elif idx3[i]:
l2.append(purple(l[i]))
elif idx4[i]:
l2.append(gray(l[i]))
else:
l2.append(l[i])
l2=' '.join(l2)
return l2
def purple(string):
return '\x1b[1;45m'+string+'\x1b[0m'
def yellow(string):
return '\x1b[1;43m'+string+'\x1b[0m'
def blue(string):
return '\x1b[1;46m'+string+'\x1b[0m'
def gray(string):
return '\x1b[1;47m'+string+'\x1b[0m'
import re
In [4]:
#cleaning and reading ingridiends and measures
def read_measure_list(path):
measures=pd.read_csv(path,header=None)
measure_list=measures[0].values
return measure_list
def remove_stopwords(text_list):
stop = stopwords.words('english')
content = [w for w in text_list if w.lower() not in stop]
return content
def remove_digits(text_list):
content=[]
for w in text_list:
w = re.sub('[./]', ' ', w).split()
content.append(w)
content = list(itertools.chain.from_iterable(content))
content = [w for w in content if w.isdigit()==0]
return content
def get_clean_text(text):
return text.replace('(','').replace(')','').replace(',','').replace('-',' ').replace('/',' ').replace(';',' ').replace(' ',' ')
def ingr_words_func(ingr_list):
recipe_words=[]
for recipe in ingr_list:
recipe=get_clean_text(recipe)
recipe_words.append([element for element in recipe.lower().split()])
recipe_words = list(itertools.chain.from_iterable(recipe_words))
recipe_words=remove_stopwords(remove_digits(recipe_words))
return recipe_words
#defining ingridients and measures
def define_ingr_measure(dirs_words, ingr_words):
if_ingr=[0]*len(dirs_words)
if_measure=[0]*len(dirs_words)
for i,dirs_word in enumerate(dirs_words):
for ingrs in ingr_words:
if dirs_word==ingrs:
if dirs_word not in measure_list:
if_ingr[i]=1
else:
if_measure[i]=1
return if_ingr,if_measure
def highlight_recipe(recipes,recipe_id):
dirs=get_clean_directions(recipes[recipe_id])
ingr_words=list(set(ingr_words_func(recipes[recipe_id]['ingr'])))
r=0
for d in dirs:
if len(d)>0:
d_words=np.array(d.split(' '))
ingr_idx,measure_idx=np.array(define_ingr_measure(d_words, ingr_words))
measure_idx=np.array([(word in set(measure_list)) for word in d_words]).astype(np.int32)
action_idx=np.array([(word in actions_set) for word in d_words]).astype(np.int32)
utensil_idx=np.array([(word in set(utensils_list)) for word in d_words]).astype(np.int32)
colored_string=bright(d_words,action_idx,ingr_idx,measure_idx,utensil_idx)
print r,colored_string
#print create_instructions(d)
print '_____________________________________'
#print create_instructions(d)
r+=1
In [7]:
%time recipes=read_data()
actions=pd.read_csv('action_dict_wordnet.txt',sep=' ')
actions_set=set(actions[:100].word.values)
measure_list=read_measure_list('measure_list.txt')
utensils_list=read_measure_list('utensils_list.txt')
annotator=Annotator()
In [29]:
def obj_elements_list(strr):
try:
output_list=[]
#make the list of correct ingrs (without 'or+word', 'and'+word = new list element etc.)
ingr_in_one_dir_list=(strr.split(' , '))
for i, ingr in enumerate(ingr_in_one_dir_list): #for all objects if it's list of ingrs
ingr_without_and_list=ingr.split(' and ') #dealing with 'and'
for ingr_without_and in ingr_without_and_list:
ingr_temp=ingr_without_and.split(' ')
for ingr_temp_el in ingr_temp:
if ingr_temp_el=='or':#dealing with 'or'
idx=ingr_temp.index('or')
ingr_temp = ingr_temp[:idx]
ingr_temp = remove_stopwords(ingr_temp)
output_list.append(' '.join(ingr_temp))
return output_list
except:
return ['0']
def add_action_line(df,ls):
cur_flow=pd.Series(ls,index=['line_id', 'action', 'object', 'target', 'no', 'nt', 'description'])
return df.append(cur_flow,ignore_index=True)
def create_instructions(phrase,flag=1):
annotated = annotator.getAnnotations('they '+phrase)['srl']
annotated_steps = []
#if (len(annotated) > 0) :
for i in xrange(len(annotated)):
annotated_step = dict()
annotated_step['object']=''
annotated_step['target']=''
annotated_step['action'] = annotated[i]['V']
if set(['A2']).issubset(annotated[i].keys()):
annotated_step['target'] = annotated[i]['A2']
if set(['AM-LOC']).issubset(annotated[i].keys()):
annotated_step['target'] = annotated[i]['AM-LOC']
if set(['A1']).issubset(annotated[i].keys()):
annotated_step['object'] = annotated[i]['A1']
annotated_steps.append(annotated_step)
#if (len(annotated_steps)==0) & (flag):
# return create_instructions('they '+phrase,0)
return annotated_steps
def line_score(lt,lp):
sc=[0,0,0]
ltt=lt.copy()
if (ltt[2][:4]=='body') & (len(ltt[2])>4):
ltt[2]='body'
if (ltt[3][:4]=='body') & (len(ltt[3])>4):
ltt[3]='body'
if (lt[1]==lp[1]):
sc+=ltt[1:4]==lp[1:4]
else:
pass
return sc
def calc_score(y_t,y_p):
cur_t=y_t.values
cur_p=y_p.values
score=np.zeros((len(cur_t),3)).astype(np.uint8)
for t in range(len(cur_t)):
cur_score=[0,0,0]
for p in range(len(cur_p)):
temp_score=line_score(cur_t[t],cur_p[p])
if sum(temp_score)>sum(cur_score):
cur_score=temp_score
score[t]=cur_score
score_2=round(np.sum(np.sum(score,axis=1)==3)/(len(cur_t)+0.),2)
score_1=np.round(np.sum(score,axis=0)/(len(cur_t)+0.),2)
return score_1,score_2
def get_prediction(recipes,recipe_id):
cols=['line_id', 'action', 'object', 'target', 'no', 'nt', 'description']
prediction=pd.DataFrame(columns=cols)
#recipe_id=6667
dirs=get_clean_directions(recipes[recipe_id])
ingr_words=ingr_words_func(recipes[recipe_id]['ingr'])
r=0
bbb=divide_ingr_plus_measure(recipe_id)
ingrs_in_rec=(list(zip(*bbb)[0]))
measures_in_rec=(list(zip(*bbb)[1]))
cur_line_id=0
for d in dirs:
if len(d)>0:
d_words=np.array(d.split(' '))
ingr_idx,measure_idx=np.array(define_ingr_measure(d_words, ingr_words))
action_idx=np.array([(word in actions_set) for word in d_words]).astype(np.int32)
#colored_string=bright(d_words,action_idx,ingr_idx,measure_idx)
#print r,colored_string
srl=create_instructions(d)
for t in range(len(srl)):
cur_srl=srl[t]
objects=obj_elements_list(cur_srl['object'])
if (len(objects[0])==0) & (len(cur_srl['target'])==0):
ls=7*['0']
ls[0]=str(cur_line_id)
ls[1]=cur_srl['action']
ls[2]='body'
prediction=add_action_line(prediction,ls)
else:
count_obj=0
for object in objects:
ls=7*['0']
ls[0]=str(cur_line_id)
ls[1]=cur_srl['action']
ls[2]='body'
if len(object):
obj,q_obj=clear_obj_or_targ_plus_quantity(object, ingrs_in_rec, measures_in_rec)
ls[2]=obj
ls[4]=q_obj
if len(cur_srl['target'])>0:
if count_obj==0:
tar,q_tar=clear_obj_or_targ_plus_quantity(cur_srl['target'], ingrs_in_rec, measures_in_rec)
ls[3]=tar
ls[5]=q_tar
else:
ls[3]='body'
else:
ls[3]='body'
prediction=add_action_line(prediction,ls)
count_obj+=1
#print '_____________________________________'
#print create_instructions(d)
r+=1
cur_line_id+=1
return prediction
In [9]:
def find_quantity(one_ingr_string):
one_ingr=one_ingr_string.split()
quantity_num=[]
meas=''
for w in one_ingr:
if w.replace('.','',1).replace('/','',1).replace('x','',1).isdigit()==1:
quantity_num.append(w)
if w in measure_list:
meas=' '+w
quantity_num=' '.join(quantity_num)
quantity=quantity_num+meas
return quantity
def remove_text_from_brackets(string):
return re.sub("[\(\[].*?[\)\]]", "", string).replace(' ',' ')
def divide_ingr_plus_measure(rec_id):
ingr=[k for k in recipes[rec_id]['ingr'] if not k.endswith(':')]
ingr_quant_list=[]
for k in ingr:
k.replace(',','')
k=remove_text_from_brackets(k)
quantity=find_quantity(k)
head, sep, tail = k.partition(quantity+' ')
#tail is ingr now - we need to clear it a little bit (from for, to taste... etc.)
head1, sep1, tail1 = tail.partition('for ')
head2, sep2, tail2 = head1.partition('to taste')
ingr_quant_list.append([head2, quantity])
return ingr_quant_list
import inflect
def make_singular_noun(word):
p = inflect.engine()
a=p.singular_noun(word, count=None)
if a==False:
return word
else:
return a
def clear_obj_or_targ_plus_quantity(target_temp, ingrs_in_rec, measures_in_rec):
#function from string!!! (from the output of srl for object or target)
target_temp_list = target_temp.split()
new_target=[]
flag=None
count_max=0
idx=None #!!!index элемента из листа без строчек, заканчивающихся на : !!!
for i,ingr_line in enumerate(ingrs_in_rec): #строка ингридиента (один ингр)
count=0
for w_ingr in ingr_line.split(): #слова одного ингр
for w in target_temp_list: #w-слова найденного таргета
#if w_ingr==w:
if make_singular_noun(w_ingr)==make_singular_noun(w):#если слово из таргета равно любому слову из ингр
count+=1 #считает количество совпадающих слов для каждого ингр
if count_max<count:
count_max=count
idx=i
flag=0
if idx==None:
for i,ingr_line in enumerate(utensils_list): #строка ингридиента (один ингр)
count=0
for w_ingr in ingr_line.split(): #слова одного ингр
for w in target_temp_list: #w-слова найденного таргета
#if w_ingr==w:
if make_singular_noun(w_ingr)==make_singular_noun(w):#если слово из таргета равно любому слову из ингр
count+=1
if count_max<count:
count_max=count
idx=i
flag=1
#new_target=[m for m in utensils_list]
if flag==0:
for w_ingr in (ingrs_in_rec[idx]).split():
for w in target_temp_list:
if w_ingr==w:
new_target.append(w)
if flag==1:
kk=utensils_list[idx]
for w_ingr in kk.split():
for w in target_temp_list:
if w_ingr==w:
new_target.append(w)
new_target=' '.join(new_target)
quantity=find_quantity(remove_text_from_brackets(target_temp))
if quantity=='':
if flag==0:
quantity=measures_in_rec[idx]
if flag==1:
quantity=0
else:
pass
if new_target=='':
return target_temp,'0'
else:
return new_target, quantity
In [10]:
###
lb=pd.read_csv('all_files.txt',header=None)
pairs=np.zeros(270000).astype(np.int32)
for i in range(len(lb)):
rev=int(lb.values[i][0][:-4])
pairs[rev]=i
fold='C:/Users/User/Dropbox (MIT)/NLP Final project/labeled recipes/'
In [38]:
#TOTAL EVALUATION
files=glob.glob(fold+'*')
arr=[0,0,0]
final=0
for file in files:
y_t=pd.read_csv(file)
v=int(file[63:-4])
y_p=get_prediction(recipes,pairs[v])
t_arr,t_final=calc_score(y_t,y_p)
arr+=t_arr
final+=t_final
print v,t_arr,t_final
arr/=len(files)
final/=len(files)
print 'Avg:',arr,final
In [35]:
#1 File
v=6663
y_t=pd.read_csv(fold+'/l'+str(v)+'.txt')
y_p=get_prediction(recipes,pairs[v])
print v,calc_score(y_t,y_p)
#y_t
In [31]:
highlight_recipe(recipes,pairs[v])
In [36]:
y_t ### TRUE labels
Out[36]:
In [37]:
y_p ### PREDICTED labels
Out[37]:
In [ ]: