In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json, urllib2
import os, glob, sys, time
import nltk,re
from practnlptools.tools import Annotator
import itertools
from nltk.corpus import stopwords
%matplotlib inline

Helper functions

Data preprocessing


In [2]:
#Read 70k recipes from allrecipes.com into a list of dictionaries
def read_data():
    folder='recipes/'
    files=pd.read_csv('all_files.txt',header=None)[0].values
    k=0
    recipes=[]
    st=time.time()
    for filename in files:
        f=open(folder+filename,'r')
        r=json.load(f)
        recipes.append(r)
        k+=1
        if k%10000==0:
            print k
    return recipes

#Removing and replacing some noizy symbols
def clean_string(s):    
    sep_symbols=[';']
    for ss in sep_symbols:
        s=s.replace(ss,'.')
    for i in range(10):
        s=s.replace('..','.')
    bad_symbols=[')','(','!','-']
    for bs in bad_symbols:
        s=s.replace(bs,' ')
    s=s.replace(',',' , ')                  
    s=s.replace('  ',' ')
    s=s.replace('. ','.')
    return s

#Raw direction text -> List of single directions
def get_clean_directions(recipe):
    raw=recipe['directions']
    direction=''
    for dd in raw:
        direction=direction+dd+'.'
    direction=clean_string(direction).lower()
    s=direction.split('.')
    return s

Color functions


In [3]:
def bright(l,idx1,idx2,idx3,idx4):
    l2=[]
    for i in range(len(l)):
        if idx2[i]:
            l2.append(blue(l[i]))
        elif idx1[i]:
            l2.append(yellow(l[i]))
        elif idx3[i]:
            l2.append(purple(l[i]))
        elif idx4[i]:
            l2.append(gray(l[i]))
        else:
            l2.append(l[i])
    l2=' '.join(l2)
    return l2

def purple(string):
    return '\x1b[1;45m'+string+'\x1b[0m'

def yellow(string):
    return '\x1b[1;43m'+string+'\x1b[0m'

def blue(string):
    return '\x1b[1;46m'+string+'\x1b[0m'

def gray(string):
    return '\x1b[1;47m'+string+'\x1b[0m'
            
import re

Working with dictionaries


In [4]:
#cleaning and reading ingridiends and measures

def read_measure_list(path):
    measures=pd.read_csv(path,header=None)
    measure_list=measures[0].values
    return measure_list

def remove_stopwords(text_list):
    stop = stopwords.words('english')
    content = [w for w in text_list if w.lower() not in stop]
    return content

def remove_digits(text_list):
    content=[]
    for w in text_list:
        w = re.sub('[./]', ' ', w).split() 
        content.append(w)
    content = list(itertools.chain.from_iterable(content))
    content = [w for w in content if w.isdigit()==0]
    return content

def get_clean_text(text):
    return text.replace('(','').replace(')','').replace(',','').replace('-',' ').replace('/',' ').replace(';',' ').replace('  ',' ')

def ingr_words_func(ingr_list):
    recipe_words=[]
    for recipe in ingr_list:
        recipe=get_clean_text(recipe)
        recipe_words.append([element for element in recipe.lower().split()])
    recipe_words = list(itertools.chain.from_iterable(recipe_words))
    recipe_words=remove_stopwords(remove_digits(recipe_words))
    return recipe_words

#defining ingridients and measures
def define_ingr_measure(dirs_words, ingr_words):
    if_ingr=[0]*len(dirs_words)
    if_measure=[0]*len(dirs_words)
    for i,dirs_word in enumerate(dirs_words):
        for ingrs in ingr_words:
            if dirs_word==ingrs:
                if dirs_word not in measure_list:
                    if_ingr[i]=1
                else:
                    if_measure[i]=1
    return if_ingr,if_measure


def highlight_recipe(recipes,recipe_id):
    dirs=get_clean_directions(recipes[recipe_id])
    ingr_words=list(set(ingr_words_func(recipes[recipe_id]['ingr'])))
    r=0
    for d in dirs:
        if len(d)>0:
            d_words=np.array(d.split(' '))
            ingr_idx,measure_idx=np.array(define_ingr_measure(d_words, ingr_words))
            measure_idx=np.array([(word in set(measure_list)) for word in d_words]).astype(np.int32)
            action_idx=np.array([(word in actions_set) for word in d_words]).astype(np.int32)
            utensil_idx=np.array([(word in set(utensils_list)) for word in d_words]).astype(np.int32)
            colored_string=bright(d_words,action_idx,ingr_idx,measure_idx,utensil_idx)
            print r,colored_string
            #print create_instructions(d)
            print '_____________________________________'
            #print create_instructions(d)
            r+=1

Read Data and Dictionaries


In [7]:
%time recipes=read_data()
actions=pd.read_csv('action_dict_wordnet.txt',sep=' ')
actions_set=set(actions[:100].word.values)
measure_list=read_measure_list('measure_list.txt')
utensils_list=read_measure_list('utensils_list.txt')
annotator=Annotator()


10000
20000
30000
40000
50000
60000
70000
Wall time: 6.49 s

SRL


In [29]:
def obj_elements_list(strr):
    try:
        output_list=[]
        #make the list of correct ingrs (without 'or+word', 'and'+word = new list element etc.)
        ingr_in_one_dir_list=(strr.split(' , ')) 
        for i, ingr in enumerate(ingr_in_one_dir_list): #for all objects if it's list of ingrs
            ingr_without_and_list=ingr.split(' and ') #dealing with 'and' 
            for ingr_without_and in ingr_without_and_list:
                ingr_temp=ingr_without_and.split(' ')
                for ingr_temp_el in ingr_temp:
                    if ingr_temp_el=='or':#dealing with 'or'
                        idx=ingr_temp.index('or') 
                        ingr_temp = ingr_temp[:idx]
                ingr_temp = remove_stopwords(ingr_temp)
                output_list.append(' '.join(ingr_temp))
        return output_list
    except:
        return ['0']

def add_action_line(df,ls):
    cur_flow=pd.Series(ls,index=['line_id', 'action', 'object', 'target', 'no', 'nt', 'description'])
    return df.append(cur_flow,ignore_index=True)


def create_instructions(phrase,flag=1):
    annotated = annotator.getAnnotations('they '+phrase)['srl']
    annotated_steps = []
    #if (len(annotated) > 0) :
    for i in xrange(len(annotated)):
        annotated_step = dict()
        annotated_step['object']=''
        annotated_step['target']=''
        annotated_step['action'] = annotated[i]['V']
        if set(['A2']).issubset(annotated[i].keys()):
            annotated_step['target'] = annotated[i]['A2']
        if set(['AM-LOC']).issubset(annotated[i].keys()):
            annotated_step['target'] = annotated[i]['AM-LOC']
        if set(['A1']).issubset(annotated[i].keys()):
            annotated_step['object'] = annotated[i]['A1']
        annotated_steps.append(annotated_step)
    #if (len(annotated_steps)==0) & (flag):
    #    return create_instructions('they '+phrase,0)
    return annotated_steps


def line_score(lt,lp):
    sc=[0,0,0]
    ltt=lt.copy()
    if (ltt[2][:4]=='body') & (len(ltt[2])>4):
        ltt[2]='body'
    if (ltt[3][:4]=='body') & (len(ltt[3])>4):
        ltt[3]='body'    
    if (lt[1]==lp[1]):
        sc+=ltt[1:4]==lp[1:4]
    else:
        pass
    return sc

def calc_score(y_t,y_p):
    cur_t=y_t.values
    cur_p=y_p.values
    score=np.zeros((len(cur_t),3)).astype(np.uint8)
    for t in range(len(cur_t)):
        cur_score=[0,0,0]
        for p in range(len(cur_p)):
            temp_score=line_score(cur_t[t],cur_p[p])
            if sum(temp_score)>sum(cur_score):
                cur_score=temp_score
        score[t]=cur_score
    score_2=round(np.sum(np.sum(score,axis=1)==3)/(len(cur_t)+0.),2)
    score_1=np.round(np.sum(score,axis=0)/(len(cur_t)+0.),2)
    return score_1,score_2



def get_prediction(recipes,recipe_id):
    cols=['line_id', 'action', 'object', 'target', 'no', 'nt', 'description']
    prediction=pd.DataFrame(columns=cols)

    #recipe_id=6667
    dirs=get_clean_directions(recipes[recipe_id])
    ingr_words=ingr_words_func(recipes[recipe_id]['ingr'])
    r=0
    bbb=divide_ingr_plus_measure(recipe_id)
    ingrs_in_rec=(list(zip(*bbb)[0]))
    measures_in_rec=(list(zip(*bbb)[1]))
    cur_line_id=0
    for d in dirs:
        if len(d)>0:
            d_words=np.array(d.split(' '))
            ingr_idx,measure_idx=np.array(define_ingr_measure(d_words, ingr_words))
            action_idx=np.array([(word in actions_set) for word in d_words]).astype(np.int32)
            #colored_string=bright(d_words,action_idx,ingr_idx,measure_idx)
            #print r,colored_string
            srl=create_instructions(d)

            for t in range(len(srl)):
                cur_srl=srl[t]
                objects=obj_elements_list(cur_srl['object'])
                if (len(objects[0])==0) & (len(cur_srl['target'])==0):
                    ls=7*['0']
                    ls[0]=str(cur_line_id)
                    ls[1]=cur_srl['action']
                    ls[2]='body'
                    prediction=add_action_line(prediction,ls)

                else:
                    count_obj=0
                    for object in objects:
                        ls=7*['0']
                        ls[0]=str(cur_line_id)
                        ls[1]=cur_srl['action']
                        ls[2]='body'
                        if len(object):
                            obj,q_obj=clear_obj_or_targ_plus_quantity(object, ingrs_in_rec, measures_in_rec)
                            ls[2]=obj
                            ls[4]=q_obj
                        if len(cur_srl['target'])>0:
                            if count_obj==0:
                                tar,q_tar=clear_obj_or_targ_plus_quantity(cur_srl['target'], ingrs_in_rec, measures_in_rec)
                                ls[3]=tar
                                ls[5]=q_tar
                            else:
                                ls[3]='body'
                        else:
                            ls[3]='body'
                        prediction=add_action_line(prediction,ls)
                        count_obj+=1
            #print '_____________________________________'
            #print create_instructions(d)
            r+=1
            cur_line_id+=1
    return prediction

Helper funtions, advanced target processing


In [9]:
def find_quantity(one_ingr_string):
    one_ingr=one_ingr_string.split()
    quantity_num=[]
    meas=''
    for w in one_ingr:
        if w.replace('.','',1).replace('/','',1).replace('x','',1).isdigit()==1:
            quantity_num.append(w)
        if w in measure_list:
            meas=' '+w
    quantity_num=' '.join(quantity_num)
    quantity=quantity_num+meas
    return quantity

def remove_text_from_brackets(string): 
    return re.sub("[\(\[].*?[\)\]]", "", string).replace('  ',' ')

def divide_ingr_plus_measure(rec_id):
    ingr=[k for k in recipes[rec_id]['ingr'] if not k.endswith(':')]
    ingr_quant_list=[]
    for k in ingr:
        k.replace(',','')
        k=remove_text_from_brackets(k)
        quantity=find_quantity(k)
        head, sep, tail = k.partition(quantity+' ')
        #tail is ingr now - we need to clear it a little bit (from for, to taste... etc.)
        head1, sep1, tail1 = tail.partition('for ')
        head2, sep2, tail2 = head1.partition('to taste')    
        ingr_quant_list.append([head2, quantity])
    return ingr_quant_list


import inflect
def make_singular_noun(word):
    p = inflect.engine()
    a=p.singular_noun(word, count=None)
    if a==False:
        return word
    else:
        return a
    
    
def clear_obj_or_targ_plus_quantity(target_temp, ingrs_in_rec, measures_in_rec): 
    #function from string!!! (from the output of srl for object or target)
    target_temp_list = target_temp.split()
    new_target=[]
    flag=None
    count_max=0
    idx=None #!!!index элемента из листа без строчек, заканчивающихся на : !!!
    for i,ingr_line in enumerate(ingrs_in_rec): #строка ингридиента (один ингр)
        count=0
        for w_ingr in ingr_line.split(): #слова одного ингр
            for w in target_temp_list: #w-слова найденного таргета
                #if w_ingr==w:
                if make_singular_noun(w_ingr)==make_singular_noun(w):#если слово из таргета равно любому слову из ингр
                    count+=1 #считает количество совпадающих слов для каждого ингр
        if count_max<count:
            count_max=count
            idx=i
            flag=0
    if idx==None:
        for i,ingr_line in enumerate(utensils_list): #строка ингридиента (один ингр)
            count=0
            for w_ingr in ingr_line.split(): #слова одного ингр
                for w in target_temp_list: #w-слова найденного таргета
                    #if w_ingr==w:
                    if make_singular_noun(w_ingr)==make_singular_noun(w):#если слово из таргета равно любому слову из ингр
                        count+=1  
            if count_max<count:
                count_max=count
                idx=i
                flag=1
    #new_target=[m for m in utensils_list]
    if flag==0:
        for w_ingr in (ingrs_in_rec[idx]).split():
            for w in target_temp_list:
                if w_ingr==w:
                    new_target.append(w)
    if flag==1:
        kk=utensils_list[idx]
        for w_ingr in kk.split():
            for w in target_temp_list:
                if w_ingr==w:
                    new_target.append(w)        
    new_target=' '.join(new_target)

    quantity=find_quantity(remove_text_from_brackets(target_temp))
    if quantity=='':
        if flag==0:
            quantity=measures_in_rec[idx]
        if flag==1:
            quantity=0
    else:
        pass
    if new_target=='':
        return target_temp,'0'
    else:
        return new_target, quantity

Main

Prepare variables


In [10]:
###
lb=pd.read_csv('all_files.txt',header=None)
pairs=np.zeros(270000).astype(np.int32)
for i in range(len(lb)):
    rev=int(lb.values[i][0][:-4])
    pairs[rev]=i
fold='C:/Users/User/Dropbox (MIT)/NLP Final project/labeled recipes/'

Evaluate all labeled recipes


In [38]:
#TOTAL EVALUATION
files=glob.glob(fold+'*')
arr=[0,0,0]
final=0
for file in files:
    y_t=pd.read_csv(file)
    v=int(file[63:-4])
    y_p=get_prediction(recipes,pairs[v])
    t_arr,t_final=calc_score(y_t,y_p)
    arr+=t_arr
    final+=t_final
    print v,t_arr,t_final
arr/=len(files)
final/=len(files)
print 'Avg:',arr,final


13344 [ 0.96  0.68  0.6 ] 0.44
13345 [ 0.83  0.61  0.5 ] 0.5
13346 [ 0.79  0.62  0.62] 0.5
13347 [ 0.7  0.7  0.3] 0.3
13348 [ 0.81  0.69  0.62] 0.58
13349 [ 0.96  0.16  0.12] 0.08
13351 [ 0.77  0.55  0.23] 0.23
13352 [ 0.64  0.48  0.52] 0.4
13353 [ 0.68  0.58  0.32] 0.26
13354 [ 0.83  0.58  0.75] 0.5
13355 [ 0.7  0.4  0.6] 0.3
13356 [ 1.    0.78  0.78] 0.56
13357 [ 0.86  0.43  0.71] 0.29
13358 [ 0.66  0.45  0.62] 0.45
13359 [ 0.88  0.81  0.75] 0.69
13360 [ 0.7  0.7  0.5] 0.5
13361 [ 0.69  0.54  0.46] 0.31
13362 [ 1.   0.8  0.7] 0.7
13363 [ 0.88  0.5   0.5 ] 0.5
13364 [ 0.92  0.77  0.77] 0.69
13366 [ 0.64  0.57  0.36] 0.36
13367 [ 0.86  0.76  0.57] 0.57
13368 [ 0.6   0.56  0.2 ] 0.16
13369 [ 0.8  0.8  0.7] 0.7
13370 [ 0.59  0.5   0.41] 0.36
13371 [ 0.55  0.27  0.36] 0.18
13372 [ 0.86  0.71  0.71] 0.57
13376 [ 0.94  0.78  0.78] 0.67
13377 [ 0.75  0.75  0.75] 0.75
13378 [ 0.77  0.69  0.54] 0.54
13379 [ 0.73  0.67  0.6 ] 0.53
13380 [ 0.9   0.75  0.75] 0.6
13381 [ 0.92  0.75  0.67] 0.5
13382 [ 0.87  0.67  0.8 ] 0.6
13383 [ 0.85  0.6   0.55] 0.45
13384 [ 0.94  0.89  0.67] 0.67
13385 [ 0.59  0.41  0.35] 0.35
13386 [ 0.64  0.55  0.27] 0.18
13387 [ 0.64  0.27  0.36] 0.09
13388 [ 0.72  0.67  0.56] 0.5
13389 [ 0.89  0.33  0.78] 0.33
13390 [ 0.59  0.47  0.59] 0.47
13391 [ 0.83  0.83  0.67] 0.67
13392 [ 0.86  0.71  0.57] 0.57
13393 [ 0.93  0.71  0.64] 0.5
13394 [ 0.82  0.65  0.47] 0.29
6663 [ 0.79  0.57  0.57] 0.57
6665 [ 0.71  0.43  0.48] 0.38
7664 [ 1.    0.33  0.2 ] 0.13
7670 [ 0.93  0.73  0.6 ] 0.47
7671 [ 0.72  0.39  0.33] 0.28
7673 [ 0.83  0.17  0.11] 0.0
8665 [ 0.92  0.77  0.77] 0.69
8666 [ 0.87  0.53  0.53] 0.33
8667 [ 0.94  0.67  0.78] 0.56
8670 [ 0.73  0.47  0.67] 0.47
8671 [ 0.9  0.6  0.7] 0.6
8676 [ 0.75  0.44  0.25] 0.13
8682 [ 0.93  0.73  0.73] 0.6
Avg: [ 0.80271186  0.59288136  0.54864407] 0.443220338983

1 recipe example prediction & evaluation


In [35]:
#1 File
v=6663
y_t=pd.read_csv(fold+'/l'+str(v)+'.txt')
y_p=get_prediction(recipes,pairs[v])
print v,calc_score(y_t,y_p)
#y_t


6663 (array([ 0.79,  0.57,  0.57]), 0.57)

In [31]:
highlight_recipe(recipes,pairs[v])


0 combine parmesan cheese , pepper and garlic powder
_____________________________________
1 unfold pastry sheets onto cutting board
_____________________________________
2 brush lightly with egg white
_____________________________________
3 sprinkle each sheet with 1/4 of the cheese mixture
_____________________________________
4 lightly press into pastry , turn over
_____________________________________
5 repeat
_____________________________________
6 cut each sheet into 12 1 inch strips
_____________________________________
7 twist
_____________________________________
8 place on ungreased cookie sheet and bake in 350 degrees f 175 degrees c oven for 15 minutes or until golden brown
_____________________________________

In [36]:
y_t ### TRUE labels


Out[36]:
line_id action object target no nt description
0 0 combine cheese body 1/2 cup 0 0
1 0 combine pepper body 3/4 teaspoon 0 0
2 0 combine garlic powder body 1/2 teaspoon 0 0
3 1 unfold pastry sheets cutting board 1 (17.5 ounce) 0 0
4 2 brush pastry sheets egg 1 (17.5 ounce) 0 0
5 3 sprinkle body sheet 0 0 0
6 3 turn over body 0 0 0 0
7 4 press body pastry 0 0 0
8 4 turn over body 0 0 0 0
9 5 repeat body 0 0 0 0
10 6 cut body strips 0 0 0
11 7 twist body 0 0 0 0
12 8 place body sheet 0 0 0
13 8 bake body oven 0 350 f 0

In [37]:
y_p   ### PREDICTED labels


Out[37]:
line_id action object target no nt description
0 0 combine cheese body 1/2 cup 0 0
1 0 combine pepper body 3/4 teaspoon 0 0
2 0 combine garlic powder body 1/2 teaspoon 0 0
3 1 unfold pastry sheets cutting board 0 0 0
4 1 cutting board body 0 0 0
5 2 brush body egg white 0 1 0
6 3 sprinkle cheese body 1/4 0 0
7 4 press body pastry 0 0 0
8 4 turn over body 0 0 0 0
9 6 cut sheet body 0 0 0
10 8 place body sheet 0 350 175 degrees 0

In [ ]: