In [1]:
import pandas as pd

In [2]:
corpora_path = 'dialog-bAbI-tasks'

In [3]:
from gensim.models.word2vec import Word2Vec as w

w2v = w.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True)


Slow version of gensim.models.doc2vec is being used

In [3]:
vec_size = len(w2v['red']) 

vec_size


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-8323695c454c> in <module>()
----> 1 vec_size = len(w2v['red'])
      2 
      3 vec_size

NameError: name 'w2v' is not defined

In [4]:
import os 

files = os.listdir(corpora_path)

files


Out[4]:
['dialog-babi-task5-full-dialogs-trn.txt',
 'dialog-babi-task3-options-dev.txt',
 'dialog-babi-task2-API-refine-trn.txt',
 'dialog-babi-task3-options-trn.txt',
 'dialog-babi-task6-dstc2-tst.txt',
 'dialog-babi-task6-dstc2-trn.txt',
 'dialog-babi-task4-phone-address-trn.txt',
 'dialog-babi-task6-dstc2-candidates.txt',
 'dialog-babi-candidates.txt',
 'dialog-babi-task3-options-tst.txt',
 'dialog-babi-task4-phone-address-tst.txt',
 'dialog-babi-task4-phone-address-dev.txt',
 'dialog-babi-task1-API-calls-dev.txt',
 'dialog-babi-task2-API-refine-tst-OOV.txt',
 'dialog-babi-task3-options-tst-OOV.txt',
 'dialog-babi-task5-full-dialogs-tst-OOV.txt',
 'dialog-babi-task1-API-calls-trn.txt',
 'dialog-babi-kb-all.txt',
 'dialog-babi-task1-API-calls-tst.txt',
 'dialog-babi-task1-API-calls-tst-OOV.txt',
 'dialog-babi-task2-API-refine-dev.txt',
 'dialog-babi-task4-phone-address-tst-OOV.txt',
 'dialog-babi-task2-API-refine-tst.txt',
 'dialog-babi-task6-dstc2-dev.txt',
 'dialog-babi-task5-full-dialogs-dev.txt',
 'dialog-babi-task5-full-dialogs-tst.txt']

In [20]:
data_dict = dict()

for f in files:
    if 'candidates' in f: continue
    data_dict[f] = pd.read_csv(os.path.join(corpora_path, f), names=['text','bot'], delimiter='\t')

In [ ]:


In [21]:
task = 'task6'

task_data = [x for x in data_dict if task in x]

task_data


Out[21]:
['dialog-babi-task6-dstc2-trn.txt',
 'dialog-babi-task6-dstc2-tst.txt',
 'dialog-babi-task6-dstc2-dev.txt']

In [22]:
train_data = data_dict[[x for x in task_data if 'trn' in x][0]]
dev_data = data_dict[[x for x in task_data if 'dev' in x][0]]
test_data = data_dict[[x for x in task_data if 'tst' in x][0]]

train_data['o'] = 'trn'
dev_data['o'] = 'dev'
test_data['o'] = 'tst'

c = pd.concat((train_data, dev_data, test_data))
c.index = range(len(c))



#c = c[~c['text'].str.contains("<SILENCE>")] # get rid of <SILENCE> markers

c = c.fillna("<unk>")
c['ind'] = c.text.map(lambda x: x.split()[0]) # split out the index into another column
c['text'] = c.text.map(lambda x: x.split()[1:])

# I hate this
gid = []
j = 1
for i in c.ind:
    if i == '1': j+=1
    gid.append(j)

c['gid'] = gid

mask = []
has_api_call = 'True'
for i,r in c.iterrows():
    if r.ind == '1': has_api_call = 'True'
    if "api_call" in r.bot: 
        mask += ['True']
        has_api_call = 'False'
        continue
    mask += [has_api_call]
c['mask'] = mask
#c = c.drop(['ind'],axis=1)

In [23]:
d = c.copy()

In [24]:
c[:100]


Out[24]:
text bot o ind gid mask
0 [<SILENCE>] Hello , welcome to the Cambridge restaurant sy... trn 1 2 True
1 [i, want, a, moderately, priced, restaurant, i... api_call R_cuisine west moderate trn 2 2 True
2 [saint_johns_chop_house, R_post_code, saint_jo... <unk> trn 3 2 False
3 [saint_johns_chop_house, R_cuisine, british] <unk> trn 4 2 False
4 [saint_johns_chop_house, R_location, west] <unk> trn 5 2 False
5 [saint_johns_chop_house, R_phone, saint_johns_... <unk> trn 6 2 False
6 [saint_johns_chop_house, R_address, saint_john... <unk> trn 7 2 False
7 [saint_johns_chop_house, R_price, moderate] <unk> trn 8 2 False
8 [saint_johns_chop_house, R_rating, 3] <unk> trn 9 2 False
9 [prezzo, R_post_code, prezzo_post_code] <unk> trn 10 2 False
10 [prezzo, R_cuisine, italian] <unk> trn 11 2 False
11 [prezzo, R_location, west] <unk> trn 12 2 False
12 [prezzo, R_phone, prezzo_phone] <unk> trn 13 2 False
13 [prezzo, R_address, prezzo_address] <unk> trn 14 2 False
14 [prezzo, R_price, moderate] <unk> trn 15 2 False
15 [prezzo, R_rating, 9] <unk> trn 16 2 False
16 [<SILENCE>] prezzo is a nice restaurant in the west of tow... trn 17 2 False
17 [is, there, anything, else] You are looking for a restaurant is that right? trn 18 2 False
18 [give, me, a, different, restaurant] prezzo is a nice restaurant in the west of tow... trn 19 2 False
19 [goodbye] you are welcome trn 20 2 False
20 [<SILENCE>] Hello , welcome to the Cambridge restaurant sy... trn 1 3 True
21 [i, want, a, moderately, priced, restaurant, t... What kind of food would you like? trn 2 3 True
22 [canapes, food] What part of town do you have in mind? trn 3 3 True
23 [it, doesnt, matter] api_call canapes R_location moderate trn 4 3 True
24 [api_call, no, result] <unk> trn 5 3 False
25 [<SILENCE>] yippee_noodle_bar is a nice restaurant in the ... trn 6 3 False
26 [gastropub, food] Sorry there is no canapes restaurant in the mo... trn 7 3 False
27 [i, want, a, gastropub, restaurant] the cow pizza kitchen and bar is a nice restau... trn 8 3 False
28 [what, is, the, address] Sure , the cow pizza kitchen and bar is on the... trn 9 3 False
29 [what, is, the, phone, number] The phone number of the cow pizza kitchen and ... trn 10 3 False
... ... ... ... ... ... ...
70 [pizza_hut_city_centre, R_address, pizza_hut_c... <unk> trn 39 4 False
71 [pizza_hut_city_centre, R_price, cheap] <unk> trn 40 4 False
72 [pizza_hut_city_centre, R_rating, 10] <unk> trn 41 4 False
73 [<SILENCE>] pizza_hut_city_centre is a nice restaurant in ... trn 42 4 False
74 [may, i, have, the, address, and, phone, numbe... Sure , pizza_hut_city_centre is on pizza_hut_c... trn 43 4 False
75 [and, the, phone, number, please] The phone number of pizza_hut_city_centre is p... trn 44 4 False
76 [thank, you, good, bye] you are welcome trn 45 4 False
77 [<SILENCE>] Hello , welcome to the Cambridge restaurant sy... trn 1 5 True
78 [im, looking, for, canapes, food, in, any, pri... I am sorry but there is no canapes restaurant ... trn 2 5 True
79 [how, about, european] api_call european R_location R_price trn 3 5 True
80 [eraina, R_post_code, eraina_post_code] <unk> trn 4 5 False
81 [eraina, R_cuisine, european] <unk> trn 5 5 False
82 [eraina, R_location, centre] <unk> trn 6 5 False
83 [eraina, R_phone, eraina_phone] <unk> trn 7 5 False
84 [eraina, R_address, eraina_address] <unk> trn 8 5 False
85 [eraina, R_price, expensive] <unk> trn 9 5 False
86 [eraina, R_rating, 1] <unk> trn 10 5 False
87 [riverside_brasserie, R_post_code, riverside_b... <unk> trn 11 5 False
88 [riverside_brasserie, R_cuisine, european] <unk> trn 12 5 False
89 [riverside_brasserie, R_location, centre] <unk> trn 13 5 False
90 [riverside_brasserie, R_phone, riverside_brass... <unk> trn 14 5 False
91 [riverside_brasserie, R_address, riverside_bra... <unk> trn 15 5 False
92 [riverside_brasserie, R_price, moderate] <unk> trn 16 5 False
93 [riverside_brasserie, R_rating, 2] <unk> trn 17 5 False
94 [de_luca_cucina_and_bar, R_post_code, de_luca_... <unk> trn 18 5 False
95 [de_luca_cucina_and_bar, R_cuisine, european] <unk> trn 19 5 False
96 [de_luca_cucina_and_bar, R_location, centre] <unk> trn 20 5 False
97 [de_luca_cucina_and_bar, R_phone, de_luca_cuci... <unk> trn 21 5 False
98 [de_luca_cucina_and_bar, R_address, de_luca_cu... <unk> trn 22 5 False
99 [de_luca_cucina_and_bar, R_price, moderate] <unk> trn 23 5 False

100 rows × 6 columns

Find all possible entities and their attributes


In [ ]:


In [25]:
restaurants = c[c.bot.str.contains('<unk>')]
restaurants = restaurants.text.apply(lambda x: pd.Series(x))
restaurants.columns = ['rname', 'attr_key', 'attr_value' ]
restaurans = restaurants.drop_duplicates()
restaurants = restaurants[restaurants.rname != 'api_key']
restaurants = restaurants[restaurants.rname != 'ask']
restaurants = restaurants[restaurants.attr_key != 'no']

attrs = ['R_cuisine', 'R_location', 'R_price']



restaurants = restaurants[restaurants.attr_key.isin(attrs)]

restaurants.to_pickle('restaurants_props.pkl')


"number of restaurants:", len(set(restaurants.rname))


Out[25]:
('number of restaurants:', 104)

In [123]:
restaurants[restaurants.rname == 'cote']


Out[123]:
rname attr_key attr_value
2126 cote R_cuisine french
2127 cote R_location centre
2130 cote R_price expensive
2262 cote R_cuisine french
2263 cote R_location centre
2266 cote R_price expensive
2461 cote R_cuisine french
2462 cote R_location centre
2465 cote R_price expensive
3869 cote R_cuisine french
3870 cote R_location centre
3873 cote R_price expensive
4772 cote R_cuisine french
4773 cote R_location centre
4776 cote R_price expensive
5662 cote R_cuisine french
5663 cote R_location centre
5666 cote R_price expensive
5810 cote R_cuisine french
5811 cote R_location centre
5814 cote R_price expensive
7222 cote R_cuisine french
7223 cote R_location centre
7226 cote R_price expensive
8453 cote R_cuisine french
8454 cote R_location centre
8457 cote R_price expensive
11582 cote R_cuisine french
11583 cote R_location centre
11586 cote R_price expensive
... ... ... ...
141427 cote R_cuisine french
141428 cote R_location centre
141431 cote R_price expensive
142227 cote R_cuisine french
142228 cote R_location centre
142231 cote R_price expensive
146691 cote R_cuisine french
146692 cote R_location centre
146695 cote R_price expensive
146791 cote R_cuisine french
146792 cote R_location centre
146795 cote R_price expensive
147110 cote R_cuisine french
147111 cote R_location centre
147114 cote R_price expensive
148930 cote R_cuisine french
148931 cote R_location centre
148934 cote R_price expensive
149754 cote R_cuisine french
149755 cote R_location centre
149758 cote R_price expensive
151624 cote R_cuisine french
151625 cote R_location centre
151628 cote R_price expensive
152377 cote R_cuisine french
152378 cote R_location centre
152381 cote R_price expensive
153724 cote R_cuisine french
153725 cote R_location centre
153728 cote R_price expensive

408 rows × 3 columns


In [11]:
cols = list(set(restaurants.attr_key))
r = restaurants.pivot_table('attr_value', ['rname'], 'attr_key', aggfunc=lambda x: list(set(x))[0])
r = pd.get_dummies(data = r, columns = cols )
r['rname'] = r.index
c = r.columns.tolist()
c.insert(0, c.pop(c.index('rname')))
r = r.reindex(columns= c)

r.to_pickle('restaurants.pkl')

r[:10]


Out[11]:
rname R_cuisine_african R_cuisine_asian_oriental R_cuisine_bistro R_cuisine_british R_cuisine_chinese R_cuisine_european R_cuisine_french R_cuisine_fusion R_cuisine_gastropub ... R_cuisine_turkish R_cuisine_vietnamese R_price_cheap R_price_expensive R_price_moderate R_location_centre R_location_east R_location_north R_location_south R_location_west
rname
ali_baba ali_baba 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 1 0 0 0 0
anatolia anatolia 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 1 1 0 0 0 0
backstreet_bistro backstreet_bistro 0 0 1 0 0 0 0 0 0 ... 0 0 0 1 0 1 0 0 0 0
bangkok_city bangkok_city 0 0 0 0 0 0 0 0 0 ... 0 0 0 1 0 1 0 0 0 0
bedouin bedouin 1 0 0 0 0 0 0 0 0 ... 0 0 0 1 0 1 0 0 0 0
bloomsbury_restaurant bloomsbury_restaurant 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 1 0 0 0 0
caffe_uno caffe_uno 0 0 0 0 0 0 0 0 0 ... 0 0 0 1 0 1 0 0 0 0
cambridge_lodge_restaurant cambridge_lodge_restaurant 0 0 0 0 0 1 0 0 0 ... 0 0 0 1 0 0 0 0 0 1
charlie_chan charlie_chan 0 0 0 0 1 0 0 0 0 ... 0 0 1 0 0 1 0 0 0 0
chiquito_restaurant_bar chiquito_restaurant_bar 0 0 0 0 0 0 0 0 0 ... 0 0 0 1 0 0 0 0 1 0

10 rows × 33 columns


In [12]:
restaurants = r
restaurants[:10]


Out[12]:
rname R_cuisine_african R_cuisine_asian_oriental R_cuisine_bistro R_cuisine_british R_cuisine_chinese R_cuisine_european R_cuisine_french R_cuisine_fusion R_cuisine_gastropub ... R_cuisine_turkish R_cuisine_vietnamese R_price_cheap R_price_expensive R_price_moderate R_location_centre R_location_east R_location_north R_location_south R_location_west
rname
ali_baba ali_baba 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 1 0 0 0 0
anatolia anatolia 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 1 1 0 0 0 0
backstreet_bistro backstreet_bistro 0 0 1 0 0 0 0 0 0 ... 0 0 0 1 0 1 0 0 0 0
bangkok_city bangkok_city 0 0 0 0 0 0 0 0 0 ... 0 0 0 1 0 1 0 0 0 0
bedouin bedouin 1 0 0 0 0 0 0 0 0 ... 0 0 0 1 0 1 0 0 0 0
bloomsbury_restaurant bloomsbury_restaurant 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 1 0 0 0 0
caffe_uno caffe_uno 0 0 0 0 0 0 0 0 0 ... 0 0 0 1 0 1 0 0 0 0
cambridge_lodge_restaurant cambridge_lodge_restaurant 0 0 0 0 0 1 0 0 0 ... 0 0 0 1 0 0 0 0 0 1
charlie_chan charlie_chan 0 0 0 0 1 0 0 0 0 ... 0 0 1 0 0 1 0 0 0 0
chiquito_restaurant_bar chiquito_restaurant_bar 0 0 0 0 0 0 0 0 0 ... 0 0 0 1 0 0 0 0 1 0

10 rows × 33 columns

Find the suggesed restaurants


In [13]:
c = d.copy()

In [14]:
suggested = c[c.text.apply(str).str.contains('<SILENCE>') | c.text.apply(str).str.contains('api_call')]
suggested.loc[suggested.text.apply(str).str.contains('api_call'), 'bot'] = "no_result"
print(len(suggested[suggested.bot == 'no_result']))

suggested['target'] = suggested.bot.map(lambda x: x.split()[0])
possible_targets = list(set(restaurants.rname)) + ['no_result']

suggested = suggested[suggested.target.isin(possible_targets)]
#suggested = suggested[~suggested.bot.str.contains('api_call')]

suggested.drop(['text','bot', 'ind', 'mask'], axis=1, inplace=True)


# dropping duplicates means we only care about the first api_call in the dialogue
suggested = suggested.drop_duplicates(subset=['o','gid'])

len(suggested)


462
/home/casey/.local/lib/python3.5/site-packages/pandas/core/indexing.py:477: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
/home/casey/.local/lib/python3.5/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[14]:
3026

In [15]:
suggested


Out[15]:
o gid target
16 trn 2 prezzo
24 trn 3 no_result
34 trn 4 no_result
136 trn 5 michaelhouse_cafe
228 trn 6 the_nirala
467 trn 8 the_missing_sock
498 trn 9 no_result
710 trn 10 riverside_brasserie
936 trn 11 da_vinci_pizzeria
948 trn 12 the_lucky_star
986 trn 13 saint_johns_chop_house
1013 trn 14 the_missing_sock
1047 trn 15 the_gandhi
1267 trn 17 no_result
1299 trn 18 royal_spice
1321 trn 19 pizza_hut_cherry_hinton
1329 trn 20 no_result
1507 trn 21 charlie_chan
1527 trn 22 bangkok_city
1540 trn 23 pizza_hut_fen_ditton
1549 trn 24 no_result
1579 trn 25 no_result
1644 trn 26 no_result
1663 trn 27 prezzo
1682 trn 28 da_vinci_pizzeria
1703 trn 29 prezzo
1722 trn 30 prezzo
1742 trn 31 thanh_binh
1779 trn 32 la_margherita
1798 trn 33 frankie_and_bennys
... ... ... ...
152349 tst 3206 efes_restaurant
152370 tst 3207 thanh_binh
152390 tst 3208 cote
152415 tst 3209 thanh_binh
152577 tst 3210 royal_spice
152591 tst 3211 wagamama
152640 tst 3212 shanghai_family_restaurant
152668 tst 3213 the_nirala
152689 tst 3214 royal_spice
152709 tst 3215 pizza_hut_cherry_hinton
152730 tst 3216 la_margherita
152877 tst 3217 nandos
152984 tst 3218 peking_restaurant
153016 tst 3219 efes_restaurant
153039 tst 3220 prezzo
153251 tst 3221 la_tasca
153299 tst 3222 shiraz_restaurant
153313 tst 3223 the_missing_sock
153343 tst 3224 the_gardenia
153408 tst 3225 galleria
153449 tst 3226 yu_garden
153545 tst 3228 gourmet_burger_kitchen
153654 tst 3229 shanghai_family_restaurant
153672 tst 3230 sala_thong
153701 tst 3231 pizza_hut_cherry_hinton
153712 tst 3232 no_result
153737 tst 3233 restaurant_two_two
153901 tst 3234 tandoori_palace
153928 tst 3235 la_margherita
153965 tst 3236 taj_tandoori

3026 rows × 3 columns


In [16]:
c = d.copy()

In [17]:
import numpy as np

c = c[~c.text.apply(str).str.contains('<SILENCE>')]
c = c[~c.bot.apply(str).str.contains('<unk>')]
c.loc[~c.bot.apply(str).str.contains('api_call'), 'bot'] = ""

# need a bit of discoures history
#c['text1'] = c.text.shift(1)
#c['text2'] = c.text.shift(2)
#c['text3'] = c.text.shift(3)
#c.dropna(subset=['text', 'text1'], inplace=True)

#c['text'] = c.text2.map(list) + c.text1.map(list) + c.text.map(list)
#c['text'] = c.text1.map(list) + c.text.map(list)
#c.drop(['text1'], axis=1, inplace=True)

# dropping duplicates means we only care about the first api_call in the dialogue
#c = c.drop_duplicates(subset=['o','gid'])


#

#c = pd.merge(c, suggested, on=['o','gid'], how='left')

c = c[c['mask'].str.contains('True')]
c = c.groupby('gid').agg(sum)
c = c[c['mask'] == 'True']
c['gid'] = c.index
c = pd.merge(c, suggested, on=['o','gid'], how='left')



c.dropna(inplace=True)
len(c)


Out[17]:
414

In [18]:
c.to_pickle('utts_refs.pkl')

In [19]:
c[:5]


Out[19]:
text bot o ind mask gid target
0 [i, want, a, moderately, priced, restaurant, i... api_call R_cuisine west moderate trn 2 True 2 prezzo
2 [cheap, restaurant, in, the, north, part, of, ... api_call R_cuisine north cheap trn 2 True 11 da_vinci_pizzeria
3 [cheap, restaurant, in, the, south, part, of, ... api_call R_cuisine south cheap trn 2 True 12 the_lucky_star
4 [cheap, restaurant, serving, indian, food] api_call indian R_location cheap trn 2 True 15 the_gandhi
5 [thai, food] api_call thai R_location R_price trn 2 True 22 bangkok_city

In [ ]:

Grab only the utts where there is an api_call


In [133]:
# make the text (which is a list of words) into a single column of words
s = c.text.apply(lambda x: pd.Series(x)).stack().reset_index(level=1, drop=True)
s.name = 'word'
c = c.drop('text', axis=1).join(s)
c.dropna(subset=['bot'], inplace=True)


#c['w2v'] = c.text.map(lambda x: [w2v[i] for i in x if i in w2v])
c = c[~c.word.apply(str).str.contains('_')]
c['w2v'] = c.word.map(lambda x: w2v[x] if x in w2v else np.zeros(vec_size))
attr_df = c.bot.apply(lambda x: pd.Series(x.split()))
c['type'], c['loc'], c['price'] = attr_df[1], attr_df[2], attr_df[3]
c = pd.get_dummies(data = c, columns = ['type','loc','price'] )
    
data = c

len(data)


Out[133]:
3783

In [134]:
train_data = data[data.o == 'trn'].drop(['o'], axis=1)
dev_data = data[data.o == 'dev'].drop(['o'], axis=1)
test_data = data[data.o == 'tst'].drop(['o'], axis=1)

train_data.shape, test_data.shape, dev_data.shape


Out[134]:
((2138, 47), (401, 47), (1244, 47))

In [135]:
train_data[:3]


Out[135]:
bot ind mask gid target word w2v type_R_cuisine type_african type_austrian ... type_vietnamese loc_R_location loc_east loc_north loc_south loc_west price_R_price price_cheap price_expensive price_moderate
0 api_call R_cuisine west moderate 2 True 2 prezzo i [-0.225586, -0.0195312, 0.0908203, 0.237305, -... 1 0 0 ... 0 0 0 0 0 1 0 0 0 1
0 api_call R_cuisine west moderate 2 True 2 prezzo want [0.136719, 0.148438, 0.114746, 0.0698242, -0.1... 1 0 0 ... 0 0 0 0 0 1 0 0 0 1
0 api_call R_cuisine west moderate 2 True 2 prezzo a [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... 1 0 0 ... 0 0 0 0 0 1 0 0 0 1

3 rows × 47 columns

Train


In [136]:
start_col = 'type_R_cuisine'

In [137]:
import numpy as np

y_train = train_data.ix[:,start_col:].as_matrix() 

X_train = train_data.w2v.as_matrix()
X_train = np.array(list(X_train), dtype=np.float) # needed to fit the regression model

X_train.shape, y_train.shape


Out[137]:
((2138, 300), (2138, 40))

In [138]:
from sklearn.linear_model import *
import numpy as np


model = Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=True, random_state=None, solver='auto', tol=0.01)

model.fit(X_train, y_train)


Out[138]:
Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.01)

Test: How well can it predict the 3 usual attributes?


In [139]:
# word level composition
def compute_target(utt):
    predictions = [model.predict(w.reshape(1,-1))[0] for w in utt.w2v]
    return compose(predictions)

In [140]:
#
# composition by union of vectors
#
def compose(predictions):
    p = predictions[0]
    for i in predictions[1:]:
        p = np.logical_or(i, p)
    return p

In [141]:
# attribute level composition
def compute_target(utt):
    predictions = utt.w2v.values
    p = compose(predictions)
    return model.predict(p.reshape(1,-1))

In [142]:
#
#  this is the composition function, it just sums vectors
#
def compose(predictions):
    p = predictions[0]
    for i in predictions[1:]:
        p = np.sum((i, p),axis=0)
    return p

In [143]:
targets = train_data.drop(['bot','word','w2v', 'target', 'gid','mask','ind'], 1)
targets = targets.drop_duplicates()
#targets = targets.ix[:,start_col:].as_matrix() 

train_data.shape, targets.shape

targets[:10]


Out[143]:
type_R_cuisine type_african type_austrian type_british type_canapes type_cantonese type_catalan type_chinese type_cuban type_european ... type_vietnamese loc_R_location loc_east loc_north loc_south loc_west price_R_price price_cheap price_expensive price_moderate
0 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 0 1
2 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 1 0 0 0 1 0 0
3 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 1 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 1 0 0 0 0 0 1 0 0
5 0 0 0 0 0 0 0 0 0 0 ... 0 1 0 0 0 0 1 0 0 0
6 0 0 0 0 0 0 0 0 0 1 ... 0 1 0 0 0 0 1 0 0 0
11 1 0 0 0 0 0 0 0 0 0 ... 0 1 0 0 0 0 0 1 0 0
12 0 0 0 0 0 0 0 0 0 0 ... 0 1 0 0 0 0 1 0 0 0
13 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 1 0 1 0 0
15 1 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 1 0 0

10 rows × 40 columns


In [144]:
import operator
import scipy
import sklearn

eval_data = dev_data
gold=[]
guess=[]

indeces = list(set(eval_data.index))
print('num instances', len(indeces))

for eid in indeces:
    sub = eval_data[eval_data.index == eid] # grab the RE for this scene
    target = compute_target(sub) # compose the predictions of each word to a target vector
    distances = [(v, scipy.spatial.distance.cosine(target,v.ix[start_col:])) for i,v in targets.iterrows()]
    guess += [min(distances, key=operator.itemgetter(1))[0]] # which object has the shortest distance?
    gold += [sub.iloc[-1].ix[start_col:]] # all the rows in sub have the same matrix


num instances 128
/home/casey/.local/lib/python3.5/site-packages/scipy/spatial/distance.py:326: RuntimeWarning: invalid value encountered in double_scalars
  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))

In [145]:
#
guess=np.array(guess,dtype=np.float)
gold=np.array(gold,dtype=np.float)

sklearn.metrics.f1_score(gold, guess, average='micro', labels=np.array([0,1],dtype=np.float))


Out[145]:
0.89322916666666663

In [ ]:

Incremental Evaluation


In [146]:
from collections import defaultdict as dd

incr_results = dd(list)
filled_slots = dd(list)

for eid in indeces:
    pre_sub = eval_data[eval_data.index == eid] # grab the RE for this scene
    for i in range(1,len(pre_sub)):
        sub = pre_sub[:i]
        gold=[]
        guess=[]
        target = compute_target(sub) # compose the predictions of each word to a target vector
        distances = [(v, scipy.spatial.distance.cosine(target,v.ix[start_col:])) for i,v in targets.iterrows()]
        guess += [min(distances, key=operator.itemgetter(1))[0]] # which object has the shortest distance?
        gold += [sub.iloc[-1].ix[start_col:]] # all the rows in sub have the same matrix
        filled_slots[i].append(np.sum(guess))
        guess=np.array(guess,dtype=np.float)
        gold=np.array(gold,dtype=np.float)
        incr_results[i].append(sklearn.metrics.f1_score(gold, guess, average='micro', labels=np.array([0,1],dtype=np.float)))


/home/casey/.local/lib/python3.5/site-packages/scipy/spatial/distance.py:326: RuntimeWarning: invalid value encountered in double_scalars
  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))

In [147]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 

results = [(i,np.mean(incr_results[i])) for i in incr_results]

r = list(zip(*results))

plt.plot(r[0], r[1])


Out[147]:
[<matplotlib.lines.Line2D at 0x7f43e0091668>]

In [ ]:

Test: How well can it predict the restaurant using all of a restaurant's attributes?


In [148]:
rdata = data[['ind','gid','target','word','w2v','o']]

In [149]:
rdata[:10]


Out[149]:
ind gid target word w2v o
0 2 2 prezzo i [-0.225586, -0.0195312, 0.0908203, 0.237305, -... trn
0 2 2 prezzo want [0.136719, 0.148438, 0.114746, 0.0698242, -0.1... trn
0 2 2 prezzo a [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... trn
0 2 2 prezzo moderately [0.145508, 0.0294189, -0.289062, 0.279297, 0.0... trn
0 2 2 prezzo priced [-0.0708008, 0.128906, -0.324219, 0.205078, -0... trn
0 2 2 prezzo restaurant [-0.148438, -0.124023, 0.0412598, 0.208984, -0... trn
0 2 2 prezzo in [0.0703125, 0.0869141, 0.0878906, 0.0625, 0.06... trn
0 2 2 prezzo the [0.0800781, 0.10498, 0.0498047, 0.0534668, -0.... trn
0 2 2 prezzo west [-0.0927734, -0.12207, 0.140625, -0.0649414, 0... trn
0 2 2 prezzo part [0.0071106, 0.0563965, 0.0106812, 0.0825195, -... trn

In [150]:
restaurants[:5]


Out[150]:
rname R_price_cheap R_price_expensive R_price_moderate R_location_centre R_location_east R_location_north R_location_south R_location_west R_cuisine_african ... R_cuisine_lebanese R_cuisine_mediterranean R_cuisine_mexican R_cuisine_north_american R_cuisine_portuguese R_cuisine_seafood R_cuisine_spanish R_cuisine_thai R_cuisine_turkish R_cuisine_vietnamese
rname
ali_baba ali_baba 0 0 1 1 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 0
anatolia anatolia 0 0 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
backstreet_bistro backstreet_bistro 0 1 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
bangkok_city bangkok_city 0 1 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
bedouin bedouin 0 1 0 1 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 33 columns


In [158]:
num_cols = len(restaurants.columns) -1
start_col = 'R_price_cheap'
num_cols


Out[158]:
32

In [159]:
rdata = rdata[rdata.target.isin(restaurants.rname)]
rdata['attrvec'] = np.array(rdata.target.map(lambda x: np.array(restaurants[restaurants.rname == x].ix[0:,start_col:].as_matrix()[0])))

In [160]:
rdata[:5]


Out[160]:
ind gid target word w2v o attrvec
0 2 2 prezzo i [-0.225586, -0.0195312, 0.0908203, 0.237305, -... trn [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
0 2 2 prezzo want [0.136719, 0.148438, 0.114746, 0.0698242, -0.1... trn [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
0 2 2 prezzo a [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... trn [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
0 2 2 prezzo moderately [0.145508, 0.0294189, -0.289062, 0.279297, 0.0... trn [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
0 2 2 prezzo priced [-0.0708008, 0.128906, -0.324219, 0.205078, -0... trn [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...

In [161]:
train_data = rdata[rdata.o == 'trn'].drop(['o'], axis=1)
dev_data = rdata[rdata.o == 'dev'].drop(['o'], axis=1)
test_data = rdata[rdata.o == 'tst'].drop(['o'], axis=1)

train_data.shape, test_data.shape, dev_data.shape


Out[161]:
((2006, 6), (395, 6), (1175, 6))

In [162]:
import numpy as np

y_train = [x for x in train_data.attrvec] # this shold just work with as_matri(), but the shape is always wrong

X_train = train_data.w2v.as_matrix()
X_train = np.array(list(X_train), dtype=np.float) # needed to fit the regression model

X_train.shape


Out[162]:
(2006, 300)

In [ ]:


In [163]:
from sklearn.linear_model import *
import numpy as np


model = Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=True, random_state=None, solver='auto', tol=0.01)

model.fit(X_train, y_train)


Out[163]:
Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.01)

In [164]:
def compute_mrr(lst, target):
    i = 1.0
    for l in lst:
        if l == target: break
        i+=1
    return 1.0/i

In [165]:
import operator
import scipy
import sklearn

eval_data = dev_data
gold=[]
guess=[]

indeces = list(set(eval_data.index))
print('num instances', len(indeces))
mrr = 0.0
for eid in indeces:
    sub = rdata[rdata.index == eid] # grab the RE for this scene
    target = compute_target(sub) # compose the predictions of each word to a target vector
    distances = [(v['rname'], scipy.spatial.distance.cosine(target,v.ix[start_col:])) for i,v in restaurants.iterrows()]
    distances.sort(key=operator.itemgetter(1))
    guess += [distances[0][0]]
    d = list(zip(*distances))[0]
    mrr += compute_mrr(d, sub.iloc[-1].ix['target'])
    gold += [sub.iloc[-1].ix['target']] # all the rows in sub have the same matrix


num instances 119

In [166]:
sklearn.metrics.accuracy_score(gold, guess)


Out[166]:
0.47058823529411764

In [167]:
mrr / len(gold)


Out[167]:
0.6229480614059901

Find the restaurant incrementally


In [168]:
import operator
import scipy
import sklearn

eval_data = dev_data
gold=[]
guess=[]

indeces = list(set(eval_data.index))
print('num instances', len(indeces))
mrr = 0.0
for eid in indeces:
    pre_sub = rdata[rdata.index == eid] # grab the RE for this scene
    for i in range(1,len(pre_sub)):
        sub = pre_sub[:i]
        gold = []
        guess = []
        target = compute_target(sub) # compose the predictions of each word to a target vector
        distances = [(v['rname'], scipy.spatial.distance.cosine(target,v.ix[start_col:])) for i,v in restaurants.iterrows()]
        distances.sort(key=operator.itemgetter(1))
        guess += [distances[0][0]]
        d = list(zip(*distances))[0]
        mrr += compute_mrr(d, sub.iloc[-1].ix['target'])
        gold += [sub.iloc[-1].ix['target']] # all the rows in sub have the same matrix
        incr_results[i].append(sklearn.metrics.accuracy_score(gold, guess))


num instances 119

In [169]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 

results = [(i,np.mean(incr_results[i])) for i in incr_results]

r = list(zip(*results))

plt.plot(r[0], r[1])


Out[169]:
[<matplotlib.lines.Line2D at 0x7f43dca72e10>]

First closest word instead


In [170]:
#eval_data['w2v'] = eval_data.word.map(lambda x: w2v[w2v.most_similar([x], topn=1)[0][0]] if x in w2v else np.zeros(vec_size))

In [171]:
# this returns it back to normal:
#eval_data['w2v'] = eval_data.word.map(lambda x: w2v[x] if x in w2v else np.zeros(vec_size))

In [172]:
import pickle
# now you can save it to a file
with open('ridge_restaurant.pkl', 'wb') as f:
    pickle.dump(model, f)

In [ ]: