In [1]:
import pandas as pd
In [2]:
corpora_path = 'dialog-bAbI-tasks'
In [3]:
from gensim.models.word2vec import Word2Vec as w
w2v = w.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True)
In [3]:
vec_size = len(w2v['red'])
vec_size
In [4]:
import os
files = os.listdir(corpora_path)
files
Out[4]:
In [20]:
data_dict = dict()
for f in files:
if 'candidates' in f: continue
data_dict[f] = pd.read_csv(os.path.join(corpora_path, f), names=['text','bot'], delimiter='\t')
In [ ]:
In [21]:
task = 'task6'
task_data = [x for x in data_dict if task in x]
task_data
Out[21]:
In [22]:
train_data = data_dict[[x for x in task_data if 'trn' in x][0]]
dev_data = data_dict[[x for x in task_data if 'dev' in x][0]]
test_data = data_dict[[x for x in task_data if 'tst' in x][0]]
train_data['o'] = 'trn'
dev_data['o'] = 'dev'
test_data['o'] = 'tst'
c = pd.concat((train_data, dev_data, test_data))
c.index = range(len(c))
#c = c[~c['text'].str.contains("<SILENCE>")] # get rid of <SILENCE> markers
c = c.fillna("<unk>")
c['ind'] = c.text.map(lambda x: x.split()[0]) # split out the index into another column
c['text'] = c.text.map(lambda x: x.split()[1:])
# I hate this
gid = []
j = 1
for i in c.ind:
if i == '1': j+=1
gid.append(j)
c['gid'] = gid
mask = []
has_api_call = 'True'
for i,r in c.iterrows():
if r.ind == '1': has_api_call = 'True'
if "api_call" in r.bot:
mask += ['True']
has_api_call = 'False'
continue
mask += [has_api_call]
c['mask'] = mask
#c = c.drop(['ind'],axis=1)
In [23]:
d = c.copy()
In [24]:
c[:100]
Out[24]:
In [ ]:
In [25]:
restaurants = c[c.bot.str.contains('<unk>')]
restaurants = restaurants.text.apply(lambda x: pd.Series(x))
restaurants.columns = ['rname', 'attr_key', 'attr_value' ]
restaurans = restaurants.drop_duplicates()
restaurants = restaurants[restaurants.rname != 'api_key']
restaurants = restaurants[restaurants.rname != 'ask']
restaurants = restaurants[restaurants.attr_key != 'no']
attrs = ['R_cuisine', 'R_location', 'R_price']
restaurants = restaurants[restaurants.attr_key.isin(attrs)]
restaurants.to_pickle('restaurants_props.pkl')
"number of restaurants:", len(set(restaurants.rname))
Out[25]:
In [123]:
restaurants[restaurants.rname == 'cote']
Out[123]:
In [11]:
cols = list(set(restaurants.attr_key))
r = restaurants.pivot_table('attr_value', ['rname'], 'attr_key', aggfunc=lambda x: list(set(x))[0])
r = pd.get_dummies(data = r, columns = cols )
r['rname'] = r.index
c = r.columns.tolist()
c.insert(0, c.pop(c.index('rname')))
r = r.reindex(columns= c)
r.to_pickle('restaurants.pkl')
r[:10]
Out[11]:
In [12]:
restaurants = r
restaurants[:10]
Out[12]:
In [13]:
c = d.copy()
In [14]:
suggested = c[c.text.apply(str).str.contains('<SILENCE>') | c.text.apply(str).str.contains('api_call')]
suggested.loc[suggested.text.apply(str).str.contains('api_call'), 'bot'] = "no_result"
print(len(suggested[suggested.bot == 'no_result']))
suggested['target'] = suggested.bot.map(lambda x: x.split()[0])
possible_targets = list(set(restaurants.rname)) + ['no_result']
suggested = suggested[suggested.target.isin(possible_targets)]
#suggested = suggested[~suggested.bot.str.contains('api_call')]
suggested.drop(['text','bot', 'ind', 'mask'], axis=1, inplace=True)
# dropping duplicates means we only care about the first api_call in the dialogue
suggested = suggested.drop_duplicates(subset=['o','gid'])
len(suggested)
Out[14]:
In [15]:
suggested
Out[15]:
In [16]:
c = d.copy()
In [17]:
import numpy as np
c = c[~c.text.apply(str).str.contains('<SILENCE>')]
c = c[~c.bot.apply(str).str.contains('<unk>')]
c.loc[~c.bot.apply(str).str.contains('api_call'), 'bot'] = ""
# need a bit of discoures history
#c['text1'] = c.text.shift(1)
#c['text2'] = c.text.shift(2)
#c['text3'] = c.text.shift(3)
#c.dropna(subset=['text', 'text1'], inplace=True)
#c['text'] = c.text2.map(list) + c.text1.map(list) + c.text.map(list)
#c['text'] = c.text1.map(list) + c.text.map(list)
#c.drop(['text1'], axis=1, inplace=True)
# dropping duplicates means we only care about the first api_call in the dialogue
#c = c.drop_duplicates(subset=['o','gid'])
#
#c = pd.merge(c, suggested, on=['o','gid'], how='left')
c = c[c['mask'].str.contains('True')]
c = c.groupby('gid').agg(sum)
c = c[c['mask'] == 'True']
c['gid'] = c.index
c = pd.merge(c, suggested, on=['o','gid'], how='left')
c.dropna(inplace=True)
len(c)
Out[17]:
In [18]:
c.to_pickle('utts_refs.pkl')
In [19]:
c[:5]
Out[19]:
In [ ]:
In [133]:
# make the text (which is a list of words) into a single column of words
s = c.text.apply(lambda x: pd.Series(x)).stack().reset_index(level=1, drop=True)
s.name = 'word'
c = c.drop('text', axis=1).join(s)
c.dropna(subset=['bot'], inplace=True)
#c['w2v'] = c.text.map(lambda x: [w2v[i] for i in x if i in w2v])
c = c[~c.word.apply(str).str.contains('_')]
c['w2v'] = c.word.map(lambda x: w2v[x] if x in w2v else np.zeros(vec_size))
attr_df = c.bot.apply(lambda x: pd.Series(x.split()))
c['type'], c['loc'], c['price'] = attr_df[1], attr_df[2], attr_df[3]
c = pd.get_dummies(data = c, columns = ['type','loc','price'] )
data = c
len(data)
Out[133]:
In [134]:
train_data = data[data.o == 'trn'].drop(['o'], axis=1)
dev_data = data[data.o == 'dev'].drop(['o'], axis=1)
test_data = data[data.o == 'tst'].drop(['o'], axis=1)
train_data.shape, test_data.shape, dev_data.shape
Out[134]:
In [135]:
train_data[:3]
Out[135]:
In [136]:
start_col = 'type_R_cuisine'
In [137]:
import numpy as np
y_train = train_data.ix[:,start_col:].as_matrix()
X_train = train_data.w2v.as_matrix()
X_train = np.array(list(X_train), dtype=np.float) # needed to fit the regression model
X_train.shape, y_train.shape
Out[137]:
In [138]:
from sklearn.linear_model import *
import numpy as np
model = Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
normalize=True, random_state=None, solver='auto', tol=0.01)
model.fit(X_train, y_train)
Out[138]:
In [139]:
# word level composition
def compute_target(utt):
predictions = [model.predict(w.reshape(1,-1))[0] for w in utt.w2v]
return compose(predictions)
In [140]:
#
# composition by union of vectors
#
def compose(predictions):
p = predictions[0]
for i in predictions[1:]:
p = np.logical_or(i, p)
return p
In [141]:
# attribute level composition
def compute_target(utt):
predictions = utt.w2v.values
p = compose(predictions)
return model.predict(p.reshape(1,-1))
In [142]:
#
# this is the composition function, it just sums vectors
#
def compose(predictions):
p = predictions[0]
for i in predictions[1:]:
p = np.sum((i, p),axis=0)
return p
In [143]:
targets = train_data.drop(['bot','word','w2v', 'target', 'gid','mask','ind'], 1)
targets = targets.drop_duplicates()
#targets = targets.ix[:,start_col:].as_matrix()
train_data.shape, targets.shape
targets[:10]
Out[143]:
In [144]:
import operator
import scipy
import sklearn
eval_data = dev_data
gold=[]
guess=[]
indeces = list(set(eval_data.index))
print('num instances', len(indeces))
for eid in indeces:
sub = eval_data[eval_data.index == eid] # grab the RE for this scene
target = compute_target(sub) # compose the predictions of each word to a target vector
distances = [(v, scipy.spatial.distance.cosine(target,v.ix[start_col:])) for i,v in targets.iterrows()]
guess += [min(distances, key=operator.itemgetter(1))[0]] # which object has the shortest distance?
gold += [sub.iloc[-1].ix[start_col:]] # all the rows in sub have the same matrix
In [145]:
#
guess=np.array(guess,dtype=np.float)
gold=np.array(gold,dtype=np.float)
sklearn.metrics.f1_score(gold, guess, average='micro', labels=np.array([0,1],dtype=np.float))
Out[145]:
In [ ]:
In [146]:
from collections import defaultdict as dd
incr_results = dd(list)
filled_slots = dd(list)
for eid in indeces:
pre_sub = eval_data[eval_data.index == eid] # grab the RE for this scene
for i in range(1,len(pre_sub)):
sub = pre_sub[:i]
gold=[]
guess=[]
target = compute_target(sub) # compose the predictions of each word to a target vector
distances = [(v, scipy.spatial.distance.cosine(target,v.ix[start_col:])) for i,v in targets.iterrows()]
guess += [min(distances, key=operator.itemgetter(1))[0]] # which object has the shortest distance?
gold += [sub.iloc[-1].ix[start_col:]] # all the rows in sub have the same matrix
filled_slots[i].append(np.sum(guess))
guess=np.array(guess,dtype=np.float)
gold=np.array(gold,dtype=np.float)
incr_results[i].append(sklearn.metrics.f1_score(gold, guess, average='micro', labels=np.array([0,1],dtype=np.float)))
In [147]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
results = [(i,np.mean(incr_results[i])) for i in incr_results]
r = list(zip(*results))
plt.plot(r[0], r[1])
Out[147]:
In [ ]:
In [148]:
rdata = data[['ind','gid','target','word','w2v','o']]
In [149]:
rdata[:10]
Out[149]:
In [150]:
restaurants[:5]
Out[150]:
In [158]:
num_cols = len(restaurants.columns) -1
start_col = 'R_price_cheap'
num_cols
Out[158]:
In [159]:
rdata = rdata[rdata.target.isin(restaurants.rname)]
rdata['attrvec'] = np.array(rdata.target.map(lambda x: np.array(restaurants[restaurants.rname == x].ix[0:,start_col:].as_matrix()[0])))
In [160]:
rdata[:5]
Out[160]:
In [161]:
train_data = rdata[rdata.o == 'trn'].drop(['o'], axis=1)
dev_data = rdata[rdata.o == 'dev'].drop(['o'], axis=1)
test_data = rdata[rdata.o == 'tst'].drop(['o'], axis=1)
train_data.shape, test_data.shape, dev_data.shape
Out[161]:
In [162]:
import numpy as np
y_train = [x for x in train_data.attrvec] # this shold just work with as_matri(), but the shape is always wrong
X_train = train_data.w2v.as_matrix()
X_train = np.array(list(X_train), dtype=np.float) # needed to fit the regression model
X_train.shape
Out[162]:
In [ ]:
In [163]:
from sklearn.linear_model import *
import numpy as np
model = Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
normalize=True, random_state=None, solver='auto', tol=0.01)
model.fit(X_train, y_train)
Out[163]:
In [164]:
def compute_mrr(lst, target):
i = 1.0
for l in lst:
if l == target: break
i+=1
return 1.0/i
In [165]:
import operator
import scipy
import sklearn
eval_data = dev_data
gold=[]
guess=[]
indeces = list(set(eval_data.index))
print('num instances', len(indeces))
mrr = 0.0
for eid in indeces:
sub = rdata[rdata.index == eid] # grab the RE for this scene
target = compute_target(sub) # compose the predictions of each word to a target vector
distances = [(v['rname'], scipy.spatial.distance.cosine(target,v.ix[start_col:])) for i,v in restaurants.iterrows()]
distances.sort(key=operator.itemgetter(1))
guess += [distances[0][0]]
d = list(zip(*distances))[0]
mrr += compute_mrr(d, sub.iloc[-1].ix['target'])
gold += [sub.iloc[-1].ix['target']] # all the rows in sub have the same matrix
In [166]:
sklearn.metrics.accuracy_score(gold, guess)
Out[166]:
In [167]:
mrr / len(gold)
Out[167]:
In [168]:
import operator
import scipy
import sklearn
eval_data = dev_data
gold=[]
guess=[]
indeces = list(set(eval_data.index))
print('num instances', len(indeces))
mrr = 0.0
for eid in indeces:
pre_sub = rdata[rdata.index == eid] # grab the RE for this scene
for i in range(1,len(pre_sub)):
sub = pre_sub[:i]
gold = []
guess = []
target = compute_target(sub) # compose the predictions of each word to a target vector
distances = [(v['rname'], scipy.spatial.distance.cosine(target,v.ix[start_col:])) for i,v in restaurants.iterrows()]
distances.sort(key=operator.itemgetter(1))
guess += [distances[0][0]]
d = list(zip(*distances))[0]
mrr += compute_mrr(d, sub.iloc[-1].ix['target'])
gold += [sub.iloc[-1].ix['target']] # all the rows in sub have the same matrix
incr_results[i].append(sklearn.metrics.accuracy_score(gold, guess))
In [169]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
results = [(i,np.mean(incr_results[i])) for i in incr_results]
r = list(zip(*results))
plt.plot(r[0], r[1])
Out[169]:
In [170]:
#eval_data['w2v'] = eval_data.word.map(lambda x: w2v[w2v.most_similar([x], topn=1)[0][0]] if x in w2v else np.zeros(vec_size))
In [171]:
# this returns it back to normal:
#eval_data['w2v'] = eval_data.word.map(lambda x: w2v[x] if x in w2v else np.zeros(vec_size))
In [172]:
import pickle
# now you can save it to a file
with open('ridge_restaurant.pkl', 'wb') as f:
pickle.dump(model, f)
In [ ]: