In [1]:
import imaginet.vendrov_provider as dp
import json
In [2]:
prov = dp.getDataProvider(dataset='coco', root='/home/gchrupala/reimaginet/')
In [3]:
from collections import Counter
def vocabulary(prov):
sent_t = prov.iterSentences(split='train')
sent_v = prov.iterSentences(split='val')
V = Counter()
for S in [sent_t, sent_v]:
for utt in S:
V.update(utt['tokens'])
return V
In [4]:
Voc = vocabulary(prov)
import codecs
# Write vocabulary file
with open("/home/gchrupala/reimaginet/data/coco/vocab.txt", "w") as f:
for word in Voc:
f.write(word)
f.write("\n\n")
# Pass through espeak
raw = subprocess.check_output(['espeak', '-q', '--ipa=3', '-v', 'en-us', '-f', "/home/gchrupala/reimaginet/data/coco/vocab.txt"])
pron = [ ''.join(ttp.clean_phonemes(line.strip())) for line in raw.split("\n")][:-1]
# Write IPA file
mapping = dict(zip(Voc.keys(), pron))
with open("/home/gchrupala/reimaginet/data/coco/vocab.ipa.json", "w") as f:
json.dump(mapping, f)
In [5]:
mapping = json.load(open("/home/gchrupala/reimaginet/data/coco/vocab.ipa.json"))
In [6]:
def invert(d):
i = {}
for k,v in d.iteritems():
if v in i:
i[v].append(k)
else:
i[v] = [k]
return i
In [7]:
hom = invert(dict((k,v) for k,v in mapping.iteritems() if Voc[k] > 20))
In [8]:
amb = [ (k,v) for k,v in hom.iteritems() if len(v) > 1 ]
In [9]:
len(amb)
Out[9]:
In [10]:
# Criteria:
## different meaning
## no respellings
## no function words
## at least 20 occurences of each meaning
homonym = [ line.split() for line in open("/home/gchrupala/reimaginet/data/coco/homonym.txt")]
In [11]:
import numpy
import imaginet.task as task
import imaginet.defn.audiovis_rhn as audiovis
In [12]:
sent = list(prov.iterSentences(split='train')) + list(prov.iterSentences(split='val'))
In [13]:
def matching(sent, word):
for sent_i in sent:
if word in sent_i['tokens']:
yield sent_i
In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.preprocessing import normalize
In [15]:
model = task.load("/home/gchrupala/reimaginet/run-rhn-coco-9-resume/model.r.e9.zip")
In [16]:
def rer(lo, hi):
return ((1-lo)-(1-hi))/(1-lo)
def audio(sent):
return [ sent_i['audio'].mean(axis=0) for sent_i in sent ]
def embed(sent):
return audiovis.encode_sentences(model, [ sent_i['audio'] for sent_i in sent ])
CACHE = {}
def mean_layer(sent, layer=0):
sent = list(sent)
if len(CACHE) > 5:
CACHE.clear()
key = '\n'.join([ sent_i['raw'] for sent_i in sent ])
if key in CACHE:
return [ datum[:,layer,:].mean(axis=0) for datum in CACHE[key] ]
else:
data = audiovis.layer_states(model, [ sent_i['audio'] for sent_i in sent ])
CACHE[key] = data
result = [ datum[:,layer,:].mean(axis=0) for datum in data ]
return result
In [22]:
def test_homonym(H, sent, features, C=1.0):
X_0 = features(matching(sent, H[0]))
X_1 = features(matching(sent, H[1]))
y_0 = numpy.zeros(len(X_0))
y_1 = numpy.ones(len(X_1))
X = normalize(numpy.vstack([X_0, X_1]), norm='l2')
y = numpy.hstack([y_0, y_1])
classifier = LogisticRegression(C=C)
fold = StratifiedKFold(y, n_folds=10)
score = []
count = []
for tr, te in fold:
X_tr, X_te = X[tr], X[te]
y_tr, y_te = y[tr], y[te]
classifier.fit(X_tr, y_tr)
score.append(sum(classifier.predict(X_te) == y_te))
count.append(len(y_te))
score = numpy.array(score, dtype='float')
count = numpy.array(count, dtype='float')
result = {'word1_count': len(y_0),
'word2_count': len(y_1),
'majority': 1.0 * max(len(y_0),len(y_1))/len(y),
'kfold_acc': score/count }
return result
In [18]:
from __future__ import print_function
In [19]:
with open("ambigu-io.txt", "w") as out:
print("word1 word2 io count1 count2 majority acc", file=out)
for H in homonym:
r = test_homonym(H, sent, audio)
for acc in r['kfold_acc']:
print(" ".join(H), "input", r['word1_count'], r['word2_count'], r['majority'], acc, file=out)
r = test_homonym(H, sent, embed)
for acc in r['kfold_acc']:
print(" ".join(H), "output", r['word1_count'], r['word2_count'], r['majority'], acc, file=out)
out.flush()
In [20]:
with open("ambigu-layerwise.txt", "w") as out:
print("word1 word2 layer count1 count2 majority acc", file=out)
for H in homonym:
for layer in range(5):
feat = lambda x: mean_layer(x, layer=layer)
r = test_homonym(H, sent, feat)
for acc in r['kfold_acc']:
print(" ".join(H), layer, r['word1_count'], r['word2_count'], r['majority'], acc, file=out)
out.flush()
In [ ]:
##########################################################
In [29]:
for s in matching(sent, 'grate'):
print(s['raw'])
In [ ]: