In [1]:
import numpy
import imaginet.task as task
import imaginet.defn.audiovis_rhn as audiovis
import imaginet.defn.visual2_rhn as vis2
In [2]:
import imaginet.vendrov_provider as dp
prov = dp.getDataProvider(dataset='coco', root='/home/gchrupala/reimaginet/')
sent = list(prov.iterSentences(split='val'))
In [ ]:
model_s = task.load("/home/gchrupala/reimaginet/run-rhn-coco-9-resume/model.r.e9.zip")
In [3]:
model_w = task.load("/home/gchrupala/reimaginet/run-rhn-coco-word-2/model.r.e14.zip")
In [4]:
from imaginet.simple_data import words
data_w = [ words(senti) for senti in sent]
embeddings_w = vis2.encode_sentences(model_w, data_w)
data_s = [ numpy.asarray(senti['audio'], dtype='float32') for senti in sent ]
embeddings_s = audiovis.encode_sentences(model_s, data_s)
In [6]:
from scipy.spatial.distance import cdist
from scipy.stats import spearmanr, pearsonr
In [7]:
#D_s = cdist(embeddings_s, embeddings_s)
#D_w = cdist(embeddings_w, embeddings_w, metric='cosine')
In [18]:
data_s[1].shape
Out[18]:
In [22]:
from imaginet.evaluate import Cdist
numpy.random.seed(123)
J = numpy.random.choice(len(embeddings_w), 1000)
K = numpy.random.choice(len(embeddings_w), 1000)
dist = Cdist()
rhos = []
D_w = dist(embeddings_w[J], embeddings_w[K]).flatten()
for layer in range(5):
states_J = numpy.vstack([ x[:,layer,:].mean(axis=0)
for x in audiovis.layer_states(model_s, [ data_s[j] for j in J ])])
states_K = numpy.vstack([ x[:,layer,:].mean(axis=0)
for x in audiovis.layer_states(model_s, [ data_s[k] for k in K ])])
D_s = dist(states_J, states_K).flatten()
rho = pearsonr(D_w, D_s)
rhos.append(rho)
print rho
D_s = dist(embeddings_s[J], embeddings_s[K]).flatten()
rho = pearsonr(D_w, D_s)
rhos.append(rho)
print rho
In [24]:
model_w = task.load("/home/gchrupala/reimaginet/run-rhn-flickr8k-word-2/model.r.e19.zip")
In [25]:
model_s = task.load("/home/gchrupala/reimaginet/run-rhn-human-11/model.r.e24.zip")
In [27]:
import imaginet.data_provider as dp
prov = dp.getDataProvider(dataset='flickr8k', root='/home/gchrupala/reimaginet/', audio_kind="human.max1K.accel3.ord.mfcc")
sent = list(prov.iterSentences(split='val'))
In [28]:
from imaginet.simple_data import words
data_w = [ words(senti) for senti in sent]
embeddings_w = vis2.encode_sentences(model_w, data_w)
data_s = [ numpy.asarray(senti['audio'], dtype='float32') for senti in sent ]
embeddings_s = audiovis.encode_sentences(model_s, data_s)
In [30]:
from imaginet.evaluate import Cdist
numpy.random.seed(123)
J = numpy.random.choice(len(embeddings_w), 1000)
K = numpy.random.choice(len(embeddings_w), 1000)
dist = Cdist()
rhos = []
D_w = dist(embeddings_w[J], embeddings_w[K]).flatten()
for layer in range(4):
states_J = numpy.vstack([ x[:,layer,:].mean(axis=0)
for x in audiovis.layer_states(model_s, [ data_s[j] for j in J ])])
states_K = numpy.vstack([ x[:,layer,:].mean(axis=0)
for x in audiovis.layer_states(model_s, [ data_s[k] for k in K ])])
D_s = dist(states_J, states_K).flatten()
rho = pearsonr(D_w, D_s)
rhos.append(rho)
print rho
D_s = dist(embeddings_s[J], embeddings_s[K]).flatten()
rho = pearsonr(D_w, D_s)
rhos.append(rho)
print rho
In [ ]: