In [1]:
import numpy
import imaginet.task as task
import imaginet.defn.audiovis_rhn as audiovis
import imaginet.defn.visual2_rhn as vis2


Using gpu device 2: GeForce GTX 980 Ti (CNMeM is disabled, cuDNN 5005)

In [2]:
import imaginet.vendrov_provider as dp
prov = dp.getDataProvider(dataset='coco', root='/home/gchrupala/reimaginet/')
sent = list(prov.iterSentences(split='val'))

In [ ]:
model_s = task.load("/home/gchrupala/reimaginet/run-rhn-coco-9-resume/model.r.e9.zip")

In [3]:
model_w = task.load("/home/gchrupala/reimaginet/run-rhn-coco-word-2/model.r.e14.zip")

In [4]:
from imaginet.simple_data import words
data_w = [ words(senti) for senti in sent]
embeddings_w = vis2.encode_sentences(model_w, data_w)
data_s = [ numpy.asarray(senti['audio'], dtype='float32') for senti in sent ]
embeddings_s = audiovis.encode_sentences(model_s, data_s)

In [6]:
from scipy.spatial.distance import cdist
from scipy.stats import spearmanr, pearsonr

In [7]:
#D_s = cdist(embeddings_s, embeddings_s)
#D_w = cdist(embeddings_w, embeddings_w, metric='cosine')

In [18]:
data_s[1].shape


Out[18]:
(383, 13)

In [22]:
from imaginet.evaluate import Cdist
numpy.random.seed(123)
J = numpy.random.choice(len(embeddings_w), 1000)
K = numpy.random.choice(len(embeddings_w), 1000)
dist = Cdist()
rhos = []
D_w = dist(embeddings_w[J], embeddings_w[K]).flatten()
for layer in range(5):
    states_J = numpy.vstack([ x[:,layer,:].mean(axis=0) 
                             for x in audiovis.layer_states(model_s, [ data_s[j] for j in J ])])
    states_K = numpy.vstack([ x[:,layer,:].mean(axis=0) 
                             for x in audiovis.layer_states(model_s, [ data_s[k] for k in K ])])
    D_s = dist(states_J, states_K).flatten()
    rho = pearsonr(D_w, D_s)
    rhos.append(rho)
    print rho
D_s = dist(embeddings_s[J], embeddings_s[K]).flatten()
rho = pearsonr(D_w, D_s)
rhos.append(rho)
print rho


(0.11358234, 0.0)
(0.21255182, 0.0)
(0.32369265, 0.0)
(0.43593171, 0.0)
(0.52159578, 0.0)
(0.63199985, 0.0)

In [24]:
model_w = task.load("/home/gchrupala/reimaginet/run-rhn-flickr8k-word-2/model.r.e19.zip")

In [25]:
model_s = task.load("/home/gchrupala/reimaginet/run-rhn-human-11/model.r.e24.zip")

In [27]:
import imaginet.data_provider as dp
prov = dp.getDataProvider(dataset='flickr8k', root='/home/gchrupala/reimaginet/', audio_kind="human.max1K.accel3.ord.mfcc")
sent = list(prov.iterSentences(split='val'))


Could not read file /home/gchrupala/reimaginet/data/flickr8k/dataset.ipa.jsonl.gz: IPA transcription not available

In [28]:
from imaginet.simple_data import words
data_w = [ words(senti) for senti in sent]
embeddings_w = vis2.encode_sentences(model_w, data_w)
data_s = [ numpy.asarray(senti['audio'], dtype='float32') for senti in sent ]
embeddings_s = audiovis.encode_sentences(model_s, data_s)

In [30]:
from imaginet.evaluate import Cdist
numpy.random.seed(123)
J = numpy.random.choice(len(embeddings_w), 1000)
K = numpy.random.choice(len(embeddings_w), 1000)
dist = Cdist()
rhos = []
D_w = dist(embeddings_w[J], embeddings_w[K]).flatten()
for layer in range(4):
    states_J = numpy.vstack([ x[:,layer,:].mean(axis=0) 
                             for x in audiovis.layer_states(model_s, [ data_s[j] for j in J ])])
    states_K = numpy.vstack([ x[:,layer,:].mean(axis=0) 
                             for x in audiovis.layer_states(model_s, [ data_s[k] for k in K ])])
    D_s = dist(states_J, states_K).flatten()
    rho = pearsonr(D_w, D_s)
    rhos.append(rho)
    print rho
D_s = dist(embeddings_s[J], embeddings_s[K]).flatten()
rho = pearsonr(D_w, D_s)
rhos.append(rho)
print rho


(0.043831281, 0.0)
(0.10815942, 0.0)
(0.15827923, 0.0)
(0.16111711, 0.0)
(0.36036298, 0.0)

In [ ]: