In [1]:
import numpy
import imaginet.task as task
import imaginet.defn.audiovis_rhn as audiovis
import imaginet.defn.audiovis4 as a4
Load two models:
mfcc_accel)mfcc)
In [2]:
model_f = task.load("/home/gchrupala/reimaginet/examples/audioviz/human-mfcc-rhn-flickr8k.zip")
In [18]:
# RHN
model_c = task.load("/home/gchrupala/reimaginet/examples/audioviz/rhn-mfcc-coco.zip")
# GRU
model_g = task.load("/home/gchrupala/reimaginet/examples/audioviz/mfcc-coco.zip")
In [19]:
def read_men():
records = []
for line in open("/home/gchrupala/reimaginet/data/MEN/MEN_dataset_natural_form_full"):
word1, word2, score = line.split()
records.append((word1, word2, float(score)))
return records
MEN = read_men()
Synthesize speech for all the words in MEN
In [20]:
import imaginet.tts as tts
def synthesize(text):
return tts.decodemp3(tts.speak(text))
def speak(data):
voc = set()
for (w1,w2,_) in data:
voc.add(w1)
voc.add(w2)
voc = list(voc)
speech = [ synthesize(word) for word in voc ]
return (voc, speech)
In [21]:
import cPickle as pickle
Synthesize and save for reuse. This is slow, so we'll comment it out and use pre-synthesize speech
In [22]:
# voc_men, speech_men = speak(MEN)
#
# pickle.dump(voc_men, open("/home/gchrupala/reimaginet/data/MEN/voc.pkl","w"),
# protocol=pickle.HIGHEST_PROTOCOL)
# pickle.dump(speech_men, open("/home/gchrupala/reimaginet/data/MEN/speech.pkl","w"),
# protocol=pickle.HIGHEST_PROTOCOL)
In [23]:
voc_men = pickle.load(open("/home/gchrupala/reimaginet/data/MEN/voc.pkl"))
speech_men = pickle.load(open("/home/gchrupala/reimaginet/data/MEN/speech.pkl"))
In [24]:
mfcc_men = [ tts.extract_mfcc(audio) for audio in speech_men ]
In [25]:
mfcc_accel_men = tts.add_accel(mfcc_men)
In [26]:
mfcc_men[1].shape
Out[26]:
mfcc_accel adds first order and second order differences, or rate or change and acceleration of the MFCC coefficients.
In [27]:
mfcc_accel_men[1].shape
Out[27]:
In [28]:
embeddings_f = audiovis.encode_sentences(model_f, mfcc_accel_men)
In [29]:
embeddings_c = audiovis.encode_sentences(model_c, mfcc_men)
embeddings_g = audiovis.encode_sentences(model_g, mfcc_men)
In [30]:
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr
def correlation(voc, emb, ratings):
REP = dict(zip(voc, emb))
sim = [ 1-cosine(REP[w1],REP[w2]) for (w1,w2,_) in ratings ]
score = [s for (_,_,s) in ratings]
return spearmanr(score, sim)
In [31]:
print(correlation(voc_men, embeddings_c, MEN))
print(correlation(voc_men, embeddings_g, MEN))
In [32]:
correlation(voc_men, embeddings_f, MEN)
Out[32]:
In [34]:
In [ ]: