In [1]:
import numpy
import imaginet.task as task
import imaginet.defn.audiovis_rhn as audiovis
import imaginet.defn.audiovis4 as a4


Using gpu device 2: GeForce GTX 980 Ti (CNMeM is disabled, cuDNN 5005)

Load two models:

  • RHN model on flickr8K human speech, with MFCC features + first and second differences (mfcc_accel)
  • RHN model on coco synthetic speech, with plain MFCC features (mfcc)

In [2]:
model_f = task.load("/home/gchrupala/reimaginet/examples/audioviz/human-mfcc-rhn-flickr8k.zip")

In [18]:
# RHN
model_c = task.load("/home/gchrupala/reimaginet/examples/audioviz/rhn-mfcc-coco.zip")

# GRU
model_g = task.load("/home/gchrupala/reimaginet/examples/audioviz/mfcc-coco.zip")

Synthesize speech

Read MEN data


In [19]:
def read_men():
    records = []
    for line in open("/home/gchrupala/reimaginet/data/MEN/MEN_dataset_natural_form_full"):
        word1, word2, score = line.split()
        records.append((word1, word2, float(score)))
    return records
MEN = read_men()

Synthesize speech for all the words in MEN


In [20]:
import imaginet.tts as tts

def synthesize(text):
    return tts.decodemp3(tts.speak(text))

def speak(data):
    voc = set()
    for (w1,w2,_) in data:
        voc.add(w1)
        voc.add(w2)
    voc = list(voc)
    speech = [ synthesize(word) for word in voc ]
    return (voc, speech)

Synthesize speech


In [21]:
import cPickle as pickle

Synthesize and save for reuse. This is slow, so we'll comment it out and use pre-synthesize speech


In [22]:
# voc_men, speech_men = speak(MEN)
#

# pickle.dump(voc_men, open("/home/gchrupala/reimaginet/data/MEN/voc.pkl","w"),
#            protocol=pickle.HIGHEST_PROTOCOL)
# pickle.dump(speech_men, open("/home/gchrupala/reimaginet/data/MEN/speech.pkl","w"),
#            protocol=pickle.HIGHEST_PROTOCOL)

In [23]:
voc_men = pickle.load(open("/home/gchrupala/reimaginet/data/MEN/voc.pkl"))
speech_men = pickle.load(open("/home/gchrupala/reimaginet/data/MEN/speech.pkl"))

Extract mfcc and mfcc_accel features


In [24]:
mfcc_men = [ tts.extract_mfcc(audio) for audio in speech_men ]

In [25]:
mfcc_accel_men = tts.add_accel(mfcc_men)

In [26]:
mfcc_men[1].shape


Out[26]:
(69, 13)

mfcc_accel adds first order and second order differences, or rate or change and acceleration of the MFCC coefficients.


In [27]:
mfcc_accel_men[1].shape


Out[27]:
(69, 37)

Compute word embeddings


In [28]:
embeddings_f = audiovis.encode_sentences(model_f, mfcc_accel_men)

In [29]:
embeddings_c = audiovis.encode_sentences(model_c, mfcc_men)
embeddings_g = audiovis.encode_sentences(model_g, mfcc_men)

Compute correlations with MEN judgments


In [30]:
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr
def correlation(voc, emb, ratings):
    REP = dict(zip(voc, emb))
    sim = [ 1-cosine(REP[w1],REP[w2]) for (w1,w2,_) in ratings ]
    score = [s for (_,_,s) in ratings]
    return spearmanr(score, sim)

In [31]:
print(correlation(voc_men, embeddings_c, MEN))
print(correlation(voc_men, embeddings_g, MEN))


SpearmanrResult(correlation=0.24975627253539079, pvalue=6.813055683222531e-44)
SpearmanrResult(correlation=0.23208217877191567, pvalue=5.6366169686918221e-38)

In [32]:
correlation(voc_men, embeddings_f, MEN)


Out[32]:
SpearmanrResult(correlation=0.052302840127719075, pvalue=0.0041633549540002783)

In [34]:


In [ ]: