In [2]:
import numpy
import imaginet.task as task
import imaginet.defn.audiovis_rhn as audiovis


Using gpu device 2: GeForce GTX 980 Ti (CNMeM is disabled, cuDNN 5005)

In [ ]:
import imaginet.vendrov_provider as dp
prov = dp.getDataProvider(dataset='coco', root='/home/gchrupala/reimaginet/')

In [3]:
sent = list(prov.iterSentences(split='val'))

In [4]:
model = task.load("/home/gchrupala/reimaginet/run-rhn-coco-9-resume/model.r.e9.zip")

In [5]:
import imaginet.tts as tts

def synthesize(text):
    return tts.decodemp3(tts.speak(text))

def speak(data):
    voc = set()
    for (w1,w2,_) in data:
        voc.add(w1)
        voc.add(w2)
    voc = list(voc)
    speech = [ synthesize(word) for word in voc ]
    return (voc, speech)

In [6]:
import pydub

In [7]:
import funktional.context as context
import theano

In [8]:
from funktional.layer import softmax_time
def _make_attention(model):
    with context.context(training=False):
        task = model.task
        rep = task.Encode(*task.inputs)
        alpha = softmax_time(task.Attn.Regress2(task.Attn.activation(task.Attn.Regress1(rep))))
    return theano.function(task.inputs, alpha)

In [9]:
attention = _make_attention(model)

In [10]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [11]:
s = 'A baby sits on a bed laughing with a laptop computer open'

In [12]:
sound = tts.from_mp3(tts.speak(s))

In [13]:
alpha = attention([tts.extract_mfcc(synthesize(s))])[0,:,0]

In [36]:
def plot_attention(text, bounds=None, marks=None, rotation=None):
    alpha = attention([tts.extract_mfcc(synthesize(text))])[0,:,0]
    if bounds is not None and marks is not None:
        xticks(numpy.array(bounds)*1000, marks, rotation=rotation)
    plot(range(0, len(alpha)*30, 30), alpha)

In [15]:
mfcc = tts.extract_mfcc(synthesize(s))
mfcc.shape


Out[15]:
(433, 13)

In [16]:
sound[50:100]


Out[16]:

In [17]:
alpha.shape[0]*30


Out[17]:
4380

In [18]:
mfcc.shape[0]*10


Out[18]:
4330

In [19]:
plot(range(0,len(alpha)*30, 30), alpha)


Out[19]:
[<matplotlib.lines.Line2D at 0x7f98d01c0b50>]

In [20]:
sound.export('baby.wav', format='wav')


Out[20]:
<open file 'baby.wav', mode 'wb+' at 0x7f98d00d8270>

In [21]:
alpha.repeat(30)[1380-40:1380+100]


Out[21]:
array([ 0.00952864,  0.00952864,  0.00952864,  0.00952864,  0.00952864,
        0.00952864,  0.00952864,  0.00952864,  0.00952864,  0.00952864,
        0.02235831,  0.02235831,  0.02235831,  0.02235831,  0.02235831,
        0.02235831,  0.02235831,  0.02235831,  0.02235831,  0.02235831,
        0.02235831,  0.02235831,  0.02235831,  0.02235831,  0.02235831,
        0.02235831,  0.02235831,  0.02235831,  0.02235831,  0.02235831,
        0.02235831,  0.02235831,  0.02235831,  0.02235831,  0.02235831,
        0.02235831,  0.02235831,  0.02235831,  0.02235831,  0.02235831,
        0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,
        0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,
        0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,
        0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,
        0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,
        0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,
        0.02096539,  0.02096539,  0.02096539,  0.02096539,  0.02096539,
        0.02096539,  0.02096539,  0.02096539,  0.02096539,  0.02096539,
        0.02096539,  0.02096539,  0.02096539,  0.02096539,  0.02096539,
        0.02096539,  0.02096539,  0.02096539,  0.02096539,  0.02096539,
        0.02096539,  0.02096539,  0.02096539,  0.02096539,  0.02096539,
        0.02096539,  0.02096539,  0.02096539,  0.02096539,  0.02096539,
        0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,
        0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,
        0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,
        0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,
        0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,
        0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,
        0.0190598 ,  0.0190598 ,  0.0190598 ,  0.0190598 ,  0.0190598 ,
        0.0190598 ,  0.0190598 ,  0.0190598 ,  0.0190598 ,  0.0190598 ], dtype=float32)

In [22]:
alpha.repeat(30).argmax()


Out[22]:
1380

In [23]:
import textgrid

In [24]:
grid = textgrid.TextGrid.fromFile("baby.TextGrid")

In [25]:
tier = grid.tiers[0]

In [26]:
bounds = [ iv.minTime for iv in tier ]

In [27]:
marks = [ iv.mark for iv in tier ]

In [33]:
def from_grid(path):
    grid = textgrid.TextGrid.fromFile(path)
    tier = grid.tiers[0]
    min_bounds = numpy.array([ iv.minTime for iv in tier ])
    max_bounds = numpy.array([ iv.maxTime for iv in tier ])
    marks = [ iv.mark for iv in tier ]
    return min_bounds, max_bounds, marks

In [46]:
x=range(0,len(alpha)*30, 30)

In [47]:
figure(figsize=(15,1))
plot(x,alpha)
yticks([],[])
xticks(numpy.array(bounds)*1000, marks)


Out[47]:
([<matplotlib.axis.XTick at 0x7f24d5261090>,
  <matplotlib.axis.XTick at 0x7f24d52f6d90>,
  <matplotlib.axis.XTick at 0x7f24d50e0290>,
  <matplotlib.axis.XTick at 0x7f24d50e08d0>,
  <matplotlib.axis.XTick at 0x7f24d50eb050>,
  <matplotlib.axis.XTick at 0x7f24d50eb790>,
  <matplotlib.axis.XTick at 0x7f24d50ebed0>,
  <matplotlib.axis.XTick at 0x7f24d50f6650>,
  <matplotlib.axis.XTick at 0x7f24d50f6d90>,
  <matplotlib.axis.XTick at 0x7f24d507f510>,
  <matplotlib.axis.XTick at 0x7f24d507fc50>,
  <matplotlib.axis.XTick at 0x7f24d50893d0>,
  <matplotlib.axis.XTick at 0x7f24d5089b10>],
 <a list of 13 Text xticklabel objects>)

In [48]:
figure(figsize=(15,1))
yticks([])
plot(range(0,len(alpha)*30,30),alpha)


Out[48]:
[<matplotlib.lines.Line2D at 0x7f24d50e0390>]

In [49]:
plot_attention("a frame on the wall")



In [50]:
plot_attention("a dog is nice")



In [51]:
plot_attention("a dog is black")



In [52]:
plot_attention("a dog is brown")



In [53]:
plot_attention("a cat sat on the floor")



In [54]:
sound = tts.from_mp3(tts.speak("a cat sat on the floor"))

In [28]:
sound = tts.from_mp3(tts.speak('while in the immediate foreground juts a gnarled tree branch the majority of the view consists of a an expanse of short grass dotted with a few longer tufts and a number of scattered grazing sheep'))

In [29]:
sound


Out[29]:

In [30]:
sound.export("sheep.wav", format="wav")


Out[30]:
<open file 'sheep.wav', mode 'wb+' at 0x7f98d00d89c0>

In [46]:
bounds_min, bounds_max, marks = from_grid("sheep.TextGrid")
bounds = ((bounds_max - bounds_min) / 2.0) + bounds_min

In [58]:
?savefig

In [67]:
sheep_text ='while in the immediate foreground juts a gnarled tree branch the majority of the view consists of a an expanse of short grass dotted with a few longer tufts and a number of scattered grazing sheep'
figure(figsize=(20,2), dpi=300)
plot_attention(sheep_text, bounds, marks, rotation=45)
yticks([])
matplotlib.rcParams.update({'font.size': 18})
savefig("sheep.pdf", pad_inches=20)
#savefig("sheep.png", width=10, height=2)



In [61]:
bounds_min


Out[61]:
array([  0.        ,   0.31107205,   0.55372163,   1.16933261,
         1.87930732,   2.21182712,   2.87237321,   3.16445141,
         3.74411431,   4.60237487,   5.2359599 ,   5.78866173,
         6.6469223 ,   7.51866339,   7.84219617,   8.13876789,
         8.5611579 ,   9.19923644,   9.70250965,  10.05749701,
        10.36305574,  10.95170566,  11.51339451,  12.02565474,  12.48848636])

In [44]:
bounds_max


Out[44]:
array([  0.31107205,   0.55372163,   1.16933261,   1.87930732,
         2.21182712,   2.87237321,   3.16445141,   3.74411431,
         4.60237487,   5.2359599 ,   5.78866173,   6.6469223 ,
         7.51866339,   7.84219617,   8.13876789,   8.5611579 ,
         9.19923644,   9.70250965,  10.05749701,  10.36305574,
        10.95170566,  11.51339451,  12.02565474,  12.48848636,  12.84      ])

In [45]:
(bounds_max - bounds_min) / 2


Out[45]:
array([ 0.15553602,  0.12132479,  0.30780549,  0.35498735,  0.1662599 ,
        0.33027305,  0.1460391 ,  0.28983145,  0.42913028,  0.31679251,
        0.27635092,  0.42913028,  0.43587055,  0.16176639,  0.14828586,
        0.21119501,  0.31903927,  0.25163661,  0.17749368,  0.15277937,
        0.29432496,  0.28084443,  0.25613012,  0.23141581,  0.17575682])

In [ ]: