notebook.community

Edit and run



In [2]:

    
import numpy
import imaginet.task as task
import imaginet.defn.audiovis_rhn as audiovis









    



Using gpu device 2: GeForce GTX 980 Ti (CNMeM is disabled, cuDNN 5005)



In [ ]:

    
import imaginet.vendrov_provider as dp
prov = dp.getDataProvider(dataset='coco', root='/home/gchrupala/reimaginet/')



In [3]:

    
sent = list(prov.iterSentences(split='val'))



In [4]:

    
model = task.load("/home/gchrupala/reimaginet/run-rhn-coco-9-resume/model.r.e9.zip")



In [5]:

    
import imaginet.tts as tts

def synthesize(text):
    return tts.decodemp3(tts.speak(text))

def speak(data):
    voc = set()
    for (w1,w2,_) in data:
        voc.add(w1)
        voc.add(w2)
    voc = list(voc)
    speech = [ synthesize(word) for word in voc ]
    return (voc, speech)



In [6]:

    
import pydub



In [7]:

    
import funktional.context as context
import theano



In [8]:

    
from funktional.layer import softmax_time
def _make_attention(model):
    with context.context(training=False):
        task = model.task
        rep = task.Encode(*task.inputs)
        alpha = softmax_time(task.Attn.Regress2(task.Attn.activation(task.Attn.Regress1(rep))))
    return theano.function(task.inputs, alpha)



In [9]:

    
attention = _make_attention(model)



In [10]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [11]:

    
s = 'A baby sits on a bed laughing with a laptop computer open'



In [12]:

    
sound = tts.from_mp3(tts.speak(s))



In [13]:

    
alpha = attention([tts.extract_mfcc(synthesize(s))])[0,:,0]



In [36]:

    
def plot_attention(text, bounds=None, marks=None, rotation=None):
    alpha = attention([tts.extract_mfcc(synthesize(text))])[0,:,0]
    if bounds is not None and marks is not None:
        xticks(numpy.array(bounds)*1000, marks, rotation=rotation)
    plot(range(0, len(alpha)*30, 30), alpha)



In [15]:

    
mfcc = tts.extract_mfcc(synthesize(s))
mfcc.shape









    Out[15]:





(433, 13)



In [16]:

    
sound[50:100]









    Out[16]:



In [17]:

    
alpha.shape[0]*30









    Out[17]:





4380



In [18]:

    
mfcc.shape[0]*10









    Out[18]:





4330



In [19]:

    
plot(range(0,len(alpha)*30, 30), alpha)









    Out[19]:





[<matplotlib.lines.Line2D at 0x7f98d01c0b50>]



In [20]:

    
sound.export('baby.wav', format='wav')









    Out[20]:





<open file 'baby.wav', mode 'wb+' at 0x7f98d00d8270>



In [21]:

    
alpha.repeat(30)[1380-40:1380+100]









    Out[21]:





array([ 0.00952864,  0.00952864,  0.00952864,  0.00952864,  0.00952864,
        0.00952864,  0.00952864,  0.00952864,  0.00952864,  0.00952864,
        0.02235831,  0.02235831,  0.02235831,  0.02235831,  0.02235831,
        0.02235831,  0.02235831,  0.02235831,  0.02235831,  0.02235831,
        0.02235831,  0.02235831,  0.02235831,  0.02235831,  0.02235831,
        0.02235831,  0.02235831,  0.02235831,  0.02235831,  0.02235831,
        0.02235831,  0.02235831,  0.02235831,  0.02235831,  0.02235831,
        0.02235831,  0.02235831,  0.02235831,  0.02235831,  0.02235831,
        0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,
        0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,
        0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,
        0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,
        0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,
        0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,  0.0246903 ,
        0.02096539,  0.02096539,  0.02096539,  0.02096539,  0.02096539,
        0.02096539,  0.02096539,  0.02096539,  0.02096539,  0.02096539,
        0.02096539,  0.02096539,  0.02096539,  0.02096539,  0.02096539,
        0.02096539,  0.02096539,  0.02096539,  0.02096539,  0.02096539,
        0.02096539,  0.02096539,  0.02096539,  0.02096539,  0.02096539,
        0.02096539,  0.02096539,  0.02096539,  0.02096539,  0.02096539,
        0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,
        0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,
        0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,
        0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,
        0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,
        0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,  0.0199234 ,
        0.0190598 ,  0.0190598 ,  0.0190598 ,  0.0190598 ,  0.0190598 ,
        0.0190598 ,  0.0190598 ,  0.0190598 ,  0.0190598 ,  0.0190598 ], dtype=float32)



In [22]:

    
alpha.repeat(30).argmax()









    Out[22]:





1380



In [23]:

    
import textgrid



In [24]:

    
grid = textgrid.TextGrid.fromFile("baby.TextGrid")



In [25]:

    
tier = grid.tiers[0]



In [26]:

    
bounds = [ iv.minTime for iv in tier ]



In [27]:

    
marks = [ iv.mark for iv in tier ]



In [33]:

    
def from_grid(path):
    grid = textgrid.TextGrid.fromFile(path)
    tier = grid.tiers[0]
    min_bounds = numpy.array([ iv.minTime for iv in tier ])
    max_bounds = numpy.array([ iv.maxTime for iv in tier ])
    marks = [ iv.mark for iv in tier ]
    return min_bounds, max_bounds, marks



In [46]:

    
x=range(0,len(alpha)*30, 30)



In [47]:

    
figure(figsize=(15,1))
plot(x,alpha)
yticks([],[])
xticks(numpy.array(bounds)*1000, marks)









    Out[47]:





([<matplotlib.axis.XTick at 0x7f24d5261090>,
  <matplotlib.axis.XTick at 0x7f24d52f6d90>,
  <matplotlib.axis.XTick at 0x7f24d50e0290>,
  <matplotlib.axis.XTick at 0x7f24d50e08d0>,
  <matplotlib.axis.XTick at 0x7f24d50eb050>,
  <matplotlib.axis.XTick at 0x7f24d50eb790>,
  <matplotlib.axis.XTick at 0x7f24d50ebed0>,
  <matplotlib.axis.XTick at 0x7f24d50f6650>,
  <matplotlib.axis.XTick at 0x7f24d50f6d90>,
  <matplotlib.axis.XTick at 0x7f24d507f510>,
  <matplotlib.axis.XTick at 0x7f24d507fc50>,
  <matplotlib.axis.XTick at 0x7f24d50893d0>,
  <matplotlib.axis.XTick at 0x7f24d5089b10>],
 <a list of 13 Text xticklabel objects>)



In [48]:

    
figure(figsize=(15,1))
yticks([])
plot(range(0,len(alpha)*30,30),alpha)









    Out[48]:





[<matplotlib.lines.Line2D at 0x7f24d50e0390>]



In [49]:

    
plot_attention("a frame on the wall")



In [50]:

    
plot_attention("a dog is nice")



In [51]:

    
plot_attention("a dog is black")



In [52]:

    
plot_attention("a dog is brown")



In [53]:

    
plot_attention("a cat sat on the floor")



In [54]:

    
sound = tts.from_mp3(tts.speak("a cat sat on the floor"))



In [28]:

    
sound = tts.from_mp3(tts.speak('while in the immediate foreground juts a gnarled tree branch the majority of the view consists of a an expanse of short grass dotted with a few longer tufts and a number of scattered grazing sheep'))



In [29]:

    
sound









    Out[29]:



In [30]:

    
sound.export("sheep.wav", format="wav")









    Out[30]:





<open file 'sheep.wav', mode 'wb+' at 0x7f98d00d89c0>



In [46]:

    
bounds_min, bounds_max, marks = from_grid("sheep.TextGrid")
bounds = ((bounds_max - bounds_min) / 2.0) + bounds_min



In [58]:

    
?savefig



In [67]:

    
sheep_text ='while in the immediate foreground juts a gnarled tree branch the majority of the view consists of a an expanse of short grass dotted with a few longer tufts and a number of scattered grazing sheep'
figure(figsize=(20,2), dpi=300)
plot_attention(sheep_text, bounds, marks, rotation=45)
yticks([])
matplotlib.rcParams.update({'font.size': 18})
savefig("sheep.pdf", pad_inches=20)
#savefig("sheep.png", width=10, height=2)



In [61]:

    
bounds_min









    Out[61]:





array([  0.        ,   0.31107205,   0.55372163,   1.16933261,
         1.87930732,   2.21182712,   2.87237321,   3.16445141,
         3.74411431,   4.60237487,   5.2359599 ,   5.78866173,
         6.6469223 ,   7.51866339,   7.84219617,   8.13876789,
         8.5611579 ,   9.19923644,   9.70250965,  10.05749701,
        10.36305574,  10.95170566,  11.51339451,  12.02565474,  12.48848636])



In [44]:

    
bounds_max









    Out[44]:





array([  0.31107205,   0.55372163,   1.16933261,   1.87930732,
         2.21182712,   2.87237321,   3.16445141,   3.74411431,
         4.60237487,   5.2359599 ,   5.78866173,   6.6469223 ,
         7.51866339,   7.84219617,   8.13876789,   8.5611579 ,
         9.19923644,   9.70250965,  10.05749701,  10.36305574,
        10.95170566,  11.51339451,  12.02565474,  12.48848636,  12.84      ])



In [45]:

    
(bounds_max - bounds_min) / 2









    Out[45]:





array([ 0.15553602,  0.12132479,  0.30780549,  0.35498735,  0.1662599 ,
        0.33027305,  0.1460391 ,  0.28983145,  0.42913028,  0.31679251,
        0.27635092,  0.42913028,  0.43587055,  0.16176639,  0.14828586,
        0.21119501,  0.31903927,  0.25163661,  0.17749368,  0.15277937,
        0.29432496,  0.28084443,  0.25613012,  0.23141581,  0.17575682])



In [ ]: