In [2]:
import numpy
import imaginet.task as task
import imaginet.defn.audiovis_rhn as audiovis
In [ ]:
import imaginet.vendrov_provider as dp
prov = dp.getDataProvider(dataset='coco', root='/home/gchrupala/reimaginet/')
In [3]:
sent = list(prov.iterSentences(split='val'))
In [4]:
model = task.load("/home/gchrupala/reimaginet/run-rhn-coco-9-resume/model.r.e9.zip")
In [5]:
import imaginet.tts as tts
def synthesize(text):
return tts.decodemp3(tts.speak(text))
def speak(data):
voc = set()
for (w1,w2,_) in data:
voc.add(w1)
voc.add(w2)
voc = list(voc)
speech = [ synthesize(word) for word in voc ]
return (voc, speech)
In [6]:
import pydub
In [7]:
import funktional.context as context
import theano
In [8]:
from funktional.layer import softmax_time
def _make_attention(model):
with context.context(training=False):
task = model.task
rep = task.Encode(*task.inputs)
alpha = softmax_time(task.Attn.Regress2(task.Attn.activation(task.Attn.Regress1(rep))))
return theano.function(task.inputs, alpha)
In [9]:
attention = _make_attention(model)
In [10]:
%pylab inline
In [11]:
s = 'A baby sits on a bed laughing with a laptop computer open'
In [12]:
sound = tts.from_mp3(tts.speak(s))
In [13]:
alpha = attention([tts.extract_mfcc(synthesize(s))])[0,:,0]
In [36]:
def plot_attention(text, bounds=None, marks=None, rotation=None):
alpha = attention([tts.extract_mfcc(synthesize(text))])[0,:,0]
if bounds is not None and marks is not None:
xticks(numpy.array(bounds)*1000, marks, rotation=rotation)
plot(range(0, len(alpha)*30, 30), alpha)
In [15]:
mfcc = tts.extract_mfcc(synthesize(s))
mfcc.shape
Out[15]:
In [16]:
sound[50:100]
Out[16]:
In [17]:
alpha.shape[0]*30
Out[17]:
In [18]:
mfcc.shape[0]*10
Out[18]:
In [19]:
plot(range(0,len(alpha)*30, 30), alpha)
Out[19]:
In [20]:
sound.export('baby.wav', format='wav')
Out[20]:
In [21]:
alpha.repeat(30)[1380-40:1380+100]
Out[21]:
In [22]:
alpha.repeat(30).argmax()
Out[22]:
In [23]:
import textgrid
In [24]:
grid = textgrid.TextGrid.fromFile("baby.TextGrid")
In [25]:
tier = grid.tiers[0]
In [26]:
bounds = [ iv.minTime for iv in tier ]
In [27]:
marks = [ iv.mark for iv in tier ]
In [33]:
def from_grid(path):
grid = textgrid.TextGrid.fromFile(path)
tier = grid.tiers[0]
min_bounds = numpy.array([ iv.minTime for iv in tier ])
max_bounds = numpy.array([ iv.maxTime for iv in tier ])
marks = [ iv.mark for iv in tier ]
return min_bounds, max_bounds, marks
In [46]:
x=range(0,len(alpha)*30, 30)
In [47]:
figure(figsize=(15,1))
plot(x,alpha)
yticks([],[])
xticks(numpy.array(bounds)*1000, marks)
Out[47]:
In [48]:
figure(figsize=(15,1))
yticks([])
plot(range(0,len(alpha)*30,30),alpha)
Out[48]:
In [49]:
plot_attention("a frame on the wall")
In [50]:
plot_attention("a dog is nice")
In [51]:
plot_attention("a dog is black")
In [52]:
plot_attention("a dog is brown")
In [53]:
plot_attention("a cat sat on the floor")
In [54]:
sound = tts.from_mp3(tts.speak("a cat sat on the floor"))
In [28]:
sound = tts.from_mp3(tts.speak('while in the immediate foreground juts a gnarled tree branch the majority of the view consists of a an expanse of short grass dotted with a few longer tufts and a number of scattered grazing sheep'))
In [29]:
sound
Out[29]:
In [30]:
sound.export("sheep.wav", format="wav")
Out[30]:
In [46]:
bounds_min, bounds_max, marks = from_grid("sheep.TextGrid")
bounds = ((bounds_max - bounds_min) / 2.0) + bounds_min
In [58]:
?savefig
In [67]:
sheep_text ='while in the immediate foreground juts a gnarled tree branch the majority of the view consists of a an expanse of short grass dotted with a few longer tufts and a number of scattered grazing sheep'
figure(figsize=(20,2), dpi=300)
plot_attention(sheep_text, bounds, marks, rotation=45)
yticks([])
matplotlib.rcParams.update({'font.size': 18})
savefig("sheep.pdf", pad_inches=20)
#savefig("sheep.png", width=10, height=2)
In [61]:
bounds_min
Out[61]:
In [44]:
bounds_max
Out[44]:
In [45]:
(bounds_max - bounds_min) / 2
Out[45]:
In [ ]: