In [1]:
import scipy.io.wavfile as wav
import csv
import numpy
import imaginet.tts
import python_speech_features as features
In [24]:
wavmap = list(csv.reader(open("flickr_audio/wav2capt.txt"), delimiter=' '))
In [55]:
import time
import timeout_decorator
@timeout_decorator.timeout(5)
def readwav(f):
return wav.read(f)
def feats(path):
try:
(rate,sig) = readwav(open("flickr_audio/wavs/{}".format(path)))
except timeout_decorator.TimeoutError:
print "Timed out {}".format(path)
rate = 16000
sig = numpy.zeros(10)
return features.mfcc(sig, rate)
In [56]:
for i in range(0,40):
print i
mfcc = numpy.array([ feats(item[0]) for item in wavmap[i*1000:i*1000+1000] ])
numpy.save("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.mfcc.{}.npy".format(i), mfcc)
In [66]:
mfcc = numpy.hstack([ numpy.load("dataset.human.mfcc.{}.npy".format(i)) for i in range(0, 40) ])
In [67]:
numpy.save("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.mfcc.npy", mfcc)
In [72]:
size = [row.shape[0] for row in mfcc]
In [73]:
numpy.mean(size)
Out[73]:
In [74]:
numpy.median(size)
Out[74]:
In [75]:
numpy.min(size)
Out[75]:
In [76]:
numpy.max(size)
Out[76]:
In [77]:
mfcc = numpy.hstack([ numpy.load("dataset.human.mfcc.{}.npy".format(i))[:3000] for i in range(0, 40) ])
numpy.save("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.max3K.mfcc.npy", mfcc)
In [84]:
Out[84]:
In [85]:
import imaginet.data_provider as dp
In [88]:
prov = dp.getDataProvider('flickr8k', root='/home/gchrupala/reimaginet/', audio_kind=None)
In [91]:
prov.dataset['images'][0]['filename']
Out[91]:
In [92]:
import csv
In [101]:
wav2cap = numpy.array(list(csv.reader(open("flickr_audio/wav2capt.txt"), delimiter=' ')))
In [102]:
wav2cap
Out[102]:
In [106]:
cap2wav = {}
for row in wav2cap:
cap2wav[row[1]+row[2]] = row[0]
In [107]:
cap2text = dict(list(csv.reader(open("Flickr8k.token.txt"), delimiter='\t')))
In [111]:
wav2ID = dict([ (w,i) for (i,w) in enumerate(wav2cap[:,0]) ])
In [117]:
# Check consistency
for img in prov.dataset['images']:
for (i,sent) in enumerate(img['sentences']):
cap = "{}#{}".format(img['filename'],i)
if cap2text[cap] != sent['raw']:
print cap2text[cap], sent['raw']
else:
pass
In [119]:
def genmfcc():
for img in prov.dataset['images']:
for (i,sent) in enumerate(img['sentences']):
cap = "{}#{}".format(img['filename'],i)
yield mfcc[wav2ID[cap2wav[cap]]]
In [120]:
MFCC = numpy.array(list(genmfcc()))
In [124]:
numpy.save("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.max3K.ord.mfcc.npy", MFCC)
In [125]:
import scipy
In [131]:
sent = [s for img in prov.dataset['images'] for s in img['sentences'] ]
scipy.stats.pearsonr([len(row) for row in MFCC], [ len(s['raw']) for s in sent ])
Out[131]:
In [132]:
scipy.stats.pearsonr([len(row) for row in mfcc], [ len(s['raw']) for s in sent ])
Out[132]:
In [133]:
numpy.save("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.max1K.ord.mfcc.npy",
numpy.array([row[:1000] for row in MFCC]))
First dimension is the log of frame energy. Indices 1-12 correspond to cepstral coeffs.
In [200]:
numpy.hstack([MFCC[0],MFCC[0]]).shape
Out[200]:
In [33]:
def delta(v, N=2, offset=1):
d = numpy.zeros_like(v[:, offset:])
for t in range(0, d.shape[0]):
Z = 2 * sum(n**2 for n in range(1, N+1))
d[t,:] = sum(n * (v[min(t+n, v.shape[0]-1), offset:]-v[max(t-n, 0), offset:]) for n in range(1,N+1)) / Z
return d
In [34]:
def add_delta(data):
return numpy.array( [ numpy.hstack([row, delta(row,N=2, offset=1)]) for row in data ])
def add_accel(data):
return numpy.array( [ numpy.hstack([row, delta(row, N=2, offset=1), delta(delta(row, N=2, offset=1), offset=0)]) for row in data ])
In [35]:
data = numpy.load("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.max1K.ord.mfcc.npy")
numpy.save("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.max1K.delta3.ord.mfcc.npy", add_delta(data))
numpy.save("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.max1K.accel3.ord.mfcc.npy", add_accel(data))
In [11]:
accel2 = numpy.load("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.max1K.accel2.ord.mfcc.npy")
In [12]:
accel2[0].shape
Out[12]:
In [9]:
a = numpy.zeros((5,5))
In [10]:
a
Out[10]:
In [11]:
b = numpy.ones((5,3))
In [13]:
a[:,:1]
Out[13]:
In [32]:
delta(data[0])
Out[32]:
In [ ]: