In [1]:
import scipy.io.wavfile as wav
import csv
import numpy
import imaginet.tts
import python_speech_features as features


Using gpu device 0: GeForce GTX 980 Ti (CNMeM is disabled, cuDNN 5005)

In [24]:
wavmap = list(csv.reader(open("flickr_audio/wav2capt.txt"), delimiter=' '))

In [55]:
import time
import timeout_decorator

@timeout_decorator.timeout(5)
def readwav(f):
    return wav.read(f)

def feats(path):
    try:
        (rate,sig) = readwav(open("flickr_audio/wavs/{}".format(path)))
    except timeout_decorator.TimeoutError:
        print "Timed out {}".format(path)
        rate = 16000
        sig = numpy.zeros(10)
    return features.mfcc(sig, rate)

In [56]:
for i in range(0,40):
    print i
    mfcc = numpy.array([  feats(item[0]) for item in wavmap[i*1000:i*1000+1000] ])
    numpy.save("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.mfcc.{}.npy".format(i), mfcc)


20
Timed out 2865703567_52de2444f2_0.wav
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39

In [66]:
mfcc = numpy.hstack([ numpy.load("dataset.human.mfcc.{}.npy".format(i)) for i in range(0, 40) ])

In [67]:
numpy.save("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.mfcc.npy", mfcc)

In [72]:
size = [row.shape[0] for row in mfcc]

In [73]:
numpy.mean(size)


Out[73]:
412.11077499999999

In [74]:
numpy.median(size)


Out[74]:
389.0

In [75]:
numpy.min(size)


Out[75]:
1

In [76]:
numpy.max(size)


Out[76]:
32350

In [77]:
mfcc = numpy.hstack([ numpy.load("dataset.human.mfcc.{}.npy".format(i))[:3000] for i in range(0, 40) ])
numpy.save("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.max3K.mfcc.npy", mfcc)

In [84]:



Out[84]:
39997

In [85]:
import imaginet.data_provider as dp

In [88]:
prov = dp.getDataProvider('flickr8k', root='/home/gchrupala/reimaginet/', audio_kind=None)


Could not read file /home/gchrupala/reimaginet/data/flickr8k/dataset.ipa.jsonl.gz: IPA transcription not available
Could not read file /home/gchrupala/reimaginet/data/flickr8k/dataset.None.npy: audio features not available

In [91]:
prov.dataset['images'][0]['filename']


Out[91]:
u'2513260012_03d33305cf.jpg'

In [92]:
import csv

In [101]:
wav2cap = numpy.array(list(csv.reader(open("flickr_audio/wav2capt.txt"), delimiter=' ')))

In [102]:
wav2cap


Out[102]:
array([['2571096893_694ce79768_1.wav', '2571096893_694ce79768.jpg', '#1'],
       ['2571096893_694ce79768_2.wav', '2571096893_694ce79768.jpg', '#2'],
       ['2570559405_dc93007f76_4.wav', '2570559405_dc93007f76.jpg', '#4'],
       ..., 
       ['534669139_1a4f8ab9d5_3.wav', '534669139_1a4f8ab9d5.jpg', '#3'],
       ['534875358_6ea30d3091_1.wav', '534875358_6ea30d3091.jpg', '#1'],
       ['534875358_6ea30d3091_0.wav', '534875358_6ea30d3091.jpg', '#0']], 
      dtype='|S27')

In [106]:
cap2wav = {}
for row in wav2cap:
    cap2wav[row[1]+row[2]] = row[0]

In [107]:
cap2text = dict(list(csv.reader(open("Flickr8k.token.txt"), delimiter='\t')))

In [111]:
wav2ID = dict([ (w,i) for (i,w) in enumerate(wav2cap[:,0]) ])

In [117]:
# Check consistency
for img in prov.dataset['images']:
    for (i,sent) in enumerate(img['sentences']):
        cap = "{}#{}".format(img['filename'],i)
        if cap2text[cap] != sent['raw']:
            print cap2text[cap], sent['raw']
        else:
            pass

In [119]:
def genmfcc():
    for img in prov.dataset['images']:
        for (i,sent) in enumerate(img['sentences']):
            cap = "{}#{}".format(img['filename'],i)
            yield mfcc[wav2ID[cap2wav[cap]]]

In [120]:
MFCC = numpy.array(list(genmfcc()))

In [124]:
numpy.save("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.max3K.ord.mfcc.npy", MFCC)

In [125]:
import scipy

In [131]:
sent = [s for img in prov.dataset['images'] for s in img['sentences'] ]
scipy.stats.pearsonr([len(row) for row in MFCC], [ len(s['raw']) for s in sent ])


Out[131]:
(0.47461496238223072, 0.0)

In [132]:
scipy.stats.pearsonr([len(row) for row in mfcc], [ len(s['raw']) for s in sent ])


Out[132]:
(-0.003074679304299016, 0.53860881075333633)

In [133]:
numpy.save("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.max1K.ord.mfcc.npy", 
           numpy.array([row[:1000] for row in MFCC]))

First dimension is the log of frame energy. Indices 1-12 correspond to cepstral coeffs.


In [200]:
numpy.hstack([MFCC[0],MFCC[0]]).shape


Out[200]:
(306, 26)

Delta coefficients


In [33]:
def delta(v, N=2, offset=1):
    d = numpy.zeros_like(v[:, offset:])
    for t in range(0, d.shape[0]):
        Z = 2 * sum(n**2 for n in range(1, N+1))
        d[t,:] = sum(n * (v[min(t+n, v.shape[0]-1), offset:]-v[max(t-n, 0), offset:]) for n in range(1,N+1)) / Z
    return d

In [34]:
def add_delta(data):
    return numpy.array( [ numpy.hstack([row, delta(row,N=2, offset=1)]) for row in data ])
def add_accel(data):
    return numpy.array( [ numpy.hstack([row, delta(row, N=2, offset=1), delta(delta(row, N=2, offset=1), offset=0)]) for row in data ])

In [35]:
data = numpy.load("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.max1K.ord.mfcc.npy")
numpy.save("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.max1K.delta3.ord.mfcc.npy", add_delta(data))
numpy.save("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.max1K.accel3.ord.mfcc.npy", add_accel(data))

In [11]:
accel2 = numpy.load("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.max1K.accel2.ord.mfcc.npy")

In [12]:
accel2[0].shape


Out[12]:
(306, 39)

In [9]:
a = numpy.zeros((5,5))

In [10]:
a


Out[10]:
array([[ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.]])

In [11]:
b = numpy.ones((5,3))

In [13]:
a[:,:1]


Out[13]:
array([[ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.]])

In [32]:
delta(data[0])


Out[32]:
array([[ 0.61958529, -0.2840682 , -1.53657403, ..., -1.42725674,
        -0.14426044, -2.76910033],
       [ 1.01140882,  0.05759722, -1.42712091, ..., -2.0242593 ,
         1.27695695, -2.68136078],
       [ 0.3469407 ,  1.1368279 ,  0.56448641, ..., -2.60394163,
         2.15386007, -1.74528471],
       ..., 
       [-2.04674055,  1.4516821 ,  2.85773237, ..., -3.41858184,
         3.78537138, -2.74750143],
       [-0.94632305,  1.30881118,  2.54623265, ..., -5.04907538,
         2.2213168 , -0.92555301],
       [-0.34780258,  0.33054613,  1.70352806, ..., -4.09286694,
         0.85026177, -0.04489378]])

In [ ]: