notebook.community

Edit and run



In [1]:

    
import scipy.io.wavfile as wav
import csv
import numpy
import imaginet.tts
import python_speech_features as features









    



Using gpu device 0: GeForce GTX 980 Ti (CNMeM is disabled, cuDNN 5005)



In [24]:

    
wavmap = list(csv.reader(open("flickr_audio/wav2capt.txt"), delimiter=' '))



In [55]:

    
import time
import timeout_decorator

@timeout_decorator.timeout(5)
def readwav(f):
    return wav.read(f)

def feats(path):
    try:
        (rate,sig) = readwav(open("flickr_audio/wavs/{}".format(path)))
    except timeout_decorator.TimeoutError:
        print "Timed out {}".format(path)
        rate = 16000
        sig = numpy.zeros(10)
    return features.mfcc(sig, rate)



In [56]:

    
for i in range(0,40):
    print i
    mfcc = numpy.array([  feats(item[0]) for item in wavmap[i*1000:i*1000+1000] ])
    numpy.save("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.mfcc.{}.npy".format(i), mfcc)









    



20
Timed out 2865703567_52de2444f2_0.wav
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39



In [66]:

    
mfcc = numpy.hstack([ numpy.load("dataset.human.mfcc.{}.npy".format(i)) for i in range(0, 40) ])



In [67]:

    
numpy.save("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.mfcc.npy", mfcc)



In [72]:

    
size = [row.shape[0] for row in mfcc]



In [73]:

    
numpy.mean(size)









    Out[73]:





412.11077499999999



In [74]:

    
numpy.median(size)









    Out[74]:





389.0



In [75]:

    
numpy.min(size)









    Out[75]:





1



In [76]:

    
numpy.max(size)









    Out[76]:





32350



In [77]:

    
mfcc = numpy.hstack([ numpy.load("dataset.human.mfcc.{}.npy".format(i))[:3000] for i in range(0, 40) ])
numpy.save("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.max3K.mfcc.npy", mfcc)



In [84]:









    Out[84]:





39997



In [85]:

    
import imaginet.data_provider as dp



In [88]:

    
prov = dp.getDataProvider('flickr8k', root='/home/gchrupala/reimaginet/', audio_kind=None)









    



Could not read file /home/gchrupala/reimaginet/data/flickr8k/dataset.ipa.jsonl.gz: IPA transcription not available
Could not read file /home/gchrupala/reimaginet/data/flickr8k/dataset.None.npy: audio features not available



In [91]:

    
prov.dataset['images'][0]['filename']









    Out[91]:





u'2513260012_03d33305cf.jpg'



In [92]:

    
import csv



In [101]:

    
wav2cap = numpy.array(list(csv.reader(open("flickr_audio/wav2capt.txt"), delimiter=' ')))



In [102]:

    
wav2cap









    Out[102]:





array([['2571096893_694ce79768_1.wav', '2571096893_694ce79768.jpg', '#1'],
       ['2571096893_694ce79768_2.wav', '2571096893_694ce79768.jpg', '#2'],
       ['2570559405_dc93007f76_4.wav', '2570559405_dc93007f76.jpg', '#4'],
       ..., 
       ['534669139_1a4f8ab9d5_3.wav', '534669139_1a4f8ab9d5.jpg', '#3'],
       ['534875358_6ea30d3091_1.wav', '534875358_6ea30d3091.jpg', '#1'],
       ['534875358_6ea30d3091_0.wav', '534875358_6ea30d3091.jpg', '#0']], 
      dtype='|S27')



In [106]:

    
cap2wav = {}
for row in wav2cap:
    cap2wav[row[1]+row[2]] = row[0]



In [107]:

    
cap2text = dict(list(csv.reader(open("Flickr8k.token.txt"), delimiter='\t')))



In [111]:

    
wav2ID = dict([ (w,i) for (i,w) in enumerate(wav2cap[:,0]) ])



In [117]:

    
# Check consistency
for img in prov.dataset['images']:
    for (i,sent) in enumerate(img['sentences']):
        cap = "{}#{}".format(img['filename'],i)
        if cap2text[cap] != sent['raw']:
            print cap2text[cap], sent['raw']
        else:
            pass



In [119]:

    
def genmfcc():
    for img in prov.dataset['images']:
        for (i,sent) in enumerate(img['sentences']):
            cap = "{}#{}".format(img['filename'],i)
            yield mfcc[wav2ID[cap2wav[cap]]]



In [120]:

    
MFCC = numpy.array(list(genmfcc()))



In [124]:

    
numpy.save("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.max3K.ord.mfcc.npy", MFCC)



In [125]:

    
import scipy



In [131]:

    
sent = [s for img in prov.dataset['images'] for s in img['sentences'] ]
scipy.stats.pearsonr([len(row) for row in MFCC], [ len(s['raw']) for s in sent ])









    Out[131]:





(0.47461496238223072, 0.0)



In [132]:

    
scipy.stats.pearsonr([len(row) for row in mfcc], [ len(s['raw']) for s in sent ])









    Out[132]:





(-0.003074679304299016, 0.53860881075333633)



In [133]:

    
numpy.save("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.max1K.ord.mfcc.npy", 
           numpy.array([row[:1000] for row in MFCC]))

First dimension is the log of frame energy. Indices 1-12 correspond to cepstral coeffs.



In [200]:

    
numpy.hstack([MFCC[0],MFCC[0]]).shape









    Out[200]:





(306, 26)

Delta coefficients



In [33]:

    
def delta(v, N=2, offset=1):
    d = numpy.zeros_like(v[:, offset:])
    for t in range(0, d.shape[0]):
        Z = 2 * sum(n**2 for n in range(1, N+1))
        d[t,:] = sum(n * (v[min(t+n, v.shape[0]-1), offset:]-v[max(t-n, 0), offset:]) for n in range(1,N+1)) / Z
    return d



In [34]:

    
def add_delta(data):
    return numpy.array( [ numpy.hstack([row, delta(row,N=2, offset=1)]) for row in data ])
def add_accel(data):
    return numpy.array( [ numpy.hstack([row, delta(row, N=2, offset=1), delta(delta(row, N=2, offset=1), offset=0)]) for row in data ])



In [35]:

    
data = numpy.load("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.max1K.ord.mfcc.npy")
numpy.save("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.max1K.delta3.ord.mfcc.npy", add_delta(data))
numpy.save("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.max1K.accel3.ord.mfcc.npy", add_accel(data))



In [11]:

    
accel2 = numpy.load("/home/gchrupala/repos/reimaginet/data/flickr8k/dataset.human.max1K.accel2.ord.mfcc.npy")



In [12]:

    
accel2[0].shape









    Out[12]:





(306, 39)



In [9]:

    
a = numpy.zeros((5,5))



In [10]:

    
a









    Out[10]:





array([[ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.]])



In [11]:

    
b = numpy.ones((5,3))



In [13]:

    
a[:,:1]









    Out[13]:





array([[ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.]])



In [32]:

    
delta(data[0])









    Out[32]:





array([[ 0.61958529, -0.2840682 , -1.53657403, ..., -1.42725674,
        -0.14426044, -2.76910033],
       [ 1.01140882,  0.05759722, -1.42712091, ..., -2.0242593 ,
         1.27695695, -2.68136078],
       [ 0.3469407 ,  1.1368279 ,  0.56448641, ..., -2.60394163,
         2.15386007, -1.74528471],
       ..., 
       [-2.04674055,  1.4516821 ,  2.85773237, ..., -3.41858184,
         3.78537138, -2.74750143],
       [-0.94632305,  1.30881118,  2.54623265, ..., -5.04907538,
         2.2213168 , -0.92555301],
       [-0.34780258,  0.33054613,  1.70352806, ..., -4.09286694,
         0.85026177, -0.04489378]])



In [ ]: