notebook.community

Edit and run



In [1666]:

    
# load this data by running ./setup
filename = 'African Drum Music-wXV39pybgJU-sm.wav'



In [1667]:

    
from scipy.io.wavfile import read, write
from librosa.core import stft, istft

# load wav data
_, y = read(filename)



In [1668]:

    
# convert timeseries wav data into spectrogram data
D = stft(y)



In [1681]:

    
x = numpy.array([D.real, D.imag])



In [1687]:

    
# x is now shape 2, freq, samples
x.shape









    Out[1687]:





(2, 1025, 5169)



In [1689]:

    
# x is the domain that the neural network can operate in
# NOTE: not completely true as we'll learn later



In [1690]:

    
# and to convert back into complex for conversion back to wav
D2 = x[0,:,:] + (x[1,:,:] * 1j)



In [1692]:

    
# D and D2 are the same
sum(D - D2) == 0









    Out[1692]:





True



In [1693]:

    
# and then back to timeseries wav
back_y = istft(D2)



In [1694]:

    
# convert back_y into int16 or whatever the file format has.  otherwise back_y is float
back_y = np.array(back_y, dtype=y.dtype)



In [1695]:

    
# write test-out for listening
write('test-out.wav', 44100, back_y)



In [1696]:

    
# a measure of how different y and y->fft->back_y are
sum(abs(y - back_y))/len(y)









    Out[1696]:





0



In [1697]:

    
# and now for some sanity checks of the data strucutres produced above



In [1698]:

    
# the shape of the input and output are the same
print y.shape
print back_y.shape









    



(2646016,)
(2646016,)



In [1699]:

    
# y is roughly 44100 samples/second * 60 seconds.  I'm not sure why its off by 16 ...
y.shape[0], 44100 * 60









    Out[1699]:





(2646016, 2646000)



In [1700]:

    
# NOTE: the default FFT window size is 2048 and hop_length is 2048/4



In [1702]:

    
# these two are *almost* the same.  Anyone know why?
print D.shape[1]
print len(y)/2048. * 4.



In [1710]:

    
# FYI: windows are roughly 11ms long
# (length of input / (samples/second)) * (milliseconds/second) / (samples/window) => milliseconds/window
float(len(y)) / 44100 * 1000 / 5169









    Out[1710]:





11.607731246235517



In [1712]:

    
# FYI: D is roughly twice the size in bytes due to complex num
float(D.shape[0] * D.shape[1]) / len(y)









    Out[1712]:





2.002340499830689