In [1666]:
# load this data by running ./setup
filename = 'African Drum Music-wXV39pybgJU-sm.wav'

In [1667]:
from scipy.io.wavfile import read, write
from librosa.core import stft, istft

# load wav data
_, y = read(filename)

In [1668]:
# convert timeseries wav data into spectrogram data
D = stft(y)

In [1681]:
x = numpy.array([D.real, D.imag])

In [1687]:
# x is now shape 2, freq, samples
x.shape


Out[1687]:
(2, 1025, 5169)

In [1689]:
# x is the domain that the neural network can operate in
# NOTE: not completely true as we'll learn later

In [1690]:
# and to convert back into complex for conversion back to wav
D2 = x[0,:,:] + (x[1,:,:] * 1j)

In [1692]:
# D and D2 are the same
sum(D - D2) == 0


Out[1692]:
True

In [1693]:
# and then back to timeseries wav
back_y = istft(D2)

In [1694]:
# convert back_y into int16 or whatever the file format has.  otherwise back_y is float
back_y = np.array(back_y, dtype=y.dtype)

In [1695]:
# write test-out for listening
write('test-out.wav', 44100, back_y)

In [1696]:
# a measure of how different y and y->fft->back_y are
sum(abs(y - back_y))/len(y)


Out[1696]:
0

In [1697]:
# and now for some sanity checks of the data strucutres produced above

In [1698]:
# the shape of the input and output are the same
print y.shape
print back_y.shape


(2646016,)
(2646016,)

In [1699]:
# y is roughly 44100 samples/second * 60 seconds.  I'm not sure why its off by 16 ...
y.shape[0], 44100 * 60


Out[1699]:
(2646016, 2646000)

In [1700]:
# NOTE: the default FFT window size is 2048 and hop_length is 2048/4

In [1702]:
# these two are *almost* the same.  Anyone know why?
print D.shape[1]
print len(y)/2048. * 4.


5169
5168.0

In [1710]:
# FYI: windows are roughly 11ms long
# (length of input / (samples/second)) * (milliseconds/second) / (samples/window) => milliseconds/window
float(len(y)) / 44100 * 1000 / 5169


Out[1710]:
11.607731246235517

In [1712]:
# FYI: D is roughly twice the size in bytes due to complex num
float(D.shape[0] * D.shape[1]) / len(y)


Out[1712]:
2.002340499830689