In [1666]:
# load this data by running ./setup
filename = 'African Drum Music-wXV39pybgJU-sm.wav'
In [1667]:
from scipy.io.wavfile import read, write
from librosa.core import stft, istft
# load wav data
_, y = read(filename)
In [1668]:
# convert timeseries wav data into spectrogram data
D = stft(y)
In [1681]:
x = numpy.array([D.real, D.imag])
In [1687]:
# x is now shape 2, freq, samples
x.shape
Out[1687]:
In [1689]:
# x is the domain that the neural network can operate in
# NOTE: not completely true as we'll learn later
In [1690]:
# and to convert back into complex for conversion back to wav
D2 = x[0,:,:] + (x[1,:,:] * 1j)
In [1692]:
# D and D2 are the same
sum(D - D2) == 0
Out[1692]:
In [1693]:
# and then back to timeseries wav
back_y = istft(D2)
In [1694]:
# convert back_y into int16 or whatever the file format has. otherwise back_y is float
back_y = np.array(back_y, dtype=y.dtype)
In [1695]:
# write test-out for listening
write('test-out.wav', 44100, back_y)
In [1696]:
# a measure of how different y and y->fft->back_y are
sum(abs(y - back_y))/len(y)
Out[1696]:
In [1697]:
# and now for some sanity checks of the data strucutres produced above
In [1698]:
# the shape of the input and output are the same
print y.shape
print back_y.shape
In [1699]:
# y is roughly 44100 samples/second * 60 seconds. I'm not sure why its off by 16 ...
y.shape[0], 44100 * 60
Out[1699]:
In [1700]:
# NOTE: the default FFT window size is 2048 and hop_length is 2048/4
In [1702]:
# these two are *almost* the same. Anyone know why?
print D.shape[1]
print len(y)/2048. * 4.
In [1710]:
# FYI: windows are roughly 11ms long
# (length of input / (samples/second)) * (milliseconds/second) / (samples/window) => milliseconds/window
float(len(y)) / 44100 * 1000 / 5169
Out[1710]:
In [1712]:
# FYI: D is roughly twice the size in bytes due to complex num
float(D.shape[0] * D.shape[1]) / len(y)
Out[1712]: