Spectrograms

Exploring the spectral features learned by wave-based neural networks.



In [1]:

    
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile
from scipy.signal import spectrogram


plt.style.use('ggplot')
%matplotlib inline

Generating some fake inputs



In [2]:

    
# Generating arificial utterances.
# Make a grid of frequencies.
frequencies = np.arange(10, 2010,10)
RATE = int(16 * 1e3)  # 16kHz
LENGTH = 3  # 5 Seconds

frequencies









    Out[2]:





array([  10,   20,   30,   40,   50,   60,   70,   80,   90,  100,  110,
        120,  130,  140,  150,  160,  170,  180,  190,  200,  210,  220,
        230,  240,  250,  260,  270,  280,  290,  300,  310,  320,  330,
        340,  350,  360,  370,  380,  390,  400,  410,  420,  430,  440,
        450,  460,  470,  480,  490,  500,  510,  520,  530,  540,  550,
        560,  570,  580,  590,  600,  610,  620,  630,  640,  650,  660,
        670,  680,  690,  700,  710,  720,  730,  740,  750,  760,  770,
        780,  790,  800,  810,  820,  830,  840,  850,  860,  870,  880,
        890,  900,  910,  920,  930,  940,  950,  960,  970,  980,  990,
       1000, 1010, 1020, 1030, 1040, 1050, 1060, 1070, 1080, 1090, 1100,
       1110, 1120, 1130, 1140, 1150, 1160, 1170, 1180, 1190, 1200, 1210,
       1220, 1230, 1240, 1250, 1260, 1270, 1280, 1290, 1300, 1310, 1320,
       1330, 1340, 1350, 1360, 1370, 1380, 1390, 1400, 1410, 1420, 1430,
       1440, 1450, 1460, 1470, 1480, 1490, 1500, 1510, 1520, 1530, 1540,
       1550, 1560, 1570, 1580, 1590, 1600, 1610, 1620, 1630, 1640, 1650,
       1660, 1670, 1680, 1690, 1700, 1710, 1720, 1730, 1740, 1750, 1760,
       1770, 1780, 1790, 1800, 1810, 1820, 1830, 1840, 1850, 1860, 1870,
       1880, 1890, 1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980,
       1990, 2000])



In [9]:

    
datadir = '/afs/inf.ed.ac.uk/user/s10/s1003970/diss/waveform-asr/tests/test_data/'



In [12]:

    
def make_wav(freq):
    
    time = np.arange(start=0, stop=RATE*LENGTH, dtype=np.float32)
    data = np.cos(time*freq*2*np.pi/RATE)
    
    data = data.astype(np.float32)
    name = datadir + 'waves/{}.wav'.format(freq)    
    
#     data = (data + 1)*128
#     data = data.astype(np.uint8)
    
    wavfile.write(filename=name,
                  data=data,
                  rate=RATE)
    return data



In [13]:

    
data = make_wav(100)



In [14]:

    
from IPython.display import Audio

Audio(data, rate=16000)









    Out[14]:



In [15]:

    
freqs, times, sgram = spectrogram(data, fs=16000, scaling='density')
sgram.shape, freqs.shape, times.shape









    Out[15]:





((129, 214), (129,), (214,))



In [16]:

    
plt.figure(figsize=(16,5))
plt.imshow(sgram, aspect='auto')
plt.colorbar()
plt.show()



In [17]:

    
import librosa
mel = librosa.feature.melspectrogram(data, sr=16000)
plt.figure(figsize=(16,5))
plt.imshow(np.log(mel), aspect='auto')
plt.colorbar()
plt.show()



In [22]:

    
frequencies









    Out[22]:





array([  10,   20,   30,   40,   50,   60,   70,   80,   90,  100,  110,
        120,  130,  140,  150,  160,  170,  180,  190,  200,  210,  220,
        230,  240,  250,  260,  270,  280,  290,  300,  310,  320,  330,
        340,  350,  360,  370,  380,  390,  400,  410,  420,  430,  440,
        450,  460,  470,  480,  490,  500,  510,  520,  530,  540,  550,
        560,  570,  580,  590,  600,  610,  620,  630,  640,  650,  660,
        670,  680,  690,  700,  710,  720,  730,  740,  750,  760,  770,
        780,  790,  800,  810,  820,  830,  840,  850,  860,  870,  880,
        890,  900,  910,  920,  930,  940,  950,  960,  970,  980,  990,
       1000, 1010, 1020, 1030, 1040, 1050, 1060, 1070, 1080, 1090, 1100,
       1110, 1120, 1130, 1140, 1150, 1160, 1170, 1180, 1190, 1200, 1210,
       1220, 1230, 1240, 1250, 1260, 1270, 1280, 1290, 1300, 1310, 1320,
       1330, 1340, 1350, 1360, 1370, 1380, 1390, 1400, 1410, 1420, 1430,
       1440, 1450, 1460, 1470, 1480, 1490, 1500, 1510, 1520, 1530, 1540,
       1550, 1560, 1570, 1580, 1590, 1600, 1610, 1620, 1630, 1640, 1650,
       1660, 1670, 1680, 1690, 1700, 1710, 1720, 1730, 1740, 1750, 1760,
       1770, 1780, 1790, 1800, 1810, 1820, 1830, 1840, 1850, 1860, 1870,
       1880, 1890, 1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980,
       1990, 2000])



In [18]:

    
for freq in frequencies:
    make_wav(freq)



In [ ]:

    
cp -r s1003970/diss/waveform-asr/tests/test_data/waves/ metadata/tones/
cp *.npy ~/s1003970/diss/waveform-asr/tests/test_data/tone_reps/

cd diss/waveform-asr/experiments
%run hmm-wavenet.py



In [ ]:

    
outdir = '/home/s1003970/metadata/tones/'
indir = '/home/s1003970/metadata/tones/waves/'  

import tensorflow as tf   
from scipy.io import wavfile

sess = tf.InteractiveSession(graph=network.graph)
sess.run(network.init)
tones = os.listdir(indir)
network.restore_model('final')

for tone in tones:
    print(tone)
    wav = wavfile.read(indir + tone)[1]
    feats = sess.run(network.stacks_output,
                     feed_dict={network.inputs: wav})
    
    feats = feats.squeeze()
    new_name = outdir + tone
    np.save(file=new_name, arr=feats)
    feats = feats.squeeze()
    new_name = outdir + tone
    np.save(file=new_name, arr=feats)

Constant tones as input



In [19]:

    
tonedir = datadir + 'tone_reps/'



In [20]:

    
import os; tones = os.listdir(tonedir)



In [21]:

    
tones









    Out[21]:





[]



In [202]:

    
freq = 10
rep = np.load(tonedir + '{}.wav.npy'.format(freq))

plt.figure(figsize=(16,5))
plt.imshow(rep.T, aspect='auto')
plt.colorbar()
plt.show()

plt.figure(figsize=(16,5))
plt.plot(rep[20000,:])
plt.show()



In [193]:

    
freq = 150
rep200 = np.load(tonedir + '{}.wav.npy'.format(freq))
freq = 120
rep300 = np.load(tonedir + '{}.wav.npy'.format(freq))

plt.figure(figsize=(16,5))
plt.imshow(rep200.T, aspect='auto')
plt.colorbar()
plt.figure(figsize=(16,5))
plt.imshow(rep300.T, aspect='auto')
plt.colorbar()
plt.show()

plt.figure(figsize=(16,5))
plt.plot(rep200[20000,:])
plt.plot(rep300[20000,:])
plt.show()

Loading output files



In [203]:

    
len(frequencies)









    Out[203]:





200



In [204]:

    
senses = np.zeros([len(frequencies), 64])



In [206]:

    
i=0
for freq in frequencies:
    rep = np.load(tonedir + '{}.wav.npy'.format(freq))   
    senses[i,:] = rep[20000,:]
    i+=1



In [209]:

    
senses.shape









    Out[209]:





(200, 64)



In [223]:

    
plt.figure(figsize=(16,5))
plt.plot(senses[:,12])
plt.show()



In [212]:

    
plt.figure(figsize=(16,5))
plt.imshow(senses, aspect='auto')
plt.colorbar()
plt.show()



In [ ]:



In [ ]:



In [ ]: