In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import IPython.display as ipd
import numpy as np
import scipy.io.wavfile as wavfile
from scipy.fftpack import dct
In [2]:
wav_file = 'mngu0_s1_0001.wav'
sample_rate, sig = wavfile.read(wav_file)
fig, ax = plt.subplots(facecolor='white')
ax.plot(sig)
ymin, ymax = ax.get_ylim()
# Play audio
ipd.Audio(sig, rate=sample_rate)
Out[2]:
In [3]:
pre_emp = 0.97
sig_emp = np.append(sig[0], sig[1:] - pre_emp*sig[:-1])
In [4]:
fig, ax = plt.subplots(facecolor='white')
ax.plot(sig_emp)
ax.set_ylim(ymin, ymax)
# Play audio
ipd.Audio(sig_emp, rate=sample_rate)
Out[4]:
In [5]:
# Set frame time
frame_size = 0.025
frame_shift = 0.01
# Get samples
frame_len = int(frame_size * sample_rate) # 400
frame_step = int(frame_shift * sample_rate) # 160
sig_len = len(sig_emp) # 57862
# Get number of frames
num_frames = int(np.ceil(np.abs(sig_len - frame_len) / frame_step)) # 359.13 -> 360
In [6]:
# Pad signal
pad_sig_len = num_frames * frame_step + frame_len # 58000
pad = np.zeros((pad_sig_len - sig_len)) # 138
sig_pad = np.append(sig_emp, pad) # 58000
In [7]:
# Get within-frame sample indices
idx1 = np.tile(np.arange(0, frame_len), (num_frames, 1)) # 360 x 400
idx1
Out[7]:
In [8]:
# Get vectors of frame step increments
idx2 = np.tile(np.arange(0, num_frames*frame_step, frame_step), ((frame_len, 1))).T
idx2
Out[8]:
In [9]:
# Get total indices divided by each frame
indices = idx1 + idx2
indices
Out[9]:
In [10]:
# Get frames divided by each frame based on indices
frames = sig_pad[indices.astype(np.int32, copy=False)]
frames
Out[10]:
In [11]:
frames *= np.hamming(frame_len)
frames
Out[11]:
In [12]:
NFFT = 512
mag_frames = np.absolute(np.fft.rfft(frames, n=NFFT)) # frames x (NFFT//2+1); 360 x 257
# pow_frames = ((1/NFFT) * ((mag_frames)**2)) # 360 x 257
pow_frames = (((mag_frames)**2)) # 360 x 257
In [13]:
# Magnitude
fig, arr = plt.subplots(1, 2, facecolor='white', figsize=(14,6))
im1 = arr[0].imshow(mag_frames.T, aspect='auto', origin='lower')
arr[0].set_title('Magnitude plot')
divider = make_axes_locatable(arr[0])
cax = divider.append_axes('right', size='5%', pad=0.05)
cbar = fig.colorbar(im1, cax=cax)
# Power spectrogram
im2 = arr[1].imshow(pow_frames.T, aspect='auto', origin='lower')
arr[1].set_title('Power spectrogram')
divider = make_axes_locatable(arr[1])
cax = divider.append_axes('right', size='5%', pad=0.05)
cbar = fig.colorbar(im2, cax=cax)
In [14]:
nfilt = 40
low_freq_mel = 0
# Convert Hz to Mel
high_freq_mel = (2595 * np.log10(1 + (sample_rate/2)/700))
# Get mel points (equally spaced)
mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)
# Convert Mel to Hz
hz_points = (700 * (10**(mel_points/2595) - 1))
hz_points
Out[14]:
In [15]:
high_freq_mel
Out[15]:
In [16]:
# Get frequency bins -> ??
bins = np.floor((NFFT + 1)*hz_points / sample_rate)
bins
Out[16]:
$H_m(k)$: $m$th Mel filter bank of the $k$th bin
In [17]:
# Filter banks
fbank = np.zeros((nfilt, int(np.floor(NFFT/2 + 1))))
# For each filter bank
for m in range(1, nfilt + 1): # 1, 2, ... 40
f_m_minus = int(bins[m-1]) # left
f_m = int(bins[m]) # center
f_m_plus = int(bins[m+1]) # right
# For left bins within filter bank
for k in range(f_m_minus, f_m):
fbank[m-1, k] = (k - bins[m-1]) / (bins[m] - bins[m-1])
# For right bins within filter bank
for k in range(f_m, f_m_plus):
fbank[m-1, k] = (bins[m+1] - k) / (bins[m+1] - bins[m])
In [18]:
fig, arr = plt.subplots(1,2, facecolor='white', figsize=(14, 6))
arr[1].imshow(fbank.T, aspect='auto', origin='lower')
arr[1].set_xlabel('Number of filter banks = 40')
arr[1].set_ylabel('(NFFT//2 + 1) = 257')
arr[0].imshow(20*np.log10(pow_frames).T, aspect='auto', origin='lower')
arr[0].set_xlabel('Number of frames = 360')
arr[0].set_ylabel('(NFFT//2 + 1) = 257')
arr[0].set_title('Log spectrogram')
Out[18]:
In [20]:
# Finalize
filter_banks = np.dot(pow_frames, fbank.T) # convolution!
# Add eps for numerical stability
filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)
filter_banks = 20 * np.log10(filter_banks) # to dB
In [21]:
filter_banks.shape
Out[21]:
In [20]:
fig, ax = plt.subplots(facecolor='white', figsize=(12,6))
ax.imshow(filter_banks.T, aspect='auto', origin='lower')
Out[20]:
In [ ]: