In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from scipy.signal import lfilter
import librosa
import librosa.display
import IPython.display as ipd
Consider two different wave files
In [2]:
wave_filename = '../audio/33711__acclivity__excessiveexposure.wav'
Calculate the prediction gain as $$ G_{\mathsf{p}} = \frac{\sigma_x^2}{\sigma_x^2-\boldsymbol{\varphi}_{XX}^{\mathsf{T}}\boldsymbol{R}_{XX}^{-1}\boldsymbol{\varphi}_{XX}} $$ where $$ \boldsymbol{\varphi}_{XX} := \begin{pmatrix} \varphi_{XX}[1] \\ \varphi_{XX}[2] \\ \vdots\\ \varphi_{XX}[n] \end{pmatrix} $$ and $$ \boldsymbol{R}_{XX} := \begin{pmatrix} \varphi_{XX}[0] & \varphi_{XX}[-1] & \cdots & \varphi_{XX}[1-n] \\ \varphi_{XX}[1] & \varphi_{XX}[0] & \cdots & \varphi_{XX}[2-n] \\ \vdots & \vdots & \ddots & \vdots\\ \varphi_{XX}[n-1] & \varphi_{XX}[n-2] & \cdots & \varphi_{XX}[0] \end{pmatrix} $$ with $$ \varphi_{XX}[\lambda] = \mathbb{E}\left\{X[k]X[k-\lambda]\right\} $$ being the auto-correlation of $X$.
In [3]:
# helper function calculating the correlation coefficient as defined in the lecture
def auto_correlation(x, k):
if k >= 0:
x_shift = np.roll(x,k)
x_shift[:k] = 0
else:
x_shift = np.roll(x,k)
x_shift[k:] = 0
return np.correlate(x,x_shift)[0] / len(x)
# calculate prediction gain
def get_prediction_gain(x, n):
if n == 0:
return 1
phi_XX = np.array([auto_correlation(x,k) for k in np.arange(1,n+1)])
# inefficient way to calculate R_XX as many terms can be reused due to symmetric of the auto-correlation function
R_XX = np.array([[auto_correlation(x,i-j) for j in range(n)] for i in range(n)])
sigmaq_x = np.mean(np.square(x))
sigmaq_d = sigmaq_x - phi_XX.T @ np.linalg.inv(R_XX) @ phi_XX
Gp = sigmaq_x / sigmaq_d
return Gp
In [4]:
# compute prediction gains for two different audio files
x, sampling_rate = librosa.load(wave_filename, sr=8000)
n_range = range(31)
Gp = [get_prediction_gain(x,n) for n in n_range]
In [5]:
# select a prediction filter of 10 coefficients
n = 10
phi_XX = np.array([auto_correlation(x,k) for k in np.arange(1,n+1)])
# inefficient way to calculate R_XX as many terms can be reused due to symmetric of the auto-correlation function
R_XX = np.array([[auto_correlation(x,i-j) for j in range(n)] for i in range(n)])
# optimal prediction coefficients
aopt = np.linalg.inv(R_XX) @ phi_XX
# generate prediction signal
xh = lfilter(np.concatenate(([0], aopt)), 1, x)
# prediction error
d = x - xh
Playback original signal
In [6]:
# playback original audio
ipd.Audio(x, rate=sampling_rate)
Out[6]:
Playback predicted signal
In [7]:
# playback original audio
ipd.Audio(xh, rate=sampling_rate)
Out[7]:
Playback prediction error
In [8]:
# playback original audio
ipd.Audio(d, rate=sampling_rate)
Out[8]:
Plot waveforms
In [9]:
x_max = np.max([np.max(x), -np.min(x)])
plt.figure(figsize=(10, 11))
plt.subplot(3,1,1)
librosa.display.waveplot(x, sr=sampling_rate)
plt.title('Original')
plt.ylim((-x_max*1.2,+x_max*1.2))
plt.subplot(3,1,2)
librosa.display.waveplot(xh, sr=sampling_rate)
plt.title('Predicted signal')
plt.ylim((-x_max*1.2,+x_max*1.2))
plt.subplot(3,1,3)
librosa.display.waveplot(d, sr=sampling_rate)
plt.title('Prediction error')
plt.ylim((-x_max*2.2,+x_max*2.2))
plt.tight_layout()
We extract blocks of the signal using a rectangular window $w[k]$ of length $N$
$$
\tilde{x}[k] = x[k]\cdot w[k_0-k]
$$
The signal $\tilde{x}[k]$ attains non-zero values only within the time interval
$$
k_1 = k_0-N+1 \leq k \leq k_0
$$
The short-term auto-correlation for finite-energy signals is defined as ($\forall\lambda \in\{0,1,\ldots, n\}$)
$$
r_\lambda = \sum_{k=k_1}^{k_0}\tilde{x}[k]\tilde{x}[k+\lambda] = \sum_{k=k_1}^{k_0-\lambda}x[k]x[k+\lambda]
$$
Due to symmetry ($r_{-\lambda} = r_\lambda$), the normal equations can be rewritten as
$$
\begin{pmatrix}
r_1 \\
r_2 \\
r_2 \\
\vdots \\
r_n
\end{pmatrix} = \begin{pmatrix}
r_0 & r_1 & r_2 & \cdots & r_{n-1} \\
r_1 & r_2 & r_3 & \cdots & r_{n-2} \\
r_2 & r_3 & r_4 & \cdots & r_{n-3} \\
\vdots & \vdots & \vdots & \ddots & \vdots \\
r_{n-1} & r_{n-2} & r_{n-3} & \cdots & r_{0} \\
\end{pmatrix}\begin{pmatrix}
a_1 \\
a_2 \\
a_3 \\
\vdots \\
a_n
\end{pmatrix}
$$
The solution of this system yields optimal predictor coefficients $\boldsymbol{a}_{\text{opt}}$ for this block.
In [15]:
# helper function calculating the auto-correlation coefficient r
def get_r(x, l):
x_shift = np.roll(x,l)
x_shift[:l] = 0
return np.correlate(x,x_shift)[0]
def get_prediction_coefficients(x, n):
r = np.array([get_r(x,k) for k in np.arange(1,n+1)])
R = np.array([np.concatenate(([get_r(x,j) for j in np.arange(i,0,-1)], [get_r(x,j) for j in np.arange(0,n-i)])) for i in range(n)])
a_opt = np.linalg.inv(R) @ r
return a_opt
In [23]:
# block prediction with N samples per block and predictor order n
def block_prediction(x, n, N):
if n==0:
return x
frames = int(np.floor(len(x) / N))
# prediction error
d = np.zeros_like(x)
# no adaptation for first frame
a = np.zeros(n)
# filter memory
memory = np.zeros(n)
for k0 in range(frames):
x_part = x[np.arange(N)+k0*N]
# compute prediction coefficients on current block
a = get_prediction_coefficients(x_part, n)
xh_part, memory = lfilter(np.concatenate(([0], a)), 1, x_part, zi=memory)
d[np.arange(len(xh_part)) + k0*N] = x_part - xh_part
return d
In [24]:
# compute prediction gains for two different audio files, 20 ms blocks
N = int(sampling_rate * 0.02)
Gp_block = []
for n in n_range:
d = block_prediction(x, n, N)
Gp_block.append(np.mean(np.square(x)) / np.mean(np.square(d)))
In [31]:
plt.figure(figsize=(8, 5))
plt.plot(n_range, 10*np.log10(Gp), ms=5, marker='o')
plt.plot(n_range, 10*np.log10(Gp_block), '--',ms=5, marker='o')
plt.xlim((0,30))
plt.ylim((0,15))
plt.xlabel('$n$')
plt.grid(True)
plt.ylabel('$10\log_{10}(G_{\mathsf{p}})$')
plt.legend(['Fixed predictor coefficients', 'Block adaptation ($N=%d$)' % N], loc=4)
plt.savefig('figure_DST_6.11.pdf',bbox_inches='tight')
Generate prediction error and predicted signal
In [32]:
n = 30
d = block_prediction(x, n, N)
xh = x-d
Playback original signal
In [33]:
ipd.Audio(x, rate=sampling_rate)
Out[33]:
Playback predicted signal
In [35]:
ipd.Audio(xh, rate=sampling_rate)
Out[35]:
Playback prediction error
In [36]:
ipd.Audio(d, rate=sampling_rate)
Out[36]:
Plot waveforms
In [37]:
x_max = np.max([np.max(x), -np.min(x)])
plt.figure(figsize=(10, 11))
plt.subplot(3,1,1)
librosa.display.waveplot(x, sr=sampling_rate)
plt.title('Original')
plt.ylim((-x_max*1.2,+x_max*1.2))
plt.subplot(3,1,2)
librosa.display.waveplot(xh, sr=sampling_rate)
plt.title('Predicted signal')
plt.ylim((-x_max*1.2,+x_max*1.2))
plt.subplot(3,1,3)
librosa.display.waveplot(d, sr=sampling_rate)
plt.title('Prediction error')
plt.ylim((-x_max*2.2,+x_max*2.2))
plt.tight_layout()
In [ ]: