In [45]:
import numpy as np
from numpy.random import randn, choice, multinomial
from scipy import stats
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style()
In [3]:
class TooLittleSampleCoverage(Exception):
"""Thrown if there were too few samples to hit the target distribution."""
pass
def target(x, val=0.25):
# Create a new array that will be populated with zeros
# except when the component of x is between 0.0 and 4.0
y = np.zeros_like(x)
y[np.logical_and(x >= 0.0, x <= 4.0)] = val
return y
def imp_sample_exact(N, loc=0.0, scale=1.0):
# Sample from the proposal
samples = stats.norm.rvs(loc, scale, N)
# Calculate the exact weights
weights = target(samples) / \
stats.norm.pdf(samples, loc=loc, scale=scale)
return samples, weights
def imp_sample_prop(N, loc=0.0, scale=1.0):
# Sample from the proposal
samples = stats.norm.rvs(loc, scale, N)
# Calculate the weights
weights = target(samples, val=1) / \
stats.norm.pdf(samples, loc=loc, scale=scale)
# Normalize the weights
if np.sum(weights) == 0.0:
raise TooLittleSampleCoverage
weights_normalized = weights / np.sum(weights)
return samples, weights, weights_normalized
Plot the resulting distribution as a weighted histogram
In [23]:
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
# Self-normalized
samples, weights, weights_normalized = imp_sample_prop(500000, loc=-3, scale=2)
axs[0].hist(samples, bins=50, weights=weights_normalized,
normed=True, range=(0, 4));
samples, weights, weights_normalized = imp_sample_prop(100000, loc=2, scale=2)
axs[1].hist(samples, bins=50, weights=weights_normalized,
normed=True, range=(0, 4));
If we use the exact proposal instead of the unscaled one:
In [13]:
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
# Self-normalized
samples, weights = imp_sample_exact(10000, loc=-3, scale=2)
axs[0].hist(samples, bins=20, weights=weights,
normed=True, range=(0, 4));
samples, weights = imp_sample_exact(10000, loc=2, scale=2)
axs[1].hist(samples, bins=20, weights=weights,
normed=True, range=(0, 4));
The situation is not much better in this case. If the proposal covers the target distribution badly, then the resulting histogram will give a distorted image of the target distribution.
In [17]:
# Number of successful repetitions
M = 8000
# Values of N
ns = [10, 20, 50, 100]
exp_vals = np.zeros((M, len(ns)))
for i, N in zip(range(4), ns):
j = 0
while j < M:
try:
samples, weights = imp_sample_exact(N)
exp_vals[j, i] = np.mean(samples * weights)
j += 1
except TooLittleSampleCoverage:
pass
fig, ax = plt.subplots()
ax.boxplot(exp_vals - 2, labels=ns);
ax.set_xlabel('$N$');
Out[17]:
Variance reduces with $N$ but bias is zero even for small $N$.
It will be shown that $$\widehat{Z} = \frac{1}{N} \sum_{i = 1}^N \widetilde{W}^i\quad\text{where}\quad\widetilde{W}^i = \frac{\widetilde{\pi}(X_i)}{q(X_i)}$$ is an estimator for the normalizing constant of the target distribution $\pi(x)$.
We can write the normalizing constant as $$Z = \int \widetilde{\pi}(x)\,\mathrm{d}x = \int \frac{\widetilde{\pi}(x)}{q(x)} q(x)\,\mathrm{d}x$$
Replace the proposal with a Monte Carlo estimate, i.e. $$q(x) \approx \frac{1}{N} \sum_{i = 1}^N \delta_{x^i}(x)\quad\text{and}\quad \omega(x) = \frac{\widetilde{\pi}(x)}{q(x)}$$ where $x^i$ are sampled from $q(x)$. This leads to $$Z \approx \frac{1}{N} \sum_{i = 1}^{N} \omega(x^i) =: \widehat{Z}$$
The theoretical value for the normalization constant is $4$.
In [18]:
# Number of successful repetitions
M = 8000
# Values of N
ns = [10, 20, 50, 100, 200]
Z_vals = np.zeros((M, len(ns)))
for i, N in zip(range(len(ns)), ns):
j = 0
while j < M:
try:
samples, weights, weights_normalized = imp_sample_prop(N, loc=0, scale=1)
Z_vals[j, i] = np.mean(weights)
j += 1
except TooLittleSampleCoverage:
pass
fig, ax = plt.subplots()
ax.boxplot(Z_vals - 4, labels=ns);
ax.set_xlabel('$N$');
Out[18]:
Variance decreases with $N$ while bias seems to be at zero even for small $N$.
It remains to check the influence of the proposal. Do this by moving the mean value of the proposal and by changing its standard deviation.
In [27]:
# Number of repetitions
M = 8000
# Values of mu
mus = [-1, 0, 1, 2, 3, 4, 5]
Z_vals = np.zeros((M, len(mus)))
for i, mu in zip(range(len(mus)), mus):
j = 0
while j < M:
try:
samples, weights, weights_normalized = imp_sample_prop(100, loc=mu, scale=1)
Z_vals[j, i] = np.mean(weights)
j += 1
except TooLittleSampleCoverage:
pass
fig, axs = plt.subplots(1, 2, figsize=(15, 5))
axs[0].boxplot(Z_vals - 4, labels=mus);
axs[0].set_title("Proposal with mean value $\mu$ and std. dev. $\sigma = 1$")
axs[0].set_xlabel("$\mu$");
axs[1].boxplot(Z_vals[:, 1:-1] - 4, labels=mus[1:-1])
axs[1].set_title("Proposal with mean value $\mu$ and std. dev. $\sigma = 1$ (no extremes)")
axs[1].set_xlabel("$\mu$");
fig.tight_layout()
Moving the Gaussian more to the midpoint of the uniform distribution reduces the variance to almost zero. Moving it away from the midpoint increases the variance a lot.
In [32]:
# Number of repetitions
M = 8000
# Values of sigma
sigmas = [0.5, 0.7, 1.0, 2.0]
Z_vals = np.zeros((M, len(sigmas)))
for i, sigma in zip(range(len(sigmas)), sigmas):
j = 0
while j < M:
try:
samples, weights, weights_normalized = imp_sample_prop(100, loc=2, scale=sigma)
Z_vals[j, i] = np.mean(weights)
j += 1
except TooLittleSampleCoverage:
pass
fig, ax = plt.subplots()
ax.boxplot(Z_vals - 4, labels=sigmas);
ax.set_title("Proposal with mean value $\mu = 2$ and std. dev. $\sigma$")
ax.set_xlabel("$\sigma$");
Higher standard deviation of the proposal also leads to less variance in the estimator. Probably because the tails, covering the uniform distribution, become heavier.
In [41]:
# Number of repetitions
M = 8000
# Values of N
ns = [10, 20, 30, 50, 100]
exp_vals = np.zeros((M, len(ns)))
for i, N in zip(range(len(ns)), ns):
j = 0
while j < M:
try:
samples, weights, weights_normalized = imp_sample_prop(N, scale=3)
exp_vals[j, i] = np.sum(samples * weights_normalized)
j += 1
except TooLittleSampleCoverage:
pass
fig, axs = plt.subplots(2, 1, figsize=(10, 10))
axs[0].boxplot(exp_vals - 2, labels=ns);
axs[0].set_xlabel('$N$');
axs[0].set_title('Summary of the runs for different $N$')
axs[1].plot(ns, exp_vals.mean(axis=0) - 2, 'o-')
axs[1].set_xlabel('$N$')
axs[1].set_ylabel('Bias')
axs[1].set_title('Bias of the estimator for the expected value');
fig.tight_layout()
g) The theoretical derivation is in the lecture notes. By considering $$ \pi(x) = \frac{\widetilde{\pi}(x)}{Z} $$ we get the estimator $$ \widehat{I}^N(\phi) = \frac{1}{N \cdot Z} \sum_{i = 1}^N \widetilde{w}^i \phi(x^i) $$ with $\phi(x) = x$ for the mean value. Considering that $Z$ can be approximated by $$ \widehat{Z} = \frac{1}{N} \sum_{i = 1}^N \widetilde{w}^i $$ as has been seen above, replacing $Z$ with $\widehat{Z}$ in the estimator $\widehat{I}^N$ above leads to $$ \widehat{I}^N(\phi) = \sum_{i = 1}^N w^i \phi(x^i)\quad\text{where}\quad w^i = \frac{\widetilde{w}^i}{\sum_{j = 1}^N \widetilde{w}^j}. $$ The difference between the two estimators is therefore essentially that the weights are normalized to sum to 1.
Consider importance sampling in a $D$-dimensional space. Let the proposal $q(x) = N(x; 0, I_D)$ be the $D$-dimensional normal distribution and the target $\pi(x) = U(x; [-0.5, 0.5]^D)$. Exact evaluation of the target is allowed.
This means $$\pi(x) = \frac{1}{(0.5 - (-0.5))^D} \prod_{i = 1}^D 1_{[-0.5, 0.5]}(x_i) = \prod_{i = 1}^D 1_{[-0.5, 0.5]}(x_i)$$
In [43]:
def multivariate_uniform_pdf(x, a=-0.5, b=0.5):
if np.alltrue(x >= a) and np.alltrue(x <= b):
return 1
else:
return 0
def imp_sample(N, D):
# Sample from the proposal
samples = stats.multivariate_normal.rvs(np.zeros((D,)), np.identity(D), N)
# Calculate exact weights
weights = np.apply_along_axis(multivariate_uniform_pdf, 1, samples) / \
stats.multivariate_normal.pdf(samples, mean=np.zeros((D,)),
cov=np.identity(D))
return samples, weights
Create a histogram in two dimensions to see if the code works.
In [100]:
N = 8000
D = 2
fig, ax = plt.subplots()
samples, weights = imp_sample(N, D)
ax.hist2d(samples[:, 0], samples[:, 1], bins=50, weights=weights);
Iterate over the dimension and see how the proportion of non-zero weights develops.
In [46]:
# Number of samples
N = 10000
proportion = []
probability = []
for D in range(2, 15):
samples, weights = imp_sample(N, D)
# Simulated proportion
proportion.append(len(weights[weights != 0.0]) / len(weights))
# Theoretical proportion
probability.append(
stats.mvn.mvnun(-0.5*np.ones((D,)), 0.5*np.ones((D,)),
np.zeros((D,)), np.identity(D))[0])
proportion = np.array(proportion)
probability = np.array(probability)
fig, ax = plt.subplots()
ax.plot(range(2, 15), proportion, 'o-');
ax.plot(range(2, 15), probability, 'x-r');
ax.set_xlabel('D');
It seems that the effective amount of weights with non-zero value very rapidly converges to zero. It is so fast that it could be exponential decrease.
Theoretically the weights are $$\omega(x^i) = \frac{\prod_{j = 1}^D I_{[-0.5, 0.5]}(x^i_j)}{\frac{1}{(2\pi)^{D/2}} \exp\left(-\frac{1}{2} \|x^i\|^2\right)}.$$
With increasing dimension the probability of all components of the sample to be inside of the interval $[-0.5, 0.5]$ get lower and lower. Thus the number of zero samples increases.
The probability for all components of a sample from the $D$-dimensional normal distribution to be between $-0.5$ and $0.5$ is $$P(-0.5 \leq x \leq 0.5) = P(x \leq 0.5) - P(x < -0.5) = 2 \cdot \Phi_D(0.5) - 1$$.
Here it holds that $$ \begin{align} \Phi_D(z) &= \int_{-\infty}^{z_1}\dots\int_{-\infty}^{z_D} \frac{1}{(2\pi)^{D/2}} \exp\left(-\frac{1}{2} \|x\|^2\right)\,\mathrm{d}x = \\ &= \frac{1}{(2\pi)^{D/2}} \int_{-\infty}^{z_1} \exp\left(-\frac{1}{2} x_1^2\right)\,\mathrm{d}x_1\,\dots\,\int_{-\infty}^{z_D} \exp\left(-\frac{1}{2} x_D^2\right)\,\mathrm{d}x_D \end{align} $$ and thus $$\Phi_D(0.5) = \Phi_1(0.5)^D \rightarrow 0\quad\text{for}\quad D \rightarrow \infty$$ since $\Phi_1(0.5) < 1$. This explains why the decrease is exponentially fast.
In [122]:
stats.norm.cdf(0.5)
Out[122]:
In [70]:
# Set dimension
D = 1000
# Generate 10 samples from the proposal
samples = stats.multivariate_normal.rvs(cov=4 * np.eye(D), size=10)
# Calculate corresponding pdf values
pdf_target = np.product(stats.norm.pdf(samples), axis=1)
pdf_proposal = np.product(stats.norm.pdf(samples, scale=2), axis=1)
# Compute weights
weights = pdf_target / pdf_proposal
# and normalize them
weights_normalized = weights / np.sum(weights)
weights_normalized
Out[70]:
The division fails since all values in pdf_proposal
are numerically zero, even though they theoretically have a non-zero value. This results from the high number of dimensions and that every value $\mathcal{N}(x_k;\,0, 1)$ is quite likely to be below $1$.
In [84]:
# Set dimension
D = 1000
# Generate 10 samples from the proposal
samples = stats.multivariate_normal.rvs(cov=4 * np.eye(D), size=10)
# Calculate corresponding pdf values
pdf_target = np.sum(stats.norm.logpdf(samples), axis=1)
pdf_proposal = np.sum(stats.norm.logpdf(samples, scale=2), axis=1)
# Compute log weights
logweights = pdf_target - pdf_proposal
# Retrieve normalized, non-log-transformed weights
weights_normalized = np.exp(logweights) / np.sum(np.exp(logweights))
(logweights, weights_normalized)
Out[84]:
Sometimes the division by zero problem is mitigated but occasionally there is still some problem during the normalizsation step, since there are still quite a lot of zeros. These zeros are theoretically non-zero but tend to be of the form $\exp(-x)$ where $x$ is around $-800$ or lower. This cannot be handled by normal floating point arithmetics and the normalized weights end up being zero.
In [94]:
# Set dimension
D = 1000
# Generate 10 samples from the proposal
samples = stats.multivariate_normal.rvs(cov=4 * np.eye(D), size=10)
# Calculate corresponding pdf values
pdf_target = np.sum(stats.norm.logpdf(samples), axis=1)
pdf_proposal = np.sum(stats.norm.logpdf(samples, scale=2), axis=1)
# Compute log weights
logweights = pdf_target - pdf_proposal
# Maximum of the log weights
logweights_max = np.max(logweights)
# Shift the weights
logweights_shifted = logweights - logweights_max
# Retrieve normalized, non-log-transformed weights
weights_normalized = np.exp(logweights_shifted) / \
np.sum(np.exp(logweights_shifted))
(logweights_shifted, weights_normalized)
Out[94]:
Now the log-weights are shifted towards a more reasonable order of magnitude which the floating point arithmetics can handle with better precision.
Consider the stochastic volatility model $$ \begin{align} x_t\,|\,x_{t - 1} &\sim N(x_t; \phi \cdot x_{t - 1}, \sigma^2) \\ y_t\,|\,x_t &\sim N(y_t; 0, \beta^2 \cdot \exp(x_t)) \end{align} $$ with parameter vector $\theta = (\phi, \sigma, \beta)$.
In [7]:
path = '..\\..\\..\\..\\course_material\\exercise_sheets\\'
data = pd.read_csv(path + 'seOMXlogreturns2012to2014.csv',
header=None, names=['logreturn'])
ys = data.logreturn.values
fig, ax = plt.subplots(figsize=(12, 4))
ax.plot(ys, 'o-', markersize=4);
To determine the initial distribution, since we are sure of the model, we can run the model for the state for a long time and then sample from the state values we get. This should give us the unconditional distribution of $x_t$ in no presence of measurements.
In [107]:
n0 = 2000000
x0s = np.zeros((n0 + 1,))
for t in range(n0):
x0s[t + 1] = 0.98 * x0s[t] + 0.16 * randn()
As expected the mean of this process is approximately zero.
In [108]:
np.mean(x0s[1000:])
Out[108]:
The standard deviation is however substantially larger than the standard deviation of the state dynamics.
In [109]:
np.std(x0s[1000:])
Out[109]:
This supports the hypothesis that the state should be initialised with a larger standard deviation.
In [106]:
fig, ax = plt.subplots(figsize=(12,4))
ax.hist(x0s[1000:], normed=True, bins=60);
x_grid = np.arange(-4, 4, 0.1);
pdf = 1 / (np.sqrt(2 * np.pi) * np.std(x0s[1000:])) * np.exp(-0.5 * x_grid**2 / np.std(x0s[1000:])**2);
ax.plot(x_grid, pdf);
Assume the parameter vector is given as $\theta = (0.98, 0.16, 0.70)$. Assume for simplicity that $x_0 \sim \mathcal{N}(0, 0.8^2)$.
In [110]:
theta = [0.98, 0.16, 0.70]
def bpf(ys, n=200):
# Length of data
tmax = len(ys)
# Pre-allocate
xs = np.zeros((tmax + 1, n)); ancs = np.zeros((tmax + 1, n), dtype=np.int32)
ws = np.zeros((tmax + 1, n)); wsnorm = np.zeros((tmax + 1, n))
# Initialise
xs[0, :] = 0.8 * randn(n)
ws[0, :] = 1 / n * np.ones((n,))
wsnorm[0, :] = ws[0, :]
ancs[0, :] = range(n)
for t in range(tmax):
# Propagate using proposal
xs[t + 1, :] = theta[0] * xs[t, ancs[t, :]] + theta[1] * randn(n)
# Reweight, multiplying with previous weights not necessary since always 1 / n
tmp = ys[t] * np.exp(-xs[t + 1, :] / 2) / theta[2]
ws[t + 1, :] = np.exp(-xs[t + 1, :] / 2) / (np.sqrt(2 * np.pi ) * theta[2]) * \
np.exp(-0.5 * tmp * tmp)
# Normalize weights
wsnorm[t + 1, :] = ws[t + 1, :] / np.sum(ws[t + 1, :])
# Resample
ancs[t + 1, :] = choice(range(n), size=n, replace=True, p=wsnorm[t + 1, :])
return xs, ancs, ws, wsnorm
Execute the bootstrap particle filter
In [111]:
xs, ancs, ws, wsnorm = bpf(ys)
In [112]:
means = np.sum(wsnorm * xs, axis=1)
fig, axs = plt.subplots(2, 1, figsize=(12, 8))
axs[0].plot(range(1, len(ys) + 1), ys, 'or', markersize=3);
axs[0].fill_between(range(len(ys) + 1), -theta[2] * np.exp(means / 2), theta[2] * np.exp(means / 2),
alpha=0.4);
axs[0].set_xlabel('Time');
axs[0].set_ylabel('$y_t$');
axs[1].plot(means, '-k');
axs[1].set_xlabel('Time');
axs[1].set_ylabel('$x_t$');
When there is more variability in the time series then the estimated volatility is higher as well and vice versa. The estimate therefore seems to be reasonable.
In [43]:
%timeit bpf(ys, n=500)
In [69]:
%timeit bpf(ys, n=10000)
In [38]:
ws = np.random.rand(1000)
ws = ws / np.sum(ws)
In [39]:
%timeit multinomial(1, ws, size=1000).argmax(axis=1)
In [40]:
ns = np.array(range(1000))
%timeit choice(ns, size=1000, p=ws, replace=True)