In [6]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
Exercise 1 (20 points)
Implement a function that returns $n$ samples from a multivariate Gaussian distribution in C++ and wrap it for use in Python using pybind11
. Use only standard C++ and the Eigen
library. The function signature in Python is
def mvnorm(mu, Sigma, n):
"""Returns n random samples from a multivariate Gaussian distribution.
mu is a mean vector
Sigma is a covariance matrix
Returns an n by p matrix, where p is the dimension of the distribution.
"""
In [4]:
%%file rng.cpp
<%
cfg['compiler_args'] = ['-std=c++11']
cfg['include_dirs'] = ['eigen']
setup_pybind11(cfg)
%>
#include <pybind11/pybind11.h>
#include <pybind11/eigen.h>
#include <Eigen/Cholesky>
#include <random>
namespace py = pybind11;
Eigen::MatrixXd mvn(Eigen::VectorXd mu, Eigen::MatrixXd sigma, int n) {
std::default_random_engine generator;
std::normal_distribution<double> distribution(0, 1);
Eigen::MatrixXd A(sigma.llt().matrixL());
int p = mu.size();
Eigen::MatrixXd Z(n, p);
for (int i=0; i<n; i++) {
Eigen::VectorXd v(p);
for (int j=0; j<p; j++) {
v[j] = distribution(generator);
}
Z.row(i) = mu + A * v;
}
return Z;
}
PYBIND11_PLUGIN(rng) {
pybind11::module m("rng", "auto-compiled c++ extension");
m.def("mvn", &mvn);
return m.ptr();
}
In [7]:
import cppimport
import numpy as np
rng = cppimport.imp("rng")
mu = 4.0*np.ones(2)
sigma = np.array([[1,0.6], [0.6, 1]])
n = 1000
x, y = rng.mvn(mu, sigma, n).T
sns.jointplot(x, y, kind='scatter')
pass
Exercise 2 (20 points)
For example: if the trials were [0, 1, 0, 1, 1, 0, 0, 0, 0, 1], the function should return
{1: 2, 2: 1})
In [4]:
from collections import Counter
def count_runs(xs):
"""Count number of success runs of length k."""
ys = []
count = 0
for x in xs:
if x == 1:
count += 1
else:
if count:
ys.append(count)
count = 0
if count:
ys.append(count)
return Counter(ys)
In [5]:
count_runs([0, 1, 0, 1, 1, 0, 0, 0, 0, 1])
Out[5]:
In [6]:
def count_runs_alt(x):
"""Returns the counts for runs of length k for each observed in x.
This works but is slower.
"""
return Counter(len(s) for s in ''.join(map(str, x)).split('0') if s)
In [7]:
count_runs_alt([0, 1, 0, 1, 1, 0, 0, 0, 0, 1])
Out[7]:
In [8]:
%timeit count_runs([0, 1, 0, 1, 1, 0, 0, 0, 0, 1])
In [9]:
%timeit count_runs_alt([0, 1, 0, 1, 1, 0, 0, 0, 0, 1])
In [10]:
def run_prob(expts, n, k, p):
xxs = np.random.choice([0,1], (expts, n), p=(1-p, p))
return sum(max(d.keys()) >= k for d in map(count_runs, xxs))/expts
In [11]:
run_prob(expts=100000, n=100, k=5, p=0.5)
Out[11]:
In [12]:
run_prob(expts=100000, n=100, k=7, p=0.7)
Out[12]:
Exercise 3 (20 points).
Consider an unbiased random walk of length $n$ as simulated with a sequence of -1 or +1 values. If we start from 0, plot the distribution of last return times for 100,000 simulations with $n = 100$. The last return time is the last time the cumulative sum of the random walk is zero - this may be the starting point if the walk never returns to zero in 100 steps.
Do a maximum likeliood fit of a beta distribution to the set of last return times using the beta.fit()
function from scipy.stats
. Set the lower bound (loc) = 0 and the upper bound (scale) = 100 for plotting. Superimpose the fitted beta PDF on the normalized histogram of last reutrn times.
In [13]:
n = 100
k = 100000
returns = np.zeros(k).astype('int')
for i in range(k):
x = np.random.choice([-1,1], n)
y = np.r_[0, np.cumsum(x)]
returns[i] = np.nonzero(y == 0)[0][-1]
plt.hist(returns, normed=True)
pass
In [14]:
from scipy.stats import beta
In [15]:
a, b, loc, scale = beta.fit(returns)
x = np.linspace(0, 100, 100)
plt.plot(x, beta(a=a, b=b, loc=0, scale=100).pdf(x), linestyle='dashed', color='blue')
plt.hist(returns, histtype='step', normed=True, linewidth=1)
pass
Exercise 4 (20 points)
The Cauchy distribution is given by $$ f(x) = \frac{1}{\pi (1 + x^2)}, \ \ -\infty \lt x \lt \infty $$
In [16]:
from scipy import stats
In [17]:
# Direct
n = 1000
sum(stats.cauchy().rvs(n) > 2)/n
Out[17]:
In [18]:
# After change of variables
x = stats.uniform().rvs(n)
np.mean(2/(np.pi*(x**2+4)))
Out[18]:
In [19]:
# Check (not required)
1 - stats.cauchy.cdf(2)
Out[19]:
In [20]:
n=1000
reps = 1000
samples = stats.cauchy().rvs((n, reps))
# repeat multiple Monte Carlo sequences
ys = np.zeros((n, reps))
for k in range(1, n+1):
ys[k-1] = np.sum(samples[:k, :] > 2, axis=0)/k
upper, lower = np.percentile(ys, [2.5, 97.5], axis=1)
plt.plot(np.arange(1, n+1)[:, None], ys, c='grey', alpha=0.02)
plt.plot(np.arange(1, n+1), ys[:, 0], c='red', linewidth=1) # one path
plt.plot(np.arange(1, n+1), upper, 'b', np.arange(1, n+1), lower, 'b')
pass
In [21]:
n=1000
reps = 1000
samples = stats.uniform().rvs(n)
samples = 2/(np.pi*(samples**2+4))
# generate bootsrap samples
xb = np.random.choice(samples, (n, reps), replace=True)
yb = 1/np.arange(1, n+1)[:, None] * np.cumsum(xb, axis=0)
upper, lower = np.percentile(yb, [2.5, 97.5], axis=1)
plt.plot(np.arange(1, n+1)[:, None], yb, c='grey', alpha=0.02)
plt.plot(np.arange(1, n+1), yb[:, 0], c='red', linewidth=1)
plt.plot(np.arange(1, n+1), upper, 'b', np.arange(1, n+1), lower, 'b')
pass
Exercise 5 (20 points).
Estimate the following integral using Monte Carlo integration
$$ \int_{-\infty}^{\infty} x^2 \frac{1}{2}e^{-|x|} dx $$Hint: See notes on importance sampling and figure.
In [22]:
# Use importance sampling a normal distribuion
def p(x):
"""Double exponential density."""
return 0.5*np.exp(-np.abs(x))
n = 1000000
x = stats.norm(0, 2).rvs(n)
np.mean(x**2 * p(x)/stats.norm(0, 2).pdf(x))
Out[22]:
In [23]:
# Check (not required)
from sympy import symbols, integrate, exp, oo
x = symbols('x')
integrate(x**2 * exp(-x), (x, 0, oo))
Out[23]:
In [ ]: