In [8]:
from traitlets.config.manager import BaseJSONConfigManager
#path = "/home/damian/miniconda3/envs/rise_latest/etc/jupyter/nbconfig"
cm = BaseJSONConfigManager()
cm.update('livereveal', {
'theme': 'sky',
'transition': 'zoom',
'start_slideshow_at': 'selected',
'scroll': True
})
Out[8]:
In the forward propagation of uncertainty, we have a known model F for a system of interest. We model its inputs $X$ as a random variable and wish to understand the output random variable $$Y = F(X)$$ (also denoted $Y \vert X$) and reads $Y$ given $X$.
Also, this is related to sensitivity analysis (how random variations in $X$ influence variation in $Y$).
In inverse problems, $F$ is a forward model, but $Y$ is observed data, and we want to find the input data $X$
such that $F(X) = Y$, i.e. we want $X \vert Y$ instead of $Y \vert X$.
Inverse problems are typically ill-posed in the usual sense, so we need an expert (or prior) about what a good solution $X$ might be.
Bayesian perspective becomes the method of choice, but this requires the representation of high-dimensional distributions.
$$p(X \vert Y) = \frac{p(Y \vert X) p(X)}{p(Y)}.$$Consider orthogonal polynomials $\{p_n\}$ $$ \langle p_n,\, p_m \rangle = \int_a^bp_n(x)p_m(x)w(x)\,dx=\delta_{nm}h_n. $$
In [23]:
import numpy as np
import matplotlib.pyplot as plt
from numpy.polynomial import Chebyshev as T
from numpy.polynomial.hermite import hermval
%matplotlib inline
def p_cheb(x, n):
"""
RETURNS T_n(x)
value of not normalized Chebyshev polynomials
$\int \frac1{\sqrt{1-x^2}}T_m(x)T_n(x) dx = \frac\pi2\delta_{nm}$
"""
return T.basis(n)(x)
def p_herm(x, n):
"""
RETURNS H_n(x)
value of non-normalized Probabilistic polynomials
"""
cf = np.zeros(n+1)
cf[n] = 1
return (2**(-float(n)*0.5))*hermval(x/np.sqrt(2.0), cf)
def system_mat(pnts, maxn, poly):
"""
RETURNS system matrix
"""
A = np.empty((pnts.size, maxn), dtype=float)
for i in range(maxn):
A[:, i] = poly(pnts, i)
return A
In [24]:
x = np.linspace(-1, 1, 1000)
data = []
for i in range(5):
data.append(x)
data.append(p_cheb(x, i))
plt.plot(*data)
plt.legend(["power = {}".format(i) for i in range(len(data))]);
In [25]:
def complex_func(x):
return np.sin(2.0*x*np.pi)*np.cos(0.75*(x+0.3)*np.pi)
plt.plot(x, complex_func(x));
Now, let's approximate the function with polynomials taking different maximal power $n$ and the corresponding number of node points $$ f(x)\approx\hat f(x)=\sum_{i=0}^n\alpha_i p_i(x) $$
In [26]:
n = 6
M = n
nodes = np.linspace(-1, 1, M)
RH = complex_func(nodes)
A = system_mat(nodes, n, p_cheb)
if n == M:
alpha = np.linalg.solve(A, RH)
else:
alpha = np.linalg.lstsq(A, RH)[0]
print("α = {}".format(alpha))
In [27]:
def calc_apprximant(poly, alpha, x):
"""
RETURNS values of approximant in points x
"""
n = len(alpha)
y = np.zeros_like(x)
for i in range(n):
y[...] += poly(x, i)*alpha[i]
return y
In [28]:
y = complex_func(x)
approx_y = calc_apprximant(p_cheb, alpha, x)
plt.plot(x, y, x, approx_y, nodes, RH, 'ro');
Approximate value of the error $$ \varepsilon= \|f-\hat f\|_\infty\approx\max_{x\in \mathcal X}\bigl|f(x)-\hat f(x)\bigr| $$
In [29]:
epsilon = np.linalg.norm(y - approx_y, np.inf)
print("ε = {}".format(epsilon))
If we take another set of polynomials, the result of the approximation will be the same (coefficients $\alpha$ will be different of course).
In [30]:
A = system_mat(nodes, n, p_herm)
if n == M:
alpha = np.linalg.solve(A, RH)
else:
alpha = np.linalg.lstsq(A, RH)[0]
print("α = {}".format(alpha))
approx_y = calc_apprximant(p_herm, alpha, x)
plt.plot(x, y, x, approx_y, nodes, RH, 'ro')
epsilon = np.linalg.norm(y - approx_y, np.inf)
print("ε = {}".format(epsilon))
Now, what will change if we take another set of node points?
In [31]:
nodes = np.cos((2.0*np.arange(M) + 1)/M*0.5*np.pi)
RH = complex_func(nodes)
A = system_mat(nodes, n, p_herm)
alpha = np.linalg.solve(A, RH)
print("α = {}".format(alpha))
approx_y = calc_apprximant(p_herm, alpha, x)
plt.plot(x, y, x, approx_y, nodes, RH, 'ro')
epsilon_cheb = np.linalg.norm(y - approx_y, np.inf)
print("ε_cheb = {}".format(epsilon_cheb))
In [32]:
# All in one. We can play with maximum polynomial power
def plot_approx(f, n, distrib='unif', poly='cheb'):
def make_nodes(n, distrib='unif'):
return {'unif' : lambda : np.linspace(-1, 1, n),
'cheb' : lambda : np.cos((2.0*np.arange(n) + 1.0)/n*0.5*np.pi)}[distrib[:4].lower()]
poly_f = {'cheb' : p_cheb, 'herm' : p_herm}[poly[:4].lower()]
#solve
nodes = make_nodes(n, distrib)()
RH = f(nodes)
A = system_mat(nodes, n, p_herm)
alpha = np.linalg.solve(A, RH)
# calc values
x = np.linspace(-1, 1, 2**10)
y = f(x)
approx_y = calc_apprximant(p_herm, alpha, x)
#plot
plt.figure(figsize=(14,6.5))
plt.plot(x, y, x, approx_y, nodes, RH, 'ro')
plt.show()
# calc error
epsilon_cheb = np.linalg.norm(y - approx_y, np.inf)
print("ε = {}".format(epsilon_cheb))
from ipywidgets import interact, fixed, widgets
In [33]:
interact(plot_approx,
f=fixed(complex_func),
n=widgets.IntSlider(min=1,max=15,step=1,value=4,continuous_update=True,description='# of terms (n)'),
distrib=widgets.ToggleButtons(options=['Uniform', 'Chebyshev roots'],description='Points distr.'),
poly=widgets.ToggleButtons(options=['Chebyshev polynomials', 'Hermite polynomials'],description='Poly. type')
);
Let input $x$ is random with known probability density function $\rho$.
We want to know statistical properties of the output
How to find them using polynomial expansion?
Assume the function $f$ is analytical $$ f(x)=\sum_{i=0}^\infty \alpha_i p_i(x). $$ The mean value of $f$ is $$ \mathbb E f = \int_a^bf(\tau)\rho(\tau)\,d\tau = \sum_{i=0}^\infty \int_a^b\alpha_i p_i(\tau)\rho(\tau)\,d\tau. $$
If the set of orthogonal polynomials $\{p_n\}$ have the same wight function as $\rho$, and the first polynomial is constant $p_0(x)=h_0$, then $\mathbb Ef=\alpha_0h_0$. Usually, $h_0=1$ and we get a simple relation $$ \mathbb Ef = \alpha_0 $$
The variance is equal to $$ \text{Var } f=\mathbb E\bigl(f-\mathbb E f\bigr)^2= \int_a^b \left(\sum_{i=1}^\infty\alpha_ip_i(\tau)\right)^2\rho(\tau)\,d\tau , $$ note, that the summation begins with 1. Assume we can interchange the sum and the integral, then $$ \text{Var } f= \sum_{i=1}^\infty\sum_{j=1}^\infty\int_a^b \!\!\alpha_ip_i(\tau)\,\alpha_jp_j(\tau)\,\rho(\tau)\,d\tau = \sum_{i=1}^\infty \alpha_i^2h_i. $$ The formula is very simple if all the coefficients $\{h_i\}$ are equal to 1 $$ \text{Var } f = \sum_{i=1}^\infty \alpha_i^2 $$
Let us check the formulas for the mean and variance by calculating them using the Monte-Carlo method.
First, consider the case of normal distrubution of the input $x\sim\mathcal N(0,1)$, $\rho(x)=\frac1{\sqrt{2\pi}}\exp(-x^2/2)$, so we take Hermite polynomials.
In [34]:
# Scale the function a little
scale = 5.0
big_x = np.random.randn(int(1e6))
big_y = complex_func(big_x/scale)
mean = np.mean(big_y)
var = np.std(big_y)**2
print ("mean = {}, variance = {}".format(mean, var))
In [35]:
def p_herm_snorm(n):
"""
Square norm of "math" Hermite (w = exp(-x^2/2)/sqrt(2*pi))
"""
return np.math.factorial(n)
n = 15
M = n
nodes = np.linspace(-scale, scale, M)
RH = complex_func(nodes/scale)
A = system_mat(nodes, n, p_herm)
if n == M:
alpha = np.linalg.solve(A, RH)
else:
W = np.diag(np.exp( -nodes**2*0.5))
alpha = np.linalg.lstsq(W.dot(A), W.dot(RH))[0]
h = np.array([p_herm_snorm(i) for i in range(len(alpha))])
var = np.sum(alpha[1:]**2*h[1:])
print ("mean = {}, variance = {}".format(alpha[0]*h[0], var))
Note, that the precise values are $$ \mathbb E f = -0.16556230699\ldots, \qquad \text{Var }f= 0.23130350880\ldots $$ so, the method based on polynomial expansion is more precise than Monte-Carlo.
In [36]:
ex = 2
x = np.linspace(-scale - ex, scale + ex, 10000)
y = complex_func(x/scale)
approx_y = calc_apprximant(p_herm, alpha, x)
plt.plot(x, y, x, approx_y, nodes, RH, 'ro');
The model described above is a special case of linear model: we fix a basis set and obtain
$$f(x) \approx \sum_{k=1}^M c_k \phi_k(x).$$For $x \in \mathbb{R}^d$ what basis set to choose?
Why tensor-product polynomials are bad for large $d$ ?
What are the alternatives to tensor-product basis?
Let $A \in \mathbb{R}^{n \times r}$, $n \gg r$.
Let $\widehat{A}$ be the submatrix of maximum volume.
Then, all coefficients in $A \widehat{A}^{-1}$ are less than $1$ in modulus.
As a simple consequence,
we have
$$E_{D} \leq (r+1) E_{best},$$where $E_D$ is the approximation from optimal design, and $E_{best}$ is the best possibble approximation error in the Chebyshev norm.
Given $\mathcal{D}$ we wish to make predictions for new inputs $\mathbf{x}_*$ within the domain.
To solve this problem we must make assumptions about the characteristics of $f(\mathbf{x})$.
restrict the class of functions that we consider (e.g. considering only linear functions)
the second approach is (speaking rather loosely) to give a prior probability to every possible function, where higher probabilities are given to functions that we consider to be more likely.
Typically, there is some knowledge about a function of interest $f(\mathbf{x})$ even before observing it anywhere.
For example, $f(\mathbf{x})$ cannot exceed, or be smaller than, certain values or that it is periodic or that it shows translational invariance.
Such knowledge is called as the prior knowledge.
Prior knowledge may be precise (e.g., $f(\mathbf{x})$ is twice differentiable), or it may be vague (e.g., the probability that the periodicity is $T$ is $p(T)$). When we have a deal with vague prior knowledge, we refer to it as prior belief.
Prior beliefs about $f(\mathbf{x})$ can be modeled by a probability measure on the space of functions from $\mathcal{F}$ to $\mathbb{R}$. A GP is a great way to represent this probability measure.
A Gaussian process is a collection of random variables, any finite number of which have a joint Gaussian distribution (in other words GP is a generalization of a multivariate Gaussian distribution to infinite dimensions).
A GP defines a probability measure on $\mathcal{F}$. When we say that $f(\mathbf{x})$ is a GP, we mean that it is a random variable that is actually a function.
Analytically, it can be written as $$ f(\mathbf{x}) \sim \mbox{GP}\left(m(\mathbf{x}), k(\mathbf{x},\mathbf{x'}) \right), $$ where
Let $\mathbf{x}_{1:n}=\{\mathbf{x}_1,\dots,\mathbf{x}_n\}$ be $n$ points in $\mathbb{R}^d$. Let $\mathbf{f}\in\mathbb{R}^n$ be the outputs of $f(\mathbf{x})$ on each one of the elements of $\mathbf{x}_{1:n}$, $$ \mathbf{f} = \left( \begin{array}{c} f(\mathbf{x}_1)\\ \vdots\\ f(\mathbf{x}_n) \end{array} \right). $$ The fact that $f(\mathbf{x})$ is a GP with mean and covariance function $m(\mathbf{x})$ and $k(\mathbf{x},\mathbf{x'})$ means that the vector of outputs $\mathbf{f}$ at the arbitrary inputs is the following multivariate-normal: $$ \mathbf{f} \sim \mathcal{N}\bigl(\mathbf{m}(\mathbf{x}_{1:n}), \mathbf{K}(\mathbf{x}_{1:n}, \mathbf{x}_{1:n})\bigr), $$ with mean vector: $$ \mathbf{m}(\mathbf{x}_{1:n}) = \left( \begin{array}{c} m(\mathbf{x}_1)\\ \vdots\\ m(\mathbf{x}_n) \end{array} \right), $$ and covariance matrix: $$ \mathbf{K}(\mathbf{x}_{1:n},\mathbf{x}_{1:n}) = \left( \begin{array}{ccc} k(\mathbf{x}_1,\mathbf{x}_1) & \dots &k(\mathbf{x}_1, \mathbf{x}_n)\\ \vdots& \ddots& \vdots\\ k(\mathbf{x}_n, \mathbf{x}_1)& \dots &k(\mathbf{x}_n, \mathbf{x}_n) \end{array} \right). $$
Now, since we have defined a GP, let us talk about how do we encode our prior beliefs into a GP.
We do so through the mean and covariance functions.
For any point $\mathbf{x}\in\mathbb{R}^d$, $m(\mathbf{x})$ is the expected value of the r.v. $f(\mathbf{x})$:
$$
m(\mathbf{x}) = \mathbb{E}[f(\mathbf{x})].
$$
The mean function can be any arbitrary function. Essentially, it tracks generic trends in the response as the input is varied.
In practice, we try and make a suitable choice for the mean function that is easy to work with. Such choices include:
Squared exponential (SE) is widely used covariance function. Its has the form: $$ k(\mathbf{x}, \mathbf{x}') = v\exp\left\{-\frac{1}{2}\sum_{i=1}^d\frac{(x_i - x_i')^2}{\ell_i^2}\right\}, $$ where
In [42]:
# 1-D example
from ipywidgets import interactive, interact, widgets
import matplotlib.pyplot as plt
import numpy as np
import scipy.spatial as SP
# defining Squared Exponential Kernel and plot it
def k(length_scale):
x = np.arange(0., 5., 0.1)
plt.figure(figsize=(10, 7))
plt.ylim([0, 1.05])
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$k(x,0)$', fontsize=16)
plt.plot(x, np.exp(-.5 * x**2/length_scale**2), 'b-')
plt.show()
controls = {r'length_scale': widgets.FloatSlider(
min=0.01, max=5.0, step=0.1, value=1., continuous_update=False, description=r'$\ell$')}
In [43]:
from ipywidgets import interactive
import matplotlib.pyplot as plt
import numpy as np
def GP(length_scale, Test, Training, sigma):
np.random.seed(100)
""" This is code for simple GP regression. It assumes a zero mean GP Prior """
# This is the true unknown function we are trying to approximate
def f(x): return np.sin(0.9*x.flatten())
# Define the kernel
def kernel(a, b):
sqdist = SP.distance.cdist(a, b, 'sqeuclidean')
return np.exp(-.5 * sqdist/(length_scale**2))
N = Training # number of training points.
n = Test # number of test points.
s = sigma # noise variance.
# Sample some input points and noisy versions of the function evaluated at
# these points.
X = np.random.uniform(-5, 5, size=(N, 1))
y = f(X) + s*np.random.randn(N)
K = kernel(X, X)
L = np.linalg.cholesky(K + s*np.eye(N))
# points we're going to make predictions at.
Xtest = np.linspace(-5, 5, n)[:, None]
# compute the mean at our test points.
Lk = np.linalg.solve(L, kernel(X, Xtest))
mu = np.dot(Lk.T, np.linalg.solve(L, y))
# compute the variance at our test points.
K_ = kernel(Xtest, Xtest)
s2 = np.diag(K_) - np.sum(Lk**2, axis=0)
s = np.sqrt(s2)
# PLOTS:
plt.figure(figsize=(9, 7))
plt.clf()
plt.plot(X, y, 'r+', ms=18, label="Training points")
plt.plot(Xtest, f(Xtest), 'b-', label="Function")
plt.gca().fill_between(Xtest.flat, mu-s, mu+s,
color="#dddddd", label="Confidence interval")
plt.plot(Xtest, mu, 'r--', lw=2, label="Approximation")
plt.title(r'Mean prediction plus-minus one s.d.')
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$f(x)$', fontsize=16)
plt.axis([-5, 5, -3, 3])
plt.legend()
print("Error (inf. norm) = ", np.linalg.norm(f(Xtest)-mu, ord=np.inf)/np.linalg.norm(f(Xtest), ord=np.inf))
plt.show()
controls = {r'sigma': widgets.FloatSlider(min=5e-4, max=5e-1, step=1e-3, value=1e-3, continuous_update=True, description=r'$\sigma$'),
r'length_scale': widgets.FloatSlider(min=0.1, max=2.0, step=0.05, value=0.7, continuous_update=True, description=r'$\ell$'),
r'Training': widgets.IntSlider(min=1, max=50, step=1, value=10, continuous_update=True, description=r'$N$ of $f$ evals'),
r'Test': widgets.IntSlider(min=1, max=100, step=1, value=50, continuous_update=True, description=r'$N$ of GP samples')}
In [44]:
interact(GP, **controls);