In [1]:
from __future__ import division
# scientific
%matplotlib inline
from matplotlib import pyplot as plt;
import numpy as np;
# ipython
import IPython;
# python
import os;
#####################################################
# image processing
import PIL;
# trim and scale images
def trim(im, percent=100):
print("trim:", percent);
bg = PIL.Image.new(im.mode, im.size, im.getpixel((0,0)))
diff = PIL.ImageChops.difference(im, bg)
diff = PIL.ImageChops.add(diff, diff, 2.0, -100)
bbox = diff.getbbox()
if bbox:
x = im.crop(bbox)
return x.resize(((x.size[0]*percent)//100,
(x.size[1]*percent)//100),
PIL.Image.ANTIALIAS);
#####################################################
# daft (rendering PGMs)
import daft;
# set to FALSE to load PGMs from static images
RENDER_PGMS = False;
# decorator for pgm rendering
def pgm_render(pgm_func):
def render_func(path, percent=100, render=None, *args, **kwargs):
print("render_func:", percent);
# render
render = render if (render is not None) else RENDER_PGMS;
if render:
print("rendering");
# render
pgm = pgm_func(*args, **kwargs);
pgm.render();
pgm.figure.savefig(path, dpi=300);
# trim
img = trim(PIL.Image.open(path), percent);
img.save(path, 'PNG');
else:
print("not rendering");
# error
if not os.path.isfile(path):
raise("Error: Graphical model image %s not found."
+ "You may need to set RENDER_PGMS=True.");
# display
return IPython.display.Image(filename=path);#trim(PIL.Image.open(path), percent);
return render_func;
######################################################
Why do we care?
Suppose we have a model $P$ with parameters $\theta$. Then,
A statistic $T(\D)$ is sufficient for $\theta$ if no other statistic calculated from the same sample provides any additional information about the parameter.
That is, if $T(\D_1) = T(\D_2)$, our estimate of $\theta$ given $\D_1$ or $\D_2$ will be the same.
Suppose $X \sim \mathcal{N}(\mu, \sigma^2)$ and we observe $\mathcal{D} = (x_1, \dots, x_n)$. Let
Then $T(\mathcal{D}) = (\hat\mu, \hat{\sigma}^2)$ is sufficient for $\theta=(\mu, \sigma^2)$.
(we are sweeping some details under the rug)
DEF: $p(x | \theta)$ has exponential family form if: $$ \begin{align} p(x | \theta) &= \frac{1}{Z(\theta)} h(x) \exp\left[ \eta(\theta)^T \phi(x) \right] \\ &= h(x) \exp\left[ \eta(\theta)^T \phi(x) - A(\theta) \right] \end{align} $$
The Bernoulli distribution can be written as $$ \begin{align} \mathrm{Ber}(x | \mu) &= \mu^x (1-\mu)^{1-x} \\ &= \exp\left[ x \log \mu + (1-x) \log (1-\mu) \right] \\ &= \exp\left[ \eta(\mu)^T \phi(x) \right] \end{align} $$
where $\eta(\mu) = (\log\mu, \log(1-\mu))$ and $\phi(x) = (x, 1-x)$
Derivatives of the log-partition function $A(\theta)$ yield cumulants of the sufficient statistics (Exercise!)
This guarantees that $A(\theta)$ is convex!
For multi-variate case, we have
$$ \frac{\partial^2A}{\partial\theta_i \partial\theta_j} = E[\phi_i(x)\phi_j(x)] - E[\phi_i(x)] E[\phi_j(x)]$$and hence, $$ \nabla^2A(\theta) = Cov[\phi(x)] $$
Since covariance is positive definite, we have $A(\theta)$ convex as required.
For data $\D = (x_1, \dots, x_N)$, the likelihood is $$ p(\D|\theta) = \left[ \prod_{k=1}^N h(x_k) \right] Z(\theta)^{-N} \exp\left[ \eta(\theta)^T \left(\sum_{k=1}^N \phi(x_k) \right) \right] $$
The sufficient statistics are now $\phi(\D) = \sum_{k=1}^N \phi(x_k)$.
For natural parameters $\theta$ and data $\D = (x_1, \dots, x_N)$, $$ \log p(\D|\theta) = \theta^T \phi(\D) - N A(\theta) $$
Since $-A(\theta)$ is concave and $\theta^T\phi(\D)$ linear,
Strict convexity of the log-partition function requires that we are working with a "regular exponential family". More on this can be found in These Notes.
To find the maximum, recall $\nabla_\theta A(\theta) = E_\theta[\phi(x)]$, so \begin{align*} \nabla_\theta \log p(\D | \theta) & = \nabla_\theta(\theta^T \phi(\D) - N A(\theta)) \\ & = \phi(\D) - N E_\theta[\phi(X)] = 0 \end{align*} Which gives $$E_\theta[\phi(X)] = \frac{\phi(\D)}{N} = \frac{1}{N} \sum_{k=1}^N \phi(x_k)$$
At the MLE $\hat\theta_{MLE}$, the empirical average of sufficient statistics equals their expected value.
Exact Bayesian analysis is considerably simplified if the prior is conjugate to the likelihood.
This requires likelihood to have finite sufficient statistics
Note: We will release some notes on cojugate priors + exponential families. It's hard to learn from slides and needs a bit more description.
Posterior: $$ p(\theta|\mathcal{D}) = p(\theta|\nu_N, \tau_N) = p(\theta| \nu_0 +N, \tau_0 +s_N) $$
Note that we obtain hyper-parameters by adding. Hence,
$$ \begin{align} p(\eta|\mathcal{D}) &\propto \exp[\eta^T (\nu_0 \bar\tau_0 + N \bar s) - (\nu_0 + N) A(\eta) ] \\ &= p(\eta|\nu_0 + N, \frac{\nu_0 \bar\tau_0 + N \bar s}{\nu_0 + N}) \end{align}$$where $\bar s = \frac 1 N \sum_{i=1}^{N}\phi(x_i)$.
"I basically know of two principles for treating complicated systems in simple ways: the first is the principle of modularity and the second is the principle of abstraction. I am an apologist for computational probability in machine learning because I believe that probability theory implements these two principles in deep and intriguing ways — nameley through factorization and through averaging. Exploiting these two mechanisms as fully as possible seems to me to be the way forward in machine learning" – Michael Jordan (qtd. in MLAPP)
Suppose we observe multiple correlated variables $x=(x_1, \dots, x_n)$.
How can we compactly represent the joint distribution $p(x|\theta)$?
What is the joint distribution of three independent coin flips?
Assuming independence, $P(X_1, X_2, X_3) = P(X_1)P(X_2)P(X_3)$
Exploiting the independence structure of a joint distribution leads to more concise representations.
In Naive Bayes, we assumed the features $X_1, \dots, X_N$ were independent given the class label $C$: $$ P(x_1, \dots, x_N, c) = P(c) \prod_{k=1}^N P(x_k | c) $$
This greatly simplified the learning procedure:
The key to efficiently representing large joint distributions is to make conditional independence assumptions of the form $$ X \perp Y \mid Z \iff p(X,Y|Z) = p(X|Z)p(Y|Z) $$
Once $z$ is known, information about $x$ does not tell us any information about $y$ and vice versa.
An effective way to represent these assumptions is with a graph.
A Bayesian Network $\mathcal{G}$ is a directed acyclic graph whose nodes represent random variables $X_1, \dots, X_n$.
Examples will come shortly...
Every Bayesian Network $\G$ encodes a set $\I_\ell(\G)$ of local independence assumptions:
For each variable $X_k$, we have $(X_k \perp \NonDesc_\G(X_k) \mid \Parents_\G(X_k))$
Every node $X_k$ is conditionally independent of its nondescendants given its parents.
In [22]:
@pgm_render
def pgm_naive_bayes():
pgm = daft.PGM([4,3], origin=[-2,0], node_unit=0.8, grid_unit=2.0);
# nodes
pgm.add_node(daft.Node("c", r"$C$", -0.25, 2));
pgm.add_node(daft.Node("x1", r"$X_1$", -1, 1));
pgm.add_node(daft.Node("x2", r"$X_2$", -0.5, 1));
pgm.add_node(daft.Node("dots", r"$\cdots$", 0, 1, plot_params={ 'ec' : 'none' }));
pgm.add_node(daft.Node("xN", r"$X_N$", 0.5, 1));
# edges
pgm.add_edge("c", "x1", head_length=0.08);
pgm.add_edge("c", "x2", head_length=0.08);
pgm.add_edge("c", "xN", head_length=0.08);
return pgm;
In [23]:
%%capture
pgm_naive_bayes("images/naive-bayes.png");
Out[23]:
A Bayesian network $\G$ over variables $X_1, \dots, X_N$ encodes a set of conditional independencies.
There are many distributions $P$ satisfying the independencies in $\G$.
If $P$ satisfies the independence assertions made by $\G$, we say that
Any distribution satisfying $\G$ shares common structure.
We can factorize any joint distribution via the Chain Rule for Probability: $$ \begin{align} P(X_1, \dots, X_N) &= P(X_1) P(X_2, \dots, X_N | X_1) \\ &= P(X_1) P(X_2 | X_1) P(X_3, \dots, X_N | X_1, X_2) \\ &= \prod_{k=1}^N P(X_k | X_1, \dots, X_{k-1}) \end{align} $$
Here, the ordering of variables is arbitrary. This works for any permutation.
In [24]:
@pgm_render
def pgm_topological_order():
pgm = daft.PGM([4, 4], origin=[-4, 0])
# Nodes
pgm.add_node(daft.Node("x1", r"$1$", -3.5, 2))
pgm.add_node(daft.Node("x2", r"$2$", -2.5, 1.3))
pgm.add_node(daft.Node("x3", r"$3$", -2.5, 2.7))
pgm.add_node(daft.Node("x4", r"$4$", -1.5, 1.6))
pgm.add_node(daft.Node("x5", r"$5$", -1.5, 2.3))
pgm.add_node(daft.Node("x6", r"$6$", -0.5, 1.3))
pgm.add_node(daft.Node("x7", r"$7$", -0.5, 2.7))
# Add in the edges.
pgm.add_edge("x1", "x4", head_length=0.08)
pgm.add_edge("x1", "x5", head_length=0.08)
pgm.add_edge("x2", "x4", head_length=0.08)
pgm.add_edge("x3", "x4", head_length=0.08)
pgm.add_edge("x3", "x5", head_length=0.08)
pgm.add_edge("x4", "x6", head_length=0.08)
pgm.add_edge("x4", "x7", head_length=0.08)
pgm.add_edge("x5", "x7", head_length=0.08)
return pgm;
In [25]:
%%capture
pgm_topological_order("images/topological-order.png")
Out[25]:
Therefore, $\{ X_1, \dots, X_{k-1} \} = \Parents_\G(X_k) \cup \mathcal{Z}$
Since $\G$ is an I-map for $P$ and $\mathcal{Z} \subseteq \NonDesc_\G(X_k)$, we have $$\begin{align} & (X_k \perp \NonDesc_\G(X_k) \mid \Parents_\G(X_k)) \\ \implies & (X_k \perp \mathcal{Z} \mid \Parents_\G(X_k)) \end{align} $$
We have just shown $(X_k \perp \mathcal{Z} \mid \Parents_\G(X_k))$, therefore $$ P(X_k \mid X_1, \dots, X_{k-1}) = P(X_k \mid \Parents_\G(X_k)) $$
Remember: $X_k$ is conditionally independent of its nondescendants given its parents!
We just proved that for any $P$ satisfying $\G$, $$ P(X_1, \dots, X_N) = \prod_{k=1}^N P(X_k \mid \Parents_\G(X_k)) $$
It suffices to store conditional probability tables $P(X_k | \Parents_\G(X_k))$!
In [26]:
@pgm_render
def pgm_fully_connected_a():
pgm = daft.PGM([4, 4], origin=[0, 0])
# nodes
pgm.add_node(daft.Node("a", r"$A$", 2, 3.5))
pgm.add_node(daft.Node("b", r"$B$", 1.3, 2.5))
pgm.add_node(daft.Node("c", r"$C$", 2.7, 2.5))
# add in the edges
pgm.add_edge("a", "b", head_length=0.08)
pgm.add_edge("a", "c", head_length=0.08)
pgm.add_edge("b", "c", head_length=0.08)
return pgm;
In [27]:
%%capture
pgm_fully_connected_a("images/fully-connected-a.png")
Out[27]:
In [28]:
@pgm_render
def pgm_fully_connected_b():
pgm = daft.PGM([8, 4], origin=[0, 0])
# nodes
pgm.add_node(daft.Node("a1", r"$A$", 2, 3.5))
pgm.add_node(daft.Node("b1", r"$B$", 1.5, 2.8))
pgm.add_node(daft.Node("c1", r"$C$", 2.5, 2.8))
# add in the edges
pgm.add_edge("a1", "b1", head_length=0.08)
pgm.add_edge("a1", "c1", head_length=0.08)
pgm.add_edge("b1", "c1", head_length=0.08)
# nodes
pgm.add_node(daft.Node("a2", r"$A$", 4, 3.5))
pgm.add_node(daft.Node("b2", r"$B$", 3.5, 2.8))
pgm.add_node(daft.Node("c2", r"$C$", 4.5, 2.8))
# add in the edges
pgm.add_edge("b2", "c2", head_length=0.08)
pgm.add_edge("b2", "a2", head_length=0.08)
pgm.add_edge("c2", "a2", head_length=0.08)
return pgm;
In [29]:
%%capture
pgm_fully_connected_b("images/fully-connected-b.png")
Out[29]:
In [30]:
@pgm_render
def pgm_markov_chain():
pgm = daft.PGM([6, 6], origin=[0, 0])
# Nodes
pgm.add_node(daft.Node("x1", r"$\mathbf{x}_1$", 2, 2.5))
pgm.add_node(daft.Node("x2", r"$\mathbf{x}_2$", 3, 2.5))
pgm.add_node(daft.Node("ellipsis", r" . . . ", 3.7, 2.5, offset=(0, 0), plot_params={"ec" : "none"}))
pgm.add_node(daft.Node("ellipsis_end", r"", 3.7, 2.5, offset=(0, 0), plot_params={"ec" : "none"}))
pgm.add_node(daft.Node("xN", r"$\mathbf{x}_N$", 4.5, 2.5))
# Add in the edges.
pgm.add_edge("x1", "x2", head_length=0.08)
pgm.add_edge("x2", "ellipsis", head_length=0.08)
pgm.add_edge("ellipsis_end", "xN", head_length=0.08)
return pgm;
In [31]:
%%capture
pgm_markov_chain("images/markov-chain.png")
Out[31]:
In [32]:
@pgm_render
def pgm_hmm():
pgm = daft.PGM([7, 7], origin=[0, 0])
# Nodes
pgm.add_node(daft.Node("Y1", r"$Y_1$", 1, 3.5))
pgm.add_node(daft.Node("Y2", r"$Y_2$", 2, 3.5))
pgm.add_node(daft.Node("Y3", r"$\dots$", 3, 3.5, plot_params={'ec':'none'}))
pgm.add_node(daft.Node("Y4", r"$Y_N$", 4, 3.5))
pgm.add_node(daft.Node("x1", r"$X_1$", 1, 2.5, observed=True))
pgm.add_node(daft.Node("x2", r"$X_2$", 2, 2.5, observed=True))
pgm.add_node(daft.Node("x3", r"$\dots$", 3, 2.5, plot_params={'ec':'none'}))
pgm.add_node(daft.Node("x4", r"$X_N$", 4, 2.5, observed=True))
# Add in the edges.
pgm.add_edge("Y1", "Y2", head_length=0.08)
pgm.add_edge("Y2", "Y3", head_length=0.08)
pgm.add_edge("Y3", "Y4", head_length=0.08)
pgm.add_edge("Y1", "x1", head_length=0.08)
pgm.add_edge("Y2", "x2", head_length=0.08)
pgm.add_edge("Y4", "x4", head_length=0.08)
return pgm;
In [33]:
%%capture
pgm_hmm("images/hmm.png")
Out[33]:
In [34]:
@pgm_render
def pgm_plate_example():
pgm = daft.PGM([4,3], origin=[-2,0], node_unit=0.8, grid_unit=2.0);
# nodes
pgm.add_node(daft.Node("lambda", r"$\lambda$", -0.25, 2));
pgm.add_node(daft.Node("t1", r"$\theta_1$", -1, 1.3));
pgm.add_node(daft.Node("t2", r"$\theta_2$", -0.5, 1.3));
pgm.add_node(daft.Node("dots1", r"$\cdots$", 0, 1.3, plot_params={ 'ec' : 'none' }));
pgm.add_node(daft.Node("tN", r"$\theta_N$", 0.5, 1.3));
pgm.add_node(daft.Node("x1", r"$X_1$", -1, 0.6));
pgm.add_node(daft.Node("x2", r"$X_2$", -0.5, 0.6));
pgm.add_node(daft.Node("dots2", r"$\cdots$", 0, 0.6, plot_params={ 'ec' : 'none' }));
pgm.add_node(daft.Node("xN", r"$X_N$", 0.5, 0.6));
pgm.add_node(daft.Node("LAMBDA", r"$\lambda$", 1.5, 2));
pgm.add_node(daft.Node("THETA", r"$\theta_k$", 1.5,1.3));
pgm.add_node(daft.Node("XX", r"$X_k$", 1.5,0.6));
# edges
pgm.add_edge("lambda", "t1", head_length=0.08);
pgm.add_edge("lambda", "t2", head_length=0.08);
pgm.add_edge("lambda", "tN", head_length=0.08);
pgm.add_edge("t1", "x1", head_length=0.08);
pgm.add_edge("t2", "x2", head_length=0.08);
pgm.add_edge("tN", "xN", head_length=0.08);
pgm.add_edge("LAMBDA", "THETA", head_length=0.08);
pgm.add_edge("THETA", "XX", head_length=0.08);
pgm.add_plate(daft.Plate([1.1,0.4,0.8,1.2], label=r"$\qquad\quad\; K$",
shift=-0.1))
return pgm;
In [35]:
%%capture
pgm_plate_example("images/plate-example.png")
Out[35]: