In contrast with the Normal distribution, which is targetted to full populations, we can use the t distribution for smaller samples, where we do not know the standard deviation.
In the bayesian settings, it just so happen that the posterior probability of a function with normal likelihood is often a t -distribution.
To remember a bit about bayesian inference:
$p(x)=\int p(x|y)p(y)$
$p(x|y) = \mathcal{N}(x|\mu, \tau^{-1})$
$p(y) = Gam(\tau|a,b)$
The resulting marginal distribution is a Stuent-t distribution, so we can think of it as a sum of normal distributions for an infinite number of variances.
Some plots on the Student-t distribution
In [1]:
import numpy as np
from scipy.stats import t as student_t
from matplotlib import pyplot as plt
#----------------------------------------------------------------------
# This function adjusts matplotlib settings for a uniform feel in the textbook.
# Note that with usetex=True, fonts are rendered with LaTeX. This may
# result in an error if LaTeX is not installed on your system. In that case,
# you can set usetex to False.
from astroML.plotting import setup_text_plots
setup_text_plots(fontsize=8, usetex=True)
#------------------------------------------------------------
# Define the distribution parameters to be plotted
mu = 0
k_values = [1E10, 2, 1, 0.5]
linestyles = ['-', '--', ':', '-.']
x = np.linspace(-10, 10, 1000)
#------------------------------------------------------------
# plot the distributions
fig, ax = plt.subplots(figsize=(5, 3.75))
for k, ls in zip(k_values, linestyles):
dist = student_t(k, 0)
if k >= 1E10:
label = r'$\mathrm{t}(k=\infty)$'
else:
label = r'$\mathrm{t}(k=%.1f)$' % k
plt.plot(x, dist.pdf(x), ls=ls, c='black', label=label)
plt.xlim(-5, 10)
plt.ylim(0.0, 0.45)
plt.xlabel('$x$')
plt.ylabel(r'$p(x|k)$')
plt.title("Student's $t$ Distribution")
plt.legend()
plt.show()
print 'Amigo'
In [2]:
# Author: Jake VanderPlas
# License: BSD
# The figure produced by this code is published in the textbook
# "Statistics, Data Mining, and Machine Learning in Astronomy" (2013)
# For more information, see http://astroML.github.com
# To report a bug or issue, use the following forum:
# https://groups.google.com/forum/#!forum/astroml-general
import numpy as np
from scipy.stats import f as fisher_f
from matplotlib import pyplot as plt
#----------------------------------------------------------------------
# This function adjusts matplotlib settings for a uniform feel in the textbook.
# Note that with usetex=True, fonts are rendered with LaTeX. This may
# result in an error if LaTeX is not installed on your system. In that case,
# you can set usetex to False.
from astroML.plotting import setup_text_plots
setup_text_plots(fontsize=8, usetex=True)
#------------------------------------------------------------
# Define the distribution parameters to be plotted
mu = 0
d1_values = [1, 5, 2, 10]
d2_values = [1, 2, 5, 50]
linestyles = ['-', '--', ':', '-.']
x = np.linspace(0, 5, 1001)[1:]
fig, ax = plt.subplots(figsize=(5, 3.75))
for (d1, d2, ls) in zip(d1_values, d2_values, linestyles):
dist = fisher_f(d1, d2, mu)
plt.plot(x, dist.pdf(x), ls=ls, c='black',
label=r'$d_1=%i,\ d_2=%i$' % (d1, d2))
plt.xlim(0, 4)
plt.ylim(0.0, 1.0)
plt.xlabel('$x$')
plt.ylabel(r'$p(x|d_1, d_2)$')
plt.title("Fisher's Distribution")
plt.legend()
plt.show()
In [3]:
# Author: Jake VanderPlas
# License: BSD
# The figure produced by this code is published in the textbook
# "Statistics, Data Mining, and Machine Learning in Astronomy" (2013)
# For more information, see http://astroML.github.com
# To report a bug or issue, use the following forum:
# https://groups.google.com/forum/#!forum/astroml-general
import numpy as np
from scipy.stats import gamma
from matplotlib import pyplot as plt
#----------------------------------------------------------------------
# This function adjusts matplotlib settings for a uniform feel in the textbook.
# Note that with usetex=True, fonts are rendered with LaTeX. This may
# result in an error if LaTeX is not installed on your system. In that case,
# you can set usetex to False.
from astroML.plotting import setup_text_plots
setup_text_plots(fontsize=8, usetex=True)
#------------------------------------------------------------
# plot the distributions
k_values = [1, 2, 3, 5]
theta_values = [2, 1, 1, 0.5]
linestyles = ['-', '--', ':', '-.']
x = np.linspace(1E-6, 10, 1000)
#------------------------------------------------------------
# plot the distributions
fig, ax = plt.subplots(figsize=(5, 3.75))
for k, t, ls in zip(k_values, theta_values, linestyles):
dist = gamma(k, 0, t)
plt.plot(x, dist.pdf(x), ls=ls, c='black',
label=r'$k=%.1f,\ \theta=%.1f$' % (k, t))
plt.xlim(0, 10)
plt.ylim(0, 0.45)
plt.xlabel('$x$')
plt.ylabel(r'$p(x|k,\theta)$')
plt.title('Gamma Distribution')
plt.legend(loc=0)
plt.show()
In [4]:
# Author: Jake VanderPlas
# License: BSD
# The figure produced by this code is published in the textbook
# "Statistics, Data Mining, and Machine Learning in Astronomy" (2013)
# For more information, see http://astroML.github.com
# To report a bug or issue, use the following forum:
# https://groups.google.com/forum/#!forum/astroml-general
import numpy as np
from scipy.stats import beta
from matplotlib import pyplot as plt
#----------------------------------------------------------------------
# This function adjusts matplotlib settings for a uniform feel in the textbook.
# Note that with usetex=True, fonts are rendered with LaTeX. This may
# result in an error if LaTeX is not installed on your system. In that case,
# you can set usetex to False.
from astroML.plotting import setup_text_plots
setup_text_plots(fontsize=8, usetex=True)
#------------------------------------------------------------
# Define the distribution parameters to be plotted
alpha_values = [0.5, 1.5, 3.0, 0.5]
beta_values = [0.5, 1.5, 3.0, 1.5]
linestyles = ['-', '--', ':', '-.']
x = np.linspace(0, 1, 1002)[1:-1]
#------------------------------------------------------------
# plot the distributions
fig, ax = plt.subplots(figsize=(5, 3.75))
for a, b, ls in zip(alpha_values, beta_values, linestyles):
dist = beta(a, b)
plt.plot(x, dist.pdf(x), ls=ls, c='black',
label=r'$\alpha=%.1f,\ \beta=%.1f$' % (a, b))
plt.xlim(0, 1)
plt.ylim(0, 3)
plt.xlabel('$x$')
plt.ylabel(r'$p(x|\alpha,\beta)$')
plt.title('Beta Distribution')
plt.legend(loc=0)
plt.show()
In [5]:
# Author: Jake VanderPlas
# License: BSD
# The figure produced by this code is published in the textbook
# "Statistics, Data Mining, and Machine Learning in Astronomy" (2013)
# For more information, see http://astroML.github.com
# To report a bug or issue, use the following forum:
# https://groups.google.com/forum/#!forum/astroml-general
import numpy as np
from scipy.stats import dweibull
from matplotlib import pyplot as plt
#----------------------------------------------------------------------
# This function adjusts matplotlib settings for a uniform feel in the textbook.
# Note that with usetex=True, fonts are rendered with LaTeX. This may
# result in an error if LaTeX is not installed on your system. In that case,
# you can set usetex to False.
from astroML.plotting import setup_text_plots
setup_text_plots(fontsize=8, usetex=True)
#------------------------------------------------------------
# Define the distribution parameters to be plotted
k_values = [0.5, 1, 2, 2]
lam_values = [1, 1, 1, 2]
linestyles = ['-', '--', ':', '-.', '--']
mu = 0
x = np.linspace(-10, 10, 1000)
#------------------------------------------------------------
# plot the distributions
fig, ax = plt.subplots(figsize=(5, 3.75))
for (k, lam, ls) in zip(k_values, lam_values, linestyles):
dist = dweibull(k, mu, lam)
plt.plot(x, dist.pdf(x), ls=ls, c='black',
label=r'$k=%.1f,\ \lambda=%i$' % (k, lam))
plt.xlim(0, 5)
plt.ylim(0, 0.6)
plt.xlabel('$x$')
plt.ylabel(r'$p(x|k,\lambda)$')
plt.title('Weibull Distribution')
plt.legend()
plt.show()
The central limit theorem, tells us, that subject to certain conditions, the sum of a set of andom variables has a distribution that becomes increasingly Gaussian as the number of the terms in the sum increases.
There are many things that can go in the "subject to certain conditions".
In [6]:
# Author: Jake VanderPlas
# License: BSD
# The figure produced by this code is published in the textbook
# "Statistics, Data Mining, and Machine Learning in Astronomy" (2013)
# For more information, see http://astroML.github.com
# To report a bug or issue, use the following forum:
# https://groups.google.com/forum/#!forum/astroml-general
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import norm
#----------------------------------------------------------------------
# This function adjusts matplotlib settings for a uniform feel in the textbook.
# Note that with usetex=True, fonts are rendered with LaTeX. This may
# result in an error if LaTeX is not installed on your system. In that case,
# you can set usetex to False.
from astroML.plotting import setup_text_plots
setup_text_plots(fontsize=8, usetex=True)
#------------------------------------------------------------
# Generate the uniform samples
N = [2, 3, 10]
np.random.seed(42)
x = np.random.random((max(N), 1E6))
#------------------------------------------------------------
# Plot the results
fig = plt.figure(figsize=(5, 5))
fig.subplots_adjust(hspace=0.05)
for i in range(len(N)):
ax = fig.add_subplot(3, 1, i + 1)
# take the mean of the first N[i] samples
x_i = x[:N[i], :].mean(0)
# histogram the data
ax.hist(x_i, bins=np.linspace(0, 1, 101),
histtype='stepfilled', alpha=0.5, normed=True)
# plot the expected gaussian pdf
mu = 0.5
sigma = 1. / np.sqrt(12 * N[i])
dist = norm(mu, sigma)
x_pdf = np.linspace(-0.5, 1.5, 1000)
ax.plot(x_pdf, dist.pdf(x_pdf), '-k')
ax.set_xlim(0.0, 1.0)
ax.set_ylim(0.001, None)
ax.xaxis.set_major_locator(plt.MultipleLocator(0.2))
ax.yaxis.set_major_locator(plt.MaxNLocator(5))
ax.text(0.99, 0.95, r"$N = %i$" % N[i],
ha='right', va='top', transform=ax.transAxes)
if i == len(N) - 1:
ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%.4f'))
ax.set_xlabel(r'$x$')
else:
ax.xaxis.set_major_formatter(plt.NullFormatter())
ax.set_ylabel('$p(x)$')
plt.show()
In [7]:
# Author: Jake VanderPlas
# License: BSD
# The figure produced by this code is published in the textbook
# "Statistics, Data Mining, and Machine Learning in Astronomy" (2013)
# For more information, see http://astroML.github.com
# To report a bug or issue, use the following forum:
# https://groups.google.com/forum/#!forum/astroml-general
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.patches import Ellipse
from astroML.stats.random import bivariate_normal
#----------------------------------------------------------------------
# This function adjusts matplotlib settings for a uniform feel in the textbook.
# Note that with usetex=True, fonts are rendered with LaTeX. This may
# result in an error if LaTeX is not installed on your system. In that case,
# you can set usetex to False.
from astroML.plotting import setup_text_plots
setup_text_plots(fontsize=8, usetex=True)
#------------------------------------------------------------
# Define the mean, principal axes, and rotation of the ellipse
mean = np.array([0, 0])
sigma_1 = 2
sigma_2 = 1
alpha = np.pi / 4
#------------------------------------------------------------
# Draw 10^5 points from a multivariate normal distribution
#
# we use the bivariate_normal function from astroML. A more
# general function for this is numpy.random.multivariate_normal(),
# which requires the user to specify the full covariance matrix.
# bivariate_normal() generates this covariance matrix for the
# given inputs.
np.random.seed(0)
x, cov = bivariate_normal(mean, sigma_1, sigma_2, alpha, size=100000,
return_cov=True)
sigma_x = np.sqrt(cov[0, 0])
sigma_y = np.sqrt(cov[1, 1])
sigma_xy = cov[0, 1]
#------------------------------------------------------------
# Plot the results
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(111)
# plot a 2D histogram/hess diagram of the points
H, bins = np.histogramdd(x, bins=2 * [np.linspace(-4.5, 4.5, 51)])
ax.imshow(H, origin='lower', cmap=plt.cm.binary, interpolation='nearest',
extent=[bins[0][0], bins[0][-1], bins[1][0], bins[1][-1]])
# draw 1, 2, 3-sigma ellipses over the distribution
for N in (1, 2, 3):
ax.add_patch(Ellipse(mean, N * sigma_1, N * sigma_2,
angle=alpha * 180. / np.pi, lw=1,
ec='k', fc='none'))
kwargs = dict(ha='left', va='top', transform=ax.transAxes)
ax.text(0.02, 0.98, r"$\sigma_1 = %i$" % sigma_1, **kwargs)
ax.text(0.02, 0.93, r"$\sigma_2 = %i$" % sigma_2, **kwargs)
ax.text(0.02, 0.88, r"$\alpha = \pi / %i$" % (np.pi / alpha), **kwargs)
ax.text(0.15, 0.98, r"$\sigma_x = %.2f$" % sigma_x, **kwargs)
ax.text(0.15, 0.93, r"$\sigma_y = %.2f$" % sigma_y, **kwargs)
ax.text(0.15, 0.88, r"$\sigma_{xy} = %.2f$" % sigma_xy, **kwargs)
ax.set_xlabel('$x$')
ax.set_ylabel('$y$')
plt.show()
In [8]:
# Author: Jake VanderPlas
# License: BSD
# The figure produced by this code is published in the textbook
# "Statistics, Data Mining, and Machine Learning in Astronomy" (2013)
# For more information, see http://astroML.github.com
# To report a bug or issue, use the following forum:
# https://groups.google.com/forum/#!forum/astroml-general
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats, interpolate
from astroML.plotting import hist
from astroML.density_estimation import EmpiricalDistribution
#----------------------------------------------------------------------
# This function adjusts matplotlib settings for a uniform feel in the textbook.
# Note that with usetex=True, fonts are rendered with LaTeX. This may
# result in an error if LaTeX is not installed on your system. In that case,
# you can set usetex to False.
from astroML.plotting import setup_text_plots
setup_text_plots(fontsize=8, usetex=True)
#------------------------------------------------------------
# Create a distribution and clone it
Ndata = 1000
Nclone = 100000
np.random.seed(0)
# generate an 'observed' bimodal distribution with 10000 values
dists = (stats.norm(-1.3, 0.5), stats.norm(1.3, 0.5))
fracs = (0.6, 0.4)
x = np.hstack((d.rvs(f * Ndata) for d, f in zip(dists, fracs)))
# We can clone the distribution easily with this function
x_cloned = EmpiricalDistribution(x).rvs(Nclone)
# compute the KS test to check if they're the same
D, p = stats.ks_2samp(x, x_cloned)
print "KS test: D = %.2g; p = %.2g" % (D, p)
#------------------------------------------------------------
# For the sake of this example, we need to calculate some
# of the partial steps used by EmpiricalDistribution
# create a cumulative distribution
x.sort()
Px_cuml = np.linspace(0, 1, Ndata)
# set up an interpolation of the inverse cumulative distribution
tck = interpolate.splrep(Px_cuml, x)
# sample evenly along the cumulative distribution, and interpolate
Px_cuml_sample = np.linspace(0, 1, 10 * Ndata)
x_sample = interpolate.splev(Px_cuml_sample, tck)
#------------------------------------------------------------
# Plot the cloned distribution and the procedure for obtaining it
fig = plt.figure(figsize=(5, 5))
fig.subplots_adjust(hspace=0.3, left=0.1, right=0.95,
bottom=0.08, top=0.92)
indices = np.linspace(0, Ndata - 1, 20).astype(int)
# plot a histogram of the input
ax = fig.add_subplot(221)
hist(x, bins='knuth', ax=ax,
histtype='stepfilled', ec='k', fc='#AAAAAA')
ax.set_ylim(0, 300)
ax.set_title('Input data distribution')
ax.set_xlabel('$x$')
ax.set_ylabel('$N(x)$')
# plot the cumulative distribution
ax = fig.add_subplot(222)
ax.scatter(x[indices], Px_cuml[indices], lw=0, c='k', s=9)
ax.plot(x, Px_cuml, '-k')
ax.set_xlim(-3, 3)
ax.set_ylim(-0.05, 1.05)
ax.set_title('Cumulative Distribution')
ax.set_xlabel('$x$')
ax.set_ylabel('$p(<x)$')
# plot the inverse cumulative distribution and spline fit
ax = fig.add_subplot(223)
ax.scatter(Px_cuml[indices], x[indices], lw=0, c='k', s=9)
ax.plot(Px_cuml_sample, x_sample, '-k')
ax.arrow(0.7, -3, 0, 3.5, width=0.015, fc='gray', ec='gray',
head_width=0.05, head_length=0.4)
ax.arrow(0.7, 0.9, -0.69, 0, width=0.1, fc='gray', ec='gray',
head_width=0.3, head_length=0.06)
ax.set_xlim(-0.05, 1.05)
ax.set_ylim(-3, 3)
ax.set_title('Inverse Cuml. Distribution')
ax.set_xlabel('$p(<x)$')
ax.set_ylabel('$x$')
# plot the resulting cloned distribution
ax = fig.add_subplot(224)
hist(x, bins='knuth', ax=ax,
histtype='stepfilled', normed=True,
ec='#AAAAAA', fc='#DDDDDD',
label='input data')
hist(x_cloned, bins='knuth', ax=ax,
histtype='step', normed=True,
color='k', label='cloned data')
ax.set_title('Cloned Distribution')
ax.set_xlabel('$x$')
ax.set_ylabel('$p(x)dx$')
ax.text(0.75, 0.95, "KS test:\nD = %.2f\np = %.2f" % (D, p),
ha='left', va='top', transform=ax.transAxes)
plt.show()