In [9]:
import numpy as np
import os
import itertools
from matplotlib import pyplot as plt
from matplotlib.colors import LogNorm
import matplotlib as mpl
%matplotlib inline
from scipy import stats
from scipy import linalg
from scipy.spatial import cKDTree
from scipy.stats import gaussian_kde
from astroML.datasets import fetch_great_wall
from astroML.plotting import hist
from astroML.density_estimation import KNeighborsDensity
from sklearn.neighbors import KernelDensity
from sklearn.mixture import GMM
from sklearn.mixture import DPGMM
from sklearn.mixture import VBGMM
from sklearn.grid_search import GridSearchCV
Given a set of points in D-dimensions (where dimensions could be spatial or other properties), density estimation involves inferring the PDF of the data set. Finding structure in this data is called ``clustering".
Non-parameteric estimates rely on density kernels. Specifically, the density estimate of a set of points $\vec{x}$ is \begin{equation} \hat{f}_N(\vec{x})=\frac{1}{N h^D}\sum_{i=1}^{N} K(\frac{d(\vec{x},\vec{x}_i)}{h})\,, \end{equation} where $d(\vec{x},\vec{x}_i)$ is some suitable estimate of the ``distance" between points $\vec{x}$ and $\vec{x}_i$ and $h$ is called the bandwidth (compare to bin width).
The kernel has the following totally reasonable properties: $\int_0^\infty K(u) du =1$ (normalized properly), $\int_0^\infty u K(u) du =1$ (mean is zero) and $\int_0^\infty u^2 K(u) du >0$ (variance is greater than 0) and $K(u) > 0 \; \forall u$ (kernel is positive definite).
While smoothing properties of the kernel function may lead to a better estimate of the density, we still need to choose $h$ just like we had to choose a bin width.
In [10]:
# Author: Jake VanderPlas
# License: BSD
# The figure produced by this code is published in the textbook
# "Statistics, Data Mining, and Machine Learning in Astronomy" (2013)
# For more information, see http://astroML.github.com
# To report a bug or issue, use the following forum:
# https://groups.google.com/forum/#!forum/astroml-general
#----------------------------------------------------------------------
# This function adjusts matplotlib settings for a uniform feel in the textbook.
# Note that with usetex=True, fonts are rendered with LaTeX. This may
# result in an error if LaTeX is not installed on your system. In that case,
# you can set usetex to False.
#from astroML.plotting import setup_text_plots
#setup_text_plots(fontsize=8, usetex=True)
#------------------------------------------------------------
# Draw the random data
np.random.seed(1)
x = np.concatenate([np.random.normal(-0.5, 0.3, size=14),
np.random.normal(1, 0.3, size=7)])
#------------------------------------------------------------
# First figure: silly histogram binning
fig1 = plt.figure(figsize=(10, 6))
fig1.subplots_adjust(left=0.12, right=0.95, wspace=0.05,
bottom=0.15, top=0.9, hspace=0.05)
FC = '#6666FF'
XLIM = (-2, 2.9)
YLIM = (-0.09, 1.1)
ax = fig1.add_subplot(121)
bins = np.linspace(-1.8, 2.7, 13)
ax.hist(x, bins=bins, normed=True,
histtype='stepfilled', fc='k', alpha=0.3)
ax.plot(XLIM, [0, 0], '-k', lw=1)
ax.plot(x, 0 * x - 0.05, '+k')
ax.set_xlim(XLIM)
ax.set_ylim(YLIM)
ax.set_xlabel('$x$')
ax.set_ylabel('$p(x)$')
ax = fig1.add_subplot(122)
ax.yaxis.set_major_formatter(plt.NullFormatter())
ax.hist(x, bins=bins + 0.25, normed=True,
histtype='stepfilled', fc='k', alpha=0.3)
ax.plot(XLIM, [0, 0], '-k', lw=1)
ax.plot(x, 0 * x - 0.05, '+k')
ax.set_xlim(XLIM)
ax.set_ylim(YLIM)
ax.set_xlabel('$x$')
#------------------------------------------------------------
# First figure: transition to KDE
fig2 = plt.figure(figsize=(10, 10))
fig2.subplots_adjust(left=0.12, right=0.95, wspace=0.05,
bottom=0.1, top=0.95, hspace=0.05)
ax = fig2.add_subplot(221)
ax.xaxis.set_major_formatter(plt.NullFormatter())
binwidth = bins[1] - bins[0]
x_plot = np.linspace(-4, 4, 1000)
y_plot = (abs(x_plot - x[:, None]) <= 0.5 * binwidth).astype(float)
y_plot /= (binwidth * len(x))
ax.fill(x_plot, y_plot.sum(0), ec='k', lw=1, fc='k', alpha=0.3)
ax.plot(x_plot, y_plot.T, '-k', lw=1)
ax.plot(x, 0 * x - 0.05, '+k')
ax.set_xlim(XLIM)
ax.set_ylim(YLIM)
ax.set_ylabel('$p(x)$')
ax = fig2.add_subplot(222)
ax.xaxis.set_major_formatter(plt.NullFormatter())
ax.yaxis.set_major_formatter(plt.NullFormatter())
binwidth = bins[1] - bins[0]
x_plot = np.linspace(-4, 4, 1000)
y_plot = binwidth * stats.norm.pdf(x_plot, x[:, None], 0.1)
y_plot /= (binwidth * len(x))
ax.fill(x_plot, y_plot.sum(0), ec='k', lw=1, fc='k', alpha=0.3)
ax.plot(x_plot, y_plot.T, '-k', lw=1)
ax.plot(x, 0 * x - 0.05, '+k')
ax.set_xlim(XLIM)
ax.set_ylim(YLIM)
ax = fig2.add_subplot(223)
binwidth = bins[1] - bins[0]
x_plot = np.linspace(-4, 4, 1000)
y_plot = binwidth * stats.norm.pdf(x_plot, x[:, None], 0.7)
y_plot /= (binwidth * len(x))
ax.fill(x_plot, y_plot.sum(0), ec='k', lw=1, fc='k', alpha=0.3)
ax.plot(x_plot, 4 * y_plot.T, '-k', lw=1)
ax.plot(x, 0 * x - 0.05, '+k')
ax.set_xlim(XLIM)
ax.set_ylim(YLIM)
ax.set_ylabel('$p(x)$')
ax.set_xlabel('$x$')
ax = fig2.add_subplot(224)
ax.yaxis.set_major_formatter(plt.NullFormatter())
binwidth = bins[1] - bins[0]
x_plot = np.linspace(-4, 4, 1000)
y_plot = binwidth * stats.norm.pdf(x_plot, x[:, None], 0.2)
y_plot /= (binwidth * len(x))
ax.fill(x_plot, y_plot.sum(0), ec='k', lw=1, fc='k', alpha=0.3)
ax.plot(x_plot, y_plot.T, '-k', lw=1)
ax.plot(x, 0 * x - 0.05, '+k')
ax.set_xlim(XLIM)
ax.set_ylim(YLIM)
ax.set_xlabel('$x$')
plt.show()
In [11]:
#------------------------------------------------------------
# Compute Kernels.
x = np.linspace(-5, 5, 10000)
dx = x[1] - x[0]
gauss = (1. / np.sqrt(2 * np.pi)) * np.exp(-0.5 * x ** 2)
exp = 0.5 * np.exp(-abs(x))
tophat = 0.5 * np.ones_like(x)
tophat[abs(x) > 1] = 0
#------------------------------------------------------------
# Plot the kernels
fig = plt.figure(figsize=(10, 7.5))
ax = fig.add_subplot(111)
ax.plot(x, gauss, '-', c='black', lw=3, label='Gaussian')
ax.plot(x, exp, '-', c='#666666', lw=2, label='Exponential')
ax.plot(x, tophat, '-', c='#999999', lw=1, label='Top-hat')
ax.legend(loc=1)
ax.set_xlabel('$u$')
ax.set_ylabel('$K(u)$')
ax.set_xlim(-5, 5)
ax.set_ylim(0, 0.6001)
plt.show()
In [12]:
#------------------------------------------------------------
# Fetch the great wall data
X = fetch_great_wall()
#------------------------------------------------------------
# Create the grid on which to evaluate the results
Nx = 50
Ny = 125
xmin, xmax = (-375, -175)
ymin, ymax = (-300, 200)
#------------------------------------------------------------
# Evaluate for several models
Xgrid = np.vstack(map(np.ravel, np.meshgrid(np.linspace(xmin, xmax, Nx),
np.linspace(ymin, ymax, Ny)))).T
kernels = ['gaussian', 'tophat', 'exponential']
dens = []
bw = 3
kde1 = KernelDensity(bandwidth=bw, kernel='gaussian')
log_dens1 = kde1.fit(X).score_samples(Xgrid)
dens1 = X.shape[0] * np.exp(log_dens1).reshape((Ny, Nx))
kde2 = KernelDensity(bandwidth=bw, kernel='tophat')
log_dens2 = kde2.fit(X).score_samples(Xgrid)
dens2 = X.shape[0] * np.exp(log_dens2).reshape((Ny, Nx))
kde3 = KernelDensity(bandwidth=bw, kernel='exponential')
log_dens3 = kde3.fit(X).score_samples(Xgrid)
dens3 = X.shape[0] * np.exp(log_dens3).reshape((Ny, Nx))
#------------------------------------------------------------
# Plot the results
fig = plt.figure(figsize=(20, 10))
fig.subplots_adjust(left=0.12, right=0.95, bottom=0.2, top=0.9,
hspace=0.01, wspace=0.01)
# First plot: scatter the points
ax1 = plt.subplot(221, aspect='equal')
ax1.scatter(X[:, 1], X[:, 0], s=1, lw=0, c='k')
ax1.text(0.95, 0.9, "input", ha='right', va='top',
transform=ax1.transAxes,
bbox=dict(boxstyle='round', ec='k', fc='w'))
# Second plot: gaussian kernel
ax2 = plt.subplot(222, aspect='equal')
ax2.imshow(dens1.T, origin='lower', norm=LogNorm(),
extent=(ymin, ymax, xmin, xmax), cmap=plt.cm.binary)
ax2.text(0.95, 0.9, "Gaussian ($h$={0})".format(bw), ha='right', va='top',
transform=ax2.transAxes,
bbox=dict(boxstyle='round', ec='k', fc='w'))
# Third plot: top-hat kernel
ax3 = plt.subplot(223, aspect='equal')
ax3.imshow(dens2.T, origin='lower', norm=LogNorm(),
extent=(ymin, ymax, xmin, xmax), cmap=plt.cm.binary)
ax3.text(0.95, 0.9, "top-hat ($h$={0})".format(bw), ha='right', va='top',
transform=ax3.transAxes,
bbox=dict(boxstyle='round', ec='k', fc='w'))
ax3.images[0].set_clim(0.01, 0.8)
# Fourth plot: exponential kernel
ax4 = plt.subplot(224, aspect='equal')
ax4.imshow(dens3.T, origin='lower', norm=LogNorm(),
extent=(ymin, ymax, xmin, xmax), cmap=plt.cm.binary)
ax4.text(0.95, 0.9, "exponential ($h$={0})".format(bw), ha='right', va='top',
transform=ax4.transAxes,
bbox=dict(boxstyle='round', ec='k', fc='w'))
for ax in [ax1, ax2, ax3, ax4]:
ax.set_xlim(ymin, ymax - 0.01)
ax.set_ylim(xmin, xmax)
for ax in [ax1, ax2]:
ax.xaxis.set_major_formatter(plt.NullFormatter())
for ax in [ax3, ax4]:
ax.set_xlabel('$y$ (Mpc)')
for ax in [ax2, ax4]:
ax.yaxis.set_major_formatter(plt.NullFormatter())
for ax in [ax1, ax3]:
ax.set_ylabel('$x$ (Mpc)')
plt.show()
In [13]:
# Import Ursa Minor data
#filename = os.path.abspath(os.path.join('Downloads','umi_alpha_sigcut_metalstars_258.dat'))
filename = 'umi_alpha_sigcut_metalstars_258.dat'
print "reading {0}".format(filename)
lines = np.loadtxt(filename, comments="#", delimiter=None, unpack=False, skiprows=1)
#Columns: ra, dec, [Fe/H], [Fe/H] error, vel los, vel los error, [alpha/Fe], [alpha/Fe] error, membership (all 1)
raUMI=227.2833
decUMI=67.2225
distUMI=73
# Create an array with distances and metallicities
dist=lines[:,0:2]-[raUMI,decUMI]
dist2=np.power(dist,2)
dist=np.multiply(np.power(dist2[:,1]+dist2[:,0],0.5),distUMI*3.14156/180)
X=np.transpose([dist,lines[:,2]])
# Create the grid on which to evaluate the results
Nx = 20
Ny = 20
xmin, xmax = (0.01, 2.41)
ymin, ymax = (-4.1, -0.1)
#------------------------------------------------------------
# Evaluate for several models
Xgrid = np.vstack(map(np.ravel, np.meshgrid(np.linspace(xmin, xmax, Nx),
np.linspace(ymin, ymax, Ny)))).T
Dgrid = np.linspace(xmin, xmax, Nx)[:, np.newaxis]
# use grid search cross-validation to optimize the bandwidth
params = {'bandwidth': np.linspace(0.05, 0.3, 25)}
grid = GridSearchCV(KernelDensity(kernel='exponential'), param_grid=params)
grid.fit(X)
kde3 = grid.best_estimator_
#kde3 = KernelDensity(0.1, kernel='exponential')
print("Exponential kernel; best bandwidth: {0}".format(grid.best_estimator_.bandwidth))
params = {'bandwidth': np.linspace(0.05, 0.3, 25)}
grid = GridSearchCV(KernelDensity(kernel='gaussian'), param_grid=params)
grid.fit(X)
kde1 = grid.best_estimator_
#kde1 = KernelDensity(bandwidth=0.2, kernel='gaussian')
print("Gaussian kernel; best bandwidth: {0}".format(grid.best_estimator_.bandwidth))
log_dens1 = kde1.fit(X).score_samples(Xgrid)
dens1 = X.shape[0] * np.exp(log_dens1).reshape((Ny, Nx))
kde2 = KernelDensity(0.1, kernel='tophat')
log_dens2 = kde2.fit(X).score_samples(Xgrid)
dens2 = X.shape[0] * np.exp(log_dens2).reshape((Ny, Nx))
log_dens3 = kde3.fit(X).score_samples(Xgrid)
dens3 = X.shape[0] * np.exp(log_dens3).reshape((Ny, Nx))
#------------------------------------------------------------
# Plot the results
fig = plt.figure(figsize=(20, 10))
fig.subplots_adjust(left=0.12, right=0.95, bottom=0.2, top=0.9,
hspace=0.01, wspace=0.01)
# First plot: scatter the points
ax1 = plt.subplot(221, aspect='equal')
ax1.scatter(X[:, 1], X[:, 0], s=1, lw=0, c='k')
ax1.text(0.95, 0.9, "input", ha='right', va='top',
transform=ax1.transAxes,
bbox=dict(boxstyle='round', ec='k', fc='w'))
# Second plot: gaussian kernel
ax2 = plt.subplot(222, aspect='equal')
ax2.imshow(dens1.T, origin='lower', norm=LogNorm(),
extent=(ymin, ymax, xmin, xmax), cmap=plt.get_cmap('gist_rainbow_r'))
ax2.text(0.95, 0.9, "Gaussian", ha='right', va='top',
transform=ax2.transAxes,
bbox=dict(boxstyle='round', ec='k', fc='w'))
# Third plot: top-hat kernel
ax3 = plt.subplot(223, aspect='equal')
ax3.imshow(dens2.T, origin='lower', norm=LogNorm(),
extent=(ymin, ymax, xmin, xmax), cmap=plt.get_cmap('Dark2'))
ax3.text(0.95, 0.9, "top-hat", ha='right', va='top',
transform=ax3.transAxes,
bbox=dict(boxstyle='round', ec='k', fc='w'))
ax3.images[0].set_clim(0.01, 0.8)
# Fourth plot: exponential kernel
ax4 = plt.subplot(224, aspect='equal')
ax4.imshow(dens3.T, origin='lower', norm=LogNorm(),
extent=(ymin, ymax, xmin, xmax), cmap=plt.get_cmap('Spectral'))
ax4.text(0.95, 0.9, "exponential", ha='right', va='top',
transform=ax4.transAxes,
bbox=dict(boxstyle='round', ec='k', fc='w'))
for ax in [ax1, ax2, ax3, ax4]:
ax.set_xlim(ymin, ymax - 0.01)
ax.set_ylim(xmin, xmax)
for ax in [ax1, ax2]:
ax.xaxis.set_major_formatter(plt.NullFormatter())
for ax in [ax3, ax4]:
ax.set_xlabel('[Fe/H]')
for ax in [ax2, ax4]:
ax.yaxis.set_major_formatter(plt.NullFormatter())
for ax in [ax1, ax3]:
ax.set_ylabel('Radius (deg)')
plt.show()
Let us look at the 1-D distribution in radius and cut on metallicity to see if there is any evidence for multiple populations.
In [14]:
# Cost function for cross validation (only works for 1-dim data)
def bestBW(data,params,kernel='gaussian'):
param_name, param_array = params.items()[0]
dmax=10*np.amax(data)
nint=1000
ndata=len(data)
Xint=np.linspace(0,dmax,nint)[:, np.newaxis]
sum1=[[param_value,0] for i, param_value in enumerate(param_array)]
sum2=[0 for j in range(len(param_array))]
for j, param_value in enumerate(param_array):
kde=KernelDensity(kernel=kernel, bandwidth=param_value).fit(data)
sum2[j]=np.sum(np.exp(2*kde.score_samples(Xint)))*dmax/(nint-1.0)
for i in range(ndata):
dd=np.delete(data,i,0)
for j, param_value in enumerate(param_array):
sum1[j][1]+=KernelDensity(kernel=kernel, bandwidth=param_value).fit(dd).score_samples(data[i])[0]
for j in range(len(param_array)):
sum1[j][1]=-2.0*sum1[j][1]/ndata+sum2[j]
# print sum1
return sum1[np.argmin(sum1,axis=0)[1]][0]
distMR=dist[np.all([lines[:,2]>-2],axis=0)][:, np.newaxis]
distMP=dist[np.all([lines[:,2]<-3],axis=0)][:, np.newaxis]
fig = plt.figure(figsize=(20, 8))
ax = fig.add_subplot(121)
# plot a standard histogram in the background, with alpha transparency
hist(dist, bins='knuth', histtype='stepfilled',alpha=0.2, normed=True, label='Knuth rule')
# plot an adaptive-width histogram on top
hist(dist, bins='blocks', ax=ax, color='black',histtype='step', normed=True, label='Bayesian blocks')
ax.legend(prop=dict(size=12))
plt.show()
params = {'bandwidth': np.linspace(0.01, 0.3, 30)}
grid = GridSearchCV(KernelDensity(kernel='gaussian'), param_grid=params)
grid.fit(distMR)
kdeMR = grid.best_estimator_
#kdeMR = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(distMR)
print("Gaussian kernel; metal-rich; best bandwidth: {0}".format(grid.best_estimator_.bandwidth))
print "best fit bandwidth from Eq. 6.8 cost = ",bestBW(distMR,params,kernel='gaussian')
grid = GridSearchCV(KernelDensity(kernel='gaussian'), param_grid=params)
grid.fit(distMP)
kdeMP = grid.best_estimator_
#kdeMP = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(distMP)
print("Gaussian kernel; metal-poor; best bandwidth: {0}".format(grid.best_estimator_.bandwidth))
densMR = np.exp(kdeMR.score_samples(Dgrid))
densMP = np.exp(kdeMP.score_samples(Dgrid))
fig, ax = plt.subplots(figsize=(15,8))
ax.plot(Dgrid[:, 0], densMR, '-')
ax.plot(Dgrid[:, 0], densMP, '-')
plt.show()
Let us try a Gaussian Mixture Model (without errors) to see how well the Expectation Maximization method recovers the two populations hinted at above.
In [15]:
X=np.transpose([dist,lines[:,2]])
for ncomp in range(1,4):
model = GMM(n_components=ncomp, covariance_type='full')
model.fit(X)
print "ncomp = {0} fit means".format(ncomp),model.means_
# print model.covars_
print "ncomp = {0} fit aic, bic values".format(ncomp),model.aic(X),model.bic(X)
plt.figure(figsize=(15,15))
splot= plt.subplot(2,1,1)
ncomp= 2
color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm'])
clf= GMM(n_components=ncomp, covariance_type='full')
clf.fit(X)
# Assign each data point to a component of the mixture
Y_ = clf.predict(X)
for i, (mean, covar, color) in enumerate(zip(
clf.means_, clf._get_covars(), color_iter)):
v, w = linalg.eigh(covar)
u = w[0] / linalg.norm(w[0])
# Scatter plot for points assigned to each component of the mixture
plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], 5, color=color)
# Plot an ellipse to show the Gaussian component
angle = np.arctan(u[1] / u[0])
angle = 180 * angle / np.pi # convert to degrees
ell = mpl.patches.Ellipse(mean, v[0], v[1], 180 + angle, color=color)
ell.set_clip_box(splot.bbox)
ell.set_alpha(0.5)
splot.add_artist(ell)
plt.xlim(0, 2.0)
plt.ylim(-4.5, 0.0)
plt.show()
We finish up with an estimate of the density in radius and metallicity for the Ursa Minor data using the nearest neighbors.
In [16]:
X=np.transpose([dist,lines[:,2]])
knn5 = KNeighborsDensity('bayesian', 5)
dens_k5 = knn5.fit(X).eval(Xgrid).reshape((Ny, Nx))
knn40 = KNeighborsDensity('bayesian', 40)
dens_k40 = knn40.fit(X).eval(Xgrid).reshape((Ny, Nx))
fig = plt.figure(figsize=(20, 20))
ax3 = plt.subplot(221, aspect='equal')
ax3.imshow(dens_k5.T, origin='lower', norm=LogNorm(),
extent=(ymin, ymax, xmin, xmax), cmap=plt.get_cmap('Spectral'))
ax3.text(0.95, 0.9, "$k$-neighbors $(k=5)$", ha='right', va='top',
transform=ax3.transAxes,
bbox=dict(boxstyle='round', ec='k', fc='w'))
# Fourth plot: KNN, k=40
ax4 = plt.subplot(222, aspect='equal')
ax4.imshow(dens_k40.T, origin='lower', norm=LogNorm(),
extent=(ymin, ymax, xmin, xmax), cmap=plt.get_cmap('Spectral'))
ax4.text(0.95, 0.9, "$k$-neighbors $(k=40)$", ha='right', va='top',
transform=ax4.transAxes,
bbox=dict(boxstyle='round', ec='k', fc='w'))
plt.show()