In [9]:
import numpy as np
import os
import itertools

from matplotlib import pyplot as plt
from matplotlib.colors import LogNorm
import matplotlib as mpl

%matplotlib inline 

from scipy import stats
from scipy import linalg
from scipy.spatial import cKDTree
from scipy.stats import gaussian_kde

from astroML.datasets import fetch_great_wall
from astroML.plotting import hist
from astroML.density_estimation import KNeighborsDensity

from sklearn.neighbors import KernelDensity
from sklearn.mixture import GMM
from sklearn.mixture import DPGMM
from sklearn.mixture import VBGMM
from sklearn.grid_search import GridSearchCV

Given a set of points in D-dimensions (where dimensions could be spatial or other properties), density estimation involves inferring the PDF of the data set. Finding structure in this data is called ``clustering".

Non-parameteric estimates rely on density kernels. Specifically, the density estimate of a set of points $\vec{x}$ is \begin{equation} \hat{f}_N(\vec{x})=\frac{1}{N h^D}\sum_{i=1}^{N} K(\frac{d(\vec{x},\vec{x}_i)}{h})\,, \end{equation} where $d(\vec{x},\vec{x}_i)$ is some suitable estimate of the ``distance" between points $\vec{x}$ and $\vec{x}_i$ and $h$ is called the bandwidth (compare to bin width).

The kernel has the following totally reasonable properties: $\int_0^\infty K(u) du =1$ (normalized properly), $\int_0^\infty u K(u) du =1$ (mean is zero) and $\int_0^\infty u^2 K(u) du >0$ (variance is greater than 0) and $K(u) > 0 \; \forall u$ (kernel is positive definite).

While smoothing properties of the kernel function may lead to a better estimate of the density, we still need to choose $h$ just like we had to choose a bin width.


In [10]:
# Author: Jake VanderPlas
# License: BSD
#   The figure produced by this code is published in the textbook
#   "Statistics, Data Mining, and Machine Learning in Astronomy" (2013)
#   For more information, see http://astroML.github.com
#   To report a bug or issue, use the following forum:
#    https://groups.google.com/forum/#!forum/astroml-general

#----------------------------------------------------------------------
# This function adjusts matplotlib settings for a uniform feel in the textbook.
# Note that with usetex=True, fonts are rendered with LaTeX.  This may
# result in an error if LaTeX is not installed on your system.  In that case,
# you can set usetex to False.
#from astroML.plotting import setup_text_plots
#setup_text_plots(fontsize=8, usetex=True)


#------------------------------------------------------------
# Draw the random data
np.random.seed(1)
x = np.concatenate([np.random.normal(-0.5, 0.3, size=14),
                    np.random.normal(1, 0.3, size=7)])


#------------------------------------------------------------
# First figure: silly histogram binning
fig1 = plt.figure(figsize=(10, 6))
fig1.subplots_adjust(left=0.12, right=0.95, wspace=0.05,
                     bottom=0.15, top=0.9, hspace=0.05)

FC = '#6666FF'
XLIM = (-2, 2.9)
YLIM = (-0.09, 1.1)

ax = fig1.add_subplot(121)
bins = np.linspace(-1.8, 2.7, 13)
ax.hist(x, bins=bins, normed=True,
        histtype='stepfilled', fc='k', alpha=0.3)
ax.plot(XLIM, [0, 0], '-k', lw=1)
ax.plot(x, 0 * x - 0.05, '+k')
ax.set_xlim(XLIM)
ax.set_ylim(YLIM)
ax.set_xlabel('$x$')
ax.set_ylabel('$p(x)$')

ax = fig1.add_subplot(122)
ax.yaxis.set_major_formatter(plt.NullFormatter())
ax.hist(x, bins=bins + 0.25, normed=True,
        histtype='stepfilled', fc='k', alpha=0.3)
ax.plot(XLIM, [0, 0], '-k', lw=1)
ax.plot(x, 0 * x - 0.05, '+k')
ax.set_xlim(XLIM)
ax.set_ylim(YLIM)
ax.set_xlabel('$x$')

#------------------------------------------------------------
# First figure: transition to KDE
fig2 = plt.figure(figsize=(10, 10))
fig2.subplots_adjust(left=0.12, right=0.95, wspace=0.05,
                     bottom=0.1, top=0.95, hspace=0.05)

ax = fig2.add_subplot(221)
ax.xaxis.set_major_formatter(plt.NullFormatter())
binwidth = bins[1] - bins[0]
x_plot = np.linspace(-4, 4, 1000)
y_plot = (abs(x_plot - x[:, None]) <= 0.5 * binwidth).astype(float)
y_plot /= (binwidth * len(x))
ax.fill(x_plot, y_plot.sum(0), ec='k', lw=1, fc='k', alpha=0.3)
ax.plot(x_plot, y_plot.T, '-k', lw=1)
ax.plot(x, 0 * x - 0.05, '+k')
ax.set_xlim(XLIM)
ax.set_ylim(YLIM)
ax.set_ylabel('$p(x)$')

ax = fig2.add_subplot(222)
ax.xaxis.set_major_formatter(plt.NullFormatter())
ax.yaxis.set_major_formatter(plt.NullFormatter())
binwidth = bins[1] - bins[0]
x_plot = np.linspace(-4, 4, 1000)
y_plot = binwidth * stats.norm.pdf(x_plot, x[:, None], 0.1)
y_plot /= (binwidth * len(x))
ax.fill(x_plot, y_plot.sum(0), ec='k', lw=1, fc='k', alpha=0.3)
ax.plot(x_plot, y_plot.T, '-k', lw=1)
ax.plot(x, 0 * x - 0.05, '+k')
ax.set_xlim(XLIM)
ax.set_ylim(YLIM)

ax = fig2.add_subplot(223)
binwidth = bins[1] - bins[0]
x_plot = np.linspace(-4, 4, 1000)
y_plot = binwidth * stats.norm.pdf(x_plot, x[:, None], 0.7)
y_plot /= (binwidth * len(x))
ax.fill(x_plot, y_plot.sum(0), ec='k', lw=1, fc='k', alpha=0.3)
ax.plot(x_plot, 4 * y_plot.T, '-k', lw=1)
ax.plot(x, 0 * x - 0.05, '+k')
ax.set_xlim(XLIM)
ax.set_ylim(YLIM)
ax.set_ylabel('$p(x)$')
ax.set_xlabel('$x$')

ax = fig2.add_subplot(224)
ax.yaxis.set_major_formatter(plt.NullFormatter())
binwidth = bins[1] - bins[0]
x_plot = np.linspace(-4, 4, 1000)
y_plot = binwidth * stats.norm.pdf(x_plot, x[:, None], 0.2)
y_plot /= (binwidth * len(x))
ax.fill(x_plot, y_plot.sum(0), ec='k', lw=1, fc='k', alpha=0.3)
ax.plot(x_plot, y_plot.T, '-k', lw=1)
ax.plot(x, 0 * x - 0.05, '+k')
ax.set_xlim(XLIM)
ax.set_ylim(YLIM)
ax.set_xlabel('$x$')

plt.show()



In [11]:
#------------------------------------------------------------
# Compute Kernels.
x = np.linspace(-5, 5, 10000)
dx = x[1] - x[0]

gauss = (1. / np.sqrt(2 * np.pi)) * np.exp(-0.5 * x ** 2)

exp = 0.5 * np.exp(-abs(x))

tophat = 0.5 * np.ones_like(x)
tophat[abs(x) > 1] = 0

#------------------------------------------------------------
# Plot the kernels
fig = plt.figure(figsize=(10, 7.5))
ax = fig.add_subplot(111)

ax.plot(x, gauss, '-', c='black', lw=3, label='Gaussian')
ax.plot(x, exp, '-', c='#666666', lw=2, label='Exponential')
ax.plot(x, tophat, '-', c='#999999', lw=1, label='Top-hat')

ax.legend(loc=1)

ax.set_xlabel('$u$')
ax.set_ylabel('$K(u)$')

ax.set_xlim(-5, 5)
ax.set_ylim(0, 0.6001)
plt.show()



In [12]:
#------------------------------------------------------------
# Fetch the great wall data
X = fetch_great_wall()

#------------------------------------------------------------
# Create  the grid on which to evaluate the results
Nx = 50
Ny = 125
xmin, xmax = (-375, -175)
ymin, ymax = (-300, 200)

#------------------------------------------------------------
# Evaluate for several models
Xgrid = np.vstack(map(np.ravel, np.meshgrid(np.linspace(xmin, xmax, Nx),
                                            np.linspace(ymin, ymax, Ny)))).T

kernels = ['gaussian', 'tophat', 'exponential']
dens = []

bw = 3

kde1 = KernelDensity(bandwidth=bw, kernel='gaussian')
log_dens1 = kde1.fit(X).score_samples(Xgrid)
dens1 = X.shape[0] * np.exp(log_dens1).reshape((Ny, Nx))

kde2 = KernelDensity(bandwidth=bw, kernel='tophat')
log_dens2 = kde2.fit(X).score_samples(Xgrid)
dens2 = X.shape[0] * np.exp(log_dens2).reshape((Ny, Nx))

kde3 = KernelDensity(bandwidth=bw, kernel='exponential')
log_dens3 = kde3.fit(X).score_samples(Xgrid)
dens3 = X.shape[0] * np.exp(log_dens3).reshape((Ny, Nx))

#------------------------------------------------------------
# Plot the results
fig = plt.figure(figsize=(20, 10))
fig.subplots_adjust(left=0.12, right=0.95, bottom=0.2, top=0.9,
                    hspace=0.01, wspace=0.01)

# First plot: scatter the points
ax1 = plt.subplot(221, aspect='equal')
ax1.scatter(X[:, 1], X[:, 0], s=1, lw=0, c='k')
ax1.text(0.95, 0.9, "input", ha='right', va='top',
         transform=ax1.transAxes,
         bbox=dict(boxstyle='round', ec='k', fc='w'))

# Second plot: gaussian kernel
ax2 = plt.subplot(222, aspect='equal')
ax2.imshow(dens1.T, origin='lower', norm=LogNorm(),
           extent=(ymin, ymax, xmin, xmax), cmap=plt.cm.binary)
ax2.text(0.95, 0.9, "Gaussian ($h$={0})".format(bw), ha='right', va='top',
         transform=ax2.transAxes,
         bbox=dict(boxstyle='round', ec='k', fc='w'))

# Third plot: top-hat kernel
ax3 = plt.subplot(223, aspect='equal')
ax3.imshow(dens2.T, origin='lower', norm=LogNorm(),
           extent=(ymin, ymax, xmin, xmax), cmap=plt.cm.binary)
ax3.text(0.95, 0.9, "top-hat ($h$={0})".format(bw), ha='right', va='top',
         transform=ax3.transAxes,
         bbox=dict(boxstyle='round', ec='k', fc='w'))
ax3.images[0].set_clim(0.01, 0.8)

# Fourth plot: exponential kernel
ax4 = plt.subplot(224, aspect='equal')
ax4.imshow(dens3.T, origin='lower', norm=LogNorm(),
           extent=(ymin, ymax, xmin, xmax), cmap=plt.cm.binary)
ax4.text(0.95, 0.9, "exponential ($h$={0})".format(bw), ha='right', va='top',
         transform=ax4.transAxes,
         bbox=dict(boxstyle='round', ec='k', fc='w'))

for ax in [ax1, ax2, ax3, ax4]:
    ax.set_xlim(ymin, ymax - 0.01)
    ax.set_ylim(xmin, xmax)

for ax in [ax1, ax2]:
    ax.xaxis.set_major_formatter(plt.NullFormatter())

for ax in [ax3, ax4]:
    ax.set_xlabel('$y$ (Mpc)')

for ax in [ax2, ax4]:
    ax.yaxis.set_major_formatter(plt.NullFormatter())

for ax in [ax1, ax3]:
    ax.set_ylabel('$x$ (Mpc)')

plt.show()



In [13]:
# Import Ursa Minor data
#filename = os.path.abspath(os.path.join('Downloads','umi_alpha_sigcut_metalstars_258.dat'))
filename = 'umi_alpha_sigcut_metalstars_258.dat'
print "reading {0}".format(filename)
lines = np.loadtxt(filename, comments="#", delimiter=None, unpack=False, skiprows=1)
#Columns: ra, dec, [Fe/H], [Fe/H] error, vel los, vel los error, [alpha/Fe], [alpha/Fe] error, membership (all 1)
raUMI=227.2833
decUMI=67.2225
distUMI=73

# Create an array with distances and metallicities
dist=lines[:,0:2]-[raUMI,decUMI]
dist2=np.power(dist,2)
dist=np.multiply(np.power(dist2[:,1]+dist2[:,0],0.5),distUMI*3.14156/180)
X=np.transpose([dist,lines[:,2]])
# Create  the grid on which to evaluate the results
Nx = 20
Ny = 20
xmin, xmax = (0.01, 2.41)
ymin, ymax = (-4.1, -0.1)

#------------------------------------------------------------
# Evaluate for several models
Xgrid = np.vstack(map(np.ravel, np.meshgrid(np.linspace(xmin, xmax, Nx),
                                            np.linspace(ymin, ymax, Ny)))).T

Dgrid = np.linspace(xmin, xmax, Nx)[:, np.newaxis]

# use grid search cross-validation to optimize the bandwidth
params = {'bandwidth': np.linspace(0.05, 0.3, 25)}
grid = GridSearchCV(KernelDensity(kernel='exponential'), param_grid=params)
grid.fit(X)
kde3 = grid.best_estimator_ 
#kde3 = KernelDensity(0.1, kernel='exponential')
print("Exponential kernel; best bandwidth: {0}".format(grid.best_estimator_.bandwidth))

params = {'bandwidth': np.linspace(0.05, 0.3, 25)}
grid = GridSearchCV(KernelDensity(kernel='gaussian'), param_grid=params)
grid.fit(X)
kde1 = grid.best_estimator_ 
#kde1 = KernelDensity(bandwidth=0.2, kernel='gaussian')
print("Gaussian kernel; best bandwidth: {0}".format(grid.best_estimator_.bandwidth)) 

log_dens1 = kde1.fit(X).score_samples(Xgrid)
dens1 = X.shape[0] * np.exp(log_dens1).reshape((Ny, Nx))

kde2 = KernelDensity(0.1, kernel='tophat')
log_dens2 = kde2.fit(X).score_samples(Xgrid)
dens2 = X.shape[0] * np.exp(log_dens2).reshape((Ny, Nx))

log_dens3 = kde3.fit(X).score_samples(Xgrid)
dens3 = X.shape[0] * np.exp(log_dens3).reshape((Ny, Nx))

#------------------------------------------------------------
# Plot the results
fig = plt.figure(figsize=(20, 10))
fig.subplots_adjust(left=0.12, right=0.95, bottom=0.2, top=0.9,
                    hspace=0.01, wspace=0.01)

# First plot: scatter the points
ax1 = plt.subplot(221, aspect='equal')
ax1.scatter(X[:, 1], X[:, 0], s=1, lw=0, c='k')
ax1.text(0.95, 0.9, "input", ha='right', va='top',
         transform=ax1.transAxes,
         bbox=dict(boxstyle='round', ec='k', fc='w'))

# Second plot: gaussian kernel
ax2 = plt.subplot(222, aspect='equal')
ax2.imshow(dens1.T, origin='lower', norm=LogNorm(),
           extent=(ymin, ymax, xmin, xmax), cmap=plt.get_cmap('gist_rainbow_r'))
ax2.text(0.95, 0.9, "Gaussian", ha='right', va='top',
         transform=ax2.transAxes,
         bbox=dict(boxstyle='round', ec='k', fc='w'))

# Third plot: top-hat kernel
ax3 = plt.subplot(223, aspect='equal')
ax3.imshow(dens2.T, origin='lower', norm=LogNorm(),
           extent=(ymin, ymax, xmin, xmax), cmap=plt.get_cmap('Dark2'))
ax3.text(0.95, 0.9, "top-hat", ha='right', va='top',
         transform=ax3.transAxes,
         bbox=dict(boxstyle='round', ec='k', fc='w'))
ax3.images[0].set_clim(0.01, 0.8)

# Fourth plot: exponential kernel
ax4 = plt.subplot(224, aspect='equal')
ax4.imshow(dens3.T, origin='lower', norm=LogNorm(),
           extent=(ymin, ymax, xmin, xmax), cmap=plt.get_cmap('Spectral'))
ax4.text(0.95, 0.9, "exponential", ha='right', va='top',
         transform=ax4.transAxes,
         bbox=dict(boxstyle='round', ec='k', fc='w'))

for ax in [ax1, ax2, ax3, ax4]:
    ax.set_xlim(ymin, ymax - 0.01)
    ax.set_ylim(xmin, xmax)

for ax in [ax1, ax2]:
    ax.xaxis.set_major_formatter(plt.NullFormatter())

for ax in [ax3, ax4]:
    ax.set_xlabel('[Fe/H]')

for ax in [ax2, ax4]:
    ax.yaxis.set_major_formatter(plt.NullFormatter())

for ax in [ax1, ax3]:
    ax.set_ylabel('Radius (deg)')

plt.show()


reading umi_alpha_sigcut_metalstars_258.dat
Exponential kernel; best bandwidth: 0.1125
Gaussian kernel; best bandwidth: 0.195833333333

Let us look at the 1-D distribution in radius and cut on metallicity to see if there is any evidence for multiple populations.


In [14]:
# Cost function for cross validation (only works for 1-dim data)
def bestBW(data,params,kernel='gaussian'):
    param_name, param_array = params.items()[0]
    dmax=10*np.amax(data)
    nint=1000
    ndata=len(data)
    Xint=np.linspace(0,dmax,nint)[:, np.newaxis]
    sum1=[[param_value,0] for i, param_value in enumerate(param_array)]
    sum2=[0 for j in range(len(param_array))]
    for j, param_value in enumerate(param_array):
        kde=KernelDensity(kernel=kernel, bandwidth=param_value).fit(data)
        sum2[j]=np.sum(np.exp(2*kde.score_samples(Xint)))*dmax/(nint-1.0)
        
    for i in range(ndata):
        dd=np.delete(data,i,0)
        for j, param_value in enumerate(param_array): 
            sum1[j][1]+=KernelDensity(kernel=kernel, bandwidth=param_value).fit(dd).score_samples(data[i])[0]
            
    for j in range(len(param_array)):
        sum1[j][1]=-2.0*sum1[j][1]/ndata+sum2[j]
        
#    print sum1
    return sum1[np.argmin(sum1,axis=0)[1]][0]


distMR=dist[np.all([lines[:,2]>-2],axis=0)][:, np.newaxis]
distMP=dist[np.all([lines[:,2]<-3],axis=0)][:, np.newaxis]

fig = plt.figure(figsize=(20, 8))
ax = fig.add_subplot(121)
# plot a standard histogram in the background, with alpha transparency
hist(dist, bins='knuth', histtype='stepfilled',alpha=0.2, normed=True, label='Knuth rule')
# plot an adaptive-width histogram on top
hist(dist, bins='blocks', ax=ax, color='black',histtype='step', normed=True, label='Bayesian blocks')
ax.legend(prop=dict(size=12))
plt.show()

params = {'bandwidth': np.linspace(0.01, 0.3, 30)}  
grid = GridSearchCV(KernelDensity(kernel='gaussian'), param_grid=params)
grid.fit(distMR)
kdeMR = grid.best_estimator_ 
#kdeMR = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(distMR)    
print("Gaussian kernel; metal-rich; best bandwidth: {0}".format(grid.best_estimator_.bandwidth))
print "best fit bandwidth from Eq. 6.8 cost = ",bestBW(distMR,params,kernel='gaussian') 
grid = GridSearchCV(KernelDensity(kernel='gaussian'), param_grid=params)
grid.fit(distMP)
kdeMP = grid.best_estimator_ 
#kdeMP = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(distMP)
print("Gaussian kernel; metal-poor; best bandwidth: {0}".format(grid.best_estimator_.bandwidth))

densMR = np.exp(kdeMR.score_samples(Dgrid))
densMP = np.exp(kdeMP.score_samples(Dgrid))

fig, ax = plt.subplots(figsize=(15,8))
ax.plot(Dgrid[:, 0], densMR, '-')
ax.plot(Dgrid[:, 0], densMP, '-')
plt.show()


Gaussian kernel; metal-rich; best bandwidth: 0.16
best fit bandwidth from Eq. 6.8 cost =  0.23
Gaussian kernel; metal-poor; best bandwidth: 0.2

Let us try a Gaussian Mixture Model (without errors) to see how well the Expectation Maximization method recovers the two populations hinted at above.


In [15]:
X=np.transpose([dist,lines[:,2]])
for ncomp in range(1,4):
    model = GMM(n_components=ncomp, covariance_type='full')
    model.fit(X)
    print "ncomp = {0} fit means".format(ncomp),model.means_
#    print model.covars_
    print "ncomp = {0} fit aic, bic values".format(ncomp),model.aic(X),model.bic(X)

plt.figure(figsize=(15,15))
splot= plt.subplot(2,1,1)
ncomp= 2
color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm'])
clf= GMM(n_components=ncomp, covariance_type='full')
clf.fit(X)
# Assign each data point to a component of the mixture
Y_ = clf.predict(X)
for i, (mean, covar, color) in enumerate(zip(
            clf.means_, clf._get_covars(), color_iter)):
        v, w = linalg.eigh(covar)
        u = w[0] / linalg.norm(w[0])
        # Scatter plot for points assigned to each component of the mixture
        plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], 5, color=color)
        # Plot an ellipse to show the Gaussian component
        angle = np.arctan(u[1] / u[0])
        angle = 180 * angle / np.pi  # convert to degrees
        ell = mpl.patches.Ellipse(mean, v[0], v[1], 180 + angle, color=color)
        ell.set_clip_box(splot.bbox)
        ell.set_alpha(0.5)
        splot.add_artist(ell)
        
plt.xlim(0, 2.0)
plt.ylim(-4.5, 0.0)
plt.show()


ncomp = 1 fit means [[ 0.58173965 -2.204154  ]]
ncomp = 1 fit aic, bic values 1640.19918925 1663.43763112
ncomp = 2 fit means [[ 0.87109571 -2.32988861]
 [ 0.39926441 -2.12486258]]
ncomp = 2 fit aic, bic values 1492.97027309 1544.0948452
ncomp = 3 fit means [[ 0.66632814 -2.057714  ]
 [ 0.30262351 -2.21193426]
 [ 0.99652447 -2.4333687 ]]
ncomp = 3 fit aic, bic values 1461.50715463 1540.51785698

We finish up with an estimate of the density in radius and metallicity for the Ursa Minor data using the nearest neighbors.


In [16]:
X=np.transpose([dist,lines[:,2]])

knn5 = KNeighborsDensity('bayesian', 5)
dens_k5 = knn5.fit(X).eval(Xgrid).reshape((Ny, Nx))

knn40 = KNeighborsDensity('bayesian', 40)
dens_k40 = knn40.fit(X).eval(Xgrid).reshape((Ny, Nx))

fig = plt.figure(figsize=(20, 20))
ax3 = plt.subplot(221, aspect='equal')
ax3.imshow(dens_k5.T, origin='lower', norm=LogNorm(),
           extent=(ymin, ymax, xmin, xmax), cmap=plt.get_cmap('Spectral'))
ax3.text(0.95, 0.9, "$k$-neighbors $(k=5)$", ha='right', va='top',
         transform=ax3.transAxes,
         bbox=dict(boxstyle='round', ec='k', fc='w'))

# Fourth plot: KNN, k=40
ax4 = plt.subplot(222, aspect='equal')
ax4.imshow(dens_k40.T, origin='lower', norm=LogNorm(),
           extent=(ymin, ymax, xmin, xmax), cmap=plt.get_cmap('Spectral'))
ax4.text(0.95, 0.9, "$k$-neighbors $(k=40)$", ha='right', va='top',
         transform=ax4.transAxes,
         bbox=dict(boxstyle='round', ec='k', fc='w'))

plt.show()