In [1]:
%pylab inline
We're looking for ways to interpret clustering in an "unsupervised" way, without having any prior information about the number and properties of the clusters within the data.
A simple way is to partition the data into a small number of subsets, or "clusters", by minimizing the "sum-of-squares objective function":
$\Sigma_{k=1}^{K}\Sigma_{i\in C_{k}} || x_{i}-\mu_{k} ||^2$
where $\mu_{k}$ is the mean of the points in set $C_{k}$. The procedure chooses the centroid, $\mu_{k}$, of each of the clusters, groups all the points closest to said cluster, and recomputes the centroid. Continues until there are no new assignments. With the same stellar metallicity data from fig 6.6, choosing $K = 4$ (so we're "supervising" in that way by choosing the number of clusters...):
In [8]:
# Author: Jake VanderPlas
# License: BSD
# The figure produced by this code is published in the textbook
# "Statistics, Data Mining, and Machine Learning in Astronomy" (2013)
# For more information, see http://astroML.github.com
# To report a bug or issue, use the following forum:
# https://groups.google.com/forum/#!forum/astroml-general
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.patches import Ellipse
from scipy.stats import norm
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.preprocessing import scale
from astroML.datasets import fetch_sdss_sspp
#----------------------------------------------------------------------
# This function adjusts matplotlib settings for a uniform feel in the textbook.
# Note that with usetex=True, fonts are rendered with LaTeX. This may
# result in an error if LaTeX is not installed on your system. In that case,
# you can set usetex to False.
from astroML.plotting import setup_text_plots
setup_text_plots(fontsize=8, usetex=True)
#------------------------------------------------------------
# Get data
data = fetch_sdss_sspp(cleaned=True)
X = np.vstack([data['FeH'], data['alphFe']]).T
# truncate dataset for speed
X = X[::5]
#------------------------------------------------------------
# Compute a 2D histogram of the input
H, FeH_bins, alphFe_bins = np.histogram2d(data['FeH'], data['alphFe'], 50)
#------------------------------------------------------------
# Compute the KMeans clustering
n_clusters = 4
scaler = preprocessing.StandardScaler() # preprocessing has no attribute Scaler...
clf = KMeans(n_clusters)
clf.fit(scaler.fit_transform(X)) # nope.
#clf.fit(scale(X))
#------------------------------------------------------------
# Visualize the results
fig = plt.figure(figsize=(7, 7))
ax = fig.add_subplot()
# plot density
ax = plt.axes()
ax.imshow(H.T, origin='lower', interpolation='nearest', aspect='auto',
extent=[FeH_bins[0], FeH_bins[-1],
alphFe_bins[0], alphFe_bins[-1]],
cmap=plt.cm.binary)
# plot cluster centers
cluster_centers = scaler.inverse_transform(clf.cluster_centers_)
ax.scatter(cluster_centers[:, 0], cluster_centers[:, 1],
s=40, c='w', edgecolors='k')
# plot cluster boundaries
FeH_centers = 0.5 * (FeH_bins[1:] + FeH_bins[:-1])
alphFe_centers = 0.5 * (alphFe_bins[1:] + alphFe_bins[:-1])
Xgrid = np.meshgrid(FeH_centers, alphFe_centers)
Xgrid = np.array(Xgrid).reshape((2, 50 * 50)).T
H = clf.predict(scaler.transform(Xgrid)).reshape((50, 50))
for i in range(n_clusters):
Hcp = H.copy()
flag = (Hcp == i)
Hcp[flag] = 1
Hcp[~flag] = 0
ax.contour(FeH_centers, alphFe_centers, Hcp, [-0.5, 0.5],
linewidths=1, colors='k')
ax.xaxis.set_major_locator(plt.MultipleLocator(0.3))
ax.set_xlim(-1.101, 0.101)
ax.set_ylim(alphFe_bins[0], alphFe_bins[-1])
ax.set_xlabel(r'$\rm [Fe/H]$',fontsize=20)
ax.set_ylabel(r'$\rm [\alpha/Fe]$',fontsize=20)
plt.show()
Which clearly separates the data into 4 equal parts, but doesn't seem to really capture the clustering of points. Try again with a different $K = 10$:
In [7]:
setup_text_plots(fontsize=8, usetex=True)
#------------------------------------------------------------
# Get data
data = fetch_sdss_sspp(cleaned=True)
X = np.vstack([data['FeH'], data['alphFe']]).T
# truncate dataset for speed
X = X[::5]
#------------------------------------------------------------
# Compute a 2D histogram of the input
H, FeH_bins, alphFe_bins = np.histogram2d(data['FeH'], data['alphFe'], 50)
#------------------------------------------------------------
# Compute the KMeans clustering
n_clusters = 10
scaler = preprocessing.StandardScaler() # preprocessing has no attribute Scaler...
clf = KMeans(n_clusters)
clf.fit(scaler.fit_transform(X)) # nope.
#clf.fit(scale(X))
#------------------------------------------------------------
# Visualize the results
fig = plt.figure(figsize=(7, 7))
ax = fig.add_subplot()
# plot density
ax = plt.axes()
ax.imshow(H.T, origin='lower', interpolation='nearest', aspect='auto',
extent=[FeH_bins[0], FeH_bins[-1],
alphFe_bins[0], alphFe_bins[-1]],
cmap=plt.cm.binary)
# plot cluster centers
cluster_centers = scaler.inverse_transform(clf.cluster_centers_)
ax.scatter(cluster_centers[:, 0], cluster_centers[:, 1],
s=40, c='w', edgecolors='k')
# plot cluster boundaries
FeH_centers = 0.5 * (FeH_bins[1:] + FeH_bins[:-1])
alphFe_centers = 0.5 * (alphFe_bins[1:] + alphFe_bins[:-1])
Xgrid = np.meshgrid(FeH_centers, alphFe_centers)
Xgrid = np.array(Xgrid).reshape((2, 50 * 50)).T
H = clf.predict(scaler.transform(Xgrid)).reshape((50, 50))
for i in range(n_clusters):
Hcp = H.copy()
flag = (Hcp == i)
Hcp[flag] = 1
Hcp[~flag] = 0
ax.contour(FeH_centers, alphFe_centers, Hcp, [-0.5, 0.5],
linewidths=1, colors='k')
ax.xaxis.set_major_locator(plt.MultipleLocator(0.3))
ax.set_xlim(-1.101, 0.101)
ax.set_ylim(alphFe_bins[0], alphFe_bins[-1])
ax.set_xlabel(r'$\rm [Fe/H]$',fontsize=20)
ax.set_ylabel(r'$\rm [\alpha/Fe]$',fontsize=20)
plt.show()
And with $K=2$ which we know to be true:
In [6]:
setup_text_plots(fontsize=8, usetex=True)
#------------------------------------------------------------
# Get data
data = fetch_sdss_sspp(cleaned=True)
X = np.vstack([data['FeH'], data['alphFe']]).T
# truncate dataset for speed
X = X[::5]
#------------------------------------------------------------
# Compute a 2D histogram of the input
H, FeH_bins, alphFe_bins = np.histogram2d(data['FeH'], data['alphFe'], 50)
#------------------------------------------------------------
# Compute the KMeans clustering
n_clusters = 2
scaler = preprocessing.StandardScaler() # preprocessing has no attribute Scaler...
clf = KMeans(n_clusters)
clf.fit(scaler.fit_transform(X)) # nope.
#clf.fit(scale(X))
#------------------------------------------------------------
# Visualize the results
fig = plt.figure(figsize=(7,7))
ax = fig.add_subplot()
# plot density
ax = plt.axes()
ax.imshow(H.T, origin='lower', interpolation='nearest', aspect='auto',
extent=[FeH_bins[0], FeH_bins[-1],
alphFe_bins[0], alphFe_bins[-1]],
cmap=plt.cm.binary)
# plot cluster centers
cluster_centers = scaler.inverse_transform(clf.cluster_centers_)
ax.scatter(cluster_centers[:, 0], cluster_centers[:, 1],
s=40, c='w', edgecolors='k')
# plot cluster boundaries
FeH_centers = 0.5 * (FeH_bins[1:] + FeH_bins[:-1])
alphFe_centers = 0.5 * (alphFe_bins[1:] + alphFe_bins[:-1])
Xgrid = np.meshgrid(FeH_centers, alphFe_centers)
Xgrid = np.array(Xgrid).reshape((2, 50 * 50)).T
H = clf.predict(scaler.transform(Xgrid)).reshape((50, 50))
for i in range(n_clusters):
Hcp = H.copy()
flag = (Hcp == i)
Hcp[flag] = 1
Hcp[~flag] = 0
ax.contour(FeH_centers, alphFe_centers, Hcp, [-0.5, 0.5],
linewidths=1, colors='k')
ax.xaxis.set_major_locator(plt.MultipleLocator(0.3))
ax.set_xlim(-1.101, 0.101)
ax.set_ylim(alphFe_bins[0], alphFe_bins[-1])
ax.set_xlabel(r'$\rm [Fe/H]$',fontsize=20)
ax.set_ylabel(r'$\rm [\alpha/Fe]$',fontsize=20)
plt.show()
And so it seems this is indeed a good way to partition data, but not necessarily capture the clustering.
This method builds a kernel density estimate of the data and looks for peaks, associating each data point with its closest peak. It does that by directing data points along the direction of the log gradient of the density of the data, until they converge at the peaks. The log is taken to ensure the method converges in a few iterations, whith points in regions of low density moving a considerable distance toward regions of high density. Iterate $m$ times for position $x_{i}$ as
$x_{i}^{m+1} = x_{i}^{m} + a\,\triangledown\, log\, \hat{f^{m}}(x_{i}^{m})$
$=x_{i}^{m} + \frac{a}{\hat{f^{m}}(x_{i}^{m})}\,\triangledown \,\hat{f^{m}}(x_{i}^{m})$
where $\hat{f^{m}}(x_{i}^{m})$ and its gradient are found from the kernel density estimation.
Convergence is defined by the "bandwidth, $h$, of the kernel and the parameterization $a$. If you parameterize $a$ with the Epanechnikov kernel,
$a=\frac{h^2}{D+2}$,
then the parameterization of the "nonparameteric" estimation is then pseudo-unparameterized...
as they go on to say the update rule above becomes
$x_{i}^{m+1} = $ mean position of points $x_{i}^{m}$ within distance $h$ of $x_{i}^{m}.
It seems to work much better than the $K$-means.
In [9]:
# Author: Jake VanderPlas
# License: BSD
# The figure produced by this code is published in the textbook
# "Statistics, Data Mining, and Machine Learning in Astronomy" (2013)
# For more information, see http://astroML.github.com
# To report a bug or issue, use the following forum:
# https://groups.google.com/forum/#!forum/astroml-general
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.patches import Ellipse
from scipy.stats import norm
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn import preprocessing
from astroML.datasets import fetch_sdss_sspp
#----------------------------------------------------------------------
# This function adjusts matplotlib settings for a uniform feel in the textbook.
# Note that with usetex=True, fonts are rendered with LaTeX. This may
# result in an error if LaTeX is not installed on your system. In that case,
# you can set usetex to False.
from astroML.plotting import setup_text_plots
setup_text_plots(fontsize=8, usetex=True)
#------------------------------------------------------------
# Get the data
np.random.seed(0)
data = fetch_sdss_sspp(cleaned=True)
# cut out some additional strange outliers
data = data[~((data['alphFe'] > 0.4) & (data['FeH'] > -0.3))]
X = np.vstack([data['FeH'], data['alphFe']]).T
#----------------------------------------------------------------------
# Compute clustering with MeanShift
#
# We'll work with the scaled data, because MeanShift finds circular clusters
X_scaled = preprocessing.scale(X)
# The following bandwidth can be automatically detected using
# the routine estimate_bandwidth(). Because bandwidth estimation
# is very expensive in memory and computation, we'll skip it here.
#bandwidth = estimate_bandwidth(X)
bandwidth = 0.4
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False)
ms.fit(X_scaled)
labels_unique = np.unique(ms.labels_)
n_clusters = len(labels_unique[labels_unique >= 0])
print labels_unique
print bandwidth
print "number of estimated clusters : %d" % n_clusters
#------------------------------------------------------------
# Plot the results
fig = plt.figure(figsize=(7, 7))
ax = fig.add_subplot(111)
# plot density
H, FeH_bins, alphFe_bins = np.histogram2d(data['FeH'], data['alphFe'], 51)
ax.imshow(H.T, origin='lower', interpolation='nearest', aspect='auto',
extent=[FeH_bins[0], FeH_bins[-1],
alphFe_bins[0], alphFe_bins[-1]],
cmap=plt.cm.binary)
# plot clusters
colors = ['b', 'g', 'r', 'k']
for i in range(n_clusters):
Xi = X[ms.labels_ == i]
H, b1, b2 = np.histogram2d(Xi[:, 0], Xi[:, 1], (FeH_bins, alphFe_bins))
bins = [0.1]
ax.contour(0.5 * (FeH_bins[1:] + FeH_bins[:-1]),
0.5 * (alphFe_bins[1:] + alphFe_bins[:-1]),
H.T, bins, colors='w')
ax.xaxis.set_major_locator(plt.MultipleLocator(0.3))
ax.set_xlim(-1.101, 0.101)
ax.set_ylim(alphFe_bins[0], alphFe_bins[-1])
ax.set_xlabel(r'$\rm [Fe/H]$', fontsize=20)
ax.set_ylabel(r'$\rm [\alpha/Fe]$',fontsize=20)
plt.show()
Try again but with $h/2$:
In [10]:
# Author: Jake VanderPlas
# License: BSD
# The figure produced by this code is published in the textbook
# "Statistics, Data Mining, and Machine Learning in Astronomy" (2013)
# For more information, see http://astroML.github.com
# To report a bug or issue, use the following forum:
# https://groups.google.com/forum/#!forum/astroml-general
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.patches import Ellipse
from scipy.stats import norm
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn import preprocessing
from astroML.datasets import fetch_sdss_sspp
#----------------------------------------------------------------------
# This function adjusts matplotlib settings for a uniform feel in the textbook.
# Note that with usetex=True, fonts are rendered with LaTeX. This may
# result in an error if LaTeX is not installed on your system. In that case,
# you can set usetex to False.
from astroML.plotting import setup_text_plots
setup_text_plots(fontsize=8, usetex=True)
#------------------------------------------------------------
# Get the data
np.random.seed(0)
data = fetch_sdss_sspp(cleaned=True)
# cut out some additional strange outliers
data = data[~((data['alphFe'] > 0.4) & (data['FeH'] > -0.3))]
X = np.vstack([data['FeH'], data['alphFe']]).T
#----------------------------------------------------------------------
# Compute clustering with MeanShift
#
# We'll work with the scaled data, because MeanShift finds circular clusters
X_scaled = preprocessing.scale(X)
# The following bandwidth can be automatically detected using
# the routine estimate_bandwidth(). Because bandwidth estimation
# is very expensive in memory and computation, we'll skip it here.
#bandwidth = estimate_bandwidth(X)
bandwidth = .2
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False)
ms.fit(X_scaled)
labels_unique = np.unique(ms.labels_)
n_clusters = len(labels_unique[labels_unique >= 0])
print labels_unique
print bandwidth
print "number of estimated clusters : %d" % n_clusters
#------------------------------------------------------------
# Plot the results
fig = plt.figure(figsize=(7, 7))
ax = fig.add_subplot(111)
# plot density
H, FeH_bins, alphFe_bins = np.histogram2d(data['FeH'], data['alphFe'], 51)
ax.imshow(H.T, origin='lower', interpolation='nearest', aspect='auto',
extent=[FeH_bins[0], FeH_bins[-1],
alphFe_bins[0], alphFe_bins[-1]],
cmap=plt.cm.binary)
# plot clusters
colors = ['b', 'g', 'r', 'k']
for i in range(n_clusters):
Xi = X[ms.labels_ == i]
H, b1, b2 = np.histogram2d(Xi[:, 0], Xi[:, 1], (FeH_bins, alphFe_bins))
bins = [0.1]
ax.contour(0.5 * (FeH_bins[1:] + FeH_bins[:-1]),
0.5 * (alphFe_bins[1:] + alphFe_bins[:-1]),
H.T, bins, colors='w')
ax.xaxis.set_major_locator(plt.MultipleLocator(0.3))
ax.set_xlim(-1.101, 0.101)
ax.set_ylim(alphFe_bins[0], alphFe_bins[-1])
ax.set_xlabel(r'$\rm [Fe/H]$', fontsize=20)
ax.set_ylabel(r'$\rm [\alpha/Fe]$', fontsize=20)
plt.show()
and $2h$:
In [11]:
# Author: Jake VanderPlas
# License: BSD
# The figure produced by this code is published in the textbook
# "Statistics, Data Mining, and Machine Learning in Astronomy" (2013)
# For more information, see http://astroML.github.com
# To report a bug or issue, use the following forum:
# https://groups.google.com/forum/#!forum/astroml-general
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.patches import Ellipse
from scipy.stats import norm
import matplotlib as mpl
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn import preprocessing
from astroML.datasets import fetch_sdss_sspp
#----------------------------------------------------------------------
# This function adjusts matplotlib settings for a uniform feel in the textbook.
# Note that with usetex=True, fonts are rendered with LaTeX. This may
# result in an error if LaTeX is not installed on your system. In that case,
# you can set usetex to False.
from astroML.plotting import setup_text_plots
setup_text_plots(fontsize=8, usetex=True)
#------------------------------------------------------------
# Get the data
np.random.seed(0)
data = fetch_sdss_sspp(cleaned=True)
# cut out some additional strange outliers
data = data[~((data['alphFe'] > 0.4) & (data['FeH'] > -0.3))]
X = np.vstack([data['FeH'], data['alphFe']]).T
#----------------------------------------------------------------------
# Compute clustering with MeanShift
#
# We'll work with the scaled data, because MeanShift finds circular clusters
X_scaled = preprocessing.scale(X)
# The following bandwidth can be automatically detected using
# the routine estimate_bandwidth(). Because bandwidth estimation
# is very expensive in memory and computation, we'll skip it here.
#bandwidth = estimate_bandwidth(X)
bandwidth = .8
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False)
ms.fit(X_scaled)
labels_unique = np.unique(ms.labels_)
n_clusters = len(labels_unique[labels_unique >= 0])
print labels_unique
print bandwidth
print "number of estimated clusters : %d" % n_clusters
#------------------------------------------------------------
# Plot the results
fig = plt.figure(figsize=(7, 7))
ax = fig.add_subplot(111)
# plot density
H, FeH_bins, alphFe_bins = np.histogram2d(data['FeH'], data['alphFe'], 51)
ax.imshow(H.T, origin='lower', interpolation='nearest', aspect='auto',
extent=[FeH_bins[0], FeH_bins[-1],
alphFe_bins[0], alphFe_bins[-1]],
cmap=plt.cm.binary)
# plot clusters
colors = ['b', 'g', 'r', 'k']
for i in range(n_clusters):
Xi = X[ms.labels_ == i]
H, b1, b2 = np.histogram2d(Xi[:, 0], Xi[:, 1], (FeH_bins, alphFe_bins))
bins = [0.1]
ax.contour(0.5 * (FeH_bins[1:] + FeH_bins[:-1]),
0.5 * (alphFe_bins[1:] + alphFe_bins[:-1]),
H.T, bins, colors='w')
ax.xaxis.set_major_locator(plt.MultipleLocator(0.3))
ax.set_xlim(-1.101, 0.101)
ax.set_ylim(alphFe_bins[0], alphFe_bins[-1])
ax.set_xlabel(r'$\rm [Fe/H]$', fontsize=20)
ax.set_ylabel(r'$\rm [\alpha/Fe]$', fontsize=20)
plt.show()
So indeed the method is highly dependent on $h$.
Methods that are not formally related to some function of underlying density -- relaxes the need to specify the number of clusters $K$ by finding all clusters at all scales. Partition the data into $N$ clusters, one for each point in the data set. Merge them as nearest pairs for agglomerative approach (vs. top-down divisive approach). At each step merge the nearest pair. Heuristically, this sounds very much like the angular correlation function $\omega(\theta)$.
In [15]:
# Author: Jake VanderPlas
# License: BSD
# The figure produced by this code is published in the textbook
# "Statistics, Data Mining, and Machine Learning in Astronomy" (2013)
# For more information, see http://astroML.github.com
# To report a bug or issue, use the following forum:
# https://groups.google.com/forum/#!forum/astroml-general
import numpy as np
from matplotlib import pyplot as plt
from scipy import sparse
from sklearn.mixture import GMM
from astroML.clustering import HierarchicalClustering, get_graph_segments
from astroML.datasets import fetch_great_wall
#----------------------------------------------------------------------
# This function adjusts matplotlib settings for a uniform feel in the textbook.
# Note that with usetex=True, fonts are rendered with LaTeX. This may
# result in an error if LaTeX is not installed on your system. In that case,
# you can set usetex to False.
from astroML.plotting import setup_text_plots
setup_text_plots(fontsize=8, usetex=True)
#------------------------------------------------------------
# get data
X = fetch_great_wall()
xmin, xmax = (-375, -175)
ymin, ymax = (-300, 200)
#------------------------------------------------------------
# Compute the MST clustering model
n_neighbors = 10
edge_cutoff = 0.9
cluster_cutoff = 10
model = HierarchicalClustering(n_neighbors=10,
edge_cutoff=edge_cutoff,
min_cluster_size=cluster_cutoff)
model.fit(X)
print " scale: %2g Mpc" % np.percentile(model.full_tree_.data,
100 * edge_cutoff)
n_components = model.n_components_
labels = model.labels_
#------------------------------------------------------------
# Get the x, y coordinates of the beginning and end of each line segment
T_x, T_y = get_graph_segments(model.X_train_,
model.full_tree_)
T_trunc_x, T_trunc_y = get_graph_segments(model.X_train_,
model.cluster_graph_)
#------------------------------------------------------------
# Fit a GMM to each individual cluster
Nx = 100
Ny = 250
Xgrid = np.vstack(map(np.ravel, np.meshgrid(np.linspace(xmin, xmax, Nx),
np.linspace(ymin, ymax, Ny)))).T
density = np.zeros(Xgrid.shape[0])
for i in range(n_components):
ind = (labels == i)
Npts = ind.sum()
Nclusters = min(12, Npts / 5)
gmm = GMM(Nclusters).fit(X[ind])
dens = np.exp(gmm.score(Xgrid))
density += dens / dens.max()
density = density.reshape((Ny, Nx))
#----------------------------------------------------------------------
# Plot the results
fig = plt.figure(figsize=(10, 12))
fig.subplots_adjust(hspace=0, left=0.1, right=0.95, bottom=0.1, top=0.9)
ax = fig.add_subplot(311, aspect='equal')
ax.scatter(X[:, 1], X[:, 0], s=1, lw=0, c='k')
ax.set_xlim(ymin, ymax)
ax.set_ylim(xmin, xmax)
ax.xaxis.set_major_formatter(plt.NullFormatter())
ax.set_ylabel('(Mpc)')
ax = fig.add_subplot(312, aspect='equal')
ax.plot(T_y, T_x, c='k', lw=0.5)
ax.set_xlim(ymin, ymax)
ax.set_ylim(xmin, xmax)
ax.xaxis.set_major_formatter(plt.NullFormatter())
ax.set_ylabel('(Mpc)')
ax = fig.add_subplot(313, aspect='equal')
ax.plot(T_trunc_y, T_trunc_x, c='k', lw=0.5)
ax.imshow(density.T, origin='lower', cmap=plt.cm.hot_r,
extent=[ymin, ymax, xmin, xmax])
ax.set_xlim(ymin, ymax)
ax.set_ylim(xmin, xmax)
ax.set_xlabel('(Mpc)')
ax.set_ylabel('(Mpc)')
plt.show()
In [ ]: