Machine learning exercise by Group 1 at AstroHackWeek 2017, Day 1.
First, we blatantly copy some of the code from the demo-SDSS notebook...
In [ ]:
from os import path
from astropy.table import Table
import h5py
import matplotlib.pyplot as plt
#plt.style.use('notebook.mplstyle')
%matplotlib inline
import numpy as np
from sklearn.cluster import KMeans
In [ ]:
data_path = '/Users/Meredith/Astronomy/astrohack/ahw2017-ml-data/' # specific to my computer
In [ ]:
photoPos = Table.read(path.join(data_path, 'sdss', 'photoPosPlate-merged.hdf5'),
path='photoPosPlate')
In [ ]:
len(photoPos)
Pull color information out of the photoPosPlate data file for u-g, g-r, r-i, and i-z colors
In [ ]:
# 01234 = ugriz filters
u_g = photoPos['PSFMAG'][:,0] - photoPos['PSFMAG'][:,1]
g_r = photoPos['PSFMAG'][:,1] - photoPos['PSFMAG'][:,2]
r_i = photoPos['PSFMAG'][:,2] - photoPos['PSFMAG'][:,3]
i_z = photoPos['PSFMAG'][:,3] - photoPos['PSFMAG'][:,4]
Let's take a look at how the spectral data was classified by SDSS into galaxies, QSOs, or stars
In [ ]:
specObj = Table.read(path.join(data_path, 'sdss', 'specObj-merged.hdf5'),
path='specObj')
spec_class = specObj['CLASS'].astype(str)
spec_classes = np.unique(spec_class)
for cls in spec_classes:
print(cls, (spec_class == cls).sum())
In [ ]:
fig, axes = plt.subplots(1, len(spec_classes), figsize=(12.5,5),
sharex=True, sharey=True)
for i, cls in enumerate(spec_classes):
axes[i].plot(g_r[spec_class == cls], r_i[spec_class == cls],
marker='.', linestyle='none', alpha=0.1)
axes[i].set_title(cls)
axes[i].set_xlabel('$g-r$ [mag]')
axes[0].set_xlim(-0.5, 2.5)
axes[0].set_ylim(-1, 2)
axes[0].set_ylabel('$r-i$ [mag]')
fig.tight_layout()
Hopefully our K-means clustering will show us that the dataset breaks into somewhat similarly-shaped pieces in color-color space.
To get an idea for how our data for K-means should be structured, we refer to the example at http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
In [ ]:
X = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]])
In [ ]:
X.shape # (number of data points X number of things per data point)
In [ ]:
colors = np.array([u_g, g_r, r_i, i_z]).T # put into the same shape as X
In [ ]:
colors.shape
In [ ]:
n_clusters = 8 # the number of clusters to use
In [ ]:
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(colors) # run the K-means analysis
In [ ]:
print(kmeans.labels_) # the label from 0 to n_clusters assigned to each point
In [ ]:
print(kmeans.cluster_centers_)
In [ ]:
# make a new plot for each cluster center
for k in range(n_clusters):
plt.figure(figsize=(5,5))
idx = (kmeans.labels_ == k)
plt.scatter(g_r[idx], r_i[idx], alpha=0.1, marker='.')
plt.xlabel('$g - r$ [mag]')
plt.ylabel('$r - i$ [mag]')
plt.xlim([-0.5, 2.5])
plt.ylim([-1, 2])
plt.title('cluster label ' + str(k))
At a glance, it looks like clusters 0, 4, and 6 are mostly galaxies, clusters 1 and 2 are weird outliers, cluster 3 is QSOs (plus some stellar contamination?), and clusters 5 and 7 are mostly stars. We could almost certainly refine this better given more time.
Get the indices corresponding to each K-means label 0, 1, and 2 for comparison
In [ ]:
zeroidx = np.where((kmeans.labels_ == 0))
oneidx = np.where((kmeans.labels_ == 1))
twoidx = np.where((kmeans.labels_ == 2))
In [ ]:
print(len(zeroidx[0]))
print(len(oneidx[0]))
print(len(twoidx[0]))
In [ ]:
plt.figure(figsize=(5,5))
plt.plot(g_r, r_i, alpha=0.1, ls='None', marker='.') # full dataset
plt.plot(g_r[oneidx], r_i[oneidx], ls='None', marker='o', mec='k') # problem outlier 1
plt.plot(g_r[twoidx], r_i[twoidx], ls='None', marker='o', mec='k') # problem outlier 2
plt.xlabel('$g - r$ [mag]')
plt.ylabel('$r - i$ [mag]')
plt.xlim([-0.5, 2.5])
plt.ylim([-1, 2])
The two outlier points highlighted on top of the full dataset.
In [ ]: