In [ ]:
%matplotlib inline
In [ ]:
import h5py
import numpy as np
import sklearn.preprocessing
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import astropy.io.fits as fits
import matplotlib
import matplotlib.pyplot as plt
import spectraldl.ondrejov as ondrejov
import spectraldl.preprocessing as preprocessing
In [ ]:
with h5py.File('data/data.hdf5') as f:
X = f['X'][...]
y = f['y'][...]
In [ ]:
def plot_scatter(X, y):
'''Plot scatter plot of point from X. X is of shape (n_samples, 2).'''
fig, ax = plt.subplots()
sc = ax.scatter(X[:, 0], X[:, 1], c=y, alpha=0.25)
fig.colorbar(sc)
In [ ]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
print('explained variance: {}%'.format(np.sum(pca.explained_variance_ratio_) * 100))
plot_scatter(X_pca, y)
In [ ]:
fig, (ax1, ax2) = plt.subplots(2, 1)
with fits.open('data/ondrejov/ui300025.fits') as hdulist:
ondrejov.plot_spectrum(hdulist, ax1)
ax1.set_title('max flux: ' + str(np.max(ondrejov.get_fluxes(hdulist))))
with fits.open('data/ondrejov//td260020.fits') as hdulist:
ondrejov.plot_spectrum(hdulist, ax2)
ax2.set_title('max flux: ' + str(np.max(ondrejov.get_fluxes(hdulist))))
fig.tight_layout()
If identificator are plotted to the scatter plot the outliers can be determined. Two of them are plotted above. They have really high some fluxes values.
But because in this classfication problem the shape of spectral is the most important the fluxes intensities can be suppress to some range.
In following code minmax scaler and scaler to zero mean and unit variance are applied to each individula spectra. ATTENTION not applied to each feature.
Aim is to suppress intensieties and so only information about shape remains.
In [ ]:
# minmax scale to range (-1, 1)
# axis=1 means to scale individual samples
X_minmax = sklearn.preprocessing.minmax_scale(X, feature_range=(-1, 1), axis=1)
X_minmax_pca = PCA(n_components=140)
plot_scatter(X_minmax_pca.fit_transform(X_minmax), y)
In [ ]:
X_scale = sklearn.preprocessing.scale(X, axis=1)
X_scale_pca_model = PCA(n_components=140)
X_scale_pca = X_scale_pca_model.fit_transform(X_scale)
plot_scatter(X_scale_pca, y)
In [ ]:
np.max(X), np.min(X), np.max(X_scale), np.min(X_scale)
In [ ]:
fig, axs = plt.subplots(2, 2)
ax1, ax2, ax3, ax4 = axs.ravel()
for ax in axs.ravel():
ax.set_ylabel('number of spectra')
ax.set_xlabel('flux')
ax1.set_title('maxima of original spectra')
ax1.hist(np.max(X, axis=1), log=True)
ax2.set_title('minima of original spectra')
ax2.hist(np.min(X, axis=1), log=True)
ax3.set_title('maxima of scaled spectra')
ax3.hist(np.max(X_scale, axis=1), log=True)
ax4.set_title('minima of scaled spectra')
ax4.hist(np.min(X_scale, axis=1), log=True)
fig.tight_layout()
In [ ]:
fig, ax = plt.subplots()
ax.set_title('explained variance ratio')
ax.set_ylabel('variance ration')
ax.set_xlabel('principal components')
cut = 10
xticks = np.arange(1, cut + 1)
ax.set_xticks(xticks)
ax.bar(xticks, X_scale_pca_model.explained_variance_ratio_[:cut]);
In [ ]:
# each class plotted individually
cmap = plt.get_cmap('viridis')
norm = matplotlib.colors.Normalize(vmin=0, vmax=2)
fig, axs = plt.subplots(1, 3)
titles = ['emission', 'absorption', 'double-peak']
labels = [0, 1, 2]
colors = [cmap(norm(l)) for l in labels]
for title, label, ax, color in zip(titles, labels, axs, colors):
ax.set_title(title)
ax.scatter(X_scale_pca[y == label][:, 0], X_scale_pca[y == label][:, 1], alpha=0.25, c=color)
Result above are much more promising. From scatter plot it look relatively easy to separate emission an absorption spectra. But double-peak spectra are mix randomly in space. They are maybe better separated when scaled to zero mean and unit varinace.
If we should split to emission and absorption linear separation would work good. But because the double-peak spectra are not easily separable in above plot deep network will hopefully find better representation for sepation.
The insight into data then the shape is the main feature helps to apply dimensionality reduction better therefore it should be used futher.
In [ ]:
# publication plot
fig, (ax1, ax2) = plt.subplots(1, 2)
cmap = plt.get_cmap('viridis')
norm = matplotlib.colors.Normalize(vmin=0, vmax=2)
for label, name in zip([0, 2, 1], ['emission', 'double-peak', 'absorption']):
idx = y == label
color = cmap(norm(label))
ax1.scatter(X_pca[:, 0][idx], X_pca[:, 1][idx], c=color, alpha=0.5, label=name)
ax2.scatter(X_scale_pca[:, 0][idx], X_scale_pca[:, 1][idx], c=color, alpha=0.5, label=name)
ax1.set_title('original dataset')
ax1.set_xlabel('PC1')
ax1.set_ylabel('PC2')
ax1.legend(loc='lower right')
ax2.set_title('dataset with scaled samples')
ax2.set_xlabel('PC1')
ax2.set_ylabel('PC2')
ax2.legend(loc='lower right')
fig.tight_layout();
In [ ]:
# publication plot
fig, ax = plt.subplots()
cmap = plt.get_cmap('viridis')
norm = matplotlib.colors.Normalize(vmin=0, vmax=2)
for label, name in zip([0, 2, 1], ['emission', 'double-peak', 'absorption']):
idx = y == label
color = cmap(norm(label))
ax.scatter(X_pca[:, 0][idx], X_pca[:, 1][idx], c=color, alpha=0.5, label=name)
ax.set_title('dataset with scaled samples')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.legend(loc='lower right')
fig.tight_layout();
In [ ]:
# minmax scale to range (-1, 1)
X_f_minmax = sklearn.preprocessing.minmax_scale(X, feature_range=(-1, 1))
X_f_minmax_pca = PCA(n_components=2).fit_transform(X_f_minmax)
plot_scatter(X_f_minmax_pca, y)
In [ ]:
X_f_scale = sklearn.preprocessing.scale(X)
X_f_scale_pca = PCA(n_components=2).fit_transform(X_f_scale)
plot_scatter(X_f_scale_pca, y)
In [ ]:
# firstly reduce to 30 dimensionsions with PCA as in t-SNE paper
X_pca = PCA(n_components=30).fit_transform(X_scale)
tsne = TSNE(
n_components=2, # 2D out array
#perplexity=40, # should be 5-50
#learning_rate=100, # should be 100-1000
init='pca', # start with PCA position
verbose=2,
)
X_tsne = tsne.fit_transform(X_pca)
In [ ]:
# publication plot
fig, ax = plt.subplots()
cmap = plt.get_cmap('viridis')
norm = matplotlib.colors.Normalize(vmin=0, vmax=2)
for label, name in enumerate(['emission', 'absorption', 'double-peak']):
idx = y == label
color = cmap(norm(label))
ax.scatter(X_tsne[:, 0][idx], X_tsne[:, 1][idx], c=color, alpha=0.5, label=name)
ax.set_title('t-SNE')
ax.legend()
ax.tick_params(which='both', bottom=False, left=False, labelbottom=False, labelleft=False)
In [ ]:
def sample_lamost(size):
with h5py.File('data/data.hdf5') as f:
# I found this random sampling fastest for h5py
# bool index array of False values
idx = np.zeros((f['X_lam'].shape[0], ), dtype=np.bool)
# set appriproiate number of indexes to True
idx[:size] = True
# randomly shuffle the index
np.random.shuffle(idx)
X_lam = f['X_lam'][idx, :]
return X_lam
X_lam = sample_lamost(5000)
In [ ]:
def foo_matrix(X_lam, X_ond, y_ond):
size_lam, size_ond = X_lam.shape[0], X_ond.shape[0]
size = size_lam + size_ond
X = np.zeros((size, 140), dtype=np.float64)
y = np.zeros((size, ), dtype=np.int8)
X[:size_lam, :] = X_lam
y[:size_lam] = -1
X[size_lam:, :] = X_ond
y[size_lam:] = y_ond
return X, y
X, y = sklearn.utils.shuffle(X, y)
sample_size = 1000
X, y = X[:sample_size], y[:sample_size]
X_all, y_all = foo_matrix(X_lam, X, y)
In [ ]:
pca = PCA(n_components=30)
X_all_scaled = preprocessing.scale_samples(X_all)
X_all_pca = pca.fit_transform(X_all_scaled)
In [ ]:
# publication plot
fig, ax = plt.subplots()
cmap = plt.get_cmap('viridis')
norm = matplotlib.colors.Normalize(vmin=-1, vmax=2)
for label, name in zip([-1, 1, 0, 2], ['lamost', 'absorption', 'emission', 'double-peak']):
idx = y_all == label
color = cmap(norm(label))
ax.scatter(X_all_pca[:, 0][idx], X_all_pca[:, 1][idx], s=10, c=color, alpha=0.5, label=name)
ax.set_title('pricinal component analysis of LAMOST')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.legend();
In [ ]:
tsne_all = TSNE(
n_components=2, # 2D out array
#perplexity=40, # should be 5-50
#learning_rate=100, # should be 100-1000
init='pca', # start with PCA position
verbose=2,
)
X_all_tsne = tsne_all.fit_transform(X_all_pca)
In [ ]:
# publication plot
fig, ax = plt.subplots()
cmap = plt.get_cmap('viridis')
norm = matplotlib.colors.Normalize(vmin=-1, vmax=2)
for label, name in zip([-1, 1, 0, 2], ['lamost', 'absorption', 'emission', 'double-peak']):
idx = y_all == label
color = cmap(norm(label))
ax.scatter(X_all_tsne[:, 0][idx], X_all_tsne[:, 1][idx], c=color, alpha=0.5, label=name)
ax.set_title('t-SNE of LAMOST')
ax.legend();
ax.tick_params(which='both', bottom=False, left=False, labelbottom=False, labelleft=False)