Careful, these constants may be different for you:
In [7]:
DATA_PATH = '~/Desktop/sdss_dr7_photometry_source.csv.gz'
In [8]:
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.neighbors
%matplotlib inline
In [9]:
PSF_COLS = ('psfMag_u', 'psfMag_g', 'psfMag_r', 'psfMag_i', 'psfMag_z')
First import the training and testing sets
In [10]:
def load_data(x_cols=PSF_COLS,
class_col='class',
class_val='Galaxy',
train_samples_num=1000000):
# Cast x_cols to list so Pandas doesn't complain…
x_cols_l = list(x_cols)
data_iter = pd.read_csv(
DATA_PATH,
iterator=True,
chunksize=100000,
usecols=x_cols_l + [class_col])
# Filter out anything that is not a galaxy without loading the whole file into memory.
data = pd.concat(chunk[chunk[class_col] == class_val]
for chunk in data_iter)
train_X = data[:train_samples_num][x_cols_l].as_matrix()
assert train_X.shape == (train_samples_num, len(x_cols))
return train_X
data = load_data()
Fit the training data.
In [11]:
def fit(train_X,
bandwidth=1, # By experimentation.
kernel='epanechnikov', # Resembles Gaussian within short distance, but is faster.
leaf_size=400, # For speed.
rtol=1e-3): # Decreased accuracy, but better speed.
estimator = sklearn.neighbors.KernelDensity(bandwidth=bandwidth,
kernel=kernel,
leaf_size=leaf_size,
rtol=rtol)
estimator.fit(train_X)
return estimator
kde = fit(data)
In [12]:
def make_5D_grid(train_X,
grid_samples_per_axis=20): # Careful! This code is O(n^5) in this variable
mins = np.min(train_X, axis=0)
maxs = np.max(train_X, axis=0)
assert mins.shape == maxs.shape == (train_X.shape[1],)
# Produce the 5D grid. This is surprisingly nontrivial.
# http://stackoverflow.com/questions/28825219/
linspaces = [np.linspace(i, j, grid_samples_per_axis)
for i, j in zip(mins, maxs)]
mesh_grids = np.meshgrid(*linspaces,
indexing='ij') # Otherwise numpy swaps the first two dimensions… 😕
sample_points = np.array(mesh_grids)
return sample_points
grid = make_5D_grid(data)
In [17]:
def evaluate_density_at_sample_points(estimator, sample_points):
dims = sample_points.shape[0]
samples_per_axis = sample_points.shape[1]
assert sample_points.shape[1:] == (samples_per_axis,) * dims
sample_points = np.reshape(sample_points, (dims, samples_per_axis ** dims))
densities = estimator.score_samples(sample_points.T)
densities = np.reshape(densities, (samples_per_axis,) * dims)
# Convert from log densities
densities = np.exp(densities)
return densities
grid_densities = evaluate_density_at_sample_points(kde, grid)
In [20]:
def plot_against_one_variable(train_X, sample_points, densities,
bands=PSF_COLS,
bins=1000,
scale_coeff=2500):
dims = len(bands)
assert train_X.shape[1] == sample_points.shape[0] == dims
assert sample_points.shape[1:] == densities.shape
for i in range(dims):
fig, axes = plt.subplots()
# Make histogram.
axes.hist(train_X[:,i], # We only care about one of the five dimensions.
bins=bins,
label='Actual density')
# Make plot of estimated densities.
x_indices = tuple(0 if a != i else slice(None) # Linspace over
for a in range(dims)) # i-th dimension.
x_indices = (i,) + x_indices # Only take i-th dimension. Due to the
# above others are constant anyway.
x = sample_points[x_indices]
assert len(x.shape) == 1 # Sanity check to ensure it is 1D.
y_sum_axes = tuple(a for a in range(dims) if a != i) # Sum over all dimensions except i.
y = np.sum(densities, axis=y_sum_axes)
y *= scale_coeff
assert y.shape == x.shape
axes.plot(x, y, label='Estimated density')
# Labels
plt.ylabel('Count')
plt.xlabel('Magnitude')
plt.title(bands[i])
plt.legend()
plot_against_one_variable(data, grid, grid_densities)
In [46]:
def plot_against_two_variables(train_X, sample_points, densities,
bands=PSF_COLS,
bins=1000):
dims = len(bands)
assert train_X.shape[1] == sample_points.shape[0] == dims
assert sample_points.shape[1:] == densities.shape
mins = sample_points[(slice(None),) + (0,) * dims]
maxs = sample_points[(slice(None),) + (-1,) * dims]
plt.figure(figsize=(10, 40))
upto = 1
for i in range(dims):
for j in range(i + 1, dims):
plt.subplot((dims ** 2 - dims) // 2, 2, upto)
upto += 1
z_sum_axes = tuple(a for a in range(dims) if a != i and a != j) # Sum over all dimensions except i.
z = np.sum(densities, axis=z_sum_axes)
extent = [mins[i], maxs[i], mins[j], maxs[j]]
# plt.axis(extent)
plt.imshow(z.T,
cmap='hot',
interpolation='nearest',
extent=extent,
aspect='auto',
origin='lower')
plt.xlabel(bands[i])
plt.ylabel(bands[j])
plt.title('Estimated')
plt.xlim((16, 26))
plt.ylim((16, 24))
plt.subplot((dims ** 2 - dims) // 2, 2, upto)
upto += 1
plt.hexbin(train_X[:,i], train_X[:,j], gridsize=100)
plt.xlabel(bands[i])
plt.ylabel(bands[j])
plt.title('Actual')
plt.xlim((16, 26))
plt.ylim((16, 24))
plot_against_two_variables(data, grid, grid_densities)
In [ ]: