In [1]:
# These might be different for you
DFLENS_PATH = '/Users/jakubnabaglo/Desktop/old/lib_phz_2dfgals.fits'
In [2]:
import functools
import itertools
import time
import astropy.io.fits
import astropy.table
import matplotlib.pyplot as plt
import numpy as np
import scipy.linalg
import sklearn.linear_model
from hyperopt import fmin, tpe, hp, STATUS_OK, STATUS_FAIL
%matplotlib inline
In [7]:
# Load the data
data = astropy.table.Table.read(DFLENS_PATH).to_pandas()
zs_unshuffled = data['z'].as_matrix().astype(np.float32)
bands_unshuffled = data[['umag', 'gmag', 'rmag', 'imag', 'zmag', 'w1mag', 'w2mag']].as_matrix().astype(np.float32)
bands_vars_unshuffled = data[['s_umag', 's_gmag', 's_rmag',
's_imag', 's_zmag', 's_w1mag', 's_w2mag']].as_matrix().astype(np.float32)
bands_vars_unshuffled *= bands_vars_unshuffled # Make standard deviations into variances
no_w1_indices = bands_vars_unshuffled[:,5] == 998001
no_w2_indices = (bands_vars_unshuffled[:,6] == 998001) | np.isnan(bands_vars_unshuffled[:,6])
def get_colours(bands, bands_vars):
u, g, r, i, z, w1, w2 = bands.T
r_w1 = r - w1
w1_w2 = w1 - w2
u_g = u - g
g_r = g - r
r_i = r - i
i_z = i - z
u_var, g_var, r_var, i_var, z_var, w1_var, w2_var = bands_vars.T
r_w1_var = r_var + w1_var
w1_w2_var = w1_var + w2_var
u_g_var = u_var + g_var
g_r_var = g_var + r_var
r_i_var = r_var + i_var
i_z_var = i_var + z_var
bands[:,0] = r
bands[:,1] = r_w1
bands[:,2] = w1_w2
bands[:,3] = u_g
bands[:,4] = g_r
bands[:,5] = r_i
bands[:,6] = i_z
bands_vars[:,0] = r_var
bands_vars[:,1] = r_w1_var
bands_vars[:,2] = w1_w2_var
bands_vars[:,3] = u_g_var
bands_vars[:,4] = g_r_var
bands_vars[:,5] = r_i_var
bands_vars[:,6] = i_z_var
get_colours(bands_unshuffled, bands_vars_unshuffled)
def fill_blanks(blanks_indices, means, vars_):
mean = np.mean(means[~blanks_indices], axis=0)
means[blanks_indices] = mean
deviations = means[~blanks_indices] - mean
deviations *= deviations
N = deviations.shape[0]
mean_sq_deviation = deviations.sum(axis=0) / (N - 1)
mean_variance = vars_[~blanks_indices].sum(axis=0) / (N - 1)
vars_[blanks_indices] = mean_sq_deviation + mean_variance
# Fill in blanks where we don't have WISE data. We set the mean to the mean of the population and the variance to that
# of the population. This is a good representation of what we know about that data.
fill_blanks(no_w1_indices, bands_unshuffled[:, 1], bands_vars_unshuffled[:, 1])
fill_blanks(no_w1_indices | no_w2_indices, bands_unshuffled[:, 2], bands_vars_unshuffled[:, 2])
all_indices = np.arange(zs_unshuffled.shape[0])
np.random.shuffle(all_indices)
zs = zs_unshuffled[all_indices]
bands = bands_unshuffled[all_indices]
bands_vars = bands_vars_unshuffled[all_indices]
In [8]:
def gaussian_kernel(s, Mu, Mu_, Sigma, Sigma_, diag_dependent=False):
""" Computes the Gaussian kernel, accounting for uncertainty in the data. Mu is the mean
of the data and Sigma is the uncertainty (as variance for each axis).
S is the length scale of the kernel, as variance on each axis.
"""
N, f = Sigma.shape
N_, f_ = Sigma_.shape
assert f == f_
assert not diag_dependent or N == N_
det_s = np.prod(s)
gauss_covars = np.tile(Sigma_, (N, 1, 1))
gauss_covars += Sigma.reshape((N, 1, f))
gauss_covars += s
inv_gauss_covars = np.reciprocal(gauss_covars, out=gauss_covars)
diffs = np.tile(Mu_, (N, 1, 1))
diffs -= Mu.reshape((N, 1, f))
diffs = np.square(diffs, out=diffs)
diffs *= inv_gauss_covars
exponents = np.sum(diffs, axis=2)
exponents *= -0.5
exponents = np.exp(exponents, out=exponents)
dets_gauss_covars = np.prod(inv_gauss_covars, axis=2)
dets_gauss_covars *= det_s
multipliers = np.sqrt(dets_gauss_covars, out=dets_gauss_covars)
exponents *= multipliers
return exponents
In [9]:
class RedshiftGPR:
def __init__(self, kernel):
self.kernel = kernel
self.L = None
self.weights = None
self.train_X = None
self.train_X_var = None
def fit(self, X, X_var, y, fit_variance=False):
y = np.log1p(y)
K = self.kernel(X, X, X_var, X_var) # n * n
K[np.diag_indices_from(K)] = 0
mean_normalise = K.sum(axis=0)
avgs = K @ y
avgs /= mean_normalise
self.y_mean = np.mean(y)
self.y_std = np.std(y, ddof=1)
sq_devs = avgs
sq_devs -= y
sq_devs = np.square(sq_devs, out=sq_devs)
y -= self.y_mean
y /= self.y_std
sq_devs /= self.y_std * self.y_std
avg_var = np.dot(K, sq_devs, out=sq_devs)
avg_var /= mean_normalise
self.avg_var = avg_var
avg_var += 1
K[np.diag_indices_from(K)] = avg_var
K = K.astype(np.float32)
y = y.astype(np.float32)
if fit_variance:
self.L = scipy.linalg.cho_factor(K, lower=True, overwrite_a=True, check_finite=False)
self.weights = scipy.linalg.cho_solve(self.L, y, check_finite=False)
else:
self.weights = scipy.linalg.solve(K, y, overwrite_a=True, check_finite=False, assume_a='pos')
self.train_X = X
self.train_X_var = X_var
def predict(self, X, X_var, return_var=False):
K_ = self.kernel(self.train_X, X, self.train_X_var, X_var)
means = K_.T @ self.weights
means *= self.y_std
means += self.y_mean
means = np.expm1(means, out=means)
if return_var:
var = scipy.linalg.cho_solve(self.L, K_, check_finite=False)
var *= K_
var = np.sum(var, axis=0)
var = np.subtract(1, var, out=var)
# var += self.alpha
var *= self.y_std * self.y_std
var *= (means + 1) ** 2
return means, var
else:
return means
In [12]:
class RedshiftGPRWithCV:
def __init__(self, iters=1000):
self.iters = iters
self.gpr = None
def fit(self, X, X_var, y, valid_X, valid_X_var, valid_y, refit=True, fit_variance=True):
# We need a reasonable prior for the Bayesian optimisation. There are several:
# 1. Perform logistic regression and use the weights
lr = sklearn.linear_model.LinearRegression()
lr.fit(X, np.log1p(y))
lr_sigmas = np.log(1 / np.abs(lr.coef_ / np.log1p(y).std()))
# 2. Find the median distance between points in each dimension
distances = [[abs(b - b_)
for b, b_ in itertools.combinations(X[:,ax], 2)]
for ax in range(X.shape[1])]
d_median = np.array([np.median(ld) for ld in distances])
ld_median = np.log(d_median, out=d_median)
# Find the mean and standard deviation of the above two. This is our prior.
dist_mean = (lr_sigmas + ld_median) / 2
dist_std = np.abs(lr_sigmas - ld_median) / 2
counter = itertools.count()
def objective(x):
print(next(counter), end=' ')
x = np.array(x)
x = np.square(x, out=x)
pred = RedshiftGPR(functools.partial(gaussian_kernel, x))
try:
pred.fit(X, X_var, y, fit_variance=False)
except np.linalg.LinAlgError:
return dict(status=STATUS_FAIL)
pred_y = pred.predict(valid_X, valid_X_var)
pred_y -= valid_y
pred_errs = np.abs(pred_y, out=pred_y)
pred_errs /= 1 + valid_y
loss = np.percentile(pred_errs, 68.3, overwrite_input=True)
return dict(status=STATUS_OK, loss=loss)
space = [hp.lognormal(str(ax), dist_mean[ax], dist_std[ax]) for ax in range(X.shape[1])]
best = fmin(objective,
space=space,
algo=tpe.suggest,
max_evals=self.iters)
self.length_scales = np.array([best[str(ax)] for ax in range(X.shape[1])])
if refit:
self.gpr = RedshiftGPR(functools.partial(gaussian_kernel, self.length_scales ** 2))
self.gpr.fit(X, X_var, y, fit_variance=fit_variance)
def predict(self, X, X_var, return_var=False):
return self.gpr.predict(X, X_var, return_var=return_var)
TRAIN_NUM = 2000
predictr = RedshiftGPRWithCV()
predictr.fit(bands[:TRAIN_NUM], bands_vars[:TRAIN_NUM], zs[:TRAIN_NUM],
bands[TRAIN_NUM:2*TRAIN_NUM], bands_vars[TRAIN_NUM:2*TRAIN_NUM], zs[TRAIN_NUM:2*TRAIN_NUM],
refit=False)
REAL_TRAIN_NUM = 5000
pred = RedshiftGPR(functools.partial(gaussian_kernel, predictr.length_scales ** 2))
pred.fit(bands[:REAL_TRAIN_NUM], bands_vars[:REAL_TRAIN_NUM], zs[:REAL_TRAIN_NUM], fit_variance=False)
In [13]:
print(predictr.length_scales)
TEST_NUM_TOTAL = 40000
TEST_NUM_SAMPLE = 1000
assert TEST_NUM_TOTAL + pred.train_X.shape[0] <= zs.shape[0]
def sample_indices(test_bands, test_zs):
faint_objects = (17.7 <= test_bands[:,0]) & (test_bands[:,0] <= 19.5)
blue = faint_objects & (test_bands[:,4] - 2.8 * test_zs < .5)
red = faint_objects & (test_bands[:,4] - 2.8 * test_zs > .5)
blues = np.arange(test_bands.shape[0])[blue][:int(TEST_NUM_SAMPLE * .6)]
reds = np.arange(test_bands.shape[0])[red][:int(TEST_NUM_SAMPLE * .4)]
indices = np.append(blues, reds)
return indices, blues, reds
test_bands = bands[-TEST_NUM_TOTAL:]
test_bands_vars = bands_vars[-TEST_NUM_TOTAL:]
test_zs = zs[-TEST_NUM_TOTAL:]
all_sample, blues, reds = sample_indices(test_bands, test_zs)
preds_blues = pred.predict(test_bands[blues], test_bands_vars[blues])
errs_blues = np.abs(preds_blues - test_zs[blues]) / (1 + test_zs[blues])
print(preds_blues.min(), preds_blues.mean(), preds_blues.max(), preds_blues.std())
preds_reds = pred.predict(test_bands[reds], test_bands_vars[reds])
errs_reds = np.abs(preds_reds - test_zs[reds]) / (1 + test_zs[reds])
err_blues = np.percentile(errs_blues, 68.3)
err_reds = np.percentile(errs_reds, 68.3)
err_all_sample = np.percentile(np.append(errs_blues, errs_reds), 68.3)
print('all', err_all_sample)
print('blues', err_blues)
print('reds', err_reds)
In [14]:
z_pred_col = astropy.table.Column(data=pred.predict(bands_unshuffled, bands_vars_unshuffled), name='z_pred')
In [19]:
in_train_arr = np.zeros((zs_unshuffled.shape[0],))
in_train_arr[all_indices[:max(REAL_TRAIN_NUM, 2 * TRAIN_NUM)]] = 1
in_train_col = astropy.table.Column(data=in_train_arr, name='in_train')
In [21]:
all_data = astropy.table.Table.read(DFLENS_PATH)
all_data.add_column(z_pred_col)
all_data.add_column(in_train_col)
all_data.write('/Users/jakubnabaglo/Desktop/chris_predictions.fits')
In [ ]:
plt.hist([abs(b - b_) for b, b_ in itertools.combinations(bands[:1000,6], 2)], bins=100)
In [ ]:
AX = 4
OPTS = [1.22469276, 1.22474092, 0.80236046, 0.86353912, 0.46588492, 1.01890828, 1.164143]
OPTS_L = 1 / abs(np.array([0.34157744, 0.10560939, 1.29769242, -0.53296155, 2.38614941, 0.15689398, -0.48851654]))
all_ = [abs(b - b_) for b, b_ in itertools.combinations(bands[:1000,AX], 2)]
all_median = np.log(np.median(all_))
nonzero_median = np.log(np.median([a for a in all_ if a > 0]))
plt.hist(np.log([a for a in all_ if a > 0]), bins=100)
plt.axvline(np.log(OPTS[AX]), c='k')
plt.axvline(all_median, c='g');
plt.axvline(np.log(OPTS_L[AX]), c='orange');
plt.axvline(nonzero_median, c='purple');
In [ ]:
TEST_NUM_TOTAL = 40000
TEST_NUM_SAMPLE = 10000
assert TEST_NUM_TOTAL + pred.train_X.shape[0] <= zs.shape[0]
def sample_indices(test_bands, test_zs):
bright_objects = test_bands[:,0] < 17.7
blue = bright_objects & (test_bands[:,4] - 2.8 * test_zs < .5)
red = bright_objects & (test_bands[:,4] - 2.8 * test_zs > .5)
blues = np.arange(test_bands.shape[0])[blue][:int(TEST_NUM_SAMPLE * .4)]
reds = np.arange(test_bands.shape[0])[red][:int(TEST_NUM_SAMPLE * .6)]
indices = np.append(blues, reds)
return indices, blues, reds
test_bands = bands[-TEST_NUM_TOTAL:]
test_bands_vars = bands_vars[-TEST_NUM_TOTAL:]
test_zs = zs[-TEST_NUM_TOTAL:]
all_sample, blues, reds = sample_indices(test_bands, test_zs)
preds_blues = pred.predict(test_bands[blues], test_bands_vars[blues])
errs_blues = np.abs(preds_blues - test_zs[blues]) / (1 + test_zs[blues])
preds_reds = pred.predict(test_bands[reds], test_bands_vars[reds])
errs_reds = np.abs(preds_reds - test_zs[reds]) / (1 + test_zs[reds])
err_blues = np.percentile(errs_blues, 68.3)
err_reds = np.percentile(errs_reds, 68.3)
err_all_sample = np.percentile(np.append(errs_blues, errs_reds), 68.3)
print('all', err_all_sample)
print('blues', err_blues)
print('reds', err_reds)
In [ ]:
xs = data['rmag'] - data['imag']
ys = data['rmag'] - data['w1mag']
far = data['z'] > 0
valid = data['w1mag'] < 98
plt.scatter(xs[~far], ys[~far])
In [ ]:
lr = sklearn.linear_model.LinearRegression()
lr.fit(bands, np.log1p(zs))
In [ ]:
lr.coef_ / np.log1p(zs).std(), lr.intercept_ / np.log1p(zs).std()
In [ ]:
(np.array([1.22469276, 1.22474092, 0.80236046, 0.86353912, 0.46588492, 1.01890828, 1.164143]) / (1 / np.abs(lr.coef_ / np.log1p(zs).std()))).mean()
In [ ]:
lr.coef_ / np.log1p(zs).std()
In [ ]: