In [5]:
import sys
sys.path.insert(1, '/Users/alger/repos/crowdastro-projects/ATLAS-CDFS/scripts')
import pipeline
import h5py, astropy.io.ascii as asc
import numpy
import scipy.stats
import matplotlib.pyplot as plt
%matplotlib inline
swire_names_cdfs, swire_coords_cdfs, swire_features_cdfs = pipeline.generate_swire_features(field='cdfs')
swire_names_elais, swire_coords_elais, swire_features_elais = pipeline.generate_swire_features(field='elais')
swire_labels_cdfs = pipeline.generate_swire_labels(swire_names_cdfs, swire_coords_cdfs, field='cdfs')
swire_labels_elais = pipeline.generate_swire_labels(swire_names_elais, swire_coords_elais, field='elais')
(atlas_train_sets_cdfs, atlas_test_sets_cdfs), (swire_train_sets_cdfs, swire_test_sets_cdfs) = pipeline.generate_data_sets(swire_coords_cdfs, swire_labels_cdfs, field='cdfs')
(atlas_train_sets_elais, atlas_test_sets_elais), (swire_train_sets_elais, swire_test_sets_elais) = pipeline.generate_data_sets(swire_coords_elais, swire_labels_elais, field='elais')
table = asc.read('/Users/alger/data/Crowdastro/one-table-to-rule-them-all.tbl')
In [2]:
import scipy.spatial
swire_tree = scipy.spatial.KDTree(swire_coords_cdfs)
In [3]:
swire_name_to_index = {j:i for i, j in enumerate(swire_names_cdfs)}
In [29]:
# Distances are normalised and centred. We need to figure out what by.
n_swire = len(swire_features_cdfs)
with h5py.File('/Users/alger/data/Crowdastro/crowdastro-swire.h5', 'r') as crowdastro_f:
distances = crowdastro_f['/atlas/cdfs/numeric'][:, -n_swire:].min(axis=0)
mean_dist = distances.mean()
distances -= distances.mean()
stdev_dist = distances.std()
In [30]:
import sklearn.ensemble, random, crowdastro.crowd.util, numpy, sklearn.metrics, astropy.coordinates
for quadrant in range(4):
lr = sklearn.linear_model.LogisticRegression(class_weight='balanced',
C=100000.0)
# Train a classifier.
train = swire_train_sets_cdfs[:, pipeline.SET_NAMES['RGZ & Norris'], quadrant].nonzero()[0]
train_features = swire_features_cdfs[train]
train_labels = swire_labels_cdfs[train, 0]
test_features = swire_features_cdfs[swire_test_sets_cdfs[:, pipeline.SET_NAMES['RGZ & Norris'], quadrant]]
test_labels = swire_labels_cdfs[swire_test_sets_cdfs[:, pipeline.SET_NAMES['RGZ & Norris'], quadrant], 0]
lr.fit(train_features, train_labels)
# Test on the cross-identification task.
n_total = 0
n_correct_regular = 0
n_correct_dist = 0
n_correct_gaussian = 0
for atlas in atlas_test_sets_cdfs[:, pipeline.SET_NAMES['RGZ & Norris'], quadrant].nonzero()[0]:
row = table[table['Key'] == atlas][0]
ra = row['Component RA (Franzen)']
dec = row['Component DEC (Franzen)']
swire = row['Source SWIRE (Norris)']
if not swire.startswith('SWIRE'):
continue
nearby = swire_tree.query_ball_point(numpy.array([ra, dec]), 1 / 60)
nearby_features = swire_features_cdfs[nearby]
if not nearby:
continue
atpreds = lr.predict_proba(nearby_features)[:, 1]
names = [swire_names_cdfs[n] for n in nearby]
# Coordinate setup
coord_atlas = astropy.coordinates.SkyCoord(ra=ra, dec=dec, unit='deg')
coords_swire = astropy.coordinates.SkyCoord(ra=swire_coords_cdfs[nearby, 0],
dec=swire_coords_cdfs[nearby, 1],
unit='deg')
separations = numpy.array(coord_atlas.separation(coords_swire).deg)
# Regular
name = names[numpy.argmax(atpreds)]
n_correct_regular += name == swire
# Gaussian multiplier
gaussians = scipy.stats.norm.pdf(separations, scale=1 / 120)
gaussian_preds = atpreds * gaussians
name = names[numpy.argmax(gaussian_preds)]
n_correct_gaussian += name == swire
# Distance substitute
# We actually need to recompute the predictions for this.
modified_features = nearby_features.copy()
# The 10th feature is distance. Replace this by the normalised and centred separations.
normalised_separations = separations - mean_dist
normalised_separations /= stdev_dist
modified_features[:, 9] = normalised_separations
dist_preds = lr.predict_proba(modified_features)[:, 1]
name = names[numpy.argmax(dist_preds)]
n_correct_dist += name == swire
n_total += 1
print('Regular:', n_correct_regular / n_total)
print('Gaussian:', n_correct_gaussian / n_total)
print('Distance:', n_correct_dist / n_total)
It seems that Gaussian does very well. Now, let's optimise over $\sigma$. We'll do this four times, once for each quadrant, to avoid biasing our tests.
In [ ]:
# Cache the parts of the calculation that never change.
_lrs = {}
_atres = {i:{} for i in range(4)}
In [99]:
def xid_acc(sigma, quadrant):
if quadrant in _lrs:
lr = _lrs[quadrant]
else:
lr = sklearn.linear_model.LogisticRegression(class_weight='balanced',
C=100000.0)
# Train a classifier.
train = swire_train_sets_cdfs[:, pipeline.SET_NAMES['RGZ & Norris'], quadrant].nonzero()[0]
train_features = swire_features_cdfs[train]
train_labels = swire_labels_cdfs[train, 0]
test_features = swire_features_cdfs[swire_test_sets_cdfs[:, pipeline.SET_NAMES['RGZ & Norris'], quadrant]]
test_labels = swire_labels_cdfs[swire_test_sets_cdfs[:, pipeline.SET_NAMES['RGZ & Norris'], quadrant], 0]
lr.fit(train_features, train_labels)
_lrs[quadrant] = lr
# Test on the cross-identification task
# Note that we want to test on the *training* set
# so our parameters aren't influenced by the test data.
n_total = 0
n_correct_gaussian = 0
for atlas in atlas_train_sets_cdfs[:, pipeline.SET_NAMES['RGZ & Norris'], quadrant].nonzero()[0]:
if atlas in _atres[quadrant]:
atpreds, names, separations, swire = _atres[quadrant][atlas]
else:
row = table[table['Key'] == atlas][0]
ra = row['Component RA (Franzen)']
dec = row['Component DEC (Franzen)']
swire = row['Source SWIRE (Norris)']
if not swire.startswith('SWIRE'):
continue
nearby = swire_tree.query_ball_point(numpy.array([ra, dec]), 1 / 60)
nearby_features = swire_features_cdfs[nearby]
if not nearby:
continue
atpreds = lr.predict_proba(nearby_features)[:, 1]
names = [swire_names_cdfs[n] for n in nearby]
# Coordinate setup
coord_atlas = astropy.coordinates.SkyCoord(ra=ra, dec=dec, unit='deg')
coords_swire = astropy.coordinates.SkyCoord(ra=swire_coords_cdfs[nearby, 0],
dec=swire_coords_cdfs[nearby, 1],
unit='deg')
separations = numpy.array(coord_atlas.separation(coords_swire).deg)
_atres[quadrant][atlas] = (atpreds, names, separations, swire)
# Gaussian multiplier
gaussians = scipy.stats.norm.pdf(separations, scale=sigma)
gaussian_preds = atpreds * gaussians
name = names[numpy.argmax(gaussian_preds)]
n_correct_gaussian += name == swire
n_total += 1
print(sigma, n_correct_gaussian / n_total)
return n_correct_gaussian / n_total
Let's use hyperopt to optimise $\sigma$. For our prior we'll assume a normal distribution around $\sigma = 1 / 120$.
In [95]:
import hyperopt
In [151]:
trials = {}
for q in range(4):
trials[q] = hyperopt.Trials()
In [152]:
space = hyperopt.hp.uniform('sigma', 1 / 1000, 1 / 50)
In [153]:
bests = {}
for q in range(4):
bests[q] = hyperopt.fmin(lambda s: -xid_acc(s, q),
space=space,
algo=hyperopt.tpe.suggest,
max_evals=200,
trials=trials[q])
In [154]:
[1 / bests[q]['sigma'] for q in range(4)]
Out[154]:
In [156]:
plt.xlabel('$1/\\sigma$')
plt.ylabel('Accuracy')
for q in range(4):
plt.scatter(1 / numpy.array(trials[q].vals['sigma']), [-r['loss'] for r in trials[q].results])
plt.yscale('log')
plt.xscale('log')
In [ ]: