This notebook generates a catalogue of host galaxies for ATLAS-CDFS objects. This process proceeds as follows:
This has some clear problems: What do we mean by "nearby"? What if we have two unrelated radio objects nearby each other? A model-based approach à la Fan et al. (2015) may resolve this kind of issue, but as we are investigating a model-free approach, we leave this for future research. We take nearby to mean within a $1'$ radius, as this is the radius that Radio Galaxy Zoo volunteers see.
In the code below, note that we internally represent ATLAS and SWIRE objects by IDs. These are arbitrary integers.
In [99]:
# Imports.
from typing import List
import astropy.io.ascii
import astropy.table
import h5py
import numpy
import sklearn.linear_model
import sklearn.cross_validation
# Globals.
# This file stores the ATLAS-CDFS and SWIRE-CDFS catalogues.
CROWDASTRO_PATH = '../data/crowdastro_swire.h5'
# This file stores the training features and labels.
TRAINING_PATH = '../data/training_swire.h5'
# ATLAS catalogue.
ATLAS_CATALOGUE_PATH = '../data/ATLASDR3_cmpcat_23July2015.csv'
# Path to output catalogue to.
OUTPUT_PATH = '../data/crowdastro_catalogue.dat'
# Radius we should consider an object "nearby".
NEARBY = 1 / 60 # 1 arcmin in degrees.
# Size of an ATLAS image vector.
IMAGE_SIZE = 200 * 200
# Number of numeric features before the distance features.
ATLAS_DIST_IDX = 2 + IMAGE_SIZE
In [48]:
def find_host(probabilities: numpy.ndarray, atlas_id: int) -> int:
"""Finds the host galaxy associated with an ATLAS object.
Arguments
---------
probabilities
(N,) array of predicted probabilities of SWIRE objects.
atlas_id
ID of the ATLAS object to find the host of.
Returns
-------
int
ID of predicted host galaxy.
"""
with h5py.File(CROWDASTRO_PATH, 'r') as cr, h5py.File(TRAINING_PATH, 'r') as tr:
# Get all nearby objects.
ir_distances = cr['/atlas/cdfs/numeric'][atlas_id, ATLAS_DIST_IDX:]
assert ir_distances.shape[0] == tr['features'].shape[0]
# Make a list of IDs of nearby objects.
nearby = sorted((ir_distances <= NEARBY).nonzero()[0])
# Find the best nearby candidate.
nearby_probabilities = probabilities[nearby]
# Select the highest probability object.
best_index = nearby_probabilities.argmax()
best_index = nearby[best_index] # Convert back into an IR index.
return best_index
In [56]:
def train_classifier(indices: List[int]) -> sklearn.linear_model.LogisticRegression:
"""Trains a classifier.
Arguments
---------
indices
List of infrared training indices.
Returns
-------
sklearn.linear_model.LogisticRegression
Trained logistic regression classifier.
"""
with h5py.File(TRAINING_PATH, 'r') as tr:
features = numpy.nan_to_num(tr['features'].value[indices])
labels = tr['labels'].value[indices]
lr = sklearn.linear_model.LogisticRegression(class_weight='balanced', penalty='l1')
lr.fit(features, labels)
return lr
In [57]:
def predict(classifier: sklearn.linear_model.LogisticRegression, indices: List[int]) -> numpy.ndarray:
"""Predicts probabilities for a set of IR objects.
Arguments
---------
classifier
Trained classifier.
indices
List of IR indices to predict probability of.
Returns
-------
numpy.ndarray
(N,) NumPy array of predicted probabilities.
"""
with h5py.File(TRAINING_PATH, 'r') as tr:
features = numpy.nan_to_num(tr['features'].value[indices])
return classifier.predict_proba(features)[:, 1]
In [51]:
def train_and_predict(n_splits: int=10) -> numpy.ndarray:
"""Generates probabilities for IR objects.
Notes
-----
Instances will be split according to ATLAS index, not IR index. This is
because there is overlap in IR objects' features, so we need to make sure
that this overlap is not present in the testing data.
Arguments
---------
n_splits
Number of splits in cross-validation.
Returns
-------
numpy.ndarray
(N,) NumPy array of predictions.
"""
with h5py.File(CROWDASTRO_PATH, 'r') as cr:
# Get the number of ATLAS IDs.
n_atlas = cr['/atlas/cdfs/numeric'].shape[0]
# Get the number of SWIRE IDs.
n_swire = cr['/swire/cdfs/numeric'].shape[0]
# Allocate the array of predicted probabilities.
probabilities = numpy.zeros((n_swire,))
# Split into training/testing sets.
kf = sklearn.cross_validation.KFold(n_atlas, n_folds=n_splits)
# Train and predict.
for train_indices, test_indices in kf:
nearby_train = (cr['/atlas/cdfs/numeric'].value[train_indices, ATLAS_DIST_IDX:]
<= NEARBY).nonzero()[0]
nearby_test = (cr['/atlas/cdfs/numeric'].value[test_indices, ATLAS_DIST_IDX:]
<= NEARBY).nonzero()[0]
classifier = train_classifier(nearby_train)
fold_probs = predict(classifier, nearby_test)
probabilities[nearby_test] = fold_probs
return probabilities
In [58]:
probabilities = train_and_predict()
In [60]:
with h5py.File(CROWDASTRO_PATH, 'r') as cr:
n_atlas = cr['/atlas/cdfs/numeric'].shape[0]
In [62]:
hosts = [find_host(probabilities, i) for i in range(n_atlas)]
In [84]:
# First, we need to get a list of the ATLAS and SWIRE object names.
with h5py.File(CROWDASTRO_PATH, 'r') as cr:
atlas_ids = cr['/atlas/cdfs/string'].value
atlas_locs = cr['/atlas/cdfs/numeric'][:, :2]
# Convert ATLAS IDs into names.
atlas_catalogue = astropy.io.ascii.read(ATLAS_CATALOGUE_PATH)
id_to_name = {r['ID']: r['name'] for r in atlas_catalogue}
atlas_names = [id_to_name[id_.decode('ascii')] for zooniverse_id, id_ in atlas_ids]
swire_names = [n.decode('ascii') for n in cr['/swire/cdfs/string']]
swire_locs = cr['/swire/cdfs/numeric'][:, :2]
In [104]:
# Now we can generate the catalogue.
names = ('radio_object', 'infrared_host', 'ra', 'dec')
table = astropy.table.Table(names=names, dtype=('S50', 'S50', 'float', 'float'))
for atlas_index, atlas_name in enumerate(atlas_names):
host = hosts[atlas_index]
swire_name = swire_names[host]
ra, dec = swire_locs[host]
table.add_row((atlas_name, swire_name, ra, dec))
astropy.io.ascii.write(table=table, output=OUTPUT_PATH)
In [ ]: