Classifying Galaxies

We want to classify each galaxy as either containing an AGN or not containing an AGN, assuming that all galaxies are independent.


In [67]:
import sys

from astropy.coordinates import SkyCoord
import h5py
import matplotlib.pyplot as plt
import numpy
import sklearn.linear_model
import sklearn.ensemble
import sklearn.metrics
import sklearn.neighbors

sys.path.insert(1, '..')
import crowdastro

NORRIS_DAT_PATH = '../data/norris_2006_atlas_classifications_ra_dec_only.dat'
TRAINING_H5_PATH = '../data/training.h5'
%matplotlib inline

How many infrared objects are there?


In [15]:
with h5py.File(TRAINING_H5_PATH) as training_f:
    print('Total:', training_f['features'].shape[0])
    print('Testing:', training_f['is_ir_test'].value.sum(),
          '({:.02%})'.format(training_f['is_ir_test'].value.sum() / training_f['features'].shape[0]))
    print('Training:', training_f['is_ir_train'].value.sum(),
          '({:.02%})'.format(training_f['is_ir_train'].value.sum() / training_f['features'].shape[0]))

    atlas_test = training_f['is_atlas_test'].value.sum()
    atlas_train = training_f['is_atlas_train'].value.sum()
    atlas_total = atlas_test + atlas_train
    print('Testing:', atlas_test, '({:.02%})'.format(atlas_test / atlas_total))
    print('Testing:', atlas_train, '({:.02%})'.format(atlas_train / atlas_total))


Total: 24140
Testing: 5922 (24.53%)
Training: 18218 (75.47%)
Testing: 452 (19.97%)
Testing: 1811 (80.03%)

Training the classifier


In [ ]:
with h5py.File(TRAINING_H5_PATH) as training_f:
    lr = sklearn.linear_model.LogisticRegression(n_jobs=-1, class_weight='balanced', C=100.0, penalty='l1')
    x = training_f['features'][training_f['is_ir_train'].value, :]
    y = training_f['labels'][training_f['is_ir_train'].value]
    lr.fit(x, y)

Testing the classifier


In [79]:
# Load Norris labels.
with h5py.File(TRAINING_H5_PATH, 'r') as training_f:
    ir_positions = training_f['positions'].value
ir_tree = sklearn.neighbors.KDTree(ir_positions)

with open(NORRIS_DAT_PATH, 'r') as norris_dat:
    norris_coords = [r.strip().split('|') for r in norris_dat]

norris_labels = numpy.zeros((len(ir_positions)))
for ra, dec in norris_coords:
    # Find a neighbour.
    skycoord = SkyCoord(ra=ra, dec=dec, unit=('hourangle', 'deg'))
    ra = skycoord.ra.degree
    dec = skycoord.dec.degree
    ((dist,),), ((ir,),) = ir_tree.query([(ra, dec)])
    if dist < 0.1:
        norris_labels[ir] = 1

In [ ]:
# Predict.
with h5py.File(TRAINING_H5_PATH) as training_f:
    test_indices = training_f['is_ir_test'].value
    x = training_f['features'][test_indices, :]
    t = norris_labels[test_indices]
    y = lr.predict(x)

Accuracy


In [ ]:
# Raw accuracy.
sklearn.metrics.accuracy_score(t, y)

In [ ]:
# Balanced accuracy.
cm = sklearn.metrics.confusion_matrix(t, y).astype(float)
cm /= cm.sum(axis=1).reshape((-1, 1))
cm.trace() / 2