Norris Labels

In this notebook, I will compare the Norris et al. (2006) labels to a) the crowdsourced labels, and b) the classifier outputs for logistic regression and random forests.


In [20]:
import sys

import h5py, numpy, sklearn.neighbors
from astropy.coordinates import SkyCoord

sys.path.insert(1, '..')
import crowdastro.train, crowdastro.test

TRAINING_H5_PATH = '../training.h5'
CROWDASTRO_H5_PATH = '../crowdastro.h5'
NORRIS_DAT_PATH = '../data/norris_2006_atlas_classifications_ra_dec_only.dat'
CLASSIFIER_OUT_PATH = '../classifier.pkl'
ASTRO_TRANSFORMER_OUT_PATH = '../astro_transformer.pkl'
IMAGE_TRANSFORMER_OUT_PATH = '../image_transformer.pkl'
IMAGE_SIZE = 200 * 200
ARCMIN = 1 / 60
N_ASTRO = 5

Crowdsourced label comparison


In [2]:
with h5py.File(TRAINING_H5_PATH, 'r') as training_h5:
    crowdsourced_labels = training_h5['labels'].value

with h5py.File(CROWDASTRO_H5_PATH, 'r') as crowdastro_h5:
    swire_names = crowdastro_h5['/wise/cdfs/string'].value
    swire_positions = crowdastro_h5['/wise/cdfs/numeric'].value[:, :2]

assert len(crowdsourced_labels) == len(swire_names)
swire_tree = sklearn.neighbors.KDTree(swire_positions)

Norris et al. used a different set of SWIRE objects to us, so we need to do a nearest neighbour search.


In [3]:
with open(NORRIS_DAT_PATH, 'r') as norris_dat:
    norris_coords = [r.strip().split('|') for r in norris_dat]

In [4]:
norris_labels = numpy.zeros(crowdsourced_labels.shape)
for ra, dec in norris_coords:
    # Find a neighbour.
    skycoord = SkyCoord(ra=ra, dec=dec, unit=('hourangle', 'deg'))
    ra = skycoord.ra.degree
    dec = skycoord.dec.degree
    ((dist,),), ((swire,),) = swire_tree.query([(ra, dec)])
    if dist < 0.1:
        norris_labels[swire] = 1

In [5]:
(numpy.logical_and(norris_labels == crowdsourced_labels, crowdsourced_labels == 1)).sum() / norris_labels.sum()


Out[5]:
0.70496453900709222

So there's about 62% agreement with volunteers for SWIRE, and 70% for WISE.

Classifier label comparison


In [21]:
with h5py.File(TRAINING_H5_PATH, 'r') as training_h5:
    classifier, astro_transformer, image_transformer = crowdastro.train.train(
        training_h5, CLASSIFIER_OUT_PATH, ASTRO_TRANSFORMER_OUT_PATH,
        IMAGE_TRANSFORMER_OUT_PATH, classifier='lr', use_astro=True,
        use_cnn=True, n_jobs=-1)

In [22]:
with h5py.File(TRAINING_H5_PATH, 'r') as training_h5:
    with h5py.File(CROWDASTRO_H5_PATH, 'r') as crowdastro_h5:
        test_indices = training_h5['is_atlas_test'].value
        numeric_subjects = crowdastro_h5['/atlas/cdfs/numeric'][test_indices, :]

        n_norris_agree = 0
        n_crowdsourced_agree = 0
        n_all_agree = 0
        n_either_agree = 0
        n_no_host = 0
        n_total = 0
        for subject in numeric_subjects:
            swire = subject[2 + IMAGE_SIZE:]
            nearby = swire < ARCMIN
            astro_inputs = numpy.minimum(training_h5['features'][nearby, :N_ASTRO],
                                         1500)
            image_inputs = training_h5['features'][nearby, N_ASTRO:]

            features = []
            features.append(astro_transformer.transform(astro_inputs))
            features.append(image_transformer.transform(image_inputs))
            inputs = numpy.hstack(features)

            crowdsourced_outputs = crowdsourced_labels[nearby]
            norris_outputs = norris_labels[nearby]
            
            if sum(crowdsourced_outputs) < 1 or sum(norris_outputs) < 1:
                # No hosts!
                n_no_host += 1
                continue

            selection = classifier.predict_proba(inputs)[:, 1].argmax()
            n_norris_agree += norris_outputs[selection]
            n_crowdsourced_agree += crowdsourced_outputs[selection]
            n_all_agree += norris_outputs[selection] * crowdsourced_outputs[selection]
            n_either_agree += norris_outputs[selection] or crowdsourced_outputs[selection]
            n_total += 1

In [23]:
print('LR agreement with RGZ: {:.02%}'.format(n_crowdsourced_agree / n_total))
print('LR agreement with Norris: {:.02%}'.format(n_norris_agree / n_total))
print('LR agreement with both: {:.02%}'.format(n_all_agree / n_total))
print('LR agreement with either: {:.02%}'.format(n_either_agree / n_total))
print('LR subjects with no Norris or RGZ host: {}'.format(n_no_host))


LR agreement with RGZ: 85.88%
LR agreement with Norris: 82.49%
LR agreement with both: 76.84%
LR agreement with either: 91.53%
LR subjects with no Norris or RGZ host: 275

In [25]:
rgz_agreements = []
norris_agreements = []
all_agreements = []
either_agreements = []
for _ in range(1):
    with h5py.File(TRAINING_H5_PATH, 'r') as training_h5:
        classifier, astro_transformer, image_transformer = crowdastro.train.train(
            training_h5, CLASSIFIER_OUT_PATH, ASTRO_TRANSFORMER_OUT_PATH,
            IMAGE_TRANSFORMER_OUT_PATH, classifier='rf', use_astro=True,
            use_cnn=True, n_jobs=-1)

    with h5py.File(TRAINING_H5_PATH, 'r') as training_h5:
        with h5py.File(CROWDASTRO_H5_PATH, 'r') as crowdastro_h5:
            test_indices = training_h5['is_atlas_test'].value
            numeric_subjects = crowdastro_h5['/atlas/cdfs/numeric'][test_indices, :]

            n_norris_agree = 0
            n_crowdsourced_agree = 0
            n_all_agree = 0
            n_either_agree = 0
            n_no_host = 0
            n_total = 0
            for subject in numeric_subjects:
                swire = subject[2 + IMAGE_SIZE:]
                nearby = swire < ARCMIN
                astro_inputs = numpy.minimum(training_h5['features'][nearby, :N_ASTRO],
                                             1500)
                image_inputs = training_h5['features'][nearby, N_ASTRO:]

                features = []
                features.append(astro_transformer.transform(astro_inputs))
                features.append(image_transformer.transform(image_inputs))
                inputs = numpy.hstack(features)

                crowdsourced_outputs = crowdsourced_labels[nearby]
                norris_outputs = norris_labels[nearby]

                if sum(crowdsourced_outputs) < 1 or sum(norris_outputs) < 1:
                    # No hosts!
                    n_no_host += 1
                    continue

                selection = classifier.predict_proba(inputs)[:, 1].argmax()
                n_norris_agree += norris_outputs[selection]
                n_crowdsourced_agree += crowdsourced_outputs[selection]
                n_all_agree += norris_outputs[selection] * crowdsourced_outputs[selection]
                n_either_agree += norris_outputs[selection] or crowdsourced_outputs[selection]
                n_total += 1
    rgz_agreements.append(n_crowdsourced_agree / n_total)
    norris_agreements.append(n_norris_agree / n_total)
    all_agreements.append(n_all_agree / n_total)
    either_agreements.append(n_either_agree / n_total)

In [26]:
print('Average RGZ agreement: ({:.02} +- {:.02})%'.format(numpy.mean(rgz_agreements), numpy.std(rgz_agreements)))
print('Average Norris agreement: ({:.02} +- {:.02})%'.format(numpy.mean(norris_agreements), numpy.std(norris_agreements)))
print('Average both agreement: ({:.02} +- {:.02})%'.format(numpy.mean(all_agreements), numpy.std(all_agreements)))
print('Average either agreement: ({:.02} +- {:.02})%'.format(numpy.mean(either_agreements), numpy.std(either_agreements)))


Average RGZ agreement: (0.94 +- 0.0)%
Average Norris agreement: (0.73 +- 0.0)%
Average both agreement: (0.7 +- 0.0)%
Average either agreement: (0.97 +- 0.0)%

In [ ]: