In [20]:
import sys
import h5py, numpy, sklearn.neighbors
from astropy.coordinates import SkyCoord
sys.path.insert(1, '..')
import crowdastro.train, crowdastro.test
TRAINING_H5_PATH = '../training.h5'
CROWDASTRO_H5_PATH = '../crowdastro.h5'
NORRIS_DAT_PATH = '../data/norris_2006_atlas_classifications_ra_dec_only.dat'
CLASSIFIER_OUT_PATH = '../classifier.pkl'
ASTRO_TRANSFORMER_OUT_PATH = '../astro_transformer.pkl'
IMAGE_TRANSFORMER_OUT_PATH = '../image_transformer.pkl'
IMAGE_SIZE = 200 * 200
ARCMIN = 1 / 60
N_ASTRO = 5
In [2]:
with h5py.File(TRAINING_H5_PATH, 'r') as training_h5:
crowdsourced_labels = training_h5['labels'].value
with h5py.File(CROWDASTRO_H5_PATH, 'r') as crowdastro_h5:
swire_names = crowdastro_h5['/wise/cdfs/string'].value
swire_positions = crowdastro_h5['/wise/cdfs/numeric'].value[:, :2]
assert len(crowdsourced_labels) == len(swire_names)
swire_tree = sklearn.neighbors.KDTree(swire_positions)
Norris et al. used a different set of SWIRE objects to us, so we need to do a nearest neighbour search.
In [3]:
with open(NORRIS_DAT_PATH, 'r') as norris_dat:
norris_coords = [r.strip().split('|') for r in norris_dat]
In [4]:
norris_labels = numpy.zeros(crowdsourced_labels.shape)
for ra, dec in norris_coords:
# Find a neighbour.
skycoord = SkyCoord(ra=ra, dec=dec, unit=('hourangle', 'deg'))
ra = skycoord.ra.degree
dec = skycoord.dec.degree
((dist,),), ((swire,),) = swire_tree.query([(ra, dec)])
if dist < 0.1:
norris_labels[swire] = 1
In [5]:
(numpy.logical_and(norris_labels == crowdsourced_labels, crowdsourced_labels == 1)).sum() / norris_labels.sum()
Out[5]:
So there's about 62% agreement with volunteers for SWIRE, and 70% for WISE.
In [21]:
with h5py.File(TRAINING_H5_PATH, 'r') as training_h5:
classifier, astro_transformer, image_transformer = crowdastro.train.train(
training_h5, CLASSIFIER_OUT_PATH, ASTRO_TRANSFORMER_OUT_PATH,
IMAGE_TRANSFORMER_OUT_PATH, classifier='lr', use_astro=True,
use_cnn=True, n_jobs=-1)
In [22]:
with h5py.File(TRAINING_H5_PATH, 'r') as training_h5:
with h5py.File(CROWDASTRO_H5_PATH, 'r') as crowdastro_h5:
test_indices = training_h5['is_atlas_test'].value
numeric_subjects = crowdastro_h5['/atlas/cdfs/numeric'][test_indices, :]
n_norris_agree = 0
n_crowdsourced_agree = 0
n_all_agree = 0
n_either_agree = 0
n_no_host = 0
n_total = 0
for subject in numeric_subjects:
swire = subject[2 + IMAGE_SIZE:]
nearby = swire < ARCMIN
astro_inputs = numpy.minimum(training_h5['features'][nearby, :N_ASTRO],
1500)
image_inputs = training_h5['features'][nearby, N_ASTRO:]
features = []
features.append(astro_transformer.transform(astro_inputs))
features.append(image_transformer.transform(image_inputs))
inputs = numpy.hstack(features)
crowdsourced_outputs = crowdsourced_labels[nearby]
norris_outputs = norris_labels[nearby]
if sum(crowdsourced_outputs) < 1 or sum(norris_outputs) < 1:
# No hosts!
n_no_host += 1
continue
selection = classifier.predict_proba(inputs)[:, 1].argmax()
n_norris_agree += norris_outputs[selection]
n_crowdsourced_agree += crowdsourced_outputs[selection]
n_all_agree += norris_outputs[selection] * crowdsourced_outputs[selection]
n_either_agree += norris_outputs[selection] or crowdsourced_outputs[selection]
n_total += 1
In [23]:
print('LR agreement with RGZ: {:.02%}'.format(n_crowdsourced_agree / n_total))
print('LR agreement with Norris: {:.02%}'.format(n_norris_agree / n_total))
print('LR agreement with both: {:.02%}'.format(n_all_agree / n_total))
print('LR agreement with either: {:.02%}'.format(n_either_agree / n_total))
print('LR subjects with no Norris or RGZ host: {}'.format(n_no_host))
In [25]:
rgz_agreements = []
norris_agreements = []
all_agreements = []
either_agreements = []
for _ in range(1):
with h5py.File(TRAINING_H5_PATH, 'r') as training_h5:
classifier, astro_transformer, image_transformer = crowdastro.train.train(
training_h5, CLASSIFIER_OUT_PATH, ASTRO_TRANSFORMER_OUT_PATH,
IMAGE_TRANSFORMER_OUT_PATH, classifier='rf', use_astro=True,
use_cnn=True, n_jobs=-1)
with h5py.File(TRAINING_H5_PATH, 'r') as training_h5:
with h5py.File(CROWDASTRO_H5_PATH, 'r') as crowdastro_h5:
test_indices = training_h5['is_atlas_test'].value
numeric_subjects = crowdastro_h5['/atlas/cdfs/numeric'][test_indices, :]
n_norris_agree = 0
n_crowdsourced_agree = 0
n_all_agree = 0
n_either_agree = 0
n_no_host = 0
n_total = 0
for subject in numeric_subjects:
swire = subject[2 + IMAGE_SIZE:]
nearby = swire < ARCMIN
astro_inputs = numpy.minimum(training_h5['features'][nearby, :N_ASTRO],
1500)
image_inputs = training_h5['features'][nearby, N_ASTRO:]
features = []
features.append(astro_transformer.transform(astro_inputs))
features.append(image_transformer.transform(image_inputs))
inputs = numpy.hstack(features)
crowdsourced_outputs = crowdsourced_labels[nearby]
norris_outputs = norris_labels[nearby]
if sum(crowdsourced_outputs) < 1 or sum(norris_outputs) < 1:
# No hosts!
n_no_host += 1
continue
selection = classifier.predict_proba(inputs)[:, 1].argmax()
n_norris_agree += norris_outputs[selection]
n_crowdsourced_agree += crowdsourced_outputs[selection]
n_all_agree += norris_outputs[selection] * crowdsourced_outputs[selection]
n_either_agree += norris_outputs[selection] or crowdsourced_outputs[selection]
n_total += 1
rgz_agreements.append(n_crowdsourced_agree / n_total)
norris_agreements.append(n_norris_agree / n_total)
all_agreements.append(n_all_agree / n_total)
either_agreements.append(n_either_agree / n_total)
In [26]:
print('Average RGZ agreement: ({:.02} +- {:.02})%'.format(numpy.mean(rgz_agreements), numpy.std(rgz_agreements)))
print('Average Norris agreement: ({:.02} +- {:.02})%'.format(numpy.mean(norris_agreements), numpy.std(norris_agreements)))
print('Average both agreement: ({:.02} +- {:.02})%'.format(numpy.mean(all_agreements), numpy.std(all_agreements)))
print('Average either agreement: ({:.02} +- {:.02})%'.format(numpy.mean(either_agreements), numpy.std(either_agreements)))
In [ ]: