In [1]:
import sys
import h5py, numpy, sklearn.neighbors
from astropy.coordinates import SkyCoord
import matplotlib.pyplot as plt
sys.path.insert(1, '..')
import crowdastro.train, crowdastro.test
TRAINING_H5_PATH = '../training.h5'
CROWDASTRO_H5_PATH = '../crowdastro.h5'
NORRIS_DAT_PATH = '../data/norris_2006_atlas_classifications_ra_dec_only.dat'
CLASSIFIER_OUT_PATH = '../classifier.pkl'
ASTRO_TRANSFORMER_OUT_PATH = '../astro_transformer.pkl'
IMAGE_TRANSFORMER_OUT_PATH = '../image_transformer.pkl'
IMAGE_SIZE = 200 * 200
ARCMIN = 1 / 60
N_JOBS = 8
%matplotlib inline
In [2]:
# Load labels.
with h5py.File(TRAINING_H5_PATH, 'r') as training_h5:
crowdsourced_labels = training_h5['labels'].value
with h5py.File(CROWDASTRO_H5_PATH, 'r') as crowdastro_h5:
ir_names = crowdastro_h5['/wise/cdfs/string'].value
ir_positions = crowdastro_h5['/wise/cdfs/numeric'].value[:, :2]
ir_tree = sklearn.neighbors.KDTree(ir_positions)
with open(NORRIS_DAT_PATH, 'r') as norris_dat:
norris_coords = [r.strip().split('|') for r in norris_dat]
norris_labels = numpy.zeros((len(ir_positions)))
for ra, dec in norris_coords:
# Find a neighbour.
skycoord = SkyCoord(ra=ra, dec=dec, unit=('hourangle', 'deg'))
ra = skycoord.ra.degree
dec = skycoord.dec.degree
((dist,),), ((ir,),) = ir_tree.query([(ra, dec)])
if dist < 0.1:
norris_labels[ir] = 1
In [3]:
def softmax(x):
exp = numpy.exp(x - numpy.max(x))
out = exp / exp.sum()
return out
def train_and_test(hidden_atlas_training_indices):
"""
hidden_atlas_training_indices: ATLAS indices to hide.
"""
with h5py.File(TRAINING_H5_PATH, 'r') as training_h5, h5py.File(CROWDASTRO_H5_PATH, 'r') as crowdastro_h5:
n_static = 5 if training_h5.attrs['ir_survey'] == 'wise' else 6
train_indices = training_h5['is_ir_train'].value
atlas_train_indices = training_h5['is_atlas_train'].value
# Remove all IR objects near hidden ATLAS objects.
for atlas_index in hidden_atlas_training_indices:
ir = crowdastro_h5['/atlas/cdfs/numeric'][atlas_index, n_static + IMAGE_SIZE:]
nearby = (ir < ARCMIN).nonzero()[0]
for ir_index in nearby:
train_indices[ir_index] = 0
n_ir = train_indices.sum()
# We can now proceed as usual with training/testing.
outputs = training_h5['labels'].value[train_indices]
n = len(outputs)
astro_inputs = numpy.minimum(
training_h5['features'][train_indices, :n_static], 1500)
image_inputs = training_h5['features'].value[train_indices, n_static:]
astro_transformer = sklearn.pipeline.Pipeline([
('normalise', sklearn.preprocessing.Normalizer()),
('scale', sklearn.preprocessing.StandardScaler()),
])
image_transformer = sklearn.pipeline.Pipeline([
('normalise', sklearn.preprocessing.Normalizer()),
])
features = []
features.append(astro_transformer.fit_transform(astro_inputs))
features.append(image_transformer.fit_transform(image_inputs))
inputs = numpy.hstack(features)
classifier = sklearn.linear_model.LogisticRegression(
class_weight='balanced', n_jobs=N_JOBS)
classifier.fit(inputs, outputs)
# Test the classifier.
test_indices = training_h5['is_atlas_test'].value
numeric_subjects = crowdastro_h5['/atlas/cdfs/numeric'][test_indices, :]
n_norris_agree = 0
n_crowdsourced_agree = 0
n_all_agree = 0
n_either_agree = 0
n_no_host = 0
n_total = 0
for subject in numeric_subjects:
swire = subject[2 + IMAGE_SIZE:]
nearby = swire < ARCMIN
astro_inputs = numpy.minimum(training_h5['features'][nearby, :n_static],
1500)
image_inputs = training_h5['features'][nearby, n_static:]
features = []
features.append(astro_transformer.transform(astro_inputs))
features.append(image_transformer.transform(image_inputs))
inputs = numpy.hstack(features)
crowdsourced_outputs = crowdsourced_labels[nearby]
norris_outputs = norris_labels[nearby]
if sum(crowdsourced_outputs) < 1 or sum(norris_outputs) < 1:
# No hosts!
n_no_host += 1
continue
selection = classifier.predict_proba(inputs)[:, 1].argmax()
n_norris_agree += norris_outputs[selection]
n_crowdsourced_agree += crowdsourced_outputs[selection]
n_all_agree += norris_outputs[selection] * crowdsourced_outputs[selection]
n_either_agree += norris_outputs[selection] or crowdsourced_outputs[selection]
n_total += 1
# Compute the uncertainties of the pool.
pool_indices = training_h5['is_atlas_train'].value
numeric_subjects = crowdastro_h5['/atlas/cdfs/numeric'][pool_indices, :]
uncertainties = []
for subject in numeric_subjects:
swire = subject[2 + IMAGE_SIZE:]
nearby = swire < ARCMIN
astro_inputs = numpy.minimum(training_h5['features'][nearby, :n_static],
1500)
image_inputs = training_h5['features'][nearby, n_static:]
features = []
features.append(astro_transformer.transform(astro_inputs))
features.append(image_transformer.transform(image_inputs))
inputs = numpy.hstack(features)
probs = softmax(classifier.predict_proba(inputs)[:, 1])
entropy = -numpy.sum(numpy.log(probs) * probs)
uncertainties.append(entropy)
return (n_norris_agree / n_total, n_crowdsourced_agree / n_total,
n_all_agree / n_total, n_either_agree / n_total, uncertainties, n_ir)
In [8]:
# Randomly hide 90% of labels.
with h5py.File(TRAINING_H5_PATH, 'r') as training_h5:
atlas_train_indices = training_h5['is_atlas_train'].value
initial_hidden_atlas_training_indices = numpy.arange(atlas_train_indices.sum())
numpy.random.shuffle(initial_hidden_atlas_training_indices)
initial_hidden_atlas_training_indices = initial_hidden_atlas_training_indices[
:9 * len(initial_hidden_atlas_training_indices) // 10]
initial_hidden_atlas_training_indices.sort()
In [9]:
# Testing random label selection.
norris_accuracies_random = []
rgz_accuracies_random = []
all_accuracies_random = []
any_accuracies_random = []
n_ir_random = []
n_batch = 100
n_epochs = 25
numpy.random.seed(0)
hidden_atlas_training_indices = initial_hidden_atlas_training_indices[:]
for epoch in range(n_epochs):
print('Epoch {}/{}'.format(epoch + 1, n_epochs))
# Train, test, and generate uncertainties.
results = train_and_test(hidden_atlas_training_indices)
norris_accuracies_random.append(results[0])
rgz_accuracies_random.append(results[1])
all_accuracies_random.append(results[2])
any_accuracies_random.append(results[3])
n_ir_random.append(results[5])
# Choose n_batch new labels at random.
if len(hidden_atlas_training_indices) < n_batch:
break
else:
numpy.random.shuffle(hidden_atlas_training_indices)
hidden_atlas_training_indices = hidden_atlas_training_indices[:-n_batch]
hidden_atlas_training_indices.sort()
In [10]:
# Testing uncertainty sampling label selection.
norris_accuracies_uncsample = []
rgz_accuracies_uncsample = []
all_accuracies_uncsample = []
any_accuracies_uncsample = []
n_ir_uncsample = []
hidden_atlas_training_indices = initial_hidden_atlas_training_indices[:]
for epoch in range(n_epochs):
print('Epoch {}/{}'.format(epoch + 1, n_epochs))
# Train, test, and generate uncertainties.
results = train_and_test(hidden_atlas_training_indices)
uncertainties = results[4]
norris_accuracies_uncsample.append(results[0])
rgz_accuracies_uncsample.append(results[1])
all_accuracies_uncsample.append(results[2])
any_accuracies_uncsample.append(results[3])
n_ir_uncsample.append(results[5])
# Choose the n_batch most uncertain objects to label.
if len(hidden_atlas_training_indices) < n_batch:
break
else:
hidden_atlas_training_indices = numpy.array(
sorted(hidden_atlas_training_indices, key=lambda z: uncertainties[z]))[:-n_batch]
hidden_atlas_training_indices.sort()
In [11]:
plt.figure(figsize=(15, 10))
plt.subplot(2, 2, 1)
plt.plot(all_accuracies_random, c='pink')
plt.plot(any_accuracies_random, c='darkred')
plt.plot(all_accuracies_uncsample, c='lightgreen')
plt.plot(any_accuracies_uncsample, c='darkgreen')
plt.xlabel('{}-batch epochs'.format(n_batch))
plt.ylabel('Classification accuracy')
plt.legend(['Norris & RGZ (passive)', 'Norris | RGZ (passive)',
'Norris & RGZ (unc)', 'Norris | RGZ (unc)'], loc='lower right')
plt.subplot(2, 2, 2)
plt.plot(norris_accuracies_random, c='red')
plt.plot(norris_accuracies_uncsample, c='green')
plt.legend(['Norris (passive)', 'Norris (unc)'], loc='lower right')
plt.xlabel('{}-batch epochs'.format(n_batch))
plt.ylabel('Classification accuracy')
plt.subplot(2, 2, 3)
plt.plot(rgz_accuracies_random, c='red')
plt.plot(rgz_accuracies_uncsample, c='green')
plt.legend(['RGZ (passive)', 'RGZ (unc)'], loc='lower right')
plt.xlabel('{}-batch epochs'.format(n_batch))
plt.ylabel('Classification accuracy')
plt.subplot(2, 2, 4)
plt.plot(numpy.array(n_ir_random) - numpy.array(n_ir_uncsample))
plt.xlabel('{}-batch epochs'.format(n_batch))
plt.ylabel('Difference in number of IR examples')
plt.show()
Conclusion: Uncertainty sampling with entropy doesn't work very well.
In [ ]: