Is the redundancy high enough on the Radio Galaxy Zoo website? Do PG-means or KDE handle low redundancy cases differently? How often does taking the mean give an accurate or useful result (based on Norris labels)? If PG-means and KDE disagree, what do the disagreements look like? If there are similar results for both, which is faster?
Bonus question: Can we model the redundancy needed for a given radio object?
In [21]:
import itertools
import logging
import sys
from astropy.coordinates import SkyCoord
import h5py
import matplotlib.pyplot as plt
import numpy
import sklearn
sys.path.insert(1, '..')
# Needs MongoDB running.
import crowdastro.rgz_analysis.consensus as consensus, crowdastro.rgz_data as data
from crowdastro.consensuses import pg_means
ARCMIN = 1 / 60
CROWDASTRO_H5_PATH = '../data/crowdastro.h5'
IMAGE_SIZE = 200 * 200
NORRIS_DAT_PATH = '../data/norris_2006_atlas_classifications_ra_dec_only.dat'
TRAINING_H5_PATH = '../data/training.h5'
In [9]:
n_kde_success = 0
n_kde_failure_not_caught = 0
n_kde_failure_caught = 0
n_kde_total = 0
for subject in data.get_all_subjects(survey='atlas', field='cdfs').limit(2000):
c = consensus.consensus(subject['zooniverse_id'], None)
for answer in c['answer'].values():
if 'ir_peak' in answer and answer['peak_data']['npeaks'] < 10:
n_kde_success += 1
elif 'ir_peak' in answer:
n_kde_failure_not_caught += 1
elif 'ir' in answer:
n_kde_failure_caught += 1
n_kde_total += 1
In [10]:
print('KDE succeeded: {:.02%}'.format(n_kde_success / n_kde_total))
print('KDE failed (caught, mean used): {:.02%}'.format(n_kde_failure_caught / n_kde_total))
print('KDE failed (not caught): {:.02%}'.format(n_kde_failure_not_caught / n_kde_total))
In [50]:
n_pg_success = 0
n_pg_failure = 0
n_pg_total = 0
# Taken mostly from crowdastro.consensuses.find_consensuses.
with h5py.File(CROWDASTRO_H5_PATH, 'r') as f_h5:
class_positions = f_h5['/atlas/cdfs/classification_positions']
class_combinations = f_h5['/atlas/cdfs/classification_combinations']
ir_coords = f_h5['/{}/cdfs/numeric'.format('wise')][:, :2]
pos_groups = itertools.groupby(class_positions, key=lambda z: z[0])
com_groups = itertools.groupby(class_combinations, key=lambda z: z['index'])
for (i, pos_group), (j, com_group) in zip(pos_groups, com_groups):
assert i == j
com_group = list(com_group)
pos_group = list(pos_group)
total_classifications = 0
radio_counts = {}
for _, full_com, _ in com_group:
count = radio_counts.get(full_com, 0)
count += 1 / (full_com.count(b'|') + 1)
radio_counts[full_com] = count
total_classifications += 1 / (full_com.count(b'|') + 1)
radio_consensus = max(radio_counts, key=radio_counts.get)
for radio_signature in radio_consensus.split(b'|'):
n_pg_total += 1
percentage_consensus = (radio_counts[radio_consensus] /
total_classifications)
locations = []
for (_, x, y), (_, full, radio) in zip(pos_group, com_group):
if full == radio_consensus and radio == radio_signature:
locations.append((x, y))
locations = numpy.array(locations)
locations = locations[~numpy.all(numpy.isnan(locations), axis=1)]
(x, y), success = pg_means(locations)
if not success:
n_pg_failure += 1
else:
n_pg_success += 1
if numpy.isnan(x) or numpy.isnan(y):
continue
In [51]:
print('PG-means succeeded: {:.02%}'.format(n_pg_success / n_pg_total))
print('PG-means failed (caught): {:.02%}'.format(n_pg_failure / n_pg_total))
In [44]:
def lowest_bic(locations):
min_bic = float('inf')
min_gmm = None
for k in range(1, 5): # Assume no more than 5 candidate objects. Probably reasonable given ~20 clicks max.
gmm = sklearn.mixture.GMM(n_components=k, covariance_type='full')
try:
gmm.fit(locations)
except ValueError:
break
bic = gmm.bic(locations)
if bic < min_bic:
min_bic = bic
min_gmm = gmm
if not min_gmm:
return locations.mean(axis=0), False, 'mean'
if sum(w == max(min_gmm.weights_) for w in min_gmm.weights_) > 1:
success = False
reason = 'low_redundancy'
else:
success = True
reason = ''
return min_gmm.means_[min_gmm.weights_.argmax()], success, reason
In [48]:
n_bic_success = 0
n_bic_failure_mean = 0
n_bic_failure_tie = 0
n_bic_total = 0
# Taken mostly from crowdastro.consensuses.find_consensuses.
with h5py.File(CROWDASTRO_H5_PATH, 'r') as f_h5:
class_positions = f_h5['/atlas/cdfs/classification_positions']
class_combinations = f_h5['/atlas/cdfs/classification_combinations']
ir_coords = f_h5['/{}/cdfs/numeric'.format('wise')][:, :2]
pos_groups = itertools.groupby(class_positions, key=lambda z: z[0])
com_groups = itertools.groupby(class_combinations, key=lambda z: z['index'])
for (i, pos_group), (j, com_group) in zip(pos_groups, com_groups):
assert i == j
com_group = list(com_group)
pos_group = list(pos_group)
total_classifications = 0
radio_counts = {}
for _, full_com, _ in com_group:
count = radio_counts.get(full_com, 0)
count += 1 / (full_com.count(b'|') + 1)
radio_counts[full_com] = count
total_classifications += 1 / (full_com.count(b'|') + 1)
radio_consensus = max(radio_counts, key=radio_counts.get)
for radio_signature in radio_consensus.split(b'|'):
n_bic_total += 1
percentage_consensus = (radio_counts[radio_consensus] /
total_classifications)
locations = []
for (_, x, y), (_, full, radio) in zip(pos_group, com_group):
if full == radio_consensus and radio == radio_signature:
locations.append((x, y))
locations = numpy.array(locations)
locations = locations[~numpy.all(numpy.isnan(locations), axis=1)]
(x, y), success, reason = lowest_bic(locations)
if not success and reason == 'mean':
n_bic_failure_mean += 1
elif not success and reason == 'low_redundancy':
n_bic_failure_tie += 1
elif not success and reason:
raise ValueError('Unknown failure reason: {}'.format(reason))
else:
n_bic_success += 1
if numpy.isnan(x) or numpy.isnan(y):
continue
In [49]:
print('BIC succeeded: {:.02%}'.format(n_bic_success / n_bic_total))
print('BIC failed (caught, mean used): {:.02%}'.format(n_bic_failure_mean / n_bic_total))
print('BIC failed (caught, best guess): {:.02%}'.format(n_bic_failure_tie / n_bic_total))
This is a little misleading. The BIC method, since it uses Gaussian fitting, may fit a single Gaussian to a few points if it's the best fit it can find, reducing it to the mean. Let's compare the labels to Norris et al.
In [22]:
from crowdastro.consensuses import lowest_bic_gmm
In [ ]:
with h5py.File(CROWDASTRO_H5_PATH, 'r') as f_h5:
# Copied from crowdastro.consensuses.
class_positions = f_h5['/atlas/cdfs/classification_positions']
class_combinations = f_h5['/atlas/cdfs/classification_combinations']
assert len(class_positions) == len(class_combinations)
logging.debug('Finding consensuses for %d classifications.',
len(class_combinations))
# Pre-build the IR tree.
ir_coords = f_h5['/{}/cdfs/numeric'.format('swire')][:, :2]
ir_tree = sklearn.neighbors.KDTree(ir_coords)
cons_positions = []
cons_combinations = []
# Data integrity and assumptions checks.
assert numpy.array_equal(class_positions[:, 0],
class_combinations['index'])
assert numpy.array_equal(class_positions[:, 0],
sorted(class_positions[:, 0]))
pos_groups = itertools.groupby(class_positions, key=lambda z: z[0])
com_groups = itertools.groupby(class_combinations, key=lambda z: z['index'])
for (i, pos_group), (j, com_group) in zip(pos_groups, com_groups):
assert i == j
com_group = list(com_group) # For multiple iterations.
pos_group = list(pos_group)
total_classifications = 0
# Find the radio consensus. Be wary when counting: If there are multiple
# AGNs identified in one subject, *that classification will appear
# multiple times*. I'm going to deal with this by dividing the weight of
# each classification by how many pipes it contains plus one.
radio_counts = {} # Radio signature -> Count
for _, full_com, _ in com_group:
count = radio_counts.get(full_com, 0)
count += 1 / (full_com.count(b'|') + 1)
radio_counts[full_com] = count
total_classifications += 1 / (full_com.count(b'|') + 1)
for count in radio_counts.values():
# Despite the divisions, we should end up with integers overall.
assert numpy.isclose(round(count), count)
assert numpy.isclose(round(total_classifications),
total_classifications)
radio_consensus = max(radio_counts, key=radio_counts.get)
# Find the location consensus. For each radio combination, run a
# location consensus function on the positions associated with that
# combination.
for radio_signature in radio_consensus.split(b'|'):
percentage_consensus = (radio_counts[radio_consensus] /
total_classifications)
locations = []
for (_, x, y), (_, full, radio) in zip(pos_group, com_group):
if full == radio_consensus and radio == radio_signature:
locations.append((x, y))
locations = numpy.array(locations)
locations = locations[~numpy.all(numpy.isnan(locations), axis=1)]
(x, y), success = lowest_bic_gmm(locations)
if numpy.isnan(x) or numpy.isnan(y):
logging.debug('Skipping NaN PG-means output.')
continue
# Match the (x, y) position to an IR object.
dist, ind = ir_tree.query([(x, y)])
cons_positions.append((i, ind[0][0], success))
cons_combinations.append((i, radio_signature, percentage_consensus))
logging.debug('Found %d consensuses (before duplicate removal).',
len(cons_positions))
# Remove duplicates. For training data, I don't really care if radio
# combinations overlap (though I need to care if I generate a catalogue!) so
# just take duplicated locations and pick the one with the highest radio
# consensus that has success.
cons_objects = {} # Maps IR index to (ATLAS index, success,
# percentage_consensus)
for (atlas_i, ir_i, success), (atlas_j, radio, percentage) in zip(
cons_positions, cons_combinations):
assert atlas_i == atlas_j
if ir_i not in cons_objects:
cons_objects[ir_i] = (atlas_i, success, percentage)
continue
if cons_objects[ir_i][1] and not success:
# Preference successful KDE/PG-means.
continue
if not cons_objects[ir_i][1] and success:
# Preference successful KDE/PG-means.
cons_objects[ir_i] = (atlas_i, success, percentage)
continue
# If we get this far, we have the same success state. Choose based on
# radio consensus.
if percentage > cons_objects[ir_i][2]:
cons_objects[ir_i] = (atlas_i, success, percentage)
continue
logging.debug('Found %d consensuses.', int(len(cons_objects)))
cons_objects = numpy.array([(atlas_i, ir_i, success, percentage)
for ir_i, (atlas_i, success, percentage)
in sorted(cons_objects.items())])
In [ ]:
# Load Norris labels.
with h5py.File(TRAINING_H5_PATH, 'r') as training_h5:
crowdsourced_labels = training_h5['labels'].value
with h5py.File(CROWDASTRO_H5_PATH, 'r') as crowdastro_h5:
ir_names = crowdastro_h5['/swire/cdfs/string'].value
ir_positions = crowdastro_h5['/swire/cdfs/numeric'].value[:, :2]
ir_tree = sklearn.neighbors.KDTree(ir_positions)
with open(NORRIS_DAT_PATH, 'r') as norris_dat:
norris_coords = [r.strip().split('|') for r in norris_dat]
norris_labels = numpy.zeros((len(ir_positions)))
for ra, dec in norris_coords:
# Find a neighbour.
skycoord = SkyCoord(ra=ra, dec=dec, unit=('hourangle', 'deg'))
ra = skycoord.ra.degree
dec = skycoord.dec.degree
((dist,),), ((ir,),) = ir_tree.query([(ra, dec)])
if dist < 0.1:
norris_labels[ir] = 1
In [ ]:
# Convert radio labels into IR labels.
bic_labels = numpy.zeros(norris_labels.shape)
for _, ir_i, _, _ in cons_objects:
bic_labels[ir_i] = 1
In [ ]:
sklearn.metrics.confusion_matrix(bic_labels, norris_labels)
In [ ]:
sklearn.metrics.confusion_matrix(crowdsourced_labels, norris_labels)
In [ ]: