Redundancy and KDE/PG-means

Is the redundancy high enough on the Radio Galaxy Zoo website? Do PG-means or KDE handle low redundancy cases differently? How often does taking the mean give an accurate or useful result (based on Norris labels)? If PG-means and KDE disagree, what do the disagreements look like? If there are similar results for both, which is faster?

Bonus question: Can we model the redundancy needed for a given radio object?


In [21]:
import itertools
import logging
import sys

from astropy.coordinates import SkyCoord
import h5py
import matplotlib.pyplot as plt
import numpy
import sklearn

sys.path.insert(1, '..')
# Needs MongoDB running.
import crowdastro.rgz_analysis.consensus as consensus, crowdastro.rgz_data as data
from crowdastro.consensuses import pg_means

ARCMIN = 1 / 60
CROWDASTRO_H5_PATH = '../data/crowdastro.h5'
IMAGE_SIZE = 200 * 200
NORRIS_DAT_PATH = '../data/norris_2006_atlas_classifications_ra_dec_only.dat'
TRAINING_H5_PATH = '../data/training.h5'

How often does KDE fail?


In [9]:
n_kde_success = 0
n_kde_failure_not_caught = 0
n_kde_failure_caught = 0
n_kde_total = 0

for subject in data.get_all_subjects(survey='atlas', field='cdfs').limit(2000):
    c = consensus.consensus(subject['zooniverse_id'], None)
    for answer in c['answer'].values():
        if 'ir_peak' in answer and answer['peak_data']['npeaks'] < 10:
            n_kde_success += 1
        elif 'ir_peak' in answer:
            n_kde_failure_not_caught += 1
        elif 'ir' in answer:
            n_kde_failure_caught += 1
        n_kde_total += 1

In [10]:
print('KDE succeeded: {:.02%}'.format(n_kde_success / n_kde_total))
print('KDE failed (caught, mean used): {:.02%}'.format(n_kde_failure_caught / n_kde_total))
print('KDE failed (not caught): {:.02%}'.format(n_kde_failure_not_caught / n_kde_total))


KDE succeeded: 90.42%
KDE failed (caught, mean used): 8.88%
KDE failed (not caught): 0.68%

How often does PG-means fail?


In [50]:
n_pg_success = 0
n_pg_failure = 0
n_pg_total = 0

# Taken mostly from crowdastro.consensuses.find_consensuses.
with h5py.File(CROWDASTRO_H5_PATH, 'r') as f_h5:
    class_positions = f_h5['/atlas/cdfs/classification_positions']
    class_combinations = f_h5['/atlas/cdfs/classification_combinations']
    ir_coords = f_h5['/{}/cdfs/numeric'.format('wise')][:, :2]
    pos_groups = itertools.groupby(class_positions, key=lambda z: z[0])
    com_groups = itertools.groupby(class_combinations, key=lambda z: z['index'])
    for (i, pos_group), (j, com_group) in zip(pos_groups, com_groups):
        assert i == j
        com_group = list(com_group)
        pos_group = list(pos_group)
        total_classifications = 0
        radio_counts = {}
        for _, full_com, _ in com_group:
            count = radio_counts.get(full_com, 0)
            count += 1 / (full_com.count(b'|') + 1)
            radio_counts[full_com] = count
            total_classifications += 1 / (full_com.count(b'|') + 1)
        radio_consensus = max(radio_counts, key=radio_counts.get)
        for radio_signature in radio_consensus.split(b'|'):
            n_pg_total += 1
            percentage_consensus = (radio_counts[radio_consensus] /
                                    total_classifications)
            locations = []
            for (_, x, y), (_, full, radio) in zip(pos_group, com_group):
                if full == radio_consensus and radio == radio_signature:
                    locations.append((x, y))
            locations = numpy.array(locations)
            locations = locations[~numpy.all(numpy.isnan(locations), axis=1)]
            (x, y), success = pg_means(locations)
            
            if not success:
                n_pg_failure += 1
            else:
                n_pg_success += 1

            if numpy.isnan(x) or numpy.isnan(y):
                continue

In [51]:
print('PG-means succeeded: {:.02%}'.format(n_pg_success / n_pg_total))
print('PG-means failed (caught): {:.02%}'.format(n_pg_failure / n_pg_total))


PG-means succeeded: 13.01%
PG-means failed (caught): 86.99%

That's a surprisingly high number of failures. Most likely, the problem is a parameter problem, but instead of fiddling with parameters, let's just use a simpler method.

Lowest BIC GMM


In [44]:
def lowest_bic(locations):
    min_bic = float('inf')
    min_gmm = None
    for k in range(1, 5):  # Assume no more than 5 candidate objects. Probably reasonable given ~20 clicks max.
        gmm = sklearn.mixture.GMM(n_components=k, covariance_type='full')
        try:
            gmm.fit(locations)
        except ValueError:
            break
        bic = gmm.bic(locations)
        if bic < min_bic:
            min_bic = bic
            min_gmm = gmm
    
    if not min_gmm:
        return locations.mean(axis=0), False, 'mean'
    
    if sum(w == max(min_gmm.weights_) for w in min_gmm.weights_) > 1:
        success = False
        reason = 'low_redundancy'
    else:
        success = True
        reason = ''
    
    return min_gmm.means_[min_gmm.weights_.argmax()], success, reason

In [48]:
n_bic_success = 0
n_bic_failure_mean = 0
n_bic_failure_tie = 0
n_bic_total = 0

# Taken mostly from crowdastro.consensuses.find_consensuses.
with h5py.File(CROWDASTRO_H5_PATH, 'r') as f_h5:
    class_positions = f_h5['/atlas/cdfs/classification_positions']
    class_combinations = f_h5['/atlas/cdfs/classification_combinations']
    ir_coords = f_h5['/{}/cdfs/numeric'.format('wise')][:, :2]
    pos_groups = itertools.groupby(class_positions, key=lambda z: z[0])
    com_groups = itertools.groupby(class_combinations, key=lambda z: z['index'])
    for (i, pos_group), (j, com_group) in zip(pos_groups, com_groups):
        assert i == j
        com_group = list(com_group)
        pos_group = list(pos_group)
        total_classifications = 0
        radio_counts = {}
        for _, full_com, _ in com_group:
            count = radio_counts.get(full_com, 0)
            count += 1 / (full_com.count(b'|') + 1)
            radio_counts[full_com] = count
            total_classifications += 1 / (full_com.count(b'|') + 1)
        radio_consensus = max(radio_counts, key=radio_counts.get)
        for radio_signature in radio_consensus.split(b'|'):
            n_bic_total += 1
            percentage_consensus = (radio_counts[radio_consensus] /
                                    total_classifications)
            locations = []
            for (_, x, y), (_, full, radio) in zip(pos_group, com_group):
                if full == radio_consensus and radio == radio_signature:
                    locations.append((x, y))
            locations = numpy.array(locations)
            locations = locations[~numpy.all(numpy.isnan(locations), axis=1)]
            (x, y), success, reason = lowest_bic(locations)
            
            if not success and reason == 'mean':
                n_bic_failure_mean += 1
            elif not success and reason == 'low_redundancy':
                n_bic_failure_tie += 1
            elif not success and reason:
                raise ValueError('Unknown failure reason: {}'.format(reason))
            else:
                n_bic_success += 1

            if numpy.isnan(x) or numpy.isnan(y):
                continue

In [49]:
print('BIC succeeded: {:.02%}'.format(n_bic_success / n_bic_total))
print('BIC failed (caught, mean used): {:.02%}'.format(n_bic_failure_mean / n_bic_total))
print('BIC failed (caught, best guess): {:.02%}'.format(n_bic_failure_tie / n_bic_total))


BIC succeeded: 99.75%
BIC failed (caught, mean used): 0.25%
BIC failed (caught, best guess): 0.00%

This is a little misleading. The BIC method, since it uses Gaussian fitting, may fit a single Gaussian to a few points if it's the best fit it can find, reducing it to the mean. Let's compare the labels to Norris et al.


In [22]:
from crowdastro.consensuses import lowest_bic_gmm

In [ ]:
with h5py.File(CROWDASTRO_H5_PATH, 'r') as f_h5:
    # Copied from crowdastro.consensuses.
    class_positions = f_h5['/atlas/cdfs/classification_positions']
    class_combinations = f_h5['/atlas/cdfs/classification_combinations']
    assert len(class_positions) == len(class_combinations)

    logging.debug('Finding consensuses for %d classifications.',
                  len(class_combinations))

    # Pre-build the IR tree.
    ir_coords = f_h5['/{}/cdfs/numeric'.format('swire')][:, :2]
    ir_tree = sklearn.neighbors.KDTree(ir_coords)

    cons_positions = []
    cons_combinations = []

    # Data integrity and assumptions checks.
    assert numpy.array_equal(class_positions[:, 0],
                             class_combinations['index'])
    assert numpy.array_equal(class_positions[:, 0],
                             sorted(class_positions[:, 0]))

    pos_groups = itertools.groupby(class_positions, key=lambda z: z[0])
    com_groups = itertools.groupby(class_combinations, key=lambda z: z['index'])

    for (i, pos_group), (j, com_group) in zip(pos_groups, com_groups):
        assert i == j

        com_group = list(com_group)  # For multiple iterations.
        pos_group = list(pos_group)
        total_classifications = 0

        # Find the radio consensus. Be wary when counting: If there are multiple
        # AGNs identified in one subject, *that classification will appear
        # multiple times*. I'm going to deal with this by dividing the weight of
        # each classification by how many pipes it contains plus one.
        radio_counts = {}  # Radio signature -> Count
        for _, full_com, _ in com_group:
            count = radio_counts.get(full_com, 0)
            count += 1 / (full_com.count(b'|') + 1)
            radio_counts[full_com] = count

            total_classifications += 1 / (full_com.count(b'|') + 1)

        for count in radio_counts.values():
            # Despite the divisions, we should end up with integers overall.
            assert numpy.isclose(round(count), count)
        assert numpy.isclose(round(total_classifications),
                             total_classifications)

        radio_consensus = max(radio_counts, key=radio_counts.get)

        # Find the location consensus. For each radio combination, run a
        # location consensus function on the positions associated with that
        # combination.
        for radio_signature in radio_consensus.split(b'|'):
            percentage_consensus = (radio_counts[radio_consensus] /
                                    total_classifications)
            locations = []
            for (_, x, y), (_, full, radio) in zip(pos_group, com_group):
                if full == radio_consensus and radio == radio_signature:
                    locations.append((x, y))
            locations = numpy.array(locations)
            locations = locations[~numpy.all(numpy.isnan(locations), axis=1)]
            (x, y), success = lowest_bic_gmm(locations)

            if numpy.isnan(x) or numpy.isnan(y):
                logging.debug('Skipping NaN PG-means output.')
                continue

            # Match the (x, y) position to an IR object.
            dist, ind = ir_tree.query([(x, y)])

            cons_positions.append((i, ind[0][0], success))
            cons_combinations.append((i, radio_signature, percentage_consensus))

    logging.debug('Found %d consensuses (before duplicate removal).',
                  len(cons_positions))

    # Remove duplicates. For training data, I don't really care if radio
    # combinations overlap (though I need to care if I generate a catalogue!) so
    # just take duplicated locations and pick the one with the highest radio
    # consensus that has success.
    cons_objects = {}  # Maps IR index to (ATLAS index, success,
                       #                   percentage_consensus)
    for (atlas_i, ir_i, success), (atlas_j, radio, percentage) in zip(
            cons_positions, cons_combinations):
        assert atlas_i == atlas_j

        if ir_i not in cons_objects:
            cons_objects[ir_i] = (atlas_i, success, percentage)
            continue

        if cons_objects[ir_i][1] and not success:
            # Preference successful KDE/PG-means.
            continue

        if not cons_objects[ir_i][1] and success:
            # Preference successful KDE/PG-means.
            cons_objects[ir_i] = (atlas_i, success, percentage)
            continue

        # If we get this far, we have the same success state. Choose based on
        # radio consensus.
        if percentage > cons_objects[ir_i][2]:
            cons_objects[ir_i] = (atlas_i, success, percentage)
            continue

    logging.debug('Found %d consensuses.', int(len(cons_objects)))

    cons_objects = numpy.array([(atlas_i, ir_i, success, percentage)
            for ir_i, (atlas_i, success, percentage)
            in sorted(cons_objects.items())])

In [ ]:
# Load Norris labels.
with h5py.File(TRAINING_H5_PATH, 'r') as training_h5:
    crowdsourced_labels = training_h5['labels'].value

with h5py.File(CROWDASTRO_H5_PATH, 'r') as crowdastro_h5:
    ir_names = crowdastro_h5['/swire/cdfs/string'].value
    ir_positions = crowdastro_h5['/swire/cdfs/numeric'].value[:, :2]
ir_tree = sklearn.neighbors.KDTree(ir_positions)

with open(NORRIS_DAT_PATH, 'r') as norris_dat:
    norris_coords = [r.strip().split('|') for r in norris_dat]

norris_labels = numpy.zeros((len(ir_positions)))
for ra, dec in norris_coords:
    # Find a neighbour.
    skycoord = SkyCoord(ra=ra, dec=dec, unit=('hourangle', 'deg'))
    ra = skycoord.ra.degree
    dec = skycoord.dec.degree
    ((dist,),), ((ir,),) = ir_tree.query([(ra, dec)])
    if dist < 0.1:
        norris_labels[ir] = 1

In [ ]:
# Convert radio labels into IR labels.
bic_labels = numpy.zeros(norris_labels.shape)
for _, ir_i, _, _ in cons_objects:
    bic_labels[ir_i] = 1

In [ ]:
sklearn.metrics.confusion_matrix(bic_labels, norris_labels)

In [ ]:
sklearn.metrics.confusion_matrix(crowdsourced_labels, norris_labels)

In [ ]: