Training Data

In this notebook, I will try to assemble training data pairs: Input subjects from the Radio Galaxy Zoo database and potential hosts from the associated IR image, and output classifications.


In [41]:
import os.path
import pprint
import sys

import astropy.io.fits
import matplotlib.colors
import matplotlib.pyplot
import numpy
import pymongo
import requests
import scipy.ndimage.filters
import sklearn.decomposition
import sklearn.ensemble
import sklearn.linear_model
import sklearn.neural_network
import sklearn.svm

sys.path.insert(1, '..')
import crowdastro.rgz_analysis.consensus

%matplotlib inline
matplotlib.pyplot.rcParams['image.cmap'] = 'gray'

HOST = 'localhost'
PORT = 27017
DB_NAME = 'radio'

DATA_PATH = os.path.join('..', 'data')
ATLAS_CATALOGUE_PATH = os.path.join(DATA_PATH, 'ATLASDR3_cmpcat_23July2015.dat')
TILE_SIZE = '2x2'

FITS_IMAGE_WIDTH = 200
FITS_IMAGE_HEIGHT = 200

CLICK_IMAGE_WIDTH = 500
CLICK_IMAGE_HEIGHT = 500

CLICK_TO_FITS_X = FITS_IMAGE_WIDTH / CLICK_IMAGE_WIDTH
CLICK_TO_FITS_Y = FITS_IMAGE_HEIGHT / CLICK_IMAGE_HEIGHT
CLICK_TO_FITS = numpy.array([CLICK_TO_FITS_X, CLICK_TO_FITS_Y])

# Setup Mongo DB.
client = pymongo.MongoClient(HOST, PORT)
db = client[DB_NAME]

"Simple" subjects

My first task is to screen out what I think would be a simple set of subjects. In the fits-format notebook, I found that about 30% of ATLAS subjects have just one set of radio contours.

I want to screen out all of these and use them as the training subjects. It's a lot easier to look for just the subjects that have contour_count = 1 — the number of contours seems to be mostly unrelated to the number of radio sources, but if there's only one contour, there should only be one source. The benefit of doing things this way is that I can ignore the classifications collection for a bit.


In [15]:
subjects = list(db.radio_subjects.find({'metadata.survey': 'atlas', 'state': 'complete', 'metadata.contour_count': 1}))
print('Found {} subjects.'.format(len(subjects)))


Found 128 subjects.

That's a lot less than ideal (and less than expected) but we can fix this later. Let's have a look at some.


In [16]:
def open_fits(subject, field, wavelength):
    """Opens a FITS image.
    
    subject: RGZ subject.
    field: 'elais' or 'cdfs'.
    wavelength: 'ir' or 'radio'.
    -> FITS image file handle.
    """
    if field not in {'elais', 'cdfs'}:
        raise ValueError('field must be either "elais" or "cdfs".')

    if wavelength not in {'ir', 'radio'}:
        raise ValueError('wavelength must be either "ir" or "radio".')
    
    assert subject['metadata']['survey'] == 'atlas', 'Subject not from ATLAS survey.'

    cid = subject['metadata']['source']
    filename = '{}_{}.fits'.format(cid, wavelength)
    path = os.path.join(DATA_PATH, field, TILE_SIZE, filename)
    
    return astropy.io.fits.open(path, ignore_blank=True)

def plot_contours(subject, colour='green'):
    uri = subject['location']['contours']
    contours = requests.get(uri).json()['contours']
    for row in contours:
        for col in row:
            xs = []
            ys = []
            for pair in col['arr']:
                xs.append(pair['x'])
                ys.append(pair['y'])
            matplotlib.pyplot.plot(xs, FITS_IMAGE_HEIGHT - numpy.array(ys), c=colour)

def imshow(im, contrast=0.05):
    """Helper function for showing an image."""
    im = im - im.min() + contrast
    return matplotlib.pyplot.imshow(im,
            origin='lower',
            norm=matplotlib.colors.LogNorm(
                    vmin=im.min(),
                    vmax=im.max(),
            ),
    )

def show_subject(subject):
    with open_fits(subject, 'cdfs', 'ir') as fits_file:
        ir = fits_file[0].data
    
    with open_fits(subject, 'cdfs', 'radio') as fits_file:
        radio = fits_file[0].data
    
    matplotlib.pyplot.figure(figsize=(15, 15))
    matplotlib.pyplot.subplot(1, 2, 1)
    matplotlib.pyplot.title(subject['zooniverse_id'] + ' IR')
    matplotlib.pyplot.xlim((0, FITS_IMAGE_WIDTH))
    matplotlib.pyplot.ylim((0, FITS_IMAGE_HEIGHT))
    imshow(ir)
    plot_contours(subject)
    matplotlib.pyplot.subplot(1, 2, 2)
    matplotlib.pyplot.title(subject['zooniverse_id'] + ' Radio')
    matplotlib.pyplot.xlim((0, FITS_IMAGE_WIDTH))
    matplotlib.pyplot.ylim((0, FITS_IMAGE_HEIGHT))
    imshow(radio)
    plot_contours(subject)

In [17]:
show_subject(subjects[10])


K:\Languages\Anaconda3\lib\site-packages\astropy\io\fits\util.py:578: UserWarning: Could not find appropriate MS Visual C Runtime library or library is corrupt/misconfigured; cannot determine whether your file object was opened in append mode.  Please consider using a file object opened in write mode instead.
  'Could not find appropriate MS Visual C Runtime '

Potential hosts

Since we're representing this as a binary classification problem, let's get all the potential hosts in an image using the method from the potential_host_counting notebook. This is not ideal — it includes far too many hosts — but it'll do for now.


In [18]:
def potential_hosts(subject, sigma=0.5, threshold=0):
    with open_fits(subject, 'cdfs', 'ir') as fits_file:
        ir = fits_file[0].data

    neighborhood = numpy.ones((10, 10))
    blurred_ir = scipy.ndimage.filters.gaussian_filter(ir, sigma) > threshold
    local_max = scipy.ndimage.filters.maximum_filter(blurred_ir, footprint=neighborhood) == blurred_ir
    region_labels, n_labels = scipy.ndimage.measurements.label(local_max)
    maxima = numpy.array(
            [numpy.array((region_labels == i + 1).nonzero()).T.mean(axis=0)
             for i in range(n_labels)]
    )
    maxima = maxima[numpy.logical_and(maxima[:, 1] != 0, maxima[:, 1] != 499)]
    return maxima

In [19]:
with open_fits(subjects[10], 'cdfs', 'ir') as fits_file:
        ir = fits_file[0].data

matplotlib.pyplot.figure(figsize=(15, 15))
matplotlib.pyplot.subplot(1, 2, 1)
matplotlib.pyplot.title(subjects[10]['zooniverse_id'] + ' IR')
matplotlib.pyplot.xlim((0, FITS_IMAGE_WIDTH))
matplotlib.pyplot.ylim((0, FITS_IMAGE_HEIGHT))
imshow(ir)

maxima = potential_hosts(subjects[10], sigma=1, threshold=0.05)
matplotlib.pyplot.scatter(maxima[:, 1], maxima[:, 0])

matplotlib.pyplot.show()


K:\Languages\Anaconda3\lib\site-packages\astropy\io\fits\util.py:578: UserWarning: Could not find appropriate MS Visual C Runtime library or library is corrupt/misconfigured; cannot determine whether your file object was opened in append mode.  Please consider using a file object opened in write mode instead.
  'Could not find appropriate MS Visual C Runtime '

This is not a fantastic result, but it will do for now. Julie said that the rgz-analysis code found peaks through Gaussian fitting. I can't find the code for that, but I can use the idea later to get better potential hosts.

Crowdsourced labels

We also need to retrieve the labels for each subject. I'll use the rgz_analysis.consensus code for that.


In [20]:
def crowdsourced_label(subject):
    answers = crowdastro.rgz_analysis.consensus.consensus(subject['zooniverse_id'])['answer']
    answer = [answer for answer in answers.values() if answer['ind'] == 0][0]
    
    if 'ir' in answer:
        return answer['ir']
    
    if 'ir_peak' in answer:
        return answer['ir_peak']
    
    return None

In [21]:
with open_fits(subjects[10], 'cdfs', 'ir') as fits_file:
    ir = fits_file[0].data

matplotlib.pyplot.figure(figsize=(15, 15))
matplotlib.pyplot.subplot(1, 2, 1)
matplotlib.pyplot.title(subjects[10]['zooniverse_id'] + ' IR')
matplotlib.pyplot.xlim((0, FITS_IMAGE_WIDTH))
matplotlib.pyplot.ylim((0, FITS_IMAGE_HEIGHT))
imshow(ir)

maxima = potential_hosts(subjects[10], sigma=1, threshold=0.05)
matplotlib.pyplot.scatter(maxima[:, 1], maxima[:, 0])

label = crowdsourced_label(subjects[10])
# Clicks are upside-down, whereas the image and peaks found from it are not.
matplotlib.pyplot.scatter([CLICK_TO_FITS_X * label[0]], [FITS_IMAGE_HEIGHT - CLICK_TO_FITS_Y * label[1]], c='r')

matplotlib.pyplot.show()


K:\Languages\Anaconda3\lib\site-packages\astropy\io\fits\util.py:578: UserWarning: Could not find appropriate MS Visual C Runtime library or library is corrupt/misconfigured; cannot determine whether your file object was opened in append mode.  Please consider using a file object opened in write mode instead.
  'Could not find appropriate MS Visual C Runtime '

That seems a reasonable answer.

Assembling the data

We now have

  • IR images
  • Radio contours
  • Radio images
  • A single point to classify
  • A way to label the points

That's effectively all we need. I want to throw all of this into logistic regression. What I'll do is get a neighbourhood of pixels around the potential host, do the same for the radio image, and naïvely throw it all into scikit-learn. This will almost certainly be ineffective, but it's a start.

Edit, 27/03/2016: According to the results of mean_images, the IR image doesn't really matter. We can quite possibly just ignore it for now, and I do this below.


In [83]:
def get_training_pairs(subject):
    with open_fits(subject, 'cdfs', 'ir') as fits_file:
        ir = fits_file[0].data

    with open_fits(subject, 'cdfs', 'radio') as fits_file:
        radio = fits_file[0].data
    
    radius = 20
    ir = numpy.pad(ir, radius, mode='linear_ramp')
    radio = numpy.pad(radio, radius, mode='linear_ramp')

    hosts = potential_hosts(subject, sigma=1, threshold=0.05)

    actual_host = crowdsourced_label(subject)

    if actual_host is None:
        return []

    actual_host = numpy.array(actual_host) * CLICK_TO_FITS

    nearest_host = min(hosts, key=lambda host: numpy.hypot(actual_host[0] - host[1], actual_host[1] - host[0]))
    
    pairs = []
    for host in hosts:
        host_y, host_x = host
        ir_neighbourhood = ir[host_x : host_x + 2 * radius, host_y : host_y + 2 * radius]
        radio_neighbourhood = radio[int(host_x) : int(host_x) + 2 * radius, int(host_y) : int(host_y) + 2 * radius]
        input_vec = numpy.ndarray.flatten(radio_neighbourhood)
        label = (nearest_host == host).all()
        pairs.append((input_vec, label))
        
    return pairs

In [84]:
training_data = [pair for subject in subjects for pair in get_training_pairs(subject)]


K:\Languages\Anaconda3\lib\site-packages\astropy\io\fits\util.py:578: UserWarning: Could not find appropriate MS Visual C Runtime library or library is corrupt/misconfigured; cannot determine whether your file object was opened in append mode.  Please consider using a file object opened in write mode instead.
  'Could not find appropriate MS Visual C Runtime '
K:\Languages\Anaconda3\lib\site-packages\ipykernel\__main__.py:26: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future

In [85]:
print('Number of training samples:', len(training_data))


Number of training samples: 9396

Training

Here, I throw the data into logistic regression and see what happens.


In [87]:
xs = [x for x, _ in training_data]
ys = [int(y) for _, y in training_data]

xs_train, xs_test, ys_train, ys_test = sklearn.cross_validation.train_test_split(xs, ys, test_size=0.2, random_state=0)

In [47]:
lr = sklearn.linear_model.LogisticRegression(C=1e5, class_weight='auto')  # Note - auto deprecated from 0.17.
lr.fit(xs_train, ys_train)


K:\Languages\Anaconda3\lib\site-packages\sklearn\utils\class_weight.py:62: DeprecationWarning: The class_weight='auto' heuristic is deprecated in 0.17 in favor of a new heuristic class_weight='balanced'. 'auto' will be removed in 0.19
  " 0.19", DeprecationWarning)
Out[47]:
LogisticRegression(C=100000.0, class_weight='auto', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [51]:
n_true_positive = numpy.logical_and(lr.predict(xs_test) == numpy.array(ys_test), numpy.array(ys_test) == 1).sum()
n_true_negative = numpy.logical_and(lr.predict(xs_test) == numpy.array(ys_test), numpy.array(ys_test) == 0).sum()
n_false_positive = numpy.logical_and(lr.predict(xs_test) != numpy.array(ys_test), numpy.array(ys_test) == 0).sum()
n_false_negative = numpy.logical_and(lr.predict(xs_test) != numpy.array(ys_test), numpy.array(ys_test) == 1).sum()

print('True positives:', n_true_positive)
print('True negatives:', n_true_negative)
print('False positives:', n_false_positive)
print('False negatives:', n_false_negative)


True positives: 26
True negatives: 1332
False positives: 518
False negatives: 4

Originally, the logistic regression had essentially learned to output False, which makes sense — the examples are overwhelmingly False, so you can get to a very easy minimum by always outputting False. I said that some ways to get around this might be to inflate the number of True examples, or to change the output encoding in some way. Cheng suggested just weighting logistic regression's cost function to balance the Trues and Falses — there's an argument for this. The result is that there are far more attempts to assign True.

ConvNet

Let's try a nonlinear model that learns some features.

This doesn't correctly weight the classes, since Keras doesn't support class weights and I haven't manually weighted yet, but it does learn features.


In [133]:
import keras.layers
import keras.models

model = keras.models.Sequential()
radius = 20
input_shape = (1, radius * 2, radius * 2)
n_conv_filters = 10
conv_width = 4
hidden_dim = 256

model = keras.models.Sequential()
model.add(keras.layers.Convolution2D(n_conv_filters, conv_width, conv_width, border_mode='valid', input_shape=input_shape))
model.add(keras.layers.Activation('relu'))
model.add(keras.layers.MaxPooling2D(pool_size=(2, 2)))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(hidden_dim))
model.add(keras.layers.Activation('sigmoid'))
model.add(keras.layers.Dense(1))
model.add(keras.layers.Activation('sigmoid'))
model.compile(optimizer='sgd', loss='mse')

In [134]:
def get_training_pairs_im(subject):
    with open_fits(subject, 'cdfs', 'ir') as fits_file:
        ir = fits_file[0].data

    with open_fits(subject, 'cdfs', 'radio') as fits_file:
        radio = fits_file[0].data
    
    radius = 20
    ir = numpy.pad(ir, radius, mode='linear_ramp')
    radio = numpy.pad(radio, radius, mode='linear_ramp')

    hosts = potential_hosts(subject, sigma=1, threshold=0.05)

    actual_host = crowdsourced_label(subject)

    if actual_host is None:
        return []

    actual_host = numpy.array(actual_host) * CLICK_TO_FITS

    nearest_host = min(hosts, key=lambda host: numpy.hypot(actual_host[0] - host[1], actual_host[1] - host[0]))
    
    pairs = []
    for host in hosts:
        host_y, host_x = host
        ir_neighbourhood = ir[host_x : host_x + 2 * radius, host_y : host_y + 2 * radius]
        radio_neighbourhood = radio[int(host_x) : int(host_x) + 2 * radius, int(host_y) : int(host_y) + 2 * radius]
        input_vec = radio_neighbourhood
        label = (nearest_host == host).all()
        pairs.append((input_vec, label))
        
    return pairs
training_data_im = [pair for subject in subjects for pair in get_training_pairs_im(subject)]
xs = [x.reshape((1, radius * 2, radius * 2)) for x, _ in training_data_im]
ys = [[int(y)] for _, y in training_data_im]

xs_train, xs_test, ys_train, ys_test = sklearn.cross_validation.train_test_split(xs, ys, test_size=0.2, random_state=0)


K:\Languages\Anaconda3\lib\site-packages\astropy\io\fits\util.py:578: UserWarning: Could not find appropriate MS Visual C Runtime library or library is corrupt/misconfigured; cannot determine whether your file object was opened in append mode.  Please consider using a file object opened in write mode instead.
  'Could not find appropriate MS Visual C Runtime '
K:\Languages\Anaconda3\lib\site-packages\ipykernel\__main__.py:26: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future

In [136]:
xs_train = numpy.array(xs_train)
ys_train = numpy.array(ys_train)
xs_test = numpy.array(xs_test)
ys_test = numpy.array(ys_test)

In [122]:
tp = []
tn = []
fp = []
fn = []
correct_pos = []
correct_neg = []
total_epochs = 0

In [138]:
import IPython.display

for i in range(10):
    print('Epoch', i + 1)
    model.fit(xs_train, ys_train.reshape((-1, 1)), nb_epoch=1, batch_size=1)
    for i, kernel in enumerate(model.get_weights()[0]):
        kernel = kernel[0]
        matplotlib.pyplot.subplot(10, 10, i + 1)
        matplotlib.pyplot.axis('off')
        matplotlib.pyplot.imshow(kernel, cmap='gray')
        matplotlib.pyplot.subplots_adjust(hspace=0, wspace=0)

    n_true_positive = numpy.logical_and(model.predict(xs_test).round() == numpy.array(ys_test), numpy.array(ys_test) == 1).sum()
    n_true_negative = numpy.logical_and(model.predict(xs_test).round() == numpy.array(ys_test), numpy.array(ys_test) == 0).sum()
    n_false_positive = numpy.logical_and(model.predict(xs_test).round() != numpy.array(ys_test), numpy.array(ys_test) == 0).sum()
    n_false_negative = numpy.logical_and(model.predict(xs_test).round() != numpy.array(ys_test), numpy.array(ys_test) == 1).sum()
    tp.append(n_true_positive)
    tn.append(n_true_negative)
    fp.append(n_false_positive)
    fn.append(n_false_negative)
    correct_pos.append(n_true_positive / (n_true_positive + n_false_negative))
    correct_neg.append(n_true_negative / (n_true_negative + n_false_positive))
    total_epochs += 1

    IPython.display.clear_output(wait=True)
    
    print('Convolutional filters:')
    matplotlib.pyplot.show()
#     IPython.display.display(matplotlib.pyplot.gcf())

    print('Model over time:')
    epoch_range = numpy.arange(total_epochs) + 1
    matplotlib.pyplot.plot(epoch_range, correct_pos)
    matplotlib.pyplot.plot(epoch_range, correct_neg)
    matplotlib.pyplot.xlabel('Epochs')
    matplotlib.pyplot.ylabel('% Correct')
    matplotlib.pyplot.legend(['Positive', 'Negative'])
    matplotlib.pyplot.show()
#     IPython.display.display(matplotlib.pyplot.gcf())


Convolutional filters:
Model over time:

In [131]:
n_true_positive = numpy.logical_and(model.predict(xs_test).round() == numpy.array(ys_test), numpy.array(ys_test) == 1).sum()
n_true_negative = numpy.logical_and(model.predict(xs_test).round() == numpy.array(ys_test), numpy.array(ys_test) == 0).sum()
n_false_positive = numpy.logical_and(model.predict(xs_test).round() != numpy.array(ys_test), numpy.array(ys_test) == 0).sum()
n_false_negative = numpy.logical_and(model.predict(xs_test).round() != numpy.array(ys_test), numpy.array(ys_test) == 1).sum()

print('True positives:', n_true_positive)
print('True negatives:', n_true_negative)
print('False positives:', n_false_positive)
print('False negatives:', n_false_negative)


True positives: 10
True negatives: 1844
False positives: 6
False negatives: 20

In [132]:
# TODO: Class weights. Can we fake some data by adding Gaussian noise?
# TODO: IID. The data are not independent - can we use this?

In [ ]: