Simple Pipeline

This notebook contains the training pipeline for the simple (one-host) data set.

Load training data


In [ ]:
## Imports.

import sys

import h5py
import IPython.core.display
import keras.backend
import keras.models
import numpy

sys.path.insert(1, '..')
import crowdastro.data

## Constants.

TRAINING_DATA_PATH = '../crowdastro-data/training_simple_flipped.h5'
COLUMNS = ['zooniverse_id', 'source', 'x', 'y', 'flux24', 'flux36', 'flux45', 'flux58', 'flux80', 'label']
N_DATA = 84298
N_FEATURES = 160  # 155 (convolutional with PCA) + 5 (astro) dimensional.
CNN_MODEL_PATH = '../crowdastro-data/cnn_model.json'
CNN_WEIGHTS_PATH = '../crowdastro-data/my_cnn_weights.h5'
PCA_PATH = '../crowdastro-data/pca.h5'
RADIUS = 40  # 0.4'
PADDING = 150  # (5' - 2') / 2
GATOR_CACHE = 'gator_cache'
DB_PATH = '../crowdastro-data/processed.db'

In [ ]:
## Load CNN and PCA.

with open(CNN_MODEL_PATH) as f:
    cnn = keras.models.model_from_json(f.read())
cnn.load_weights(CNN_WEIGHTS_PATH)
cnn.compile(optimizer='sgd', loss='mse')
get_convolutional_features_ = keras.backend.function([cnn.layers[0].input], [cnn.layers[4].output])
get_convolutional_features = lambda p: get_convolutional_features_([p])[0].reshape((1, -1))

with h5py.File(PCA_PATH) as f:
    pca = f['conv_2'][:]

## Load training data.

subjects = []
xs = numpy.zeros((N_DATA, N_FEATURES))
ts = numpy.zeros((N_DATA,))
radio_patches = numpy.zeros((100, 1, RADIUS * 2, RADIUS * 2))  # Buffer.

last_zid = None
subject = None
radio = None

with h5py.File(TRAINING_DATA_PATH) as f:
    data = f['data']
    
    for index, datapoint in enumerate(data):
        if index % 100 == 0 and index > 0:
            IPython.core.display.clear_output()
            print('{:.02%}'.format((index + 1) / N_DATA))
            
            # Batch the features.
            convolutional_features = numpy.dot(get_convolutional_features(radio_patches).reshape((100, -1)), pca.T)
            xs[index - 100 : index, :-5] = convolutional_features

        datapoint = tuple(datapoint)
        zooniverse_id = datapoint[0].decode('ascii')
        x = datapoint[2]
        y = datapoint[3]
        astro_features = datapoint[4:9]
        label = datapoint[-1]
        
        # Fetch image patch.
        if last_zid != zooniverse_id:
            subject = crowdastro.data.db.radio_subjects.find_one({'zooniverse_id': zooniverse_id})
            radio = crowdastro.data.get_radio(subject, size='5x5')
            last_zid = zooniverse_id

        patch = radio[x - RADIUS + PADDING : x + RADIUS + PADDING, y - RADIUS + PADDING : y + RADIUS + PADDING]
        
        # Store patch so we can get CNN features later.
        radio_patches[index % 100, 0] = patch
        
        # Store astro features.
        xs[index, -5:] = astro_features
        
        # Store label.
        ts[index] = label

In [ ]:
# Can't just train_test_split - have to make sure we have training and testing *subjects*.
# Assume xs/ts ordered by subject.
# In future, should have separate test subjects!
xs_train = numpy.vstack([xs[:N_DATA//4], xs[-N_DATA//4:]])
xs_test = xs[N_DATA//4:-N_DATA//4]
ts_train = numpy.hstack([ts[:N_DATA//4], ts[-N_DATA//4:]])
ts_test = ts[N_DATA//4:-N_DATA//4]

In [ ]:
# Normalise and scale.
import sklearn.preprocessing

normaliser = sklearn.preprocessing.Normalizer()
scaler = sklearn.preprocessing.StandardScaler().fit(normaliser.transform(xs_train[:, -5:]))
xs_train = numpy.hstack([normaliser.transform(xs_train[:, :-5]),
                         scaler.transform(normaliser.transform(xs_train[:, -5:]))])
xs_test = numpy.hstack([normaliser.transform(xs_test[:, :-5]),
                        scaler.transform(normaliser.transform(xs_test[:, -5:]))])

Testing framework


In [ ]:
from crowdastro.training_data import remove_nans
import crowdastro.config

def test_subject(subject, conn, classifier, stellarity_cutoff=1):
    radio = crowdastro.data.get_radio(subject, size='5x5')
    potential_hosts = crowdastro.data.get_potential_hosts(subject, GATOR_CACHE)
    
    potential_hosts = {i:j for i, j in potential_hosts.items() if j['stell_36'] < 0.75}
    
    cur = conn.cursor()
    consensus = list(cur.execute('SELECT source_x, source_y FROM consensuses_kde WHERE zooniverse_id = ?'
                                 'AND radio_agreement >= 0.5',
                                 [subject['zooniverse_id']]))
    if not consensus:
        raise ValueError('Found null in data assumed sanitised.')
    
    ((cx, cy),) = consensus
    if not cx or not cy:
        raise ValueError('Found null in data assumed sanitised.')
    # Consensuses are inverted vertically w.r.t. the potential hosts.
    cy = crowdastro.config.get('fits_image_height') - cy

    
    n = len(potential_hosts)
    xs = numpy.zeros((n, N_FEATURES))
    radio_patches = numpy.zeros((n, 1, RADIUS * 2, RADIUS * 2))
    ts = numpy.zeros((n,))
    
    points = []
    
    for index, ((x, y), data) in enumerate(potential_hosts.items()):
        patch = radio[x - RADIUS + PADDING : x + RADIUS + PADDING, y - RADIUS + PADDING : y + RADIUS + PADDING]
        radio_patches[index, 0] = patch
        xs[index, -5] = remove_nans(data['flux_ap2_24'])
        xs[index, -4] = remove_nans(data['flux_ap2_36'])
        xs[index, -3] = remove_nans(data['flux_ap2_45'])
        xs[index, -2] = remove_nans(data['flux_ap2_58'])
        xs[index, -1] = remove_nans(data['flux_ap2_80'])

        points.append((x, y))
    
    closest = min(enumerate(points), key=lambda z: numpy.hypot(z[1][0] - cx, z[1][1] - cy))
    assert points[closest[0]] == closest[1]
    
    xs[:, :-5] = numpy.dot(get_convolutional_features(radio_patches).reshape((n, -1)), pca.T)
    
    xs = numpy.hstack([normaliser.transform(xs[:, :-5]),
                       scaler.transform(normaliser.transform(xs[:, -5:]))])
    
    probs = classifier.predict_proba(xs)[:, 1]
    selection = probs.argmax()
    return selection, closest[0], probs, points

Train logistic regression


In [ ]:
import sklearn.linear_model

lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
lr.fit(xs_train, ts_train)

In [ ]:
lr.score(xs_test, ts_test)

In [ ]:
import sqlite3

import matplotlib.pyplot
%matplotlib inline

import crowdastro.show

def softmax(x):
    exp = numpy.exp(x)
    return exp / numpy.sum(exp, axis=0)

with sqlite3.connect(DB_PATH) as conn:
    cur = conn.cursor()
    zooniverse_ids = list(cur.execute("""SELECT zooniverse_id
                                         FROM consensuses_kde
                                         GROUP BY zooniverse_id
                                         HAVING COUNT(zooniverse_id) = 1"""))
    assert len(zooniverse_ids) in {1087, 1093}, len(zooniverse_ids)
    n_correct = 0
    n_total = 0
    split_index = len(zooniverse_ids)//4
    for index, (zooniverse_id,) in enumerate(zooniverse_ids[split_index:-split_index]):
        subject = crowdastro.data.get_subject(zooniverse_id)
        try:
#             print(zooniverse_id)
            guess, groundtruth, probs, points = test_subject(subject, conn, lr)
#             print(points, guess, groundtruth)
            n_correct += numpy.hypot(points[guess][0] - points[groundtruth][0],
                                     points[guess][1] - points[groundtruth][1]) <= 5 # 5 px radius = 3''
            n_total += 1
#             print(sorted(points))
            print('{:.02%} / {:.02%}'.format(index / (len(zooniverse_ids[split_index:-split_index])),
                                             n_correct / n_total))
            
            if index < 50:
                matplotlib.pyplot.figure(figsize=(20, 10))
                print(zooniverse_id)
                matplotlib.pyplot.subplot(1, 2, 1)
                crowdastro.show.subject(subject)
                matplotlib.pyplot.scatter(*zip(*points), c=softmax(probs), s=200, cmap='cool')
                matplotlib.pyplot.axis('off')
                matplotlib.pyplot.colorbar()
                matplotlib.pyplot.scatter(points[guess][0], points[guess][1], c='green', s=200, marker='+')
    #             print(points[groundtruth][0], 200-points[groundtruth][1])
                matplotlib.pyplot.scatter(points[groundtruth][0], points[groundtruth][1], c='red', s=200, marker='x')
                matplotlib.pyplot.subplot(1, 2, 2)
                matplotlib.pyplot.scatter(range(len(probs)), sorted(softmax(probs)), marker='o', c=sorted(softmax(probs)),
                                          cmap='cool', s=200)
                matplotlib.pyplot.xlabel('Index')
                matplotlib.pyplot.ylabel('Probability of being the true host')
                matplotlib.pyplot.show()
        except ValueError:
            continue

    print('{:.02%}'.format(n_correct / n_total))

Train random forest


In [ ]:
import sklearn.ensemble

rf = sklearn.ensemble.RandomForestClassifier(class_weight='balanced')
rf.fit(xs_train, ts_train)

In [ ]:
rf.score(xs_test, ts_test)

In [ ]:
with sqlite3.connect(DB_PATH) as conn:
    cur = conn.cursor()
    zooniverse_ids = list(cur.execute("""SELECT zooniverse_id
                                         FROM consensuses_kde
                                         GROUP BY zooniverse_id
                                         HAVING COUNT(zooniverse_id) = 1"""))
    assert len(zooniverse_ids) in {1087, 1093}, len(zooniverse_ids)
    n_correct = 0
    n_total = 0
    split_index = len(zooniverse_ids)//4
    for index, (zooniverse_id,) in enumerate(zooniverse_ids[split_index:-split_index]):
        subject = crowdastro.data.get_subject(zooniverse_id)
        try:
#             print(zooniverse_id)
            guess, groundtruth, probs, points = test_subject(subject, conn, rf)
            n_correct += numpy.hypot(points[guess][0] - points[groundtruth][0],
                                     points[guess][1] - points[groundtruth][1]) <= 5 # 5 px radius = 3''
            n_total += 1
#             print(sorted(points))
            print('{:.02%} / {:.02%}'.format(index / (len(zooniverse_ids[split_index:-split_index])),
                                             n_correct / n_total))
            
            if index < 50:
                matplotlib.pyplot.figure(figsize=(20, 10))
                print(zooniverse_id)
                matplotlib.pyplot.subplot(1, 2, 1)
                crowdastro.show.subject(subject)
                matplotlib.pyplot.scatter(*zip(*points), c=softmax(probs), s=200, cmap='cool')
                matplotlib.pyplot.axis('off')
                matplotlib.pyplot.colorbar()
                matplotlib.pyplot.scatter(points[guess][0], points[guess][1], c='green', s=200, marker='+')
    #             print(points[groundtruth][0], 200-points[groundtruth][1])
                matplotlib.pyplot.scatter(points[groundtruth][0], points[groundtruth][1], c='red', s=200, marker='x')
                matplotlib.pyplot.subplot(1, 2, 2)
                matplotlib.pyplot.scatter(range(len(probs)), sorted(softmax(probs)), marker='o', c=sorted(softmax(probs)),
                                          cmap='cool', s=200)
                matplotlib.pyplot.xlabel('Index')
                matplotlib.pyplot.ylabel('Probability of being the true host')
                matplotlib.pyplot.show()
        except ValueError:
            continue

    print('{:.02%}'.format(n_correct / n_total))

In [ ]:
rf.feature_importances_

In [ ]:
matplotlib.pyplot.figure(figsize=(15, 15))
matplotlib.pyplot.xlabel('Feature index')
matplotlib.pyplot.ylabel('Importance')
matplotlib.pyplot.plot(numpy.abs(rf.feature_importances_ / numpy.max(rf.feature_importances_)), color='blue')
matplotlib.pyplot.plot(numpy.abs(-lr.coef_[0] / numpy.min(lr.coef_)), color='red')
matplotlib.pyplot.show()

In [ ]:
lr.coef_ / numpy.max(lr.coef_)

In [ ]: