Distance from Centre

Host galaxies are usually in the centre. Should we use this fact?

In this notebook, I'll try three different sets of features: one set including linear distance from the centre, one set including Gaussian distance from the centre, and one set with neither.


In [15]:
# Setup taken from notebook 17.

import itertools
import sys

import bson
import h5py
import keras.layers
import keras.models
import matplotlib.pyplot
import numpy
import pandas
import sklearn.cross_validation
import sklearn.dummy
import sklearn.linear_model
import sklearn.metrics

sys.path.insert(1, '..')
import crowdastro.data
import crowdastro.show

with pandas.HDFStore('../crowdastro-data/training.h5') as store:
    data = store['data']

n = 5000

# I'm gathering up the radio patches first so I can run them through the CNN at the same time
# as one big matrix operation. In principle this would run on the GPU.
radio_patches = numpy.zeros((n, 80, 80))
labels = numpy.zeros((n,))
linear_distances = numpy.zeros((n,))
gaussian_distances = numpy.zeros((n,))

radius = 40
padding = 150

for idx, row in data.head(n).iterrows():
    sid = bson.objectid.ObjectId(row['subject_id'][0].decode('ascii'))
    x = row['x'][0]
    y = row['y'][0]
    label = row['is_host'][0]
    
    labels[idx] = label
    
    subject = crowdastro.data.db.radio_subjects.find_one({'_id': sid})
    radio = crowdastro.data.get_radio(subject, size='5x5')
    patch = radio[x - radius + padding : x + radius + padding, y - radius + padding : y + radius + padding]
    radio_patches[idx, :] = patch
    
    linear_distances[idx] = numpy.hypot(x - 100, y - 100)
    gaussian_distances[idx] = numpy.exp(-((x - 100) ** 2 / (2 * 50 ** 2) + (y - 100) ** 2 / (2 * 50 ** 2)))

# Load the CNN.

with open('../crowdastro-data/cnn_model_2.json', 'r') as f:
    cnn = keras.models.model_from_json(f.read())

cnn.load_weights('../crowdastro-data/cnn_weights_2.h5')

cnn.layers = cnn.layers[:5]  # Pop the layers after the second convolution's activation.
cnn.add(keras.layers.Flatten())

cnn.compile(optimizer='sgd', loss='mse')  # I don't actually care about the optimiser or loss.

# Load the PCA.
with h5py.File('../crowdastro-data/pca.h5') as f:
    pca = f['conv_2'][:]

# Find the radio features.
radio_features = cnn.predict(radio_patches.reshape(n, 1, 80, 80)) @ pca.T

# Add on the astronomy features.
features = numpy.hstack([radio_features, data.ix[:n-1, 'flux_ap2_24':'flux_ap2_80'].as_matrix()])
features = numpy.nan_to_num(features)


K:\Languages\Anaconda3\lib\site-packages\astropy\io\fits\util.py:578: UserWarning: Could not find appropriate MS Visual C Runtime library or library is corrupt/misconfigured; cannot determine whether your file object was opened in append mode.  Please consider using a file object opened in write mode instead.
  'Could not find appropriate MS Visual C Runtime '
K:\Languages\Anaconda3\lib\site-packages\ipykernel\__main__.py:47: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future

In [16]:
subject_ids = set()
for idx, row in data.ix[n:n * 2].iterrows():
    sid = row['subject_id'][0]
    subject_ids.add(sid)

Baseline — no distance


In [23]:
def test(features, labels):
    xs_train, xs_test, ts_train, ts_test = sklearn.cross_validation.train_test_split(
            features, labels, test_size=0.2, random_state=0)
    lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
    lr.fit(xs_train, ts_train)
    print('Classification accuracy: {:.02%}'.format(lr.score(xs_test, ts_test)))

    hits = 0
    attempts = 0

    for subject_id in subject_ids:
        indices = (data['subject_id'] == subject_id).as_matrix().reshape(-1)
        potential_hosts = numpy.nan_to_num(data.as_matrix()[indices][:, 1:-1].astype(float))
        labels = numpy.nan_to_num(data.as_matrix()[indices][:, -1].astype(bool))

        subject = crowdastro.data.db.radio_subjects.find_one({'_id': bson.objectid.ObjectId(subject_id.decode('ascii'))})
        radio = crowdastro.data.get_radio(subject, size='5x5')

        radio_patches = numpy.zeros((len(potential_hosts), 1, radius * 2, radius * 2))
        for index, (x, y, *astro) in enumerate(potential_hosts):
            patch = radio[x - radius + padding : x + radius + padding, y - radius + padding : y + radius + padding]
            radio_patches[index, 0, :] = patch

        radio_features = cnn.predict(radio_patches) @ pca.T
        astro_features = potential_hosts[:, 2:]
        features = numpy.hstack([radio_features, astro_features])

        scores = lr.predict_proba(features)[:, 1].reshape(-1)
        predicted_host = scores.argmax()
        if labels[predicted_host]:
            hits += 1
        attempts += 1

    print('Problem accuracy: {:.02%}'.format(hits / attempts))

test(features, labels)


Classification accuracy: 84.70%
Problem accuracy: 61.59%
K:\Languages\Anaconda3\lib\site-packages\astropy\io\fits\util.py:578: UserWarning: Could not find appropriate MS Visual C Runtime library or library is corrupt/misconfigured; cannot determine whether your file object was opened in append mode.  Please consider using a file object opened in write mode instead.
  'Could not find appropriate MS Visual C Runtime '
K:\Languages\Anaconda3\lib\site-packages\ipykernel\__main__.py:21: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future

Linear distance


In [24]:
def test(features, labels, linear_distances):
    features = numpy.hstack([features, linear_distances.reshape(-1, 1)])

    xs_train, xs_test, ts_train, ts_test = sklearn.cross_validation.train_test_split(
            features, labels, test_size=0.2, random_state=0)
    lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
    lr.fit(xs_train, ts_train)
    print('Classification accuracy: {:.02%}'.format(lr.score(xs_test, ts_test)))

    hits = 0
    attempts = 0

    for subject_id in subject_ids:
        indices = (data['subject_id'] == subject_id).as_matrix().reshape(-1)
        potential_hosts = numpy.nan_to_num(data.as_matrix()[indices][:, 1:-1].astype(float))
        labels = numpy.nan_to_num(data.as_matrix()[indices][:, -1].astype(bool))

        subject = crowdastro.data.db.radio_subjects.find_one({'_id': bson.objectid.ObjectId(subject_id.decode('ascii'))})
        radio = crowdastro.data.get_radio(subject, size='5x5')

        radio_patches = numpy.zeros((len(potential_hosts), 1, radius * 2, radius * 2))
        linear_features = numpy.zeros((len(potential_hosts),))
        for index, (x, y, *astro) in enumerate(potential_hosts):
            patch = radio[x - radius + padding : x + radius + padding, y - radius + padding : y + radius + padding]
            radio_patches[index, 0, :] = patch
            linear_features[index] = numpy.hypot(x - 100, y - 100)

        radio_features = cnn.predict(radio_patches) @ pca.T
        astro_features = potential_hosts[:, 2:]
        features = numpy.hstack([radio_features, astro_features, linear_features.reshape(-1, 1)])

        scores = lr.predict_proba(features)[:, 1].reshape(-1)
        predicted_host = scores.argmax()
        if labels[predicted_host]:
            hits += 1
        attempts += 1
    
    assert lr.coef_.shape[1] == 161
    print('Linear distance weight:', lr.coef_[0, -1])

    print('Problem accuracy: {:.02%}'.format(hits / attempts))

test(features, labels, linear_distances)


Classification accuracy: 87.90%
Linear distance weight: -0.00328479306217
Problem accuracy: 63.04%
K:\Languages\Anaconda3\lib\site-packages\astropy\io\fits\util.py:578: UserWarning: Could not find appropriate MS Visual C Runtime library or library is corrupt/misconfigured; cannot determine whether your file object was opened in append mode.  Please consider using a file object opened in write mode instead.
  'Could not find appropriate MS Visual C Runtime '
K:\Languages\Anaconda3\lib\site-packages\ipykernel\__main__.py:24: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future

Gaussian distance


In [27]:
def test(features, labels, gaussian_distances):
    features = numpy.hstack([features, gaussian_distances.reshape(-1, 1)])

    xs_train, xs_test, ts_train, ts_test = sklearn.cross_validation.train_test_split(
            features, labels, test_size=0.2, random_state=0)
    lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
    lr.fit(xs_train, ts_train)
    print('Classification accuracy: {:.02%}'.format(lr.score(xs_test, ts_test)))

    hits = 0
    attempts = 0

    for subject_id in subject_ids:
        indices = (data['subject_id'] == subject_id).as_matrix().reshape(-1)
        potential_hosts = numpy.nan_to_num(data.as_matrix()[indices][:, 1:-1].astype(float))
        labels = numpy.nan_to_num(data.as_matrix()[indices][:, -1].astype(bool))

        subject = crowdastro.data.db.radio_subjects.find_one({'_id': bson.objectid.ObjectId(subject_id.decode('ascii'))})
        radio = crowdastro.data.get_radio(subject, size='5x5')

        radio_patches = numpy.zeros((len(potential_hosts), 1, radius * 2, radius * 2))
        gaussian_features = numpy.zeros((len(potential_hosts),))
        for index, (x, y, *astro) in enumerate(potential_hosts):
            patch = radio[x - radius + padding : x + radius + padding, y - radius + padding : y + radius + padding]
            radio_patches[index, 0, :] = patch
            gaussian_features[index] = numpy.exp(-((x - 100) ** 2 / (2 * 50 ** 2) + (y - 100) ** 2 / (2 * 50 ** 2)))

        radio_features = cnn.predict(radio_patches) @ pca.T
        astro_features = potential_hosts[:, 2:]
        features = numpy.hstack([radio_features, astro_features, gaussian_features.reshape(-1, 1)])

        scores = lr.predict_proba(features)[:, 1].reshape(-1)
        predicted_host = scores.argmax()
        if labels[predicted_host]:
            hits += 1
        attempts += 1
    
    assert lr.coef_.shape[1] == 161
    print('Linear distance weight:', lr.coef_[0, -1])

    print('Problem accuracy: {:.02%}'.format(hits / attempts))

test(features, labels, gaussian_distances)


Classification accuracy: 86.00%
Linear distance weight: 0.0186958708631
Problem accuracy: 62.32%
K:\Languages\Anaconda3\lib\site-packages\astropy\io\fits\util.py:578: UserWarning: Could not find appropriate MS Visual C Runtime library or library is corrupt/misconfigured; cannot determine whether your file object was opened in append mode.  Please consider using a file object opened in write mode instead.
  'Could not find appropriate MS Visual C Runtime '
K:\Languages\Anaconda3\lib\site-packages\ipykernel\__main__.py:24: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future

So it seems that linear and Gaussian distance are both useful, and linear is more useful.


In [ ]: