In [15]:
# Setup taken from notebook 17.
import itertools
import sys
import bson
import h5py
import keras.layers
import keras.models
import matplotlib.pyplot
import numpy
import pandas
import sklearn.cross_validation
import sklearn.dummy
import sklearn.linear_model
import sklearn.metrics
sys.path.insert(1, '..')
import crowdastro.data
import crowdastro.show
with pandas.HDFStore('../crowdastro-data/training.h5') as store:
data = store['data']
n = 5000
# I'm gathering up the radio patches first so I can run them through the CNN at the same time
# as one big matrix operation. In principle this would run on the GPU.
radio_patches = numpy.zeros((n, 80, 80))
labels = numpy.zeros((n,))
linear_distances = numpy.zeros((n,))
gaussian_distances = numpy.zeros((n,))
radius = 40
padding = 150
for idx, row in data.head(n).iterrows():
sid = bson.objectid.ObjectId(row['subject_id'][0].decode('ascii'))
x = row['x'][0]
y = row['y'][0]
label = row['is_host'][0]
labels[idx] = label
subject = crowdastro.data.db.radio_subjects.find_one({'_id': sid})
radio = crowdastro.data.get_radio(subject, size='5x5')
patch = radio[x - radius + padding : x + radius + padding, y - radius + padding : y + radius + padding]
radio_patches[idx, :] = patch
linear_distances[idx] = numpy.hypot(x - 100, y - 100)
gaussian_distances[idx] = numpy.exp(-((x - 100) ** 2 / (2 * 50 ** 2) + (y - 100) ** 2 / (2 * 50 ** 2)))
# Load the CNN.
with open('../crowdastro-data/cnn_model_2.json', 'r') as f:
cnn = keras.models.model_from_json(f.read())
cnn.load_weights('../crowdastro-data/cnn_weights_2.h5')
cnn.layers = cnn.layers[:5] # Pop the layers after the second convolution's activation.
cnn.add(keras.layers.Flatten())
cnn.compile(optimizer='sgd', loss='mse') # I don't actually care about the optimiser or loss.
# Load the PCA.
with h5py.File('../crowdastro-data/pca.h5') as f:
pca = f['conv_2'][:]
# Find the radio features.
radio_features = cnn.predict(radio_patches.reshape(n, 1, 80, 80)) @ pca.T
# Add on the astronomy features.
features = numpy.hstack([radio_features, data.ix[:n-1, 'flux_ap2_24':'flux_ap2_80'].as_matrix()])
features = numpy.nan_to_num(features)
In [16]:
subject_ids = set()
for idx, row in data.ix[n:n * 2].iterrows():
sid = row['subject_id'][0]
subject_ids.add(sid)
In [23]:
def test(features, labels):
xs_train, xs_test, ts_train, ts_test = sklearn.cross_validation.train_test_split(
features, labels, test_size=0.2, random_state=0)
lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
lr.fit(xs_train, ts_train)
print('Classification accuracy: {:.02%}'.format(lr.score(xs_test, ts_test)))
hits = 0
attempts = 0
for subject_id in subject_ids:
indices = (data['subject_id'] == subject_id).as_matrix().reshape(-1)
potential_hosts = numpy.nan_to_num(data.as_matrix()[indices][:, 1:-1].astype(float))
labels = numpy.nan_to_num(data.as_matrix()[indices][:, -1].astype(bool))
subject = crowdastro.data.db.radio_subjects.find_one({'_id': bson.objectid.ObjectId(subject_id.decode('ascii'))})
radio = crowdastro.data.get_radio(subject, size='5x5')
radio_patches = numpy.zeros((len(potential_hosts), 1, radius * 2, radius * 2))
for index, (x, y, *astro) in enumerate(potential_hosts):
patch = radio[x - radius + padding : x + radius + padding, y - radius + padding : y + radius + padding]
radio_patches[index, 0, :] = patch
radio_features = cnn.predict(radio_patches) @ pca.T
astro_features = potential_hosts[:, 2:]
features = numpy.hstack([radio_features, astro_features])
scores = lr.predict_proba(features)[:, 1].reshape(-1)
predicted_host = scores.argmax()
if labels[predicted_host]:
hits += 1
attempts += 1
print('Problem accuracy: {:.02%}'.format(hits / attempts))
test(features, labels)
In [24]:
def test(features, labels, linear_distances):
features = numpy.hstack([features, linear_distances.reshape(-1, 1)])
xs_train, xs_test, ts_train, ts_test = sklearn.cross_validation.train_test_split(
features, labels, test_size=0.2, random_state=0)
lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
lr.fit(xs_train, ts_train)
print('Classification accuracy: {:.02%}'.format(lr.score(xs_test, ts_test)))
hits = 0
attempts = 0
for subject_id in subject_ids:
indices = (data['subject_id'] == subject_id).as_matrix().reshape(-1)
potential_hosts = numpy.nan_to_num(data.as_matrix()[indices][:, 1:-1].astype(float))
labels = numpy.nan_to_num(data.as_matrix()[indices][:, -1].astype(bool))
subject = crowdastro.data.db.radio_subjects.find_one({'_id': bson.objectid.ObjectId(subject_id.decode('ascii'))})
radio = crowdastro.data.get_radio(subject, size='5x5')
radio_patches = numpy.zeros((len(potential_hosts), 1, radius * 2, radius * 2))
linear_features = numpy.zeros((len(potential_hosts),))
for index, (x, y, *astro) in enumerate(potential_hosts):
patch = radio[x - radius + padding : x + radius + padding, y - radius + padding : y + radius + padding]
radio_patches[index, 0, :] = patch
linear_features[index] = numpy.hypot(x - 100, y - 100)
radio_features = cnn.predict(radio_patches) @ pca.T
astro_features = potential_hosts[:, 2:]
features = numpy.hstack([radio_features, astro_features, linear_features.reshape(-1, 1)])
scores = lr.predict_proba(features)[:, 1].reshape(-1)
predicted_host = scores.argmax()
if labels[predicted_host]:
hits += 1
attempts += 1
assert lr.coef_.shape[1] == 161
print('Linear distance weight:', lr.coef_[0, -1])
print('Problem accuracy: {:.02%}'.format(hits / attempts))
test(features, labels, linear_distances)
In [27]:
def test(features, labels, gaussian_distances):
features = numpy.hstack([features, gaussian_distances.reshape(-1, 1)])
xs_train, xs_test, ts_train, ts_test = sklearn.cross_validation.train_test_split(
features, labels, test_size=0.2, random_state=0)
lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
lr.fit(xs_train, ts_train)
print('Classification accuracy: {:.02%}'.format(lr.score(xs_test, ts_test)))
hits = 0
attempts = 0
for subject_id in subject_ids:
indices = (data['subject_id'] == subject_id).as_matrix().reshape(-1)
potential_hosts = numpy.nan_to_num(data.as_matrix()[indices][:, 1:-1].astype(float))
labels = numpy.nan_to_num(data.as_matrix()[indices][:, -1].astype(bool))
subject = crowdastro.data.db.radio_subjects.find_one({'_id': bson.objectid.ObjectId(subject_id.decode('ascii'))})
radio = crowdastro.data.get_radio(subject, size='5x5')
radio_patches = numpy.zeros((len(potential_hosts), 1, radius * 2, radius * 2))
gaussian_features = numpy.zeros((len(potential_hosts),))
for index, (x, y, *astro) in enumerate(potential_hosts):
patch = radio[x - radius + padding : x + radius + padding, y - radius + padding : y + radius + padding]
radio_patches[index, 0, :] = patch
gaussian_features[index] = numpy.exp(-((x - 100) ** 2 / (2 * 50 ** 2) + (y - 100) ** 2 / (2 * 50 ** 2)))
radio_features = cnn.predict(radio_patches) @ pca.T
astro_features = potential_hosts[:, 2:]
features = numpy.hstack([radio_features, astro_features, gaussian_features.reshape(-1, 1)])
scores = lr.predict_proba(features)[:, 1].reshape(-1)
predicted_host = scores.argmax()
if labels[predicted_host]:
hits += 1
attempts += 1
assert lr.coef_.shape[1] == 161
print('Linear distance weight:', lr.coef_[0, -1])
print('Problem accuracy: {:.02%}'.format(hits / attempts))
test(features, labels, gaussian_distances)
So it seems that linear and Gaussian distance are both useful, and linear is more useful.
In [ ]: