In [3]:
# Setup taken from notebook 17.
import itertools
import sys
import bson
import h5py
import keras.layers
import keras.models
import matplotlib.pyplot
import numpy
import pandas
import sklearn.cross_validation
import sklearn.dummy
import sklearn.linear_model
import sklearn.metrics
sys.path.insert(1, '..')
import crowdastro.data
import crowdastro.show
with pandas.HDFStore('crowdastro-data/training.h5') as store:
data = store['data']
n = 5000
# I'm gathering up the radio patches first so I can run them through the CNN at the same time
# as one big matrix operation. In principle this would run on the GPU.
radio_patches = numpy.zeros((n, 80, 80))
labels = numpy.zeros((n,))
linear_distances = numpy.zeros((n,))
gaussian_distances = numpy.zeros((n,))
radius = 40
padding = 150
for idx, row in data.head(n).iterrows():
sid = bson.objectid.ObjectId(row['subject_id'][0].decode('ascii'))
x = row['x'][0]
y = row['y'][0]
label = row['is_host'][0]
labels[idx] = label
subject = crowdastro.data.db.radio_subjects.find_one({'_id': sid})
radio = crowdastro.data.get_radio(subject, size='5x5')
patch = radio[x - radius + padding : x + radius + padding, y - radius + padding : y + radius + padding]
radio_patches[idx, :] = patch
linear_distances[idx] = numpy.hypot(x - 100, y - 100)
gaussian_distances[idx] = numpy.exp(-((x - 100) ** 2 / (2 * 50 ** 2) + (y - 100) ** 2 / (2 * 50 ** 2)))
# Load the CNN.
with open('crowdastro-data/cnn_model_2.json', 'r') as f:
cnn = keras.models.model_from_json(f.read())
cnn.load_weights('crowdastro-data/cnn_weights_2.h5')
cnn.layers = cnn.layers[:5] # Pop the layers after the second convolution's activation.
cnn.add(keras.layers.Flatten())
cnn.compile(optimizer='sgd', loss='mse') # I don't actually care about the optimiser or loss.
# Load the PCA.
with h5py.File('crowdastro-data/pca.h5') as f:
pca = f['conv_2'][:]
# Find the radio features.
radio_features = cnn.predict(radio_patches.reshape(n, 1, 80, 80)) @ pca.T
# Find the astronomy features.
astro_features = data.ix[:n-1, 'flux_ap2_24':'flux_ap2_80'].as_matrix()
In [5]:
subject_ids = set()
for idx, row in data.ix[n:n * 2].iterrows():
sid = row['subject_id'][0]
subject_ids.add(sid)
In [6]:
def test(radio_features, labels):
xs_train, xs_test, ts_train, ts_test = sklearn.cross_validation.train_test_split(
radio_features, labels, test_size=0.2, random_state=0)
lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
lr.fit(xs_train, ts_train)
print('Classification accuracy: {:.02%}'.format(lr.score(xs_test, ts_test)))
hits = 0
attempts = 0
for subject_id in subject_ids:
indices = (data['subject_id'] == subject_id).as_matrix().reshape(-1)
potential_hosts = numpy.nan_to_num(data.as_matrix()[indices][:, 1:-1].astype(float))
labels = numpy.nan_to_num(data.as_matrix()[indices][:, -1].astype(bool))
subject = crowdastro.data.db.radio_subjects.find_one({'_id': bson.objectid.ObjectId(subject_id.decode('ascii'))})
radio = crowdastro.data.get_radio(subject, size='5x5')
radio_patches = numpy.zeros((len(potential_hosts), 1, radius * 2, radius * 2))
for index, (x, y, *astro) in enumerate(potential_hosts):
patch = radio[x - radius + padding : x + radius + padding, y - radius + padding : y + radius + padding]
radio_patches[index, 0, :] = patch
radio_features = cnn.predict(radio_patches) @ pca.T
scores = lr.predict_proba(radio_features)[:, 1].reshape(-1)
predicted_host = scores.argmax()
if labels[predicted_host]:
hits += 1
attempts += 1
print('Problem accuracy: {:.02%}'.format(hits / attempts))
test(radio_features, labels)
In [8]:
def test(astro_features, labels):
xs_train, xs_test, ts_train, ts_test = sklearn.cross_validation.train_test_split(
astro_features, labels, test_size=0.2, random_state=0)
lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
lr.fit(xs_train, ts_train)
print('Classification accuracy: {:.02%}'.format(lr.score(xs_test, ts_test)))
hits = 0
attempts = 0
for subject_id in subject_ids:
indices = (data['subject_id'] == subject_id).as_matrix().reshape(-1)
potential_hosts = numpy.nan_to_num(data.as_matrix()[indices][:, 1:-1].astype(float))
labels = numpy.nan_to_num(data.as_matrix()[indices][:, -1].astype(bool))
subject = crowdastro.data.db.radio_subjects.find_one({'_id': bson.objectid.ObjectId(subject_id.decode('ascii'))})
radio = crowdastro.data.get_radio(subject, size='5x5')
radio_patches = numpy.zeros((len(potential_hosts), 1, radius * 2, radius * 2))
for index, (x, y, *astro) in enumerate(potential_hosts):
patch = radio[x - radius + padding : x + radius + padding, y - radius + padding : y + radius + padding]
radio_patches[index, 0, :] = patch
radio_features = cnn.predict(radio_patches) @ pca.T
astro_features = potential_hosts[:, 2:]
scores = lr.predict_proba(astro_features)[:, 1].reshape(-1)
predicted_host = scores.argmax()
if labels[predicted_host]:
hits += 1
attempts += 1
print('Problem accuracy: {:.02%}'.format(hits / attempts))
test(numpy.nan_to_num(astro_features), labels)
In [ ]: