In [11]:
# Setup taken from notebook 17.
import itertools
import sys
import bson
import h5py
import keras.layers
import keras.models
import matplotlib.pyplot
import numpy
import pandas
import sklearn.cross_validation
import sklearn.dummy
import sklearn.linear_model
import sklearn.metrics
sys.path.insert(1, '..')
import crowdastro.data
import crowdastro.show
with pandas.HDFStore('../crowdastro-data/training.h5') as store:
data = store['data']
n = 5000
# I'm gathering up the radio patches first so I can run them through the CNN at the same time
# as one big matrix operation. In principle this would run on the GPU.
radio_patches = numpy.zeros((n, 80, 80))
labels = numpy.zeros((n,))
linear_distances = numpy.zeros((n,))
radius = 40
padding = 150
for idx, row in data.head(n).iterrows():
sid = bson.objectid.ObjectId(row['subject_id'][0].decode('ascii'))
x = row['x'][0]
y = row['y'][0]
label = row['is_host'][0]
labels[idx] = label
subject = crowdastro.data.db.radio_subjects.find_one({'_id': sid})
radio = crowdastro.data.get_radio(subject, size='5x5')
patch = radio[x - radius + padding : x + radius + padding, y - radius + padding : y + radius + padding]
radio_patches[idx, :] = patch
linear_distances[idx] = numpy.hypot(x - 100, y - 100)
# Load the CNN.
with open('../crowdastro-data/cnn_model_2.json', 'r') as f:
cnn = keras.models.model_from_json(f.read())
cnn.load_weights('../crowdastro-data/cnn_weights_2.h5')
cnn.layers = cnn.layers[:5] # Pop the layers after the second convolution's activation.
cnn.add(keras.layers.Flatten())
cnn.compile(optimizer='sgd', loss='mse') # I don't actually care about the optimiser or loss.
# Load the PCA.
with h5py.File('../crowdastro-data/pca.h5') as f:
pca = f['conv_2'][:]
# Find the radio features.
radio_features = cnn.predict(radio_patches.reshape(n, 1, 80, 80)) @ pca.T
# Add on the astronomy features.
features = numpy.hstack([radio_features, data.ix[:n-1, 'flux_ap2_24':'flux_ap2_80'].as_matrix(),
linear_distances.reshape(-1, 1)])
features = numpy.nan_to_num(features)
In [12]:
subject_ids = set()
for idx, row in data.ix[n:n * 2].iterrows():
sid = row['subject_id'][0]
subject_ids.add(sid)
In [13]:
def test(features, labels):
xs_train, xs_test, ts_train, ts_test = sklearn.cross_validation.train_test_split(
features, labels, test_size=0.2, random_state=0)
lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
lr.fit(xs_train, ts_train)
print('Classification accuracy: {:.02%}'.format(lr.score(xs_test, ts_test)))
hits = 0
attempts = 0
for subject_id in subject_ids:
indices = (data['subject_id'] == subject_id).as_matrix().reshape(-1)
potential_hosts = numpy.nan_to_num(data.as_matrix()[indices][:, 1:-1].astype(float))
labels = numpy.nan_to_num(data.as_matrix()[indices][:, -1].astype(bool))
subject = crowdastro.data.db.radio_subjects.find_one({'_id': bson.objectid.ObjectId(subject_id.decode('ascii'))})
radio = crowdastro.data.get_radio(subject, size='5x5')
radio_patches = numpy.zeros((len(potential_hosts), 1, radius * 2, radius * 2))
linear_features = numpy.zeros((len(potential_hosts),))
for index, (x, y, *astro) in enumerate(potential_hosts):
patch = radio[x - radius + padding : x + radius + padding, y - radius + padding : y + radius + padding]
radio_patches[index, 0, :] = patch
linear_features[index] = numpy.hypot(x - 100, y - 100)
radio_features = cnn.predict(radio_patches) @ pca.T
astro_features = potential_hosts[:, 2:]
features = numpy.hstack([radio_features, astro_features, linear_features.reshape(-1, 1)])
scores = lr.predict_proba(features)[:, 1].reshape(-1)
predicted_host = scores.argmax()
if labels[predicted_host]:
hits += 1
attempts += 1
print('Problem accuracy: {:.02%}'.format(hits / attempts))
test(features, labels)
In [16]:
import sklearn.preprocessing
def test(features, labels):
xs_train, xs_test, ts_train, ts_test = sklearn.cross_validation.train_test_split(
features, labels, test_size=0.2, random_state=0)
scaler = sklearn.preprocessing.StandardScaler().fit(xs_train)
xs_train = scaler.transform(xs_train)
xs_test = scaler.transform(xs_test)
lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
lr.fit(xs_train, ts_train)
print('Classification accuracy: {:.02%}'.format(lr.score(xs_test, ts_test)))
hits = 0
attempts = 0
for subject_id in subject_ids:
indices = (data['subject_id'] == subject_id).as_matrix().reshape(-1)
potential_hosts = numpy.nan_to_num(data.as_matrix()[indices][:, 1:-1].astype(float))
labels = numpy.nan_to_num(data.as_matrix()[indices][:, -1].astype(bool))
subject = crowdastro.data.db.radio_subjects.find_one({'_id': bson.objectid.ObjectId(subject_id.decode('ascii'))})
radio = crowdastro.data.get_radio(subject, size='5x5')
radio_patches = numpy.zeros((len(potential_hosts), 1, radius * 2, radius * 2))
linear_features = numpy.zeros((len(potential_hosts), 1))
for index, (x, y, *astro) in enumerate(potential_hosts):
patch = radio[x - radius + padding : x + radius + padding, y - radius + padding : y + radius + padding]
radio_patches[index, 0, :] = patch
linear_features[index] = numpy.hypot(x - 100, y - 100)
radio_features = cnn.predict(radio_patches) @ pca.T
astro_features = potential_hosts[:, 2:]
features = numpy.hstack([radio_features, astro_features, linear_features])
features = scaler.transform(features)
scores = lr.predict_proba(features)[:, 1].reshape(-1)
predicted_host = scores.argmax()
if labels[predicted_host]:
hits += 1
attempts += 1
print('Problem accuracy: {:.02%}'.format(hits / attempts))
test(features, labels)
Scaling inputs = good. This isn't terribly surprising.
In [17]:
def test(features):
scaler = sklearn.preprocessing.StandardScaler().fit(features)
return scaler.scale_
print(test(features))
In [21]:
def test(features, labels):
labels = labels * 2 - 1
xs_train, xs_test, ts_train, ts_test = sklearn.cross_validation.train_test_split(
features, labels, test_size=0.2, random_state=0)
lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
lr.fit(xs_train, ts_train)
print('Classification accuracy: {:.02%}'.format(lr.score(xs_test, ts_test)))
hits = 0
attempts = 0
for subject_id in subject_ids:
indices = (data['subject_id'] == subject_id).as_matrix().reshape(-1)
potential_hosts = numpy.nan_to_num(data.as_matrix()[indices][:, 1:-1].astype(float))
labels = numpy.nan_to_num(data.as_matrix()[indices][:, -1].astype(bool)) * 2 - 1
subject = crowdastro.data.db.radio_subjects.find_one({'_id': bson.objectid.ObjectId(subject_id.decode('ascii'))})
radio = crowdastro.data.get_radio(subject, size='5x5')
radio_patches = numpy.zeros((len(potential_hosts), 1, radius * 2, radius * 2))
linear_features = numpy.zeros((len(potential_hosts), 1))
for index, (x, y, *astro) in enumerate(potential_hosts):
patch = radio[x - radius + padding : x + radius + padding, y - radius + padding : y + radius + padding]
radio_patches[index, 0, :] = patch
linear_features[index] = numpy.hypot(x - 100, y - 100)
radio_features = cnn.predict(radio_patches) @ pca.T
astro_features = potential_hosts[:, 2:]
features = numpy.hstack([radio_features, astro_features, linear_features])
scores = lr.predict_proba(features)[:, 1].reshape(-1)
predicted_host = scores.argmax()
if labels[predicted_host] == 1:
hits += 1
attempts += 1
print('Problem accuracy: {:.02%}'.format(hits / attempts))
test(features, labels)
This had no effect (maybe sklearn does this internally?).
In [27]:
def test(features, labels):
xs_train, xs_test, ts_train, ts_test = sklearn.cross_validation.train_test_split(
features, labels, test_size=0.2, random_state=0)
normaliser = sklearn.preprocessing.Normalizer()
xs_train = normaliser.transform(xs_train)
xs_test = normaliser.transform(xs_test)
lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
lr.fit(xs_train, ts_train)
print('Classification accuracy: {:.02%}'.format(lr.score(xs_test, ts_test)))
hits = 0
attempts = 0
for subject_id in subject_ids:
indices = (data['subject_id'] == subject_id).as_matrix().reshape(-1)
potential_hosts = numpy.nan_to_num(data.as_matrix()[indices][:, 1:-1].astype(float))
labels = numpy.nan_to_num(data.as_matrix()[indices][:, -1].astype(bool))
subject = crowdastro.data.db.radio_subjects.find_one({'_id': bson.objectid.ObjectId(subject_id.decode('ascii'))})
radio = crowdastro.data.get_radio(subject, size='5x5')
radio_patches = numpy.zeros((len(potential_hosts), 1, radius * 2, radius * 2))
linear_features = numpy.zeros((len(potential_hosts), 1))
for index, (x, y, *astro) in enumerate(potential_hosts):
patch = radio[x - radius + padding : x + radius + padding, y - radius + padding : y + radius + padding]
radio_patches[index, 0, :] = patch
linear_features[index] = numpy.hypot(x - 100, y - 100)
radio_features = cnn.predict(radio_patches) @ pca.T
astro_features = potential_hosts[:, 2:]
features = numpy.hstack([radio_features, astro_features, linear_features])
features = normaliser.transform(features)
scores = lr.predict_proba(features)[:, 1].reshape(-1)
predicted_host = scores.argmax()
if labels[predicted_host]:
hits += 1
attempts += 1
print('Problem accuracy: {:.02%}'.format(hits / attempts))
test(features, labels)
Normalising massively raises the accuracy. This isn't surprising by itself, but the size of the accuracy gain is surprising.
In [26]:
def test(features, labels):
xs_train, xs_test, ts_train, ts_test = sklearn.cross_validation.train_test_split(
features, labels, test_size=0.2, random_state=0)
normaliser = sklearn.preprocessing.Normalizer()
scaler = sklearn.preprocessing.StandardScaler().fit(normaliser.transform(xs_train))
xs_train = scaler.transform(normaliser.transform(xs_train))
xs_test = scaler.transform(normaliser.transform(xs_test))
lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
lr.fit(xs_train, ts_train)
print('Classification accuracy: {:.02%}'.format(lr.score(xs_test, ts_test)))
hits = 0
attempts = 0
for subject_id in subject_ids:
indices = (data['subject_id'] == subject_id).as_matrix().reshape(-1)
potential_hosts = numpy.nan_to_num(data.as_matrix()[indices][:, 1:-1].astype(float))
labels = numpy.nan_to_num(data.as_matrix()[indices][:, -1].astype(bool))
subject = crowdastro.data.db.radio_subjects.find_one({'_id': bson.objectid.ObjectId(subject_id.decode('ascii'))})
radio = crowdastro.data.get_radio(subject, size='5x5')
radio_patches = numpy.zeros((len(potential_hosts), 1, radius * 2, radius * 2))
linear_features = numpy.zeros((len(potential_hosts), 1))
for index, (x, y, *astro) in enumerate(potential_hosts):
patch = radio[x - radius + padding : x + radius + padding, y - radius + padding : y + radius + padding]
radio_patches[index, 0, :] = patch
linear_features[index] = numpy.hypot(x - 100, y - 100)
radio_features = cnn.predict(radio_patches) @ pca.T
astro_features = potential_hosts[:, 2:]
features = numpy.hstack([radio_features, astro_features, linear_features])
features = scaler.transform(normaliser.transform(features))
scores = lr.predict_proba(features)[:, 1].reshape(-1)
predicted_host = scores.argmax()
if labels[predicted_host]:
hits += 1
attempts += 1
print('Problem accuracy: {:.02%}'.format(hits / attempts))
test(features, labels)
While better than scaling alone, this is worse than just normalising. I wonder if there's a relationship between the scales of the CNN features that we're missing somehow when we scale? To test this, I'll normalise all the data, but only scale the last 6 features.
In [28]:
def test(features, labels):
xs_train, xs_test, ts_train, ts_test = sklearn.cross_validation.train_test_split(
features, labels, test_size=0.2, random_state=0)
normaliser = sklearn.preprocessing.Normalizer()
scaler = sklearn.preprocessing.StandardScaler().fit(normaliser.transform(xs_train[:, -6:]))
xs_train = numpy.hstack([normaliser.transform(xs_train[:, :-6]),
scaler.transform(normaliser.transform(xs_train[:, -6:]))])
xs_test = numpy.hstack([normaliser.transform(xs_test[:, :-6]),
scaler.transform(normaliser.transform(xs_test[:, -6:]))])
lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
lr.fit(xs_train, ts_train)
print('Classification accuracy: {:.02%}'.format(lr.score(xs_test, ts_test)))
hits = 0
attempts = 0
for subject_id in subject_ids:
indices = (data['subject_id'] == subject_id).as_matrix().reshape(-1)
potential_hosts = numpy.nan_to_num(data.as_matrix()[indices][:, 1:-1].astype(float))
labels = numpy.nan_to_num(data.as_matrix()[indices][:, -1].astype(bool))
subject = crowdastro.data.db.radio_subjects.find_one({'_id': bson.objectid.ObjectId(subject_id.decode('ascii'))})
radio = crowdastro.data.get_radio(subject, size='5x5')
radio_patches = numpy.zeros((len(potential_hosts), 1, radius * 2, radius * 2))
linear_features = numpy.zeros((len(potential_hosts), 1))
for index, (x, y, *astro) in enumerate(potential_hosts):
patch = radio[x - radius + padding : x + radius + padding, y - radius + padding : y + radius + padding]
radio_patches[index, 0, :] = patch
linear_features[index] = numpy.hypot(x - 100, y - 100)
radio_features = cnn.predict(radio_patches) @ pca.T
astro_features = potential_hosts[:, 2:]
features = numpy.hstack([radio_features, astro_features, linear_features])
features = numpy.hstack([normaliser.transform(features[:, :-6]),
scaler.transform(normaliser.transform(features[:, -6:]))])
scores = lr.predict_proba(features)[:, 1].reshape(-1)
predicted_host = scores.argmax()
if labels[predicted_host]:
hits += 1
attempts += 1
print('Problem accuracy: {:.02%}'.format(hits / attempts))
test(features, labels)
That worked really well. I'll have to find out why. I'll try looking at the output of one of the more difficult-to-classify subjects.
In [29]:
%matplotlib inline
subject = crowdastro.data.db.radio_subjects.find_one({'zooniverse_id': 'ARG0003r8e'})
crowdastro.show.subject(subject)
In [37]:
def test(features, labels, subject):
xs_train, xs_test, ts_train, ts_test = sklearn.cross_validation.train_test_split(
features, labels, test_size=0.2, random_state=0)
normaliser = sklearn.preprocessing.Normalizer()
scaler = sklearn.preprocessing.StandardScaler().fit(normaliser.transform(xs_train[:, -6:]))
xs_train = numpy.hstack([normaliser.transform(xs_train[:, :-6]),
scaler.transform(normaliser.transform(xs_train[:, -6:]))])
xs_test = numpy.hstack([normaliser.transform(xs_test[:, :-6]),
scaler.transform(normaliser.transform(xs_test[:, -6:]))])
lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
lr.fit(xs_train, ts_train)
print('Classification accuracy: {:.02%}'.format(lr.score(xs_test, ts_test)))
subject_id = str(subject['_id']).encode('ascii')
indices = (data['subject_id'] == subject_id).as_matrix().reshape(-1)
potential_hosts = numpy.nan_to_num(data.as_matrix()[indices][:, 1:-1].astype(float))
labels = numpy.nan_to_num(data.as_matrix()[indices][:, -1].astype(bool))
radio = crowdastro.data.get_radio(subject, size='5x5')
radio_patches = numpy.zeros((len(potential_hosts), 1, radius * 2, radius * 2))
linear_features = numpy.zeros((len(potential_hosts), 1))
for index, (x, y, *astro) in enumerate(potential_hosts):
patch = radio[x - radius + padding : x + radius + padding, y - radius + padding : y + radius + padding]
radio_patches[index, 0, :] = patch
linear_features[index] = numpy.hypot(x - 100, y - 100)
radio_features = cnn.predict(radio_patches) @ pca.T
astro_features = potential_hosts[:, 2:]
features = numpy.hstack([radio_features, astro_features, linear_features])
features = numpy.hstack([normaliser.transform(features[:, :-6]),
scaler.transform(normaliser.transform(features[:, -6:]))])
scores = lr.predict_proba(features)[:, 1].reshape(-1)
crowdastro.show.subject(subject)
matplotlib.pyplot.scatter(potential_hosts[:, 0], potential_hosts[:, 1], c=scores)
matplotlib.pyplot.show()
return normaliser, scaler, lr
normaliser, scaler, classifier = test(features, labels, subject)
Interestingly, this classifier does a lot better at finding the middle of the compact source. It also highlights some odd points off to the side.
What happens if we run the entropy check over all the data with this classify? What does this classifier consider "hard"?
In [39]:
def softmax(x):
exp = numpy.exp(x)
return exp / numpy.sum(exp, axis=0)
def test(features, labels, normaliser, scaler, lr):
max_entropy = float('-inf')
max_subject = None
for subject_id in subject_ids:
indices = (data['subject_id'] == subject_id).as_matrix().reshape(-1)
potential_hosts = numpy.nan_to_num(data.as_matrix()[indices][:, 1:-1].astype(float))
labels = numpy.nan_to_num(data.as_matrix()[indices][:, -1].astype(bool))
subject = crowdastro.data.db.radio_subjects.find_one({'_id': bson.objectid.ObjectId(subject_id.decode('ascii'))})
radio = crowdastro.data.get_radio(subject, size='5x5')
radio_patches = numpy.zeros((len(potential_hosts), 1, radius * 2, radius * 2))
linear_features = numpy.zeros((len(potential_hosts), 1))
for index, (x, y, *astro) in enumerate(potential_hosts):
patch = radio[x - radius + padding : x + radius + padding, y - radius + padding : y + radius + padding]
radio_patches[index, 0, :] = patch
linear_features[index] = numpy.hypot(x - 100, y - 100)
radio_features = cnn.predict(radio_patches) @ pca.T
astro_features = potential_hosts[:, 2:]
features = numpy.hstack([radio_features, astro_features, linear_features])
features = numpy.hstack([normaliser.transform(features[:, :-6]),
scaler.transform(normaliser.transform(features[:, -6:]))])
probabilities = softmax(lr.predict_proba(features)[:, 1].reshape(-1))
entropy = -(probabilities * numpy.log(probabilities)).sum()
if entropy > max_entropy:
max_entropy = entropy
max_subject = subject
return max_subject
max_subject = test(features, labels, normaliser, scaler, classifier)
In [40]:
crowdastro.show.subject(max_subject)
Looks like this one is still hard to classify.
In [42]:
def test(features, labels, subject):
xs_train, xs_test, ts_train, ts_test = sklearn.cross_validation.train_test_split(
features, labels, test_size=0.2, random_state=0)
normaliser = sklearn.preprocessing.Normalizer()
scaler = sklearn.preprocessing.StandardScaler().fit(normaliser.transform(xs_train[:, -6:]))
xs_train = numpy.hstack([normaliser.transform(xs_train[:, :-6]),
scaler.transform(normaliser.transform(xs_train[:, -6:]))])
xs_test = numpy.hstack([normaliser.transform(xs_test[:, :-6]),
scaler.transform(normaliser.transform(xs_test[:, -6:]))])
lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
lr.fit(xs_train, ts_train)
print('Classification accuracy: {:.02%}'.format(lr.score(xs_test, ts_test)))
subject_id = str(subject['_id']).encode('ascii')
indices = (data['subject_id'] == subject_id).as_matrix().reshape(-1)
potential_hosts = numpy.nan_to_num(data.as_matrix()[indices][:, 1:-1].astype(float))
labels = numpy.nan_to_num(data.as_matrix()[indices][:, -1].astype(bool))
radio = crowdastro.data.get_radio(subject, size='5x5')
radio_patches = numpy.zeros((len(potential_hosts), 1, radius * 2, radius * 2))
linear_features = numpy.zeros((len(potential_hosts), 1))
for index, (x, y, *astro) in enumerate(potential_hosts):
patch = radio[x - radius + padding : x + radius + padding, y - radius + padding : y + radius + padding]
radio_patches[index, 0, :] = patch
linear_features[index] = numpy.hypot(x - 100, y - 100)
radio_features = cnn.predict(radio_patches) @ pca.T
astro_features = potential_hosts[:, 2:]
features = numpy.hstack([radio_features, astro_features, linear_features])
features = numpy.hstack([normaliser.transform(features[:, :-6]),
scaler.transform(normaliser.transform(features[:, -6:]))])
scores = lr.predict_proba(features)[:, 1].reshape(-1)
matplotlib.pyplot.plot(sorted(scores), marker='x')
matplotlib.pyplot.show()
return normaliser, scaler, lr
test(features, labels, subject)
Out[42]:
One interesting fact is that this curve is steeper than for the non-scaled/non-normalised/no distance classifier.
In [ ]: