In [ ]:
## Imports.
import sys
import h5py
import IPython.core.display
import keras.backend
import keras.models
import numpy
sys.path.insert(1, '..')
import crowdastro.data
## Constants.
TRAINING_DATA_PATH = '../crowdastro-data/training_simple_flipped.h5'
COLUMNS = ['zooniverse_id', 'source', 'x', 'y', 'flux24', 'flux36', 'flux45', 'flux58', 'flux80', 'label']
N_DATA = 84298
N_FEATURES = 160 # 155 (convolutional with PCA) + 5 (astro) dimensional.
CNN_MODEL_PATH = '../crowdastro-data/cnn_model.json'
CNN_WEIGHTS_PATH = '../crowdastro-data/my_cnn_weights.h5'
PCA_PATH = '../crowdastro-data/pca.h5'
RADIUS = 40 # 0.4'
PADDING = 150 # (5' - 2') / 2
GATOR_CACHE = 'gator_cache'
DB_PATH = '../crowdastro-data/processed.db'
In [ ]:
## Load CNN and PCA.
with open(CNN_MODEL_PATH) as f:
cnn = keras.models.model_from_json(f.read())
cnn.load_weights(CNN_WEIGHTS_PATH)
cnn.compile(optimizer='sgd', loss='mse')
get_convolutional_features_ = keras.backend.function([cnn.layers[0].input], [cnn.layers[4].output])
get_convolutional_features = lambda p: get_convolutional_features_([p])[0].reshape((1, -1))
with h5py.File(PCA_PATH) as f:
pca = f['conv_2'][:]
## Load training data.
subjects = []
xs = numpy.zeros((N_DATA, N_FEATURES))
ts = numpy.zeros((N_DATA,))
radio_patches = numpy.zeros((100, 1, RADIUS * 2, RADIUS * 2)) # Buffer.
last_zid = None
subject = None
radio = None
with h5py.File(TRAINING_DATA_PATH) as f:
data = f['data']
for index, datapoint in enumerate(data):
if index % 100 == 0 and index > 0:
IPython.core.display.clear_output()
print('{:.02%}'.format((index + 1) / N_DATA))
# Batch the features.
convolutional_features = numpy.dot(get_convolutional_features(radio_patches).reshape((100, -1)), pca.T)
xs[index - 100 : index, :-5] = convolutional_features
datapoint = tuple(datapoint)
zooniverse_id = datapoint[0].decode('ascii')
x = datapoint[2]
y = datapoint[3]
astro_features = datapoint[4:9]
label = datapoint[-1]
# Fetch image patch.
if last_zid != zooniverse_id:
subject = crowdastro.data.db.radio_subjects.find_one({'zooniverse_id': zooniverse_id})
radio = crowdastro.data.get_radio(subject, size='5x5')
last_zid = zooniverse_id
patch = radio[x - RADIUS + PADDING : x + RADIUS + PADDING, y - RADIUS + PADDING : y + RADIUS + PADDING]
# Store patch so we can get CNN features later.
radio_patches[index % 100, 0] = patch
# Store astro features.
xs[index, -5:] = astro_features
# Store label.
ts[index] = label
In [ ]:
# Can't just train_test_split - have to make sure we have training and testing *subjects*.
# Assume xs/ts ordered by subject.
# In future, should have separate test subjects!
xs_train = numpy.vstack([xs[:N_DATA//4], xs[-N_DATA//4:]])
xs_test = xs[N_DATA//4:-N_DATA//4]
ts_train = numpy.hstack([ts[:N_DATA//4], ts[-N_DATA//4:]])
ts_test = ts[N_DATA//4:-N_DATA//4]
In [ ]:
# Normalise and scale.
import sklearn.preprocessing
normaliser = sklearn.preprocessing.Normalizer()
scaler = sklearn.preprocessing.StandardScaler().fit(normaliser.transform(xs_train[:, -5:]))
xs_train = numpy.hstack([normaliser.transform(xs_train[:, :-5]),
scaler.transform(normaliser.transform(xs_train[:, -5:]))])
xs_test = numpy.hstack([normaliser.transform(xs_test[:, :-5]),
scaler.transform(normaliser.transform(xs_test[:, -5:]))])
In [ ]:
from crowdastro.training_data import remove_nans
import crowdastro.config
def test_subject(subject, conn, classifier, stellarity_cutoff=1):
radio = crowdastro.data.get_radio(subject, size='5x5')
potential_hosts = crowdastro.data.get_potential_hosts(subject, GATOR_CACHE)
potential_hosts = {i:j for i, j in potential_hosts.items() if j['stell_36'] < 0.75}
cur = conn.cursor()
consensus = list(cur.execute('SELECT source_x, source_y FROM consensuses_kde WHERE zooniverse_id = ?'
'AND radio_agreement >= 0.5',
[subject['zooniverse_id']]))
if not consensus:
raise ValueError('Found null in data assumed sanitised.')
((cx, cy),) = consensus
if not cx or not cy:
raise ValueError('Found null in data assumed sanitised.')
# Consensuses are inverted vertically w.r.t. the potential hosts.
cy = crowdastro.config.get('fits_image_height') - cy
n = len(potential_hosts)
xs = numpy.zeros((n, N_FEATURES))
radio_patches = numpy.zeros((n, 1, RADIUS * 2, RADIUS * 2))
ts = numpy.zeros((n,))
points = []
for index, ((x, y), data) in enumerate(potential_hosts.items()):
patch = radio[x - RADIUS + PADDING : x + RADIUS + PADDING, y - RADIUS + PADDING : y + RADIUS + PADDING]
radio_patches[index, 0] = patch
xs[index, -5] = remove_nans(data['flux_ap2_24'])
xs[index, -4] = remove_nans(data['flux_ap2_36'])
xs[index, -3] = remove_nans(data['flux_ap2_45'])
xs[index, -2] = remove_nans(data['flux_ap2_58'])
xs[index, -1] = remove_nans(data['flux_ap2_80'])
points.append((x, y))
closest = min(enumerate(points), key=lambda z: numpy.hypot(z[1][0] - cx, z[1][1] - cy))
assert points[closest[0]] == closest[1]
xs[:, :-5] = numpy.dot(get_convolutional_features(radio_patches).reshape((n, -1)), pca.T)
xs = numpy.hstack([normaliser.transform(xs[:, :-5]),
scaler.transform(normaliser.transform(xs[:, -5:]))])
probs = classifier.predict_proba(xs)[:, 1]
selection = probs.argmax()
return selection, closest[0], probs, points
In [ ]:
import sklearn.linear_model
lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
lr.fit(xs_train, ts_train)
In [ ]:
lr.score(xs_test, ts_test)
In [ ]:
import sqlite3
import matplotlib.pyplot
%matplotlib inline
import crowdastro.show
def softmax(x):
exp = numpy.exp(x)
return exp / numpy.sum(exp, axis=0)
with sqlite3.connect(DB_PATH) as conn:
cur = conn.cursor()
zooniverse_ids = list(cur.execute("""SELECT zooniverse_id
FROM consensuses_kde
GROUP BY zooniverse_id
HAVING COUNT(zooniverse_id) = 1"""))
assert len(zooniverse_ids) in {1087, 1093}, len(zooniverse_ids)
n_correct = 0
n_total = 0
split_index = len(zooniverse_ids)//4
for index, (zooniverse_id,) in enumerate(zooniverse_ids[split_index:-split_index]):
subject = crowdastro.data.get_subject(zooniverse_id)
try:
# print(zooniverse_id)
guess, groundtruth, probs, points = test_subject(subject, conn, lr)
# print(points, guess, groundtruth)
n_correct += numpy.hypot(points[guess][0] - points[groundtruth][0],
points[guess][1] - points[groundtruth][1]) <= 5 # 5 px radius = 3''
n_total += 1
# print(sorted(points))
print('{:.02%} / {:.02%}'.format(index / (len(zooniverse_ids[split_index:-split_index])),
n_correct / n_total))
if index < 50:
matplotlib.pyplot.figure(figsize=(20, 10))
print(zooniverse_id)
matplotlib.pyplot.subplot(1, 2, 1)
crowdastro.show.subject(subject)
matplotlib.pyplot.scatter(*zip(*points), c=softmax(probs), s=200, cmap='cool')
matplotlib.pyplot.axis('off')
matplotlib.pyplot.colorbar()
matplotlib.pyplot.scatter(points[guess][0], points[guess][1], c='green', s=200, marker='+')
# print(points[groundtruth][0], 200-points[groundtruth][1])
matplotlib.pyplot.scatter(points[groundtruth][0], points[groundtruth][1], c='red', s=200, marker='x')
matplotlib.pyplot.subplot(1, 2, 2)
matplotlib.pyplot.scatter(range(len(probs)), sorted(softmax(probs)), marker='o', c=sorted(softmax(probs)),
cmap='cool', s=200)
matplotlib.pyplot.xlabel('Index')
matplotlib.pyplot.ylabel('Probability of being the true host')
matplotlib.pyplot.show()
except ValueError:
continue
print('{:.02%}'.format(n_correct / n_total))
In [ ]:
import sklearn.ensemble
rf = sklearn.ensemble.RandomForestClassifier(class_weight='balanced')
rf.fit(xs_train, ts_train)
In [ ]:
rf.score(xs_test, ts_test)
In [ ]:
with sqlite3.connect(DB_PATH) as conn:
cur = conn.cursor()
zooniverse_ids = list(cur.execute("""SELECT zooniverse_id
FROM consensuses_kde
GROUP BY zooniverse_id
HAVING COUNT(zooniverse_id) = 1"""))
assert len(zooniverse_ids) in {1087, 1093}, len(zooniverse_ids)
n_correct = 0
n_total = 0
split_index = len(zooniverse_ids)//4
for index, (zooniverse_id,) in enumerate(zooniverse_ids[split_index:-split_index]):
subject = crowdastro.data.get_subject(zooniverse_id)
try:
# print(zooniverse_id)
guess, groundtruth, probs, points = test_subject(subject, conn, rf)
n_correct += numpy.hypot(points[guess][0] - points[groundtruth][0],
points[guess][1] - points[groundtruth][1]) <= 5 # 5 px radius = 3''
n_total += 1
# print(sorted(points))
print('{:.02%} / {:.02%}'.format(index / (len(zooniverse_ids[split_index:-split_index])),
n_correct / n_total))
if index < 50:
matplotlib.pyplot.figure(figsize=(20, 10))
print(zooniverse_id)
matplotlib.pyplot.subplot(1, 2, 1)
crowdastro.show.subject(subject)
matplotlib.pyplot.scatter(*zip(*points), c=softmax(probs), s=200, cmap='cool')
matplotlib.pyplot.axis('off')
matplotlib.pyplot.colorbar()
matplotlib.pyplot.scatter(points[guess][0], points[guess][1], c='green', s=200, marker='+')
# print(points[groundtruth][0], 200-points[groundtruth][1])
matplotlib.pyplot.scatter(points[groundtruth][0], points[groundtruth][1], c='red', s=200, marker='x')
matplotlib.pyplot.subplot(1, 2, 2)
matplotlib.pyplot.scatter(range(len(probs)), sorted(softmax(probs)), marker='o', c=sorted(softmax(probs)),
cmap='cool', s=200)
matplotlib.pyplot.xlabel('Index')
matplotlib.pyplot.ylabel('Probability of being the true host')
matplotlib.pyplot.show()
except ValueError:
continue
print('{:.02%}'.format(n_correct / n_total))
In [ ]:
rf.feature_importances_
In [ ]:
matplotlib.pyplot.figure(figsize=(15, 15))
matplotlib.pyplot.xlabel('Feature index')
matplotlib.pyplot.ylabel('Importance')
matplotlib.pyplot.plot(numpy.abs(rf.feature_importances_ / numpy.max(rf.feature_importances_)), color='blue')
matplotlib.pyplot.plot(numpy.abs(-lr.coef_[0] / numpy.min(lr.coef_)), color='red')
matplotlib.pyplot.show()
In [ ]:
lr.coef_ / numpy.max(lr.coef_)
In [ ]: