In [20]:
import h5py, sklearn.linear_model, sklearn.metrics, matplotlib.pyplot as plt, seaborn, sklearn.neighbors, numpy
from astropy.coordinates import SkyCoord
%matplotlib inline
NORRIS_DAT_PATH = '../data/norris_2006_atlas_classifications_ra_dec_only.dat'
# Load Norris labels.
with h5py.File('../data/training.h5', 'r') as training_f:
ir_positions = training_f['positions'].value
ir_tree = sklearn.neighbors.KDTree(ir_positions)
with open(NORRIS_DAT_PATH, 'r') as norris_dat:
norris_coords = [r.strip().split('|') for r in norris_dat]
norris_labels = numpy.zeros((len(ir_positions)))
for ra, dec in norris_coords:
# Find a neighbour.
skycoord = SkyCoord(ra=ra, dec=dec, unit=('hourangle', 'deg'))
ra = skycoord.ra.degree
dec = skycoord.dec.degree
((dist,),), ((ir,),) = ir_tree.query([(ra, dec)])
if dist < 0.1:
norris_labels[ir] = 1
with h5py.File('../data/training.h5') as f:
features = f['features'].value
labels = f['labels'].value
ir_test = f['is_ir_test'].value.astype(bool)
ir_train = f['is_ir_train'].value.astype(bool)
In [7]:
lr = sklearn.linear_model.LogisticRegression(C=100.0, class_weight='balanced')
In [8]:
lr.fit(features[ir_train], labels[ir_train])
Out[8]:
In [11]:
cm = sklearn.metrics.confusion_matrix(labels[ir_test], lr.predict(features[ir_test]))
tp = cm[1, 1]
n, p = cm.sum(axis=1)
tn = cm[0, 0]
ba = (tp / p + tn / n) / 2
ba
Out[11]:
In [24]:
score = lr.decision_function(features[ir_test])
predicted_labels = lr.predict(features[ir_test])
pos_score = score[norris_labels[ir_test] == 1]
neg_score = score[norris_labels[ir_test] == 0]
assert pos_score.shape[0] + neg_score.shape[0] == score.shape[0]
plt.figure(figsize=(10, 5))
seaborn.distplot(pos_score, rug=True, hist=False, color='green', rug_kws={'alpha': 0.1})
seaborn.distplot(neg_score, rug=True, hist=False, color='red', rug_kws={'alpha': 0.1})
plt.show()
In [77]:
# Positive examples incorrectly classified as negative.
false_negatives = numpy.logical_and(norris_labels[ir_test] == 1, predicted_labels == 0).nonzero()[0]
false_negatives = false_negatives[score[false_negatives].argsort()]
true_positives = numpy.logical_and(norris_labels[ir_test] == 1, predicted_labels == 1).nonzero()[0]
true_positives = true_positives[score[true_positives].argsort()]
most_negative = numpy.arange(features.shape[0])[ir_test][false_negatives[:5]]
least_negative = numpy.arange(features.shape[0])[ir_test][false_negatives[-5:]]
most_positive = numpy.arange(features.shape[0])[ir_test][true_positives[-5:]]
print(lr.predict(features[most_negative]), norris_labels[most_negative], lr.decision_function(features[most_negative]))
print(lr.predict(features[least_negative]), norris_labels[least_negative], lr.decision_function(features[least_negative]))
print(lr.predict(features[most_positive]), norris_labels[most_positive], lr.decision_function(features[most_positive]))
In [80]:
# Get raw radio patches to look at.
with h5py.File('../data/crowdastro.h5') as f:
plt.figure(figsize=(15, 9))
for i, im in enumerate(f['/wise/cdfs/numeric'][sorted(most_negative), -1024:].reshape((5, 32, 32))):
plt.subplot(3, 5, i + 1)
plt.pcolor(im, cmap='inferno')
plt.xlim((0, 32))
plt.ylim((0, 32))
for i, im in enumerate(f['/wise/cdfs/numeric'][sorted(least_negative), -1024:].reshape((5, 32, 32))):
plt.subplot(3, 5, i + 6)
plt.pcolor(im, cmap='inferno')
plt.xlim((0, 32))
plt.ylim((0, 32))
for i, im in enumerate(f['/wise/cdfs/numeric'][sorted(most_positive), -1024:].reshape((5, 32, 32))):
plt.subplot(3, 5, i + 11)
plt.pcolor(im, cmap='inferno')
plt.xlim((0, 32))
plt.ylim((0, 32))
plt.show()
print(f['/wise/cdfs/numeric'][sorted(most_negative), :2])
print(f['/wise/cdfs/numeric'][sorted(least_negative), :2])
print(f['/wise/cdfs/numeric'][sorted(most_positive), :2])
In [ ]:
# Get the corresponding IR, too.
import requests