In [67]:
import sys
from astropy.coordinates import SkyCoord
import h5py
import matplotlib.pyplot as plt
import numpy
import sklearn.linear_model
import sklearn.ensemble
import sklearn.metrics
import sklearn.neighbors
sys.path.insert(1, '..')
import crowdastro
NORRIS_DAT_PATH = '../data/norris_2006_atlas_classifications_ra_dec_only.dat'
TRAINING_H5_PATH = '../data/training.h5'
%matplotlib inline
In [15]:
with h5py.File(TRAINING_H5_PATH) as training_f:
print('Total:', training_f['features'].shape[0])
print('Testing:', training_f['is_ir_test'].value.sum(),
'({:.02%})'.format(training_f['is_ir_test'].value.sum() / training_f['features'].shape[0]))
print('Training:', training_f['is_ir_train'].value.sum(),
'({:.02%})'.format(training_f['is_ir_train'].value.sum() / training_f['features'].shape[0]))
atlas_test = training_f['is_atlas_test'].value.sum()
atlas_train = training_f['is_atlas_train'].value.sum()
atlas_total = atlas_test + atlas_train
print('Testing:', atlas_test, '({:.02%})'.format(atlas_test / atlas_total))
print('Testing:', atlas_train, '({:.02%})'.format(atlas_train / atlas_total))
In [ ]:
with h5py.File(TRAINING_H5_PATH) as training_f:
lr = sklearn.linear_model.LogisticRegression(n_jobs=-1, class_weight='balanced', C=100.0, penalty='l1')
x = training_f['features'][training_f['is_ir_train'].value, :]
y = training_f['labels'][training_f['is_ir_train'].value]
lr.fit(x, y)
In [79]:
# Load Norris labels.
with h5py.File(TRAINING_H5_PATH, 'r') as training_f:
ir_positions = training_f['positions'].value
ir_tree = sklearn.neighbors.KDTree(ir_positions)
with open(NORRIS_DAT_PATH, 'r') as norris_dat:
norris_coords = [r.strip().split('|') for r in norris_dat]
norris_labels = numpy.zeros((len(ir_positions)))
for ra, dec in norris_coords:
# Find a neighbour.
skycoord = SkyCoord(ra=ra, dec=dec, unit=('hourangle', 'deg'))
ra = skycoord.ra.degree
dec = skycoord.dec.degree
((dist,),), ((ir,),) = ir_tree.query([(ra, dec)])
if dist < 0.1:
norris_labels[ir] = 1
In [ ]:
# Predict.
with h5py.File(TRAINING_H5_PATH) as training_f:
test_indices = training_f['is_ir_test'].value
x = training_f['features'][test_indices, :]
t = norris_labels[test_indices]
y = lr.predict(x)
In [ ]:
# Raw accuracy.
sklearn.metrics.accuracy_score(t, y)
In [ ]:
# Balanced accuracy.
cm = sklearn.metrics.confusion_matrix(t, y).astype(float)
cm /= cm.sum(axis=1).reshape((-1, 1))
cm.trace() / 2