In [9]:
from astropy.coordinates import SkyCoord
import h5py
import keras
import numpy
import sklearn.ensemble
import sklearn.linear_model
import sklearn.metrics
import sklearn.neighbors
In [10]:
def balanced_accuracy(y_true, y_pred):
tp = (numpy.logical_and(y_true == y_pred, y_true == 1)).sum()
p = (y_true == 1).sum()
tn = (numpy.logical_and(y_true == y_pred, y_true == 0)).sum()
n = (y_true == 0).sum()
return (tp / p + tn / n) / 2
In [11]:
C = 100.0
In [12]:
with h5py.File('../data/crowdastro.h5') as crowdastro_h5:
with h5py.File('../data/training.h5') as training_h5:
rgz_mv = training_h5['labels'].value
fan = crowdastro_h5['/wise/cdfs/fan_labels'].value
norris = crowdastro_h5['/wise/cdfs/norris_labels'].value
names = crowdastro_h5['/wise/cdfs/string'].value
test_sets = crowdastro_h5['/wise/cdfs/test_sets'].value
features = training_h5['features'].value
In [13]:
for name, labels in (('Norris', norris), ('Fan', fan), ('RGZ MV', rgz_mv)):
print('{:<15}'.format(name), end='\t')
for test_set in test_sets:
print('{:.02%}'.format(balanced_accuracy(norris[test_set], labels[test_set])), end='\t')
print()
In [14]:
for name, labels in (('LR(Norris)', norris), ('LR(Fan)', fan), ('LR(RGZ MV)', rgz_mv)):
print('{:<15}'.format(name), end='\t')
for test_set in test_sets:
train_set = sorted(set(range(len(labels))) - set(test_set))
lr = sklearn.linear_model.LogisticRegression(C=C, class_weight='balanced')
lr.fit(features[train_set], labels[train_set])
print('{:.02%}'.format(balanced_accuracy(norris[test_set], lr.predict(features[test_set]))), end='\t')
print()
In [18]:
for name, labels in (('RF(Norris)', norris), ('RF(Fan)', fan), ('RF(RGZ MV)', rgz_mv)):
print('{:<15}'.format(name), end='\t')
for test_set in test_sets:
train_set = sorted(set(range(len(labels))) - set(test_set))
rf = sklearn.ensemble.RandomForestClassifier(class_weight='balanced')
rf.fit(features[train_set], labels[train_set])
print('{:.02%}'.format(balanced_accuracy(norris[test_set], rf.predict(features[test_set]))), end='\t')
print()
In [66]:
with h5py.File('../data/crowdastro.h5') as crowdastro_h5:
classifications = crowdastro_h5['/atlas/cdfs/classification_positions'].value
ir_positions = crowdastro_h5['/wise/cdfs/numeric'][:, :2]
crowd_indices = []
crowd_labels = []
ir_tree = sklearn.neighbors.KDTree(ir_positions)
for atlas_index, ra, dec in classifications:
if numpy.isnan(ra) or numpy.isnan(dec):
continue
skycoord = SkyCoord(ra=ra, dec=dec, unit=('deg', 'deg'))
ra = skycoord.ra.degree
dec = skycoord.dec.degree
irs, dists = ir_tree.query_radius([(ra, dec)], 1 / 60, return_distance=True)
min_dists, min_irs = ir_tree.query([(ra, dec)])
if min_dists[0][0] > 1 / 60:
continue
selected = min_irs[0][0]
for ir in irs[0]:
crowd_indices.append(ir)
if ir == selected:
crowd_labels.append(1)
else:
crowd_labels.append(0)
crowd_indices = numpy.array(crowd_indices)
crowd_labels = numpy.array(crowd_labels)
In [67]:
print('{:<15}'.format('LR(RGZ)'), end='\t')
for test_set in test_sets:
test_set_set = set(test_set)
train_features = []
train_labels = []
for index, label in zip(crowd_indices, crowd_labels):
if index not in test_set_set:
train_features.append(features[index])
train_labels.append(label)
lr = sklearn.linear_model.LogisticRegression(C=C, class_weight='balanced')
lr.fit(train_features, train_labels)
print('{:.02%}'.format(balanced_accuracy(norris[test_set], lr.predict(features[test_set]))), end='\t')
print()
In [68]:
print('{:<15}'.format('RF(RGZ)'), end='\t')
for test_set in test_sets:
test_set_set = set(test_set)
train_features = []
train_labels = []
for index, label in zip(crowd_indices, crowd_labels):
if index not in test_set_set:
train_features.append(features[index])
train_labels.append(label)
rf = sklearn.ensemble.RandomForestClassifier(class_weight='balanced')
rf.fit(train_features, train_labels)
print('{:.02%}'.format(balanced_accuracy(norris[test_set], rf.predict(features[test_set]))), end='\t')
print()
In [33]:
model = keras.models.Sequential()
model.add(keras.layers.core.Dense(256, input_shape=(features.shape[1],), activation='tanh'))
model.add(keras.layers.core.Dense(128, activation='tanh'))
model.add(keras.layers.core.Dense(64, activation='tanh'))
model.add(keras.layers.core.Dense(1, activation='sigmoid'))
In [35]:
for name, labels in (('NN(Norris)', norris), ('NN(Fan)', fan), ('NN(RGZ MV)', rgz_mv)):
print('{:<15}'.format(name), end='\t')
for test_set in test_sets:
train_set = sorted(set(range(len(labels))) - set(test_set))
model.compile('adagrad', 'binary_crossentropy')
model.fit(features[train_set], labels[train_set], verbose=0, nb_epoch=100)
predictions = model.predict(features[test_set]).round().ravel()
print('{:.02%}'.format(balanced_accuracy(norris[test_set], predictions)), end='\t')
print()
In [ ]: