Predictors

Now that we have our test sets, we can go ahead and train and test some predictors.


In [9]:
from astropy.coordinates import SkyCoord
import h5py
import keras
import numpy
import sklearn.ensemble
import sklearn.linear_model
import sklearn.metrics
import sklearn.neighbors

In [10]:
def balanced_accuracy(y_true, y_pred):
    tp = (numpy.logical_and(y_true == y_pred, y_true == 1)).sum()
    p = (y_true == 1).sum()
    tn = (numpy.logical_and(y_true == y_pred, y_true == 0)).sum()
    n = (y_true == 0).sum()
    
    return (tp / p + tn / n) / 2

In [11]:
C = 100.0

Label sets: Norris, Fan, RGZ MV


In [12]:
with h5py.File('../data/crowdastro.h5') as crowdastro_h5:
    with h5py.File('../data/training.h5') as training_h5:
        rgz_mv = training_h5['labels'].value
        fan = crowdastro_h5['/wise/cdfs/fan_labels'].value
        norris = crowdastro_h5['/wise/cdfs/norris_labels'].value
        names = crowdastro_h5['/wise/cdfs/string'].value
        test_sets = crowdastro_h5['/wise/cdfs/test_sets'].value
        features = training_h5['features'].value

Testing


In [13]:
for name, labels in (('Norris', norris), ('Fan', fan), ('RGZ MV', rgz_mv)):
    print('{:<15}'.format(name), end='\t')
    for test_set in test_sets:
        print('{:.02%}'.format(balanced_accuracy(norris[test_set], labels[test_set])), end='\t')
    print()


Norris         	100.00%	100.00%	100.00%	100.00%	100.00%	
Fan            	100.00%	100.00%	100.00%	100.00%	100.00%	
RGZ MV         	83.76%	84.34%	84.49%	84.84%	86.63%	

Logistic regression trained on the label sets


In [14]:
for name, labels in (('LR(Norris)', norris), ('LR(Fan)', fan), ('LR(RGZ MV)', rgz_mv)):
    print('{:<15}'.format(name), end='\t')
    for test_set in test_sets:
        train_set = sorted(set(range(len(labels))) - set(test_set))
        lr = sklearn.linear_model.LogisticRegression(C=C, class_weight='balanced')
        lr.fit(features[train_set], labels[train_set])
        print('{:.02%}'.format(balanced_accuracy(norris[test_set], lr.predict(features[test_set]))), end='\t')
    print()


LR(Norris)     	86.73%	86.87%	86.46%	86.34%	88.13%	
LR(Fan)        	87.28%	86.55%	86.82%	87.13%	88.70%	
LR(RGZ MV)     	85.17%	84.75%	84.93%	85.41%	86.43%	

Random forests trained on the label sets


In [18]:
for name, labels in (('RF(Norris)', norris), ('RF(Fan)', fan), ('RF(RGZ MV)', rgz_mv)):
    print('{:<15}'.format(name), end='\t')
    for test_set in test_sets:
        train_set = sorted(set(range(len(labels))) - set(test_set))
        rf = sklearn.ensemble.RandomForestClassifier(class_weight='balanced')
        rf.fit(features[train_set], labels[train_set])
        print('{:.02%}'.format(balanced_accuracy(norris[test_set], rf.predict(features[test_set]))), end='\t')
    print()


RF(Norris)     	74.11%	75.83%	73.37%	74.24%	75.96%	
RF(Fan)        	71.79%	72.22%	69.85%	72.13%	74.00%	
RF(RGZ MV)     	80.05%	78.04%	78.70%	81.30%	80.94%	

Logistic regression trained directly on crowd labels


In [66]:
with h5py.File('../data/crowdastro.h5') as crowdastro_h5:
    classifications = crowdastro_h5['/atlas/cdfs/classification_positions'].value
    ir_positions = crowdastro_h5['/wise/cdfs/numeric'][:, :2]
    crowd_indices = []
    crowd_labels = []
    ir_tree = sklearn.neighbors.KDTree(ir_positions)
    for atlas_index, ra, dec in classifications:
        if numpy.isnan(ra) or numpy.isnan(dec):
            continue

        skycoord = SkyCoord(ra=ra, dec=dec, unit=('deg', 'deg'))
        ra = skycoord.ra.degree
        dec = skycoord.dec.degree
        irs, dists = ir_tree.query_radius([(ra, dec)], 1 / 60, return_distance=True)
        min_dists, min_irs = ir_tree.query([(ra, dec)])
        if min_dists[0][0] > 1 / 60:
            continue

        selected = min_irs[0][0]
        for ir in irs[0]:
            crowd_indices.append(ir)
            if ir == selected:
                crowd_labels.append(1)
            else:
                crowd_labels.append(0)
        
    crowd_indices = numpy.array(crowd_indices)
    crowd_labels = numpy.array(crowd_labels)

In [67]:
print('{:<15}'.format('LR(RGZ)'), end='\t')
for test_set in test_sets:
    test_set_set = set(test_set)
    
    train_features = []
    train_labels = []
    
    for index, label in zip(crowd_indices, crowd_labels):
        if index not in test_set_set:
            train_features.append(features[index])
            train_labels.append(label)
    
    lr = sklearn.linear_model.LogisticRegression(C=C, class_weight='balanced')
    lr.fit(train_features, train_labels)

    print('{:.02%}'.format(balanced_accuracy(norris[test_set], lr.predict(features[test_set]))), end='\t')
print()


LR(RGZ)        	85.18%	85.22%	84.77%	85.27%	85.03%	

In [68]:
print('{:<15}'.format('RF(RGZ)'), end='\t')
for test_set in test_sets:
    test_set_set = set(test_set)
    
    train_features = []
    train_labels = []
    
    for index, label in zip(crowd_indices, crowd_labels):
        if index not in test_set_set:
            train_features.append(features[index])
            train_labels.append(label)
    
    rf = sklearn.ensemble.RandomForestClassifier(class_weight='balanced')
    rf.fit(train_features, train_labels)

    print('{:.02%}'.format(balanced_accuracy(norris[test_set], rf.predict(features[test_set]))), end='\t')
print()


RF(RGZ)        	83.37%	83.67%	84.46%	85.53%	85.40%	

Neural network trained on the label sets


In [33]:
model = keras.models.Sequential()
model.add(keras.layers.core.Dense(256, input_shape=(features.shape[1],), activation='tanh'))
model.add(keras.layers.core.Dense(128, activation='tanh'))
model.add(keras.layers.core.Dense(64, activation='tanh'))
model.add(keras.layers.core.Dense(1, activation='sigmoid'))

In [35]:
for name, labels in (('NN(Norris)', norris), ('NN(Fan)', fan), ('NN(RGZ MV)', rgz_mv)):
    print('{:<15}'.format(name), end='\t')
    for test_set in test_sets:
        train_set = sorted(set(range(len(labels))) - set(test_set))
        model.compile('adagrad', 'binary_crossentropy')
        model.fit(features[train_set], labels[train_set], verbose=0, nb_epoch=100)
        predictions = model.predict(features[test_set]).round().ravel()
        print('{:.02%}'.format(balanced_accuracy(norris[test_set], predictions)), end='\t')
    print()


NN(Norris)     	73.56%	77.62%	76.08%	78.63%	76.13%	
NN(Fan)        	75.83%	75.76%	75.75%	75.69%	77.28%	
NN(RGZ MV)     	83.55%	83.97%	83.70%	84.72%	85.72%	

In [ ]: