Colour-Colour Features

Are the other magnitude differences useful predictors?


In [1]:
import sklearn.linear_model, h5py, numpy, crowdastro.crowd.util

In [20]:
with h5py.File('/Users/alger/data/Crowdastro/all_training_data_01_05_17.h5', 'r') as f:
    features = f['features'].value
    labels = f['norris_labels'].value
    train_sets = f['sets/RGZ & Norris/train'].value
    test_sets = f['sets/RGZ & Norris/test'].value
with h5py.File('/Users/alger/data/Crowdastro/crowdastro-swire.h5', 'r') as f:
    swire_features = f['/swire/cdfs/numeric'][:, 2:2 + 5]

SPITZER_SENSITIVITIES = {
    36: 7.3,
    45: 9.7,
    58: 27.5,
    80: 32.5,
    24: 450,
}
swire_features[swire_features[:, 0] == -99, 0] = SPITZER_SENSITIVITIES[36]
swire_features[swire_features[:, 1] == -99, 1] = SPITZER_SENSITIVITIES[45]
swire_features[swire_features[:, 2] == -99, 2] = SPITZER_SENSITIVITIES[58]
swire_features[swire_features[:, 2] == 0, 2] = SPITZER_SENSITIVITIES[58]
swire_features[swire_features[:, 3] == -99, 3] = SPITZER_SENSITIVITIES[80]
swire_features[swire_features[:, 3] == 0, 3] = SPITZER_SENSITIVITIES[80]
swire_features[swire_features[:, 4] == -99, 4] = SPITZER_SENSITIVITIES[24]
swire_features[swire_features[:, 4] == 0, 4] = SPITZER_SENSITIVITIES[24]
colour_colour_36_45 = numpy.log10(swire_features[:, 0]) - numpy.log10(swire_features[:, 1])
colour_colour_36_58 = numpy.log10(swire_features[:, 0]) - numpy.log10(swire_features[:, 2])
colour_colour_36_80 = numpy.log10(swire_features[:, 0]) - numpy.log10(swire_features[:, 3])
colour_colour_45_58 = numpy.log10(swire_features[:, 1]) - numpy.log10(swire_features[:, 2])
colour_colour_45_80 = numpy.log10(swire_features[:, 1]) - numpy.log10(swire_features[:, 3])
colour_colour_58_80 = numpy.log10(swire_features[:, 2]) - numpy.log10(swire_features[:, 3])

In [16]:
# Train LR on the 36-45 and 45-58 features.
bas = []
for train, test in zip(train_sets, test_sets):
    lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
    f = features
    f -= f.mean(axis=0)
    f /= f.std(axis=0)
    lr.fit(f[train], labels[train])
    p = lr.predict(f[test])
    bas.append(crowdastro.crowd.util.balanced_accuracy(labels[test], p))
print(numpy.mean(bas), numpy.std(bas))


[-0.10719221  0.47285233 -0.2232971  ...,  0.03450133  0.07687134
  0.04083214]
[-0.12296261  0.38621728 -0.39697595 ..., -0.09677584 -0.0425222
 -0.03138334]
[-0.10374743  0.51388107 -0.33630897 ..., -0.1169043  -0.0833884
 -0.17638748]
[-0.10126773  0.41021902 -0.28329018 ..., -0.04553762 -0.02926188
 -0.02935066]
0.944788992331 0.0141064182402

In [21]:
# Train LR on all flux ratio features.
bas_all = []
for train, test in zip(train_sets, test_sets):
    lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
    f = numpy.vstack([features.T, colour_colour_36_45, colour_colour_36_58,
                      colour_colour_36_80, colour_colour_45_58, colour_colour_45_80,
                      colour_colour_58_80]).T
    f -= f.mean(axis=0)
    f /= f.std(axis=0)
    lr.fit(f[train], labels[train])
    p = lr.predict(f[test])
    bas_all.append(crowdastro.crowd.util.balanced_accuracy(labels[test], p))
print(numpy.mean(bas_all), numpy.std(bas_all))


(42252, 1035)
(42252, 1035)
(42252, 1035)
(42252, 1035)
0.945969430012 0.0171863151418

In [28]:
list(lr.coef_.ravel()[:5]) + list(lr.coef_.ravel()[-6:])


Out[28]:
[0.20804105640636134,
 -0.21774015580042197,
 -0.35497569981104871,
 0.13257619859275122,
 -2.4503358849367789,
 -0.4137405445307476,
 0.19434280406804824,
 0.042171695507318287,
 0.43063967035838507,
 0.21440396177362994,
 -0.2226582290557802]

In [ ]: